RPM build fix (reverted CI changes which will need to be un-reverted or made conditional) and vendor Rust dependencies to make builds much faster in any CI system.

2022-06-08 07:32:16 -04:00
parent 373ca30269
commit d5ca4e5f52
12611 changed files with 2898014 additions and 284 deletions
--- a/zeroidc/vendor/ring/crypto/fipsmodule/aes/aes_nohw.c
+++ b/zeroidc/vendor/ring/crypto/fipsmodule/aes/aes_nohw.c
@@ -0,0 +1,961 @@
+/* Copyright (c) 2019, Google Inc.
+ *
+ * Permission to use, copy, modify, and/or distribute this software for any
+ * purpose with or without fee is hereby granted, provided that the above
+ * copyright notice and this permission notice appear in all copies.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+ * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+ * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY
+ * SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+ * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION
+ * OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN
+ * CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. */
+
+#include <GFp/aes.h>
+
+#include "../../internal.h"
+
+#if defined(OPENSSL_SSE2)
+#include <emmintrin.h>
+#endif
+
+
+// This file contains a constant-time implementation of AES, bitsliced with
+// 32-bit, 64-bit, or 128-bit words, operating on two-, four-, and eight-block
+// batches, respectively. The 128-bit implementation requires SSE2 intrinsics.
+//
+// This implementation is based on the algorithms described in the following
+// references:
+// - https://bearssl.org/constanttime.html#aes
+// - https://eprint.iacr.org/2009/129.pdf
+// - https://eprint.iacr.org/2009/191.pdf
+
+
+// Word operations.
+//
+// An aes_word_t is the word used for this AES implementation. Throughout this
+// file, bits and bytes are ordered little-endian, though "left" and "right"
+// shifts match the operations themselves, which makes them reversed in a
+// little-endian, left-to-right reading.
+//
+// Eight |aes_word_t|s contain |AES_NOHW_BATCH_SIZE| blocks. The bits in an
+// |aes_word_t| are divided into 16 consecutive groups of |AES_NOHW_BATCH_SIZE|
+// bits each, each corresponding to a byte in an AES block in column-major
+// order (AES's byte order). We refer to these as "logical bytes". Note, in the
+// 32-bit and 64-bit implementations, they are smaller than a byte. (The
+// contents of a logical byte will be described later.)
+//
+// MSVC does not support C bit operators on |__m128i|, so the wrapper functions
+// |aes_nohw_and|, etc., should be used instead. Note |aes_nohw_shift_left| and
+// |aes_nohw_shift_right| measure the shift in logical bytes. That is, the shift
+// value ranges from 0 to 15 independent of |aes_word_t| and
+// |AES_NOHW_BATCH_SIZE|.
+//
+// This ordering is different from https://eprint.iacr.org/2009/129.pdf, which
+// uses row-major order. Matching the AES order was easier to reason about, and
+// we do not have PSHUFB available to arbitrarily permute bytes.
+
+#if defined(OPENSSL_SSE2)
+typedef __m128i aes_word_t;
+// AES_NOHW_WORD_SIZE is sizeof(aes_word_t). alignas(sizeof(T)) does not work in
+// MSVC, so we define a constant.
+#define AES_NOHW_WORD_SIZE 16
+#define AES_NOHW_BATCH_SIZE 8
+#define AES_NOHW_ROW0_MASK \
+  _mm_set_epi32(0x000000ff, 0x000000ff, 0x000000ff, 0x000000ff)
+#define AES_NOHW_ROW1_MASK \
+  _mm_set_epi32(0x0000ff00, 0x0000ff00, 0x0000ff00, 0x0000ff00)
+#define AES_NOHW_ROW2_MASK \
+  _mm_set_epi32(0x00ff0000, 0x00ff0000, 0x00ff0000, 0x00ff0000)
+#define AES_NOHW_ROW3_MASK \
+  _mm_set_epi32(0xff000000, 0xff000000, 0xff000000, 0xff000000)
+#define AES_NOHW_COL01_MASK \
+  _mm_set_epi32(0x00000000, 0x00000000, 0xffffffff, 0xffffffff)
+#define AES_NOHW_COL2_MASK \
+  _mm_set_epi32(0x00000000, 0xffffffff, 0x00000000, 0x00000000)
+#define AES_NOHW_COL3_MASK \
+  _mm_set_epi32(0xffffffff, 0x00000000, 0x00000000, 0x00000000)
+
+static inline aes_word_t aes_nohw_and(aes_word_t a, aes_word_t b) {
+  return _mm_and_si128(a, b);
+}
+
+static inline aes_word_t aes_nohw_or(aes_word_t a, aes_word_t b) {
+  return _mm_or_si128(a, b);
+}
+
+static inline aes_word_t aes_nohw_xor(aes_word_t a, aes_word_t b) {
+  return _mm_xor_si128(a, b);
+}
+
+static inline aes_word_t aes_nohw_not(aes_word_t a) {
+  return _mm_xor_si128(
+      a, _mm_set_epi32(0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff));
+}
+
+// These are macros because parameters to |_mm_slli_si128| and |_mm_srli_si128|
+// must be constants.
+#define aes_nohw_shift_left(/* aes_word_t */ a, /* const */ i) \
+  _mm_slli_si128((a), (i))
+#define aes_nohw_shift_right(/* aes_word_t */ a, /* const */ i) \
+  _mm_srli_si128((a), (i))
+#else  // !OPENSSL_SSE2
+#if defined(OPENSSL_64_BIT)
+typedef uint64_t aes_word_t;
+#define AES_NOHW_WORD_SIZE 8
+#define AES_NOHW_BATCH_SIZE 4
+#define AES_NOHW_ROW0_MASK UINT64_C(0x000f000f000f000f)
+#define AES_NOHW_ROW1_MASK UINT64_C(0x00f000f000f000f0)
+#define AES_NOHW_ROW2_MASK UINT64_C(0x0f000f000f000f00)
+#define AES_NOHW_ROW3_MASK UINT64_C(0xf000f000f000f000)
+#define AES_NOHW_COL01_MASK UINT64_C(0x00000000ffffffff)
+#define AES_NOHW_COL2_MASK UINT64_C(0x0000ffff00000000)
+#define AES_NOHW_COL3_MASK UINT64_C(0xffff000000000000)
+#else  // !OPENSSL_64_BIT
+typedef uint32_t aes_word_t;
+#define AES_NOHW_WORD_SIZE 4
+#define AES_NOHW_BATCH_SIZE 2
+#define AES_NOHW_ROW0_MASK 0x03030303
+#define AES_NOHW_ROW1_MASK 0x0c0c0c0c
+#define AES_NOHW_ROW2_MASK 0x30303030
+#define AES_NOHW_ROW3_MASK 0xc0c0c0c0
+#define AES_NOHW_COL01_MASK 0x0000ffff
+#define AES_NOHW_COL2_MASK 0x00ff0000
+#define AES_NOHW_COL3_MASK 0xff000000
+#endif  // OPENSSL_64_BIT
+
+static inline aes_word_t aes_nohw_and(aes_word_t a, aes_word_t b) {
+  return a & b;
+}
+
+static inline aes_word_t aes_nohw_or(aes_word_t a, aes_word_t b) {
+  return a | b;
+}
+
+static inline aes_word_t aes_nohw_xor(aes_word_t a, aes_word_t b) {
+  return a ^ b;
+}
+
+static inline aes_word_t aes_nohw_not(aes_word_t a) { return ~a; }
+
+static inline aes_word_t aes_nohw_shift_left(aes_word_t a, aes_word_t i) {
+  return a << (i * AES_NOHW_BATCH_SIZE);
+}
+
+static inline aes_word_t aes_nohw_shift_right(aes_word_t a, aes_word_t i) {
+  return a >> (i * AES_NOHW_BATCH_SIZE);
+}
+#endif  // OPENSSL_SSE2
+
+OPENSSL_STATIC_ASSERT(AES_NOHW_BATCH_SIZE * 128 == 8 * 8 * sizeof(aes_word_t),
+                      "batch size does not match word size");
+OPENSSL_STATIC_ASSERT(AES_NOHW_WORD_SIZE == sizeof(aes_word_t),
+                      "AES_NOHW_WORD_SIZE is incorrect");
+
+
+// Block representations.
+//
+// This implementation uses three representations for AES blocks. First, the
+// public API represents blocks as uint8_t[16] in the usual way. Second, most
+// AES steps are evaluated in bitsliced form, stored in an |AES_NOHW_BATCH|.
+// This stores |AES_NOHW_BATCH_SIZE| blocks in bitsliced order. For 64-bit words
+// containing bitsliced blocks a, b, c, d, this would be as follows (vertical
+// bars divide logical bytes):
+//
+//   batch.w[0] = a0 b0 c0 d0 |  a8  b8  c8  d8 | a16 b16 c16 d16 ...
+//   batch.w[1] = a1 b1 c1 d1 |  a9  b9  c9  d9 | a17 b17 c17 d17 ...
+//   batch.w[2] = a2 b2 c2 d2 | a10 b10 c10 d10 | a18 b18 c18 d18 ...
+//   batch.w[3] = a3 b3 c3 d3 | a11 b11 c11 d11 | a19 b19 c19 d19 ...
+//   ...
+//
+// Finally, an individual block may be stored as an intermediate form in an
+// aes_word_t[AES_NOHW_BLOCK_WORDS]. In this form, we permute the bits in each
+// block, so that block[0]'s ith logical byte contains least-significant
+// |AES_NOHW_BATCH_SIZE| bits of byte i, block[1] contains the next group of
+// |AES_NOHW_BATCH_SIZE| bits, and so on. We refer to this transformation as
+// "compacting" the block. Note this is no-op with 128-bit words because then
+// |AES_NOHW_BLOCK_WORDS| is one and |AES_NOHW_BATCH_SIZE| is eight. For 64-bit
+// words, one block would be stored in two words:
+//
+//   block[0] = a0 a1 a2 a3 |  a8  a9 a10 a11 | a16 a17 a18 a19 ...
+//   block[1] = a4 a5 a6 a7 | a12 a13 a14 a15 | a20 a21 a22 a23 ...
+//
+// Observe that the distances between corresponding bits in bitsliced and
+// compact bit orders match. If we line up corresponding words of each block,
+// the bitsliced and compact representations may be converted by tranposing bits
+// in corresponding logical bytes. Continuing the 64-bit example:
+//
+//   block_a[0] = a0 a1 a2 a3 |  a8  a9 a10 a11 | a16 a17 a18 a19 ...
+//   block_b[0] = b0 b1 b2 b3 |  b8  b9 b10 b11 | b16 b17 b18 b19 ...
+//   block_c[0] = c0 c1 c2 c3 |  c8  c9 c10 c11 | c16 c17 c18 c19 ...
+//   block_d[0] = d0 d1 d2 d3 |  d8  d9 d10 d11 | d16 d17 d18 d19 ...
+//
+//   batch.w[0] = a0 b0 c0 d0 |  a8  b8  c8  d8 | a16 b16 c16 d16 ...
+//   batch.w[1] = a1 b1 c1 d1 |  a9  b9  c9  d9 | a17 b17 c17 d17 ...
+//   batch.w[2] = a2 b2 c2 d2 | a10 b10 c10 d10 | a18 b18 c18 d18 ...
+//   batch.w[3] = a3 b3 c3 d3 | a11 b11 c11 d11 | a19 b19 c19 d19 ...
+//
+// Note also that bitwise operations and (logical) byte permutations on an
+// |aes_word_t| work equally for the bitsliced and compact words.
+//
+// We use the compact form in the |AES_KEY| representation to save work
+// inflating round keys into |AES_NOHW_BATCH|. The compact form also exists
+// temporarily while moving blocks in or out of an |AES_NOHW_BATCH|, immediately
+// before or after |aes_nohw_transpose|.
+
+#define AES_NOHW_BLOCK_WORDS (16 / sizeof(aes_word_t))
+
+// An AES_NOHW_BATCH stores |AES_NOHW_BATCH_SIZE| blocks. Unless otherwise
+// specified, it is in bitsliced form.
+typedef struct {
+  aes_word_t w[8];
+} AES_NOHW_BATCH;
+
+// An AES_NOHW_SCHEDULE is an expanded bitsliced AES key schedule. It is
+// suitable for encryption or decryption. It is as large as |AES_NOHW_BATCH|
+// |AES_KEY|s so it should not be used as a long-term key representation.
+typedef struct {
+  // keys is an array of batches, one for each round key. Each batch stores
+  // |AES_NOHW_BATCH_SIZE| copies of the round key in bitsliced form.
+  AES_NOHW_BATCH keys[AES_MAXNR + 1];
+} AES_NOHW_SCHEDULE;
+
+// aes_nohw_batch_set sets the |i|th block of |batch| to |in|. |batch| is in
+// compact form.
+static inline void aes_nohw_batch_set(AES_NOHW_BATCH *batch,
+                                      const aes_word_t in[AES_NOHW_BLOCK_WORDS],
+                                      size_t i) {
+  // Note the words are interleaved. The order comes from |aes_nohw_transpose|.
+  // If |i| is zero and this is the 64-bit implementation, in[0] contains bits
+  // 0-3 and in[1] contains bits 4-7. We place in[0] at w[0] and in[1] at
+  // w[4] so that bits 0 and 4 are in the correct position. (In general, bits
+  // along diagonals of |AES_NOHW_BATCH_SIZE| by |AES_NOHW_BATCH_SIZE| squares
+  // will be correctly placed.)
+  dev_assert_secret(i < AES_NOHW_BATCH_SIZE);
+#if defined(OPENSSL_SSE2)
+  batch->w[i] = in[0];
+#elif defined(OPENSSL_64_BIT)
+  batch->w[i] = in[0];
+  batch->w[i + 4] = in[1];
+#else
+  batch->w[i] = in[0];
+  batch->w[i + 2] = in[1];
+  batch->w[i + 4] = in[2];
+  batch->w[i + 6] = in[3];
+#endif
+}
+
+// aes_nohw_batch_get writes the |i|th block of |batch| to |out|. |batch| is in
+// compact form.
+static inline void aes_nohw_batch_get(const AES_NOHW_BATCH *batch,
+                                      aes_word_t out[AES_NOHW_BLOCK_WORDS],
+                                      size_t i) {
+  dev_assert_secret(i < AES_NOHW_BATCH_SIZE);
+#if defined(OPENSSL_SSE2)
+  out[0] = batch->w[i];
+#elif defined(OPENSSL_64_BIT)
+  out[0] = batch->w[i];
+  out[1] = batch->w[i + 4];
+#else
+  out[0] = batch->w[i];
+  out[1] = batch->w[i + 2];
+  out[2] = batch->w[i + 4];
+  out[3] = batch->w[i + 6];
+#endif
+}
+
+#if !defined(OPENSSL_SSE2)
+// aes_nohw_delta_swap returns |a| with bits |a & mask| and
+// |a & (mask << shift)| swapped. |mask| and |mask << shift| may not overlap.
+static inline aes_word_t aes_nohw_delta_swap(aes_word_t a, aes_word_t mask,
+                                             aes_word_t shift) {
+  // See
+  // https://reflectionsonsecurity.wordpress.com/2014/05/11/efficient-bit-permutation-using-delta-swaps/
+  aes_word_t b = (a ^ (a >> shift)) & mask;
+  return a ^ b ^ (b << shift);
+}
+
+// In the 32-bit and 64-bit implementations, a block spans multiple words.
+// |aes_nohw_compact_block| must permute bits across different words. First we
+// implement |aes_nohw_compact_word| which performs a smaller version of the
+// transformation which stays within a single word.
+//
+// These transformations are generalizations of the output of
+// http://programming.sirrida.de/calcperm.php on smaller inputs.
+#if defined(OPENSSL_64_BIT)
+static inline uint64_t aes_nohw_compact_word(uint64_t a) {
+  // Numbering the 64/2 = 16 4-bit chunks, least to most significant, we swap
+  // quartets of those chunks:
+  //   0 1 2 3 | 4 5 6 7 | 8  9 10 11 | 12 13 14 15 =>
+  //   0 2 1 3 | 4 6 5 7 | 8 10  9 11 | 12 14 13 15
+  a = aes_nohw_delta_swap(a, UINT64_C(0x00f000f000f000f0), 4);
+  // Swap quartets of 8-bit chunks (still numbering by 4-bit chunks):
+  //   0 2 1 3 | 4 6 5 7 | 8 10  9 11 | 12 14 13 15 =>
+  //   0 2 4 6 | 1 3 5 7 | 8 10 12 14 |  9 11 13 15
+  a = aes_nohw_delta_swap(a, UINT64_C(0x0000ff000000ff00), 8);
+  // Swap quartets of 16-bit chunks (still numbering by 4-bit chunks):
+  //   0 2 4 6 | 1  3  5  7 | 8 10 12 14 | 9 11 13 15 =>
+  //   0 2 4 6 | 8 10 12 14 | 1  3  5  7 | 9 11 13 15
+  a = aes_nohw_delta_swap(a, UINT64_C(0x00000000ffff0000), 16);
+  return a;
+}
+
+static inline uint64_t aes_nohw_uncompact_word(uint64_t a) {
+  // Reverse the steps of |aes_nohw_uncompact_word|.
+  a = aes_nohw_delta_swap(a, UINT64_C(0x00000000ffff0000), 16);
+  a = aes_nohw_delta_swap(a, UINT64_C(0x0000ff000000ff00), 8);
+  a = aes_nohw_delta_swap(a, UINT64_C(0x00f000f000f000f0), 4);
+  return a;
+}
+#else   // !OPENSSL_64_BIT
+static inline uint32_t aes_nohw_compact_word(uint32_t a) {
+  // Numbering the 32/2 = 16 pairs of bits, least to most significant, we swap:
+  //   0 1 2 3 | 4 5 6 7 | 8  9 10 11 | 12 13 14 15 =>
+  //   0 4 2 6 | 1 5 3 7 | 8 12 10 14 |  9 13 11 15
+  // Note:  0x00cc = 0b0000_0000_1100_1100
+  //   0x00cc << 6 = 0b0011_0011_0000_0000
+  a = aes_nohw_delta_swap(a, 0x00cc00cc, 6);
+  // Now we swap groups of four bits (still numbering by pairs):
+  //   0 4 2  6 | 1 5 3  7 | 8 12 10 14 | 9 13 11 15 =>
+  //   0 4 8 12 | 1 5 9 13 | 2  6 10 14 | 3  7 11 15
+  // Note: 0x0000_f0f0 << 12 = 0x0f0f_0000
+  a = aes_nohw_delta_swap(a, 0x0000f0f0, 12);
+  return a;
+}
+
+static inline uint32_t aes_nohw_uncompact_word(uint32_t a) {
+  // Reverse the steps of |aes_nohw_uncompact_word|.
+  a = aes_nohw_delta_swap(a, 0x0000f0f0, 12);
+  a = aes_nohw_delta_swap(a, 0x00cc00cc, 6);
+  return a;
+}
+
+static inline uint32_t aes_nohw_word_from_bytes(uint8_t a0, uint8_t a1,
+                                                uint8_t a2, uint8_t a3) {
+  return (uint32_t)a0 | ((uint32_t)a1 << 8) | ((uint32_t)a2 << 16) |
+         ((uint32_t)a3 << 24);
+}
+
+static inline uint8_t lo(uint32_t a) {
+  return (uint8_t)a;
+}
+
+#endif  // OPENSSL_64_BIT
+#endif  // !OPENSSL_SSE2
+
+static inline void aes_nohw_compact_block(aes_word_t out[AES_NOHW_BLOCK_WORDS],
+                                          const uint8_t in[16]) {
+  GFp_memcpy(out, in, 16);
+#if defined(OPENSSL_SSE2)
+  // No conversions needed.
+#elif defined(OPENSSL_64_BIT)
+  uint64_t a0 = aes_nohw_compact_word(out[0]);
+  uint64_t a1 = aes_nohw_compact_word(out[1]);
+  out[0] = (a0 & UINT64_C(0x00000000ffffffff)) | (a1 << 32);
+  out[1] = (a1 & UINT64_C(0xffffffff00000000)) | (a0 >> 32);
+#else
+  uint32_t a0 = aes_nohw_compact_word(out[0]);
+  uint32_t a1 = aes_nohw_compact_word(out[1]);
+  uint32_t a2 = aes_nohw_compact_word(out[2]);
+  uint32_t a3 = aes_nohw_compact_word(out[3]);
+  // Note clang, when building for ARM Thumb2, will sometimes miscompile
+  // expressions such as (a0 & 0x0000ff00) << 8, particularly when building
+  // without optimizations. This bug was introduced in
+  // https://reviews.llvm.org/rL340261 and fixed in
+  // https://reviews.llvm.org/rL351310. The following is written to avoid this.
+  out[0] = aes_nohw_word_from_bytes(lo(a0), lo(a1), lo(a2), lo(a3));
+  out[1] = aes_nohw_word_from_bytes(lo(a0 >> 8), lo(a1 >> 8), lo(a2 >> 8), lo(a3 >> 8));
+  out[2] = aes_nohw_word_from_bytes(lo(a0 >> 16), lo(a1 >> 16), lo(a2 >> 16), lo(a3 >> 16));
+  out[3] = aes_nohw_word_from_bytes(lo(a0 >> 24), lo(a1 >> 24), lo(a2 >> 24), lo(a3 >> 24));
+#endif
+}
+
+static inline void aes_nohw_uncompact_block(
+    uint8_t out[16], const aes_word_t in[AES_NOHW_BLOCK_WORDS]) {
+#if defined(OPENSSL_SSE2)
+  GFp_memcpy(out, in, 16);  // No conversions needed.
+#elif defined(OPENSSL_64_BIT)
+  uint64_t a0 = in[0];
+  uint64_t a1 = in[1];
+  uint64_t b0 =
+      aes_nohw_uncompact_word((a0 & UINT64_C(0x00000000ffffffff)) | (a1 << 32));
+  uint64_t b1 =
+      aes_nohw_uncompact_word((a1 & UINT64_C(0xffffffff00000000)) | (a0 >> 32));
+  GFp_memcpy(out, &b0, 8);
+  GFp_memcpy(out + 8, &b1, 8);
+#else
+  uint32_t a0 = in[0];
+  uint32_t a1 = in[1];
+  uint32_t a2 = in[2];
+  uint32_t a3 = in[3];
+  // Note clang, when building for ARM Thumb2, will sometimes miscompile
+  // expressions such as (a0 & 0x0000ff00) << 8, particularly when building
+  // without optimizations. This bug was introduced in
+  // https://reviews.llvm.org/rL340261 and fixed in
+  // https://reviews.llvm.org/rL351310. The following is written to avoid this.
+  uint32_t b0 = aes_nohw_word_from_bytes(lo(a0), lo(a1), lo(a2), lo(a3));
+  uint32_t b1 = aes_nohw_word_from_bytes(lo(a0 >> 8), lo(a1 >> 8), lo(a2 >> 8), lo(a3 >> 8));
+  uint32_t b2 =
+      aes_nohw_word_from_bytes(lo(a0 >> 16), lo(a1 >> 16), lo(a2 >> 16), lo(a3 >> 16));
+  uint32_t b3 =
+      aes_nohw_word_from_bytes(lo(a0 >> 24), lo(a1 >> 24), lo(a2 >> 24), lo(a3 >> 24));
+  b0 = aes_nohw_uncompact_word(b0);
+  b1 = aes_nohw_uncompact_word(b1);
+  b2 = aes_nohw_uncompact_word(b2);
+  b3 = aes_nohw_uncompact_word(b3);
+  GFp_memcpy(out, &b0, 4);
+  GFp_memcpy(out + 4, &b1, 4);
+  GFp_memcpy(out + 8, &b2, 4);
+  GFp_memcpy(out + 12, &b3, 4);
+#endif
+}
+
+// aes_nohw_swap_bits is a variation on a delta swap. It swaps the bits in
+// |*a & (mask << shift)| with the bits in |*b & mask|. |mask| and
+// |mask << shift| must not overlap. |mask| is specified as a |uint32_t|, but it
+// is repeated to the full width of |aes_word_t|.
+#if defined(OPENSSL_SSE2)
+// This must be a macro because |_mm_srli_epi32| and |_mm_slli_epi32| require
+// constant shift values.
+#define aes_nohw_swap_bits(/*__m128i* */ a, /*__m128i* */ b,              \
+                           /* uint32_t */ mask, /* const */ shift)        \
+  do {                                                                    \
+    __m128i swap =                                                        \
+        _mm_and_si128(_mm_xor_si128(_mm_srli_epi32(*(a), (shift)), *(b)), \
+                      _mm_set_epi32((mask), (mask), (mask), (mask)));     \
+    *(a) = _mm_xor_si128(*(a), _mm_slli_epi32(swap, (shift)));            \
+    *(b) = _mm_xor_si128(*(b), swap);                                     \
+                                                                          \
+  } while (0)
+#else
+static inline void aes_nohw_swap_bits(aes_word_t *a, aes_word_t *b,
+                                      uint32_t mask, aes_word_t shift) {
+#if defined(OPENSSL_64_BIT)
+  aes_word_t mask_w = (((uint64_t)mask) << 32) | mask;
+#else
+  aes_word_t mask_w = mask;
+#endif
+  // This is a variation on a delta swap.
+  aes_word_t swap = ((*a >> shift) ^ *b) & mask_w;
+  *a ^= swap << shift;
+  *b ^= swap;
+}
+#endif  // OPENSSL_SSE2
+
+// aes_nohw_transpose converts |batch| to and from bitsliced form. It divides
+// the 8 × word_size bits into AES_NOHW_BATCH_SIZE × AES_NOHW_BATCH_SIZE squares
+// and transposes each square.
+static void aes_nohw_transpose(AES_NOHW_BATCH *batch) {
+  // Swap bits with index 0 and 1 mod 2 (0x55 = 0b01010101).
+  aes_nohw_swap_bits(&batch->w[0], &batch->w[1], 0x55555555, 1);
+  aes_nohw_swap_bits(&batch->w[2], &batch->w[3], 0x55555555, 1);
+  aes_nohw_swap_bits(&batch->w[4], &batch->w[5], 0x55555555, 1);
+  aes_nohw_swap_bits(&batch->w[6], &batch->w[7], 0x55555555, 1);
+
+#if AES_NOHW_BATCH_SIZE >= 4
+  // Swap bits with index 0-1 and 2-3 mod 4 (0x33 = 0b00110011).
+  aes_nohw_swap_bits(&batch->w[0], &batch->w[2], 0x33333333, 2);
+  aes_nohw_swap_bits(&batch->w[1], &batch->w[3], 0x33333333, 2);
+  aes_nohw_swap_bits(&batch->w[4], &batch->w[6], 0x33333333, 2);
+  aes_nohw_swap_bits(&batch->w[5], &batch->w[7], 0x33333333, 2);
+#endif
+
+#if AES_NOHW_BATCH_SIZE >= 8
+  // Swap bits with index 0-3 and 4-7 mod 8 (0x0f = 0b00001111).
+  aes_nohw_swap_bits(&batch->w[0], &batch->w[4], 0x0f0f0f0f, 4);
+  aes_nohw_swap_bits(&batch->w[1], &batch->w[5], 0x0f0f0f0f, 4);
+  aes_nohw_swap_bits(&batch->w[2], &batch->w[6], 0x0f0f0f0f, 4);
+  aes_nohw_swap_bits(&batch->w[3], &batch->w[7], 0x0f0f0f0f, 4);
+#endif
+}
+
+// aes_nohw_to_batch initializes |out| with the |num_blocks| blocks from |in|.
+// |num_blocks| must be at most |AES_NOHW_BATCH|.
+static void aes_nohw_to_batch(AES_NOHW_BATCH *out, const uint8_t *in,
+                              size_t num_blocks) {
+  // Don't leave unused blocks uninitialized.
+  GFp_memset(out, 0, sizeof(AES_NOHW_BATCH));
+  debug_assert_nonsecret(num_blocks <= AES_NOHW_BATCH_SIZE);
+  for (size_t i = 0; i < num_blocks; i++) {
+    aes_word_t block[AES_NOHW_BLOCK_WORDS];
+    aes_nohw_compact_block(block, in + 16 * i);
+    aes_nohw_batch_set(out, block, i);
+  }
+
+  aes_nohw_transpose(out);
+}
+
+// aes_nohw_to_batch writes the first |num_blocks| blocks in |batch| to |out|.
+// |num_blocks| must be at most |AES_NOHW_BATCH|.
+static void aes_nohw_from_batch(uint8_t *out, size_t num_blocks,
+                                const AES_NOHW_BATCH *batch) {
+  AES_NOHW_BATCH copy = *batch;
+  aes_nohw_transpose(&copy);
+
+  debug_assert_nonsecret(num_blocks <= AES_NOHW_BATCH_SIZE);
+  for (size_t i = 0; i < num_blocks; i++) {
+    aes_word_t block[AES_NOHW_BLOCK_WORDS];
+    aes_nohw_batch_get(&copy, block, i);
+    aes_nohw_uncompact_block(out + 16 * i, block);
+  }
+}
+
+
+// AES round steps.
+
+static void aes_nohw_add_round_key(AES_NOHW_BATCH *batch,
+                                   const AES_NOHW_BATCH *key) {
+  for (size_t i = 0; i < 8; i++) {
+    batch->w[i] = aes_nohw_xor(batch->w[i], key->w[i]);
+  }
+}
+
+static void aes_nohw_sub_bytes(AES_NOHW_BATCH *batch) {
+  // See https://eprint.iacr.org/2009/191.pdf, Appendix C.
+  aes_word_t x0 = batch->w[7];
+  aes_word_t x1 = batch->w[6];
+  aes_word_t x2 = batch->w[5];
+  aes_word_t x3 = batch->w[4];
+  aes_word_t x4 = batch->w[3];
+  aes_word_t x5 = batch->w[2];
+  aes_word_t x6 = batch->w[1];
+  aes_word_t x7 = batch->w[0];
+
+  // Figure 2, the top linear transformation.
+  aes_word_t y14 = aes_nohw_xor(x3, x5);
+  aes_word_t y13 = aes_nohw_xor(x0, x6);
+  aes_word_t y9 = aes_nohw_xor(x0, x3);
+  aes_word_t y8 = aes_nohw_xor(x0, x5);
+  aes_word_t t0 = aes_nohw_xor(x1, x2);
+  aes_word_t y1 = aes_nohw_xor(t0, x7);
+  aes_word_t y4 = aes_nohw_xor(y1, x3);
+  aes_word_t y12 = aes_nohw_xor(y13, y14);
+  aes_word_t y2 = aes_nohw_xor(y1, x0);
+  aes_word_t y5 = aes_nohw_xor(y1, x6);
+  aes_word_t y3 = aes_nohw_xor(y5, y8);
+  aes_word_t t1 = aes_nohw_xor(x4, y12);
+  aes_word_t y15 = aes_nohw_xor(t1, x5);
+  aes_word_t y20 = aes_nohw_xor(t1, x1);
+  aes_word_t y6 = aes_nohw_xor(y15, x7);
+  aes_word_t y10 = aes_nohw_xor(y15, t0);
+  aes_word_t y11 = aes_nohw_xor(y20, y9);
+  aes_word_t y7 = aes_nohw_xor(x7, y11);
+  aes_word_t y17 = aes_nohw_xor(y10, y11);
+  aes_word_t y19 = aes_nohw_xor(y10, y8);
+  aes_word_t y16 = aes_nohw_xor(t0, y11);
+  aes_word_t y21 = aes_nohw_xor(y13, y16);
+  aes_word_t y18 = aes_nohw_xor(x0, y16);
+
+  // Figure 3, the middle non-linear section.
+  aes_word_t t2 = aes_nohw_and(y12, y15);
+  aes_word_t t3 = aes_nohw_and(y3, y6);
+  aes_word_t t4 = aes_nohw_xor(t3, t2);
+  aes_word_t t5 = aes_nohw_and(y4, x7);
+  aes_word_t t6 = aes_nohw_xor(t5, t2);
+  aes_word_t t7 = aes_nohw_and(y13, y16);
+  aes_word_t t8 = aes_nohw_and(y5, y1);
+  aes_word_t t9 = aes_nohw_xor(t8, t7);
+  aes_word_t t10 = aes_nohw_and(y2, y7);
+  aes_word_t t11 = aes_nohw_xor(t10, t7);
+  aes_word_t t12 = aes_nohw_and(y9, y11);
+  aes_word_t t13 = aes_nohw_and(y14, y17);
+  aes_word_t t14 = aes_nohw_xor(t13, t12);
+  aes_word_t t15 = aes_nohw_and(y8, y10);
+  aes_word_t t16 = aes_nohw_xor(t15, t12);
+  aes_word_t t17 = aes_nohw_xor(t4, t14);
+  aes_word_t t18 = aes_nohw_xor(t6, t16);
+  aes_word_t t19 = aes_nohw_xor(t9, t14);
+  aes_word_t t20 = aes_nohw_xor(t11, t16);
+  aes_word_t t21 = aes_nohw_xor(t17, y20);
+  aes_word_t t22 = aes_nohw_xor(t18, y19);
+  aes_word_t t23 = aes_nohw_xor(t19, y21);
+  aes_word_t t24 = aes_nohw_xor(t20, y18);
+  aes_word_t t25 = aes_nohw_xor(t21, t22);
+  aes_word_t t26 = aes_nohw_and(t21, t23);
+  aes_word_t t27 = aes_nohw_xor(t24, t26);
+  aes_word_t t28 = aes_nohw_and(t25, t27);
+  aes_word_t t29 = aes_nohw_xor(t28, t22);
+  aes_word_t t30 = aes_nohw_xor(t23, t24);
+  aes_word_t t31 = aes_nohw_xor(t22, t26);
+  aes_word_t t32 = aes_nohw_and(t31, t30);
+  aes_word_t t33 = aes_nohw_xor(t32, t24);
+  aes_word_t t34 = aes_nohw_xor(t23, t33);
+  aes_word_t t35 = aes_nohw_xor(t27, t33);
+  aes_word_t t36 = aes_nohw_and(t24, t35);
+  aes_word_t t37 = aes_nohw_xor(t36, t34);
+  aes_word_t t38 = aes_nohw_xor(t27, t36);
+  aes_word_t t39 = aes_nohw_and(t29, t38);
+  aes_word_t t40 = aes_nohw_xor(t25, t39);
+  aes_word_t t41 = aes_nohw_xor(t40, t37);
+  aes_word_t t42 = aes_nohw_xor(t29, t33);
+  aes_word_t t43 = aes_nohw_xor(t29, t40);
+  aes_word_t t44 = aes_nohw_xor(t33, t37);
+  aes_word_t t45 = aes_nohw_xor(t42, t41);
+  aes_word_t z0 = aes_nohw_and(t44, y15);
+  aes_word_t z1 = aes_nohw_and(t37, y6);
+  aes_word_t z2 = aes_nohw_and(t33, x7);
+  aes_word_t z3 = aes_nohw_and(t43, y16);
+  aes_word_t z4 = aes_nohw_and(t40, y1);
+  aes_word_t z5 = aes_nohw_and(t29, y7);
+  aes_word_t z6 = aes_nohw_and(t42, y11);
+  aes_word_t z7 = aes_nohw_and(t45, y17);
+  aes_word_t z8 = aes_nohw_and(t41, y10);
+  aes_word_t z9 = aes_nohw_and(t44, y12);
+  aes_word_t z10 = aes_nohw_and(t37, y3);
+  aes_word_t z11 = aes_nohw_and(t33, y4);
+  aes_word_t z12 = aes_nohw_and(t43, y13);
+  aes_word_t z13 = aes_nohw_and(t40, y5);
+  aes_word_t z14 = aes_nohw_and(t29, y2);
+  aes_word_t z15 = aes_nohw_and(t42, y9);
+  aes_word_t z16 = aes_nohw_and(t45, y14);
+  aes_word_t z17 = aes_nohw_and(t41, y8);
+
+  // Figure 4, bottom linear transformation.
+  aes_word_t t46 = aes_nohw_xor(z15, z16);
+  aes_word_t t47 = aes_nohw_xor(z10, z11);
+  aes_word_t t48 = aes_nohw_xor(z5, z13);
+  aes_word_t t49 = aes_nohw_xor(z9, z10);
+  aes_word_t t50 = aes_nohw_xor(z2, z12);
+  aes_word_t t51 = aes_nohw_xor(z2, z5);
+  aes_word_t t52 = aes_nohw_xor(z7, z8);
+  aes_word_t t53 = aes_nohw_xor(z0, z3);
+  aes_word_t t54 = aes_nohw_xor(z6, z7);
+  aes_word_t t55 = aes_nohw_xor(z16, z17);
+  aes_word_t t56 = aes_nohw_xor(z12, t48);
+  aes_word_t t57 = aes_nohw_xor(t50, t53);
+  aes_word_t t58 = aes_nohw_xor(z4, t46);
+  aes_word_t t59 = aes_nohw_xor(z3, t54);
+  aes_word_t t60 = aes_nohw_xor(t46, t57);
+  aes_word_t t61 = aes_nohw_xor(z14, t57);
+  aes_word_t t62 = aes_nohw_xor(t52, t58);
+  aes_word_t t63 = aes_nohw_xor(t49, t58);
+  aes_word_t t64 = aes_nohw_xor(z4, t59);
+  aes_word_t t65 = aes_nohw_xor(t61, t62);
+  aes_word_t t66 = aes_nohw_xor(z1, t63);
+  aes_word_t s0 = aes_nohw_xor(t59, t63);
+  aes_word_t s6 = aes_nohw_xor(t56, aes_nohw_not(t62));
+  aes_word_t s7 = aes_nohw_xor(t48, aes_nohw_not(t60));
+  aes_word_t t67 = aes_nohw_xor(t64, t65);
+  aes_word_t s3 = aes_nohw_xor(t53, t66);
+  aes_word_t s4 = aes_nohw_xor(t51, t66);
+  aes_word_t s5 = aes_nohw_xor(t47, t65);
+  aes_word_t s1 = aes_nohw_xor(t64, aes_nohw_not(s3));
+  aes_word_t s2 = aes_nohw_xor(t55, aes_nohw_not(t67));
+
+  batch->w[0] = s7;
+  batch->w[1] = s6;
+  batch->w[2] = s5;
+  batch->w[3] = s4;
+  batch->w[4] = s3;
+  batch->w[5] = s2;
+  batch->w[6] = s1;
+  batch->w[7] = s0;
+}
+
+// aes_nohw_rotate_cols_right returns |v| with the columns in each row rotated
+// to the right by |n|. This is a macro because |aes_nohw_shift_*| require
+// constant shift counts in the SSE2 implementation.
+#define aes_nohw_rotate_cols_right(/* aes_word_t */ v, /* const */ n) \
+  (aes_nohw_or(aes_nohw_shift_right((v), (n)*4),                      \
+               aes_nohw_shift_left((v), 16 - (n)*4)))
+
+static void aes_nohw_shift_rows(AES_NOHW_BATCH *batch) {
+  for (size_t i = 0; i < 8; i++) {
+    aes_word_t row0 = aes_nohw_and(batch->w[i], AES_NOHW_ROW0_MASK);
+    aes_word_t row1 = aes_nohw_and(batch->w[i], AES_NOHW_ROW1_MASK);
+    aes_word_t row2 = aes_nohw_and(batch->w[i], AES_NOHW_ROW2_MASK);
+    aes_word_t row3 = aes_nohw_and(batch->w[i], AES_NOHW_ROW3_MASK);
+    row1 = aes_nohw_rotate_cols_right(row1, 1);
+    row2 = aes_nohw_rotate_cols_right(row2, 2);
+    row3 = aes_nohw_rotate_cols_right(row3, 3);
+    batch->w[i] = aes_nohw_or(aes_nohw_or(row0, row1), aes_nohw_or(row2, row3));
+  }
+}
+
+// aes_nohw_rotate_rows_down returns |v| with the rows in each column rotated
+// down by one.
+static inline aes_word_t aes_nohw_rotate_rows_down(aes_word_t v) {
+#if defined(OPENSSL_SSE2)
+  return _mm_or_si128(_mm_srli_epi32(v, 8), _mm_slli_epi32(v, 24));
+#elif defined(OPENSSL_64_BIT)
+  return ((v >> 4) & UINT64_C(0x0fff0fff0fff0fff)) |
+         ((v << 12) & UINT64_C(0xf000f000f000f000));
+#else
+  return ((v >> 2) & 0x3f3f3f3f) | ((v << 6) & 0xc0c0c0c0);
+#endif
+}
+
+// aes_nohw_rotate_rows_twice returns |v| with the rows in each column rotated
+// by two.
+static inline aes_word_t aes_nohw_rotate_rows_twice(aes_word_t v) {
+#if defined(OPENSSL_SSE2)
+  return _mm_or_si128(_mm_srli_epi32(v, 16), _mm_slli_epi32(v, 16));
+#elif defined(OPENSSL_64_BIT)
+  return ((v >> 8) & UINT64_C(0x00ff00ff00ff00ff)) |
+         ((v << 8) & UINT64_C(0xff00ff00ff00ff00));
+#else
+  return ((v >> 4) & 0x0f0f0f0f) | ((v << 4) & 0xf0f0f0f0);
+#endif
+}
+
+static void aes_nohw_mix_columns(AES_NOHW_BATCH *batch) {
+  // See https://eprint.iacr.org/2009/129.pdf, section 4.4 and appendix A.
+  aes_word_t a0 = batch->w[0];
+  aes_word_t a1 = batch->w[1];
+  aes_word_t a2 = batch->w[2];
+  aes_word_t a3 = batch->w[3];
+  aes_word_t a4 = batch->w[4];
+  aes_word_t a5 = batch->w[5];
+  aes_word_t a6 = batch->w[6];
+  aes_word_t a7 = batch->w[7];
+
+  aes_word_t r0 = aes_nohw_rotate_rows_down(a0);
+  aes_word_t a0_r0 = aes_nohw_xor(a0, r0);
+  aes_word_t r1 = aes_nohw_rotate_rows_down(a1);
+  aes_word_t a1_r1 = aes_nohw_xor(a1, r1);
+  aes_word_t r2 = aes_nohw_rotate_rows_down(a2);
+  aes_word_t a2_r2 = aes_nohw_xor(a2, r2);
+  aes_word_t r3 = aes_nohw_rotate_rows_down(a3);
+  aes_word_t a3_r3 = aes_nohw_xor(a3, r3);
+  aes_word_t r4 = aes_nohw_rotate_rows_down(a4);
+  aes_word_t a4_r4 = aes_nohw_xor(a4, r4);
+  aes_word_t r5 = aes_nohw_rotate_rows_down(a5);
+  aes_word_t a5_r5 = aes_nohw_xor(a5, r5);
+  aes_word_t r6 = aes_nohw_rotate_rows_down(a6);
+  aes_word_t a6_r6 = aes_nohw_xor(a6, r6);
+  aes_word_t r7 = aes_nohw_rotate_rows_down(a7);
+  aes_word_t a7_r7 = aes_nohw_xor(a7, r7);
+
+  batch->w[0] =
+      aes_nohw_xor(aes_nohw_xor(a7_r7, r0), aes_nohw_rotate_rows_twice(a0_r0));
+  batch->w[1] =
+      aes_nohw_xor(aes_nohw_xor(a0_r0, a7_r7),
+                   aes_nohw_xor(r1, aes_nohw_rotate_rows_twice(a1_r1)));
+  batch->w[2] =
+      aes_nohw_xor(aes_nohw_xor(a1_r1, r2), aes_nohw_rotate_rows_twice(a2_r2));
+  batch->w[3] =
+      aes_nohw_xor(aes_nohw_xor(a2_r2, a7_r7),
+                   aes_nohw_xor(r3, aes_nohw_rotate_rows_twice(a3_r3)));
+  batch->w[4] =
+      aes_nohw_xor(aes_nohw_xor(a3_r3, a7_r7),
+                   aes_nohw_xor(r4, aes_nohw_rotate_rows_twice(a4_r4)));
+  batch->w[5] =
+      aes_nohw_xor(aes_nohw_xor(a4_r4, r5), aes_nohw_rotate_rows_twice(a5_r5));
+  batch->w[6] =
+      aes_nohw_xor(aes_nohw_xor(a5_r5, r6), aes_nohw_rotate_rows_twice(a6_r6));
+  batch->w[7] =
+      aes_nohw_xor(aes_nohw_xor(a6_r6, r7), aes_nohw_rotate_rows_twice(a7_r7));
+}
+
+static void aes_nohw_encrypt_batch(const AES_NOHW_SCHEDULE *key,
+                                   size_t num_rounds, AES_NOHW_BATCH *batch) {
+  aes_nohw_add_round_key(batch, &key->keys[0]);
+  for (size_t i = 1; i < num_rounds; i++) {
+    aes_nohw_sub_bytes(batch);
+    aes_nohw_shift_rows(batch);
+    aes_nohw_mix_columns(batch);
+    aes_nohw_add_round_key(batch, &key->keys[i]);
+  }
+  aes_nohw_sub_bytes(batch);
+  aes_nohw_shift_rows(batch);
+  aes_nohw_add_round_key(batch, &key->keys[num_rounds]);
+}
+
+// Key schedule.
+
+static void aes_nohw_expand_round_keys(AES_NOHW_SCHEDULE *out,
+                                       const AES_KEY *key) {
+  for (unsigned i = 0; i <= key->rounds; i++) {
+    // Copy the round key into each block in the batch.
+    for (size_t j = 0; j < AES_NOHW_BATCH_SIZE; j++) {
+      aes_word_t tmp[AES_NOHW_BLOCK_WORDS];
+      GFp_memcpy(tmp, key->rd_key + 4 * i, 16);
+      aes_nohw_batch_set(&out->keys[i], tmp, j);
+    }
+    aes_nohw_transpose(&out->keys[i]);
+  }
+}
+
+static const uint8_t aes_nohw_rcon[10] = {0x01, 0x02, 0x04, 0x08, 0x10,
+                                          0x20, 0x40, 0x80, 0x1b, 0x36};
+
+// aes_nohw_rcon_slice returns the |i|th group of |AES_NOHW_BATCH_SIZE| bits in
+// |rcon|, stored in a |aes_word_t|.
+static inline aes_word_t aes_nohw_rcon_slice(uint8_t rcon, size_t i) {
+  rcon = (rcon >> (i * AES_NOHW_BATCH_SIZE)) & ((1 << AES_NOHW_BATCH_SIZE) - 1);
+#if defined(OPENSSL_SSE2)
+  return _mm_set_epi32(0, 0, 0, rcon);
+#else
+  return ((aes_word_t)rcon);
+#endif
+}
+
+static void aes_nohw_sub_block(aes_word_t out[AES_NOHW_BLOCK_WORDS],
+                               const aes_word_t in[AES_NOHW_BLOCK_WORDS]) {
+  AES_NOHW_BATCH batch;
+  GFp_memset(&batch, 0, sizeof(batch));
+  aes_nohw_batch_set(&batch, in, 0);
+  aes_nohw_transpose(&batch);
+  aes_nohw_sub_bytes(&batch);
+  aes_nohw_transpose(&batch);
+  aes_nohw_batch_get(&batch, out, 0);
+}
+
+static void aes_nohw_setup_key_128(AES_KEY *key, const uint8_t in[16]) {
+  key->rounds = 10;
+
+  aes_word_t block[AES_NOHW_BLOCK_WORDS];
+  aes_nohw_compact_block(block, in);
+  GFp_memcpy(key->rd_key, block, 16);
+
+  for (size_t i = 1; i <= 10; i++) {
+    aes_word_t sub[AES_NOHW_BLOCK_WORDS];
+    aes_nohw_sub_block(sub, block);
+    uint8_t rcon = aes_nohw_rcon[i - 1];
+    for (size_t j = 0; j < AES_NOHW_BLOCK_WORDS; j++) {
+      // Incorporate |rcon| and the transformed word into the first word.
+      block[j] = aes_nohw_xor(block[j], aes_nohw_rcon_slice(rcon, j));
+      block[j] = aes_nohw_xor(
+          block[j],
+          aes_nohw_shift_right(aes_nohw_rotate_rows_down(sub[j]), 12));
+      // Propagate to the remaining words. Note this is reordered from the usual
+      // formulation to avoid needing masks.
+      aes_word_t v = block[j];
+      block[j] = aes_nohw_xor(block[j], aes_nohw_shift_left(v, 4));
+      block[j] = aes_nohw_xor(block[j], aes_nohw_shift_left(v, 8));
+      block[j] = aes_nohw_xor(block[j], aes_nohw_shift_left(v, 12));
+    }
+    GFp_memcpy(key->rd_key + 4 * i, block, 16);
+  }
+}
+
+static void aes_nohw_setup_key_256(AES_KEY *key, const uint8_t in[32]) {
+  key->rounds = 14;
+
+  // Each key schedule iteration produces two round keys.
+  aes_word_t block1[AES_NOHW_BLOCK_WORDS], block2[AES_NOHW_BLOCK_WORDS];
+  aes_nohw_compact_block(block1, in);
+  GFp_memcpy(key->rd_key, block1, 16);
+
+  aes_nohw_compact_block(block2, in + 16);
+  GFp_memcpy(key->rd_key + 4, block2, 16);
+
+  for (size_t i = 2; i <= 14; i += 2) {
+    aes_word_t sub[AES_NOHW_BLOCK_WORDS];
+    aes_nohw_sub_block(sub, block2);
+    uint8_t rcon = aes_nohw_rcon[i / 2 - 1];
+    for (size_t j = 0; j < AES_NOHW_BLOCK_WORDS; j++) {
+      // Incorporate |rcon| and the transformed word into the first word.
+      block1[j] = aes_nohw_xor(block1[j], aes_nohw_rcon_slice(rcon, j));
+      block1[j] = aes_nohw_xor(
+          block1[j],
+          aes_nohw_shift_right(aes_nohw_rotate_rows_down(sub[j]), 12));
+      // Propagate to the remaining words.
+      aes_word_t v = block1[j];
+      block1[j] = aes_nohw_xor(block1[j], aes_nohw_shift_left(v, 4));
+      block1[j] = aes_nohw_xor(block1[j], aes_nohw_shift_left(v, 8));
+      block1[j] = aes_nohw_xor(block1[j], aes_nohw_shift_left(v, 12));
+    }
+    GFp_memcpy(key->rd_key + 4 * i, block1, 16);
+
+    if (i == 14) {
+      break;
+    }
+
+    aes_nohw_sub_block(sub, block1);
+    for (size_t j = 0; j < AES_NOHW_BLOCK_WORDS; j++) {
+      // Incorporate the transformed word into the first word.
+      block2[j] = aes_nohw_xor(block2[j], aes_nohw_shift_right(sub[j], 12));
+      // Propagate to the remaining words.
+      aes_word_t v = block2[j];
+      block2[j] = aes_nohw_xor(block2[j], aes_nohw_shift_left(v, 4));
+      block2[j] = aes_nohw_xor(block2[j], aes_nohw_shift_left(v, 8));
+      block2[j] = aes_nohw_xor(block2[j], aes_nohw_shift_left(v, 12));
+    }
+    GFp_memcpy(key->rd_key + 4 * (i + 1), block2, 16);
+  }
+}
+
+
+// External API.
+
+int GFp_aes_nohw_set_encrypt_key(const uint8_t *key, unsigned bits,
+                                 AES_KEY *aeskey) {
+  switch (bits) {
+    case 128:
+      aes_nohw_setup_key_128(aeskey, key);
+      return 0;
+    case 256:
+      aes_nohw_setup_key_256(aeskey, key);
+      return 0;
+  }
+  return 1;
+}
+
+void GFp_aes_nohw_encrypt(const uint8_t *in, uint8_t *out, const AES_KEY *key) {
+  AES_NOHW_SCHEDULE sched;
+  aes_nohw_expand_round_keys(&sched, key);
+  AES_NOHW_BATCH batch;
+  aes_nohw_to_batch(&batch, in, /*num_blocks=*/1);
+  aes_nohw_encrypt_batch(&sched, key->rounds, &batch);
+  aes_nohw_from_batch(out, /*num_blocks=*/1, &batch);
+}
+
+static inline void aes_nohw_xor_block(uint8_t out[16], const uint8_t a[16],
+                                      const uint8_t b[16]) {
+  for (size_t i = 0; i < 16; i += sizeof(aes_word_t)) {
+    aes_word_t x, y;
+    GFp_memcpy(&x, a + i, sizeof(aes_word_t));
+    GFp_memcpy(&y, b + i, sizeof(aes_word_t));
+    x = aes_nohw_xor(x, y);
+    GFp_memcpy(out + i, &x, sizeof(aes_word_t));
+  }
+}
+
+void GFp_aes_nohw_ctr32_encrypt_blocks(const uint8_t *in, uint8_t *out,
+                                       size_t blocks, const AES_KEY *key,
+                                       const uint8_t ivec[16]) {
+  if (blocks == 0) {
+    return;
+  }
+
+  AES_NOHW_SCHEDULE sched;
+  aes_nohw_expand_round_keys(&sched, key);
+
+  // Make |AES_NOHW_BATCH_SIZE| copies of |ivec|.
+  alignas(AES_NOHW_WORD_SIZE) union {
+    uint32_t u32[AES_NOHW_BATCH_SIZE * 4];
+    uint8_t u8[AES_NOHW_BATCH_SIZE * 16];
+  } ivs, enc_ivs;
+  for (size_t i = 0; i < AES_NOHW_BATCH_SIZE; i++) {
+    GFp_memcpy(ivs.u8 + 16 * i, ivec, 16);
+  }
+
+  uint32_t ctr = CRYPTO_bswap4(ivs.u32[3]);
+  for (;;) {
+    // Update counters.
+    for (uint32_t i = 0; i < AES_NOHW_BATCH_SIZE; i++) {
+      ivs.u32[4 * i + 3] = CRYPTO_bswap4(ctr + i);
+    }
+
+    size_t todo = blocks >= AES_NOHW_BATCH_SIZE ? AES_NOHW_BATCH_SIZE : blocks;
+    AES_NOHW_BATCH batch;
+    aes_nohw_to_batch(&batch, ivs.u8, todo);
+    aes_nohw_encrypt_batch(&sched, key->rounds, &batch);
+    aes_nohw_from_batch(enc_ivs.u8, todo, &batch);
+
+    for (size_t i = 0; i < todo; i++) {
+      aes_nohw_xor_block(out + 16 * i, in + 16 * i, enc_ivs.u8 + 16 * i);
+    }
+
+    blocks -= todo;
+    if (blocks == 0) {
+      break;
+    }
+
+    in += 16 * AES_NOHW_BATCH_SIZE;
+    out += 16 * AES_NOHW_BATCH_SIZE;
+    ctr += AES_NOHW_BATCH_SIZE;
+  }
+}
--- a/zeroidc/vendor/ring/crypto/fipsmodule/aes/asm/aesni-x86.pl
+++ b/zeroidc/vendor/ring/crypto/fipsmodule/aes/asm/aesni-x86.pl
@@ -0,0 +1,971 @@
+#! /usr/bin/env perl
+# Copyright 2009-2016 The OpenSSL Project Authors. All Rights Reserved.
+#
+# Licensed under the OpenSSL license (the "License").  You may not use
+# this file except in compliance with the License.  You can obtain a copy
+# in the file LICENSE in the source distribution or at
+# https://www.openssl.org/source/license.html
+
+
+# ====================================================================
+# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
+# project. The module is, however, dual licensed under OpenSSL and
+# CRYPTOGAMS licenses depending on where you obtain it. For further
+# details see http://www.openssl.org/~appro/cryptogams/.
+# ====================================================================
+#
+# This module implements support for Intel AES-NI extension. In
+# OpenSSL context it's used with Intel engine, but can also be used as
+# drop-in replacement for crypto/aes/asm/aes-586.pl [see below for
+# details].
+#
+# Performance.
+#
+# To start with see corresponding paragraph in aesni-x86_64.pl...
+# Instead of filling table similar to one found there I've chosen to
+# summarize *comparison* results for raw ECB, CTR and CBC benchmarks.
+# The simplified table below represents 32-bit performance relative
+# to 64-bit one in every given point. Ratios vary for different
+# encryption modes, therefore interval values.
+#
+#	16-byte     64-byte     256-byte    1-KB        8-KB
+#	53-67%      67-84%      91-94%      95-98%      97-99.5%
+#
+# Lower ratios for smaller block sizes are perfectly understandable,
+# because function call overhead is higher in 32-bit mode. Largest
+# 8-KB block performance is virtually same: 32-bit code is less than
+# 1% slower for ECB, CBC and CCM, and ~3% slower otherwise.
+
+# January 2011
+#
+# See aesni-x86_64.pl for details. Unlike x86_64 version this module
+# interleaves at most 6 aes[enc|dec] instructions, because there are
+# not enough registers for 8x interleave [which should be optimal for
+# Sandy Bridge]. Actually, performance results for 6x interleave
+# factor presented in aesni-x86_64.pl (except for CTR) are for this
+# module.
+
+# April 2011
+#
+# Add aesni_xts_[en|de]crypt. Westmere spends 1.50 cycles processing
+# one byte out of 8KB with 128-bit key, Sandy Bridge - 1.09.
+
+# November 2015
+#
+# Add aesni_ocb_[en|de]crypt. [Removed in BoringSSL]
+
+######################################################################
+# Current large-block performance in cycles per byte processed with
+# 128-bit key (less is better).
+#
+#		CBC en-/decrypt	CTR	XTS	ECB	OCB
+# Westmere	3.77/1.37	1.37	1.52	1.27
+# * Bridge	5.07/0.98	0.99	1.09	0.91	1.10
+# Haswell	4.44/0.80	0.97	1.03	0.72	0.76
+# Skylake	2.68/0.65	0.65	0.66	0.64	0.66
+# Silvermont	5.77/3.56	3.67	4.03	3.46	4.03
+# Goldmont	3.84/1.39	1.39	1.63	1.31	1.70
+# Bulldozer	5.80/0.98	1.05	1.24	0.93	1.23
+
+$PREFIX="GFp_aes_hw";	# if $PREFIX is set to "AES", the script
+			# generates drop-in replacement for
+			# crypto/aes/asm/aes-586.pl:-)
+$AESNI_PREFIX="GFp_aes_hw";
+$inline=1;		# inline _aesni_[en|de]crypt
+
+$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
+push(@INC,"${dir}","${dir}../../../perlasm");
+require "x86asm.pl";
+
+$output = pop;
+open OUT,">$output";
+*STDOUT=*OUT;
+
+&asm_init($ARGV[0]);
+
+&external_label("GFp_ia32cap_P");
+&static_label("key_const");
+
+if ($PREFIX eq $AESNI_PREFIX)	{ $movekey=\&movups; }
+else			{ $movekey=\&movups; }
+
+$len="eax";
+$rounds="ecx";
+$key="edx";
+$inp="esi";
+$out="edi";
+$rounds_="ebx";	# backup copy for $rounds
+$key_="ebp";	# backup copy for $key
+
+$rndkey0="xmm0";
+$rndkey1="xmm1";
+$inout0="xmm2";
+$inout1="xmm3";
+$inout2="xmm4";
+$inout3="xmm5";	$in1="xmm5";
+$inout4="xmm6";	$in0="xmm6";
+$inout5="xmm7";	$ivec="xmm7";
+
+# AESNI extension
+sub aeskeygenassist
+{ my($dst,$src,$imm)=@_;
+    if ("$dst:$src" =~ /xmm([0-7]):xmm([0-7])/)
+    {	&data_byte(0x66,0x0f,0x3a,0xdf,0xc0|($1<<3)|$2,$imm);	}
+}
+sub aescommon
+{ my($opcodelet,$dst,$src)=@_;
+    if ("$dst:$src" =~ /xmm([0-7]):xmm([0-7])/)
+    {	&data_byte(0x66,0x0f,0x38,$opcodelet,0xc0|($1<<3)|$2);}
+}
+sub aesimc	{ aescommon(0xdb,@_); }
+sub aesenc	{ aescommon(0xdc,@_); }
+sub aesenclast	{ aescommon(0xdd,@_); }
+
+# Inline version of internal aesni_[en|de]crypt1
+{ my $sn;
+sub aesni_inline_generate1
+{ my ($p,$inout,$ivec)=@_; $inout=$inout0 if (!defined($inout));
+  $sn++;
+
+    &$movekey		($rndkey0,&QWP(0,$key));
+    &$movekey		($rndkey1,&QWP(16,$key));
+    &xorps		($ivec,$rndkey0)	if (defined($ivec));
+    &lea		($key,&DWP(32,$key));
+    &xorps		($inout,$ivec)		if (defined($ivec));
+    &xorps		($inout,$rndkey0)	if (!defined($ivec));
+    &set_label("${p}1_loop_$sn");
+	eval"&aes${p}	($inout,$rndkey1)";
+	&dec		($rounds);
+	&$movekey	($rndkey1,&QWP(0,$key));
+	&lea		($key,&DWP(16,$key));
+    &jnz		(&label("${p}1_loop_$sn"));
+    eval"&aes${p}last	($inout,$rndkey1)";
+}}
+
+sub aesni_generate1	# fully unrolled loop
+{ my ($p,$inout)=@_; $inout=$inout0 if (!defined($inout));
+
+    &function_begin_B("_aesni_${p}rypt1");
+	&movups		($rndkey0,&QWP(0,$key));
+	&$movekey	($rndkey1,&QWP(0x10,$key));
+	&xorps		($inout,$rndkey0);
+	&$movekey	($rndkey0,&QWP(0x20,$key));
+	&lea		($key,&DWP(0x30,$key));
+	&cmp		($rounds,11);
+	&jb		(&label("${p}128"));
+	&lea		($key,&DWP(0x40,$key));
+	# 192-bit key support was removed.
+
+	eval"&aes${p}	($inout,$rndkey1)";
+	&$movekey	($rndkey1,&QWP(-0x40,$key));
+	eval"&aes${p}	($inout,$rndkey0)";
+	&$movekey	($rndkey0,&QWP(-0x30,$key));
+
+	# 192-bit key support was removed.
+	eval"&aes${p}	($inout,$rndkey1)";
+	&$movekey	($rndkey1,&QWP(-0x20,$key));
+	eval"&aes${p}	($inout,$rndkey0)";
+	&$movekey	($rndkey0,&QWP(-0x10,$key));
+    &set_label("${p}128");
+	eval"&aes${p}	($inout,$rndkey1)";
+	&$movekey	($rndkey1,&QWP(0,$key));
+	eval"&aes${p}	($inout,$rndkey0)";
+	&$movekey	($rndkey0,&QWP(0x10,$key));
+	eval"&aes${p}	($inout,$rndkey1)";
+	&$movekey	($rndkey1,&QWP(0x20,$key));
+	eval"&aes${p}	($inout,$rndkey0)";
+	&$movekey	($rndkey0,&QWP(0x30,$key));
+	eval"&aes${p}	($inout,$rndkey1)";
+	&$movekey	($rndkey1,&QWP(0x40,$key));
+	eval"&aes${p}	($inout,$rndkey0)";
+	&$movekey	($rndkey0,&QWP(0x50,$key));
+	eval"&aes${p}	($inout,$rndkey1)";
+	&$movekey	($rndkey1,&QWP(0x60,$key));
+	eval"&aes${p}	($inout,$rndkey0)";
+	&$movekey	($rndkey0,&QWP(0x70,$key));
+	eval"&aes${p}	($inout,$rndkey1)";
+    eval"&aes${p}last	($inout,$rndkey0)";
+    &ret();
+    &function_end_B("_aesni_${p}rypt1");
+}
+
+# void $PREFIX_encrypt (const void *inp,void *out,const AES_KEY *key);
+&aesni_generate1("enc") if (!$inline);
+&function_begin_B("${PREFIX}_encrypt");
+	&mov	("eax",&wparam(0));
+	&mov	($key,&wparam(2));
+	&movups	($inout0,&QWP(0,"eax"));
+	&mov	($rounds,&DWP(240,$key));
+	&mov	("eax",&wparam(1));
+	if ($inline)
+	{   &aesni_inline_generate1("enc");	}
+	else
+	{   &call	("_aesni_encrypt1");	}
+	&pxor	($rndkey0,$rndkey0);		# clear register bank
+	&pxor	($rndkey1,$rndkey1);
+	&movups	(&QWP(0,"eax"),$inout0);
+	&pxor	($inout0,$inout0);
+	&ret	();
+&function_end_B("${PREFIX}_encrypt");
+
+# _aesni_[en|de]cryptN are private interfaces, N denotes interleave
+# factor. Why 3x subroutine were originally used in loops? Even though
+# aes[enc|dec] latency was originally 6, it could be scheduled only
+# every *2nd* cycle. Thus 3x interleave was the one providing optimal
+# utilization, i.e. when subroutine's throughput is virtually same as
+# of non-interleaved subroutine [for number of input blocks up to 3].
+# This is why it originally made no sense to implement 2x subroutine.
+# But times change and it became appropriate to spend extra 192 bytes
+# on 2x subroutine on Atom Silvermont account. For processors that
+# can schedule aes[enc|dec] every cycle optimal interleave factor
+# equals to corresponding instructions latency. 8x is optimal for
+# * Bridge, but it's unfeasible to accommodate such implementation
+# in XMM registers addressable in 32-bit mode and therefore maximum
+# of 6x is used instead...
+
+sub aesni_generate2
+{ my $p=shift;
+
+    &function_begin_B("_aesni_${p}rypt2");
+	&$movekey	($rndkey0,&QWP(0,$key));
+	&shl		($rounds,4);
+	&$movekey	($rndkey1,&QWP(16,$key));
+	&xorps		($inout0,$rndkey0);
+	&pxor		($inout1,$rndkey0);
+	&$movekey	($rndkey0,&QWP(32,$key));
+	&lea		($key,&DWP(32,$key,$rounds));
+	&neg		($rounds);
+	&add		($rounds,16);
+
+    &set_label("${p}2_loop");
+	eval"&aes${p}	($inout0,$rndkey1)";
+	eval"&aes${p}	($inout1,$rndkey1)";
+	&$movekey	($rndkey1,&QWP(0,$key,$rounds));
+	&add		($rounds,32);
+	eval"&aes${p}	($inout0,$rndkey0)";
+	eval"&aes${p}	($inout1,$rndkey0)";
+	&$movekey	($rndkey0,&QWP(-16,$key,$rounds));
+	&jnz		(&label("${p}2_loop"));
+    eval"&aes${p}	($inout0,$rndkey1)";
+    eval"&aes${p}	($inout1,$rndkey1)";
+    eval"&aes${p}last	($inout0,$rndkey0)";
+    eval"&aes${p}last	($inout1,$rndkey0)";
+    &ret();
+    &function_end_B("_aesni_${p}rypt2");
+}
+
+sub aesni_generate3
+{ my $p=shift;
+
+    &function_begin_B("_aesni_${p}rypt3");
+	&$movekey	($rndkey0,&QWP(0,$key));
+	&shl		($rounds,4);
+	&$movekey	($rndkey1,&QWP(16,$key));
+	&xorps		($inout0,$rndkey0);
+	&pxor		($inout1,$rndkey0);
+	&pxor		($inout2,$rndkey0);
+	&$movekey	($rndkey0,&QWP(32,$key));
+	&lea		($key,&DWP(32,$key,$rounds));
+	&neg		($rounds);
+	&add		($rounds,16);
+
+    &set_label("${p}3_loop");
+	eval"&aes${p}	($inout0,$rndkey1)";
+	eval"&aes${p}	($inout1,$rndkey1)";
+	eval"&aes${p}	($inout2,$rndkey1)";
+	&$movekey	($rndkey1,&QWP(0,$key,$rounds));
+	&add		($rounds,32);
+	eval"&aes${p}	($inout0,$rndkey0)";
+	eval"&aes${p}	($inout1,$rndkey0)";
+	eval"&aes${p}	($inout2,$rndkey0)";
+	&$movekey	($rndkey0,&QWP(-16,$key,$rounds));
+	&jnz		(&label("${p}3_loop"));
+    eval"&aes${p}	($inout0,$rndkey1)";
+    eval"&aes${p}	($inout1,$rndkey1)";
+    eval"&aes${p}	($inout2,$rndkey1)";
+    eval"&aes${p}last	($inout0,$rndkey0)";
+    eval"&aes${p}last	($inout1,$rndkey0)";
+    eval"&aes${p}last	($inout2,$rndkey0)";
+    &ret();
+    &function_end_B("_aesni_${p}rypt3");
+}
+
+# 4x interleave is implemented to improve small block performance,
+# most notably [and naturally] 4 block by ~30%. One can argue that one
+# should have implemented 5x as well, but improvement  would be <20%,
+# so it's not worth it...
+sub aesni_generate4
+{ my $p=shift;
+
+    &function_begin_B("_aesni_${p}rypt4");
+	&$movekey	($rndkey0,&QWP(0,$key));
+	&$movekey	($rndkey1,&QWP(16,$key));
+	&shl		($rounds,4);
+	&xorps		($inout0,$rndkey0);
+	&pxor		($inout1,$rndkey0);
+	&pxor		($inout2,$rndkey0);
+	&pxor		($inout3,$rndkey0);
+	&$movekey	($rndkey0,&QWP(32,$key));
+	&lea		($key,&DWP(32,$key,$rounds));
+	&neg		($rounds);
+	&data_byte	(0x0f,0x1f,0x40,0x00);
+	&add		($rounds,16);
+
+    &set_label("${p}4_loop");
+	eval"&aes${p}	($inout0,$rndkey1)";
+	eval"&aes${p}	($inout1,$rndkey1)";
+	eval"&aes${p}	($inout2,$rndkey1)";
+	eval"&aes${p}	($inout3,$rndkey1)";
+	&$movekey	($rndkey1,&QWP(0,$key,$rounds));
+	&add		($rounds,32);
+	eval"&aes${p}	($inout0,$rndkey0)";
+	eval"&aes${p}	($inout1,$rndkey0)";
+	eval"&aes${p}	($inout2,$rndkey0)";
+	eval"&aes${p}	($inout3,$rndkey0)";
+	&$movekey	($rndkey0,&QWP(-16,$key,$rounds));
+    &jnz		(&label("${p}4_loop"));
+
+    eval"&aes${p}	($inout0,$rndkey1)";
+    eval"&aes${p}	($inout1,$rndkey1)";
+    eval"&aes${p}	($inout2,$rndkey1)";
+    eval"&aes${p}	($inout3,$rndkey1)";
+    eval"&aes${p}last	($inout0,$rndkey0)";
+    eval"&aes${p}last	($inout1,$rndkey0)";
+    eval"&aes${p}last	($inout2,$rndkey0)";
+    eval"&aes${p}last	($inout3,$rndkey0)";
+    &ret();
+    &function_end_B("_aesni_${p}rypt4");
+}
+
+sub aesni_generate6
+{ my $p=shift;
+
+    &function_begin_B("_aesni_${p}rypt6");
+    &static_label("_aesni_${p}rypt6_enter");
+	&$movekey	($rndkey0,&QWP(0,$key));
+	&shl		($rounds,4);
+	&$movekey	($rndkey1,&QWP(16,$key));
+	&xorps		($inout0,$rndkey0);
+	&pxor		($inout1,$rndkey0);	# pxor does better here
+	&pxor		($inout2,$rndkey0);
+	eval"&aes${p}	($inout0,$rndkey1)";
+	&pxor		($inout3,$rndkey0);
+	&pxor		($inout4,$rndkey0);
+	eval"&aes${p}	($inout1,$rndkey1)";
+	&lea		($key,&DWP(32,$key,$rounds));
+	&neg		($rounds);
+	eval"&aes${p}	($inout2,$rndkey1)";
+	&pxor		($inout5,$rndkey0);
+	&$movekey	($rndkey0,&QWP(0,$key,$rounds));
+	&add		($rounds,16);
+	&jmp		(&label("_aesni_${p}rypt6_inner"));
+
+    &set_label("${p}6_loop",16);
+	eval"&aes${p}	($inout0,$rndkey1)";
+	eval"&aes${p}	($inout1,$rndkey1)";
+	eval"&aes${p}	($inout2,$rndkey1)";
+    &set_label("_aesni_${p}rypt6_inner");
+	eval"&aes${p}	($inout3,$rndkey1)";
+	eval"&aes${p}	($inout4,$rndkey1)";
+	eval"&aes${p}	($inout5,$rndkey1)";
+    &set_label("_aesni_${p}rypt6_enter");
+	&$movekey	($rndkey1,&QWP(0,$key,$rounds));
+	&add		($rounds,32);
+	eval"&aes${p}	($inout0,$rndkey0)";
+	eval"&aes${p}	($inout1,$rndkey0)";
+	eval"&aes${p}	($inout2,$rndkey0)";
+	eval"&aes${p}	($inout3,$rndkey0)";
+	eval"&aes${p}	($inout4,$rndkey0)";
+	eval"&aes${p}	($inout5,$rndkey0)";
+	&$movekey	($rndkey0,&QWP(-16,$key,$rounds));
+    &jnz		(&label("${p}6_loop"));
+
+    eval"&aes${p}	($inout0,$rndkey1)";
+    eval"&aes${p}	($inout1,$rndkey1)";
+    eval"&aes${p}	($inout2,$rndkey1)";
+    eval"&aes${p}	($inout3,$rndkey1)";
+    eval"&aes${p}	($inout4,$rndkey1)";
+    eval"&aes${p}	($inout5,$rndkey1)";
+    eval"&aes${p}last	($inout0,$rndkey0)";
+    eval"&aes${p}last	($inout1,$rndkey0)";
+    eval"&aes${p}last	($inout2,$rndkey0)";
+    eval"&aes${p}last	($inout3,$rndkey0)";
+    eval"&aes${p}last	($inout4,$rndkey0)";
+    eval"&aes${p}last	($inout5,$rndkey0)";
+    &ret();
+    &function_end_B("_aesni_${p}rypt6");
+}
+&aesni_generate2("enc") if ($PREFIX eq $AESNI_PREFIX);
+&aesni_generate3("enc") if ($PREFIX eq $AESNI_PREFIX);
+&aesni_generate4("enc") if ($PREFIX eq $AESNI_PREFIX);
+&aesni_generate6("enc") if ($PREFIX eq $AESNI_PREFIX);
+
+if ($PREFIX eq $AESNI_PREFIX) {
+
+######################################################################
+# void aes_hw_ctr32_encrypt_blocks (const void *in, void *out,
+#                         size_t blocks, const AES_KEY *key,
+#                         const char *ivec);
+#
+# Handles only complete blocks, operates on 32-bit counter and
+# does not update *ivec! (see crypto/modes/ctr128.c for details)
+#
+# stack layout:
+#	0	pshufb mask
+#	16	vector addend: 0,6,6,6
+# 	32	counter-less ivec
+#	48	1st triplet of counter vector
+#	64	2nd triplet of counter vector
+#	80	saved %esp
+
+&function_begin("${PREFIX}_ctr32_encrypt_blocks");
+	&mov	($inp,&wparam(0));
+	&mov	($out,&wparam(1));
+	&mov	($len,&wparam(2));
+	&mov	($key,&wparam(3));
+	&mov	($rounds_,&wparam(4));
+	&mov	($key_,"esp");
+	&sub	("esp",88);
+	&and	("esp",-16);			# align stack
+	&mov	(&DWP(80,"esp"),$key_);
+
+	&cmp	($len,1);
+	&je	(&label("ctr32_one_shortcut"));
+
+	&movdqu	($inout5,&QWP(0,$rounds_));	# load ivec
+
+	# compose byte-swap control mask for pshufb on stack
+	&mov	(&DWP(0,"esp"),0x0c0d0e0f);
+	&mov	(&DWP(4,"esp"),0x08090a0b);
+	&mov	(&DWP(8,"esp"),0x04050607);
+	&mov	(&DWP(12,"esp"),0x00010203);
+
+	# compose counter increment vector on stack
+	&mov	($rounds,6);
+	&xor	($key_,$key_);
+	&mov	(&DWP(16,"esp"),$rounds);
+	&mov	(&DWP(20,"esp"),$rounds);
+	&mov	(&DWP(24,"esp"),$rounds);
+	&mov	(&DWP(28,"esp"),$key_);
+
+	&pextrd	($rounds_,$inout5,3);		# pull 32-bit counter
+	&pinsrd	($inout5,$key_,3);		# wipe 32-bit counter
+
+	&mov	($rounds,&DWP(240,$key));	# key->rounds
+
+	# compose 2 vectors of 3x32-bit counters
+	&bswap	($rounds_);
+	&pxor	($rndkey0,$rndkey0);
+	&pxor	($rndkey1,$rndkey1);
+	&movdqa	($inout0,&QWP(0,"esp"));	# load byte-swap mask
+	&pinsrd	($rndkey0,$rounds_,0);
+	&lea	($key_,&DWP(3,$rounds_));
+	&pinsrd	($rndkey1,$key_,0);
+	&inc	($rounds_);
+	&pinsrd	($rndkey0,$rounds_,1);
+	&inc	($key_);
+	&pinsrd	($rndkey1,$key_,1);
+	&inc	($rounds_);
+	&pinsrd	($rndkey0,$rounds_,2);
+	&inc	($key_);
+	&pinsrd	($rndkey1,$key_,2);
+	&movdqa	(&QWP(48,"esp"),$rndkey0);	# save 1st triplet
+	&pshufb	($rndkey0,$inout0);		# byte swap
+	&movdqu	($inout4,&QWP(0,$key));		# key[0]
+	&movdqa	(&QWP(64,"esp"),$rndkey1);	# save 2nd triplet
+	&pshufb	($rndkey1,$inout0);		# byte swap
+
+	&pshufd	($inout0,$rndkey0,3<<6);	# place counter to upper dword
+	&pshufd	($inout1,$rndkey0,2<<6);
+	&cmp	($len,6);
+	&jb	(&label("ctr32_tail"));
+	&pxor	($inout5,$inout4);		# counter-less ivec^key[0]
+	&shl	($rounds,4);
+	&mov	($rounds_,16);
+	&movdqa	(&QWP(32,"esp"),$inout5);	# save counter-less ivec^key[0]
+	&mov	($key_,$key);			# backup $key
+	&sub	($rounds_,$rounds);		# backup twisted $rounds
+	&lea	($key,&DWP(32,$key,$rounds));
+	&sub	($len,6);
+	&jmp	(&label("ctr32_loop6"));
+
+&set_label("ctr32_loop6",16);
+	# inlining _aesni_encrypt6's prologue gives ~6% improvement...
+	&pshufd	($inout2,$rndkey0,1<<6);
+	&movdqa	($rndkey0,&QWP(32,"esp"));	# pull counter-less ivec
+	&pshufd	($inout3,$rndkey1,3<<6);
+	&pxor		($inout0,$rndkey0);	# merge counter-less ivec
+	&pshufd	($inout4,$rndkey1,2<<6);
+	&pxor		($inout1,$rndkey0);
+	&pshufd	($inout5,$rndkey1,1<<6);
+	&$movekey	($rndkey1,&QWP(16,$key_));
+	&pxor		($inout2,$rndkey0);
+	&pxor		($inout3,$rndkey0);
+	&aesenc		($inout0,$rndkey1);
+	&pxor		($inout4,$rndkey0);
+	&pxor		($inout5,$rndkey0);
+	&aesenc		($inout1,$rndkey1);
+	&$movekey	($rndkey0,&QWP(32,$key_));
+	&mov		($rounds,$rounds_);
+	&aesenc		($inout2,$rndkey1);
+	&aesenc		($inout3,$rndkey1);
+	&aesenc		($inout4,$rndkey1);
+	&aesenc		($inout5,$rndkey1);
+
+	&call		(&label("_aesni_encrypt6_enter"));
+
+	&movups	($rndkey1,&QWP(0,$inp));
+	&movups	($rndkey0,&QWP(0x10,$inp));
+	&xorps	($inout0,$rndkey1);
+	&movups	($rndkey1,&QWP(0x20,$inp));
+	&xorps	($inout1,$rndkey0);
+	&movups	(&QWP(0,$out),$inout0);
+	&movdqa	($rndkey0,&QWP(16,"esp"));	# load increment
+	&xorps	($inout2,$rndkey1);
+	&movdqa	($rndkey1,&QWP(64,"esp"));	# load 2nd triplet
+	&movups	(&QWP(0x10,$out),$inout1);
+	&movups	(&QWP(0x20,$out),$inout2);
+
+	&paddd	($rndkey1,$rndkey0);		# 2nd triplet increment
+	&paddd	($rndkey0,&QWP(48,"esp"));	# 1st triplet increment
+	&movdqa	($inout0,&QWP(0,"esp"));	# load byte swap mask
+
+	&movups	($inout1,&QWP(0x30,$inp));
+	&movups	($inout2,&QWP(0x40,$inp));
+	&xorps	($inout3,$inout1);
+	&movups	($inout1,&QWP(0x50,$inp));
+	&lea	($inp,&DWP(0x60,$inp));
+	&movdqa	(&QWP(48,"esp"),$rndkey0);	# save 1st triplet
+	&pshufb	($rndkey0,$inout0);		# byte swap
+	&xorps	($inout4,$inout2);
+	&movups	(&QWP(0x30,$out),$inout3);
+	&xorps	($inout5,$inout1);
+	&movdqa	(&QWP(64,"esp"),$rndkey1);	# save 2nd triplet
+	&pshufb	($rndkey1,$inout0);		# byte swap
+	&movups	(&QWP(0x40,$out),$inout4);
+	&pshufd	($inout0,$rndkey0,3<<6);
+	&movups	(&QWP(0x50,$out),$inout5);
+	&lea	($out,&DWP(0x60,$out));
+
+	&pshufd	($inout1,$rndkey0,2<<6);
+	&sub	($len,6);
+	&jnc	(&label("ctr32_loop6"));
+
+	&add	($len,6);
+	&jz	(&label("ctr32_ret"));
+	&movdqu	($inout5,&QWP(0,$key_));
+	&mov	($key,$key_);
+	&pxor	($inout5,&QWP(32,"esp"));	# restore count-less ivec
+	&mov	($rounds,&DWP(240,$key_));	# restore $rounds
+
+&set_label("ctr32_tail");
+	&por	($inout0,$inout5);
+	&cmp	($len,2);
+	&jb	(&label("ctr32_one"));
+
+	&pshufd	($inout2,$rndkey0,1<<6);
+	&por	($inout1,$inout5);
+	&je	(&label("ctr32_two"));
+
+	&pshufd	($inout3,$rndkey1,3<<6);
+	&por	($inout2,$inout5);
+	&cmp	($len,4);
+	&jb	(&label("ctr32_three"));
+
+	&pshufd	($inout4,$rndkey1,2<<6);
+	&por	($inout3,$inout5);
+	&je	(&label("ctr32_four"));
+
+	&por	($inout4,$inout5);
+	&call	("_aesni_encrypt6");
+	&movups	($rndkey1,&QWP(0,$inp));
+	&movups	($rndkey0,&QWP(0x10,$inp));
+	&xorps	($inout0,$rndkey1);
+	&movups	($rndkey1,&QWP(0x20,$inp));
+	&xorps	($inout1,$rndkey0);
+	&movups	($rndkey0,&QWP(0x30,$inp));
+	&xorps	($inout2,$rndkey1);
+	&movups	($rndkey1,&QWP(0x40,$inp));
+	&xorps	($inout3,$rndkey0);
+	&movups	(&QWP(0,$out),$inout0);
+	&xorps	($inout4,$rndkey1);
+	&movups	(&QWP(0x10,$out),$inout1);
+	&movups	(&QWP(0x20,$out),$inout2);
+	&movups	(&QWP(0x30,$out),$inout3);
+	&movups	(&QWP(0x40,$out),$inout4);
+	&jmp	(&label("ctr32_ret"));
+
+&set_label("ctr32_one_shortcut",16);
+	&movups	($inout0,&QWP(0,$rounds_));	# load ivec
+	&mov	($rounds,&DWP(240,$key));
+
+&set_label("ctr32_one");
+	if ($inline)
+	{   &aesni_inline_generate1("enc");	}
+	else
+	{   &call	("_aesni_encrypt1");	}
+	&movups	($in0,&QWP(0,$inp));
+	&xorps	($in0,$inout0);
+	&movups	(&QWP(0,$out),$in0);
+	&jmp	(&label("ctr32_ret"));
+
+&set_label("ctr32_two",16);
+	&call	("_aesni_encrypt2");
+	&movups	($inout3,&QWP(0,$inp));
+	&movups	($inout4,&QWP(0x10,$inp));
+	&xorps	($inout0,$inout3);
+	&xorps	($inout1,$inout4);
+	&movups	(&QWP(0,$out),$inout0);
+	&movups	(&QWP(0x10,$out),$inout1);
+	&jmp	(&label("ctr32_ret"));
+
+&set_label("ctr32_three",16);
+	&call	("_aesni_encrypt3");
+	&movups	($inout3,&QWP(0,$inp));
+	&movups	($inout4,&QWP(0x10,$inp));
+	&xorps	($inout0,$inout3);
+	&movups	($inout5,&QWP(0x20,$inp));
+	&xorps	($inout1,$inout4);
+	&movups	(&QWP(0,$out),$inout0);
+	&xorps	($inout2,$inout5);
+	&movups	(&QWP(0x10,$out),$inout1);
+	&movups	(&QWP(0x20,$out),$inout2);
+	&jmp	(&label("ctr32_ret"));
+
+&set_label("ctr32_four",16);
+	&call	("_aesni_encrypt4");
+	&movups	($inout4,&QWP(0,$inp));
+	&movups	($inout5,&QWP(0x10,$inp));
+	&movups	($rndkey1,&QWP(0x20,$inp));
+	&xorps	($inout0,$inout4);
+	&movups	($rndkey0,&QWP(0x30,$inp));
+	&xorps	($inout1,$inout5);
+	&movups	(&QWP(0,$out),$inout0);
+	&xorps	($inout2,$rndkey1);
+	&movups	(&QWP(0x10,$out),$inout1);
+	&xorps	($inout3,$rndkey0);
+	&movups	(&QWP(0x20,$out),$inout2);
+	&movups	(&QWP(0x30,$out),$inout3);
+
+&set_label("ctr32_ret");
+	&pxor	("xmm0","xmm0");		# clear register bank
+	&pxor	("xmm1","xmm1");
+	&pxor	("xmm2","xmm2");
+	&pxor	("xmm3","xmm3");
+	&pxor	("xmm4","xmm4");
+	&movdqa	(&QWP(32,"esp"),"xmm0");	# clear stack
+	&pxor	("xmm5","xmm5");
+	&movdqa	(&QWP(48,"esp"),"xmm0");
+	&pxor	("xmm6","xmm6");
+	&movdqa	(&QWP(64,"esp"),"xmm0");
+	&pxor	("xmm7","xmm7");
+	&mov	("esp",&DWP(80,"esp"));
+&function_end("${PREFIX}_ctr32_encrypt_blocks");
+}
+
+######################################################################
+# Mechanical port from aesni-x86_64.pl.
+#
+# _aesni_set_encrypt_key is private interface,
+# input:
+#	"eax"	const unsigned char *userKey
+#	$rounds	int bits
+#	$key	AES_KEY *key
+# output:
+#	"eax"	return code
+#	$round	rounds
+
+&function_begin_B("_aesni_set_encrypt_key");
+	&push	("ebp");
+	&push	("ebx");
+	&test	("eax","eax");
+	&jz	(&label("bad_pointer"));
+	&test	($key,$key);
+	&jz	(&label("bad_pointer"));
+
+	&call	(&label("pic"));
+&set_label("pic");
+	&blindpop("ebx");
+	&lea	("ebx",&DWP(&label("key_const")."-".&label("pic"),"ebx"));
+
+	&picmeup("ebp","GFp_ia32cap_P","ebx",&label("key_const"));
+	&movups	("xmm0",&QWP(0,"eax"));	# pull first 128 bits of *userKey
+	&xorps	("xmm4","xmm4");	# low dword of xmm4 is assumed 0
+	&mov	("ebp",&DWP(4,"ebp"));
+	&lea	($key,&DWP(16,$key));
+	&and	("ebp",1<<28|1<<11);	# AVX and XOP bits
+	&cmp	($rounds,256);
+	&je	(&label("14rounds"));
+	# 192-bit key support was removed.
+	&cmp	($rounds,128);
+	&jne	(&label("bad_keybits"));
+
+&set_label("10rounds",16);
+	&cmp		("ebp",1<<28);
+	&je		(&label("10rounds_alt"));
+
+	&mov		($rounds,9);
+	&$movekey	(&QWP(-16,$key),"xmm0");	# round 0
+	&aeskeygenassist("xmm1","xmm0",0x01);		# round 1
+	&call		(&label("key_128_cold"));
+	&aeskeygenassist("xmm1","xmm0",0x2);		# round 2
+	&call		(&label("key_128"));
+	&aeskeygenassist("xmm1","xmm0",0x04);		# round 3
+	&call		(&label("key_128"));
+	&aeskeygenassist("xmm1","xmm0",0x08);		# round 4
+	&call		(&label("key_128"));
+	&aeskeygenassist("xmm1","xmm0",0x10);		# round 5
+	&call		(&label("key_128"));
+	&aeskeygenassist("xmm1","xmm0",0x20);		# round 6
+	&call		(&label("key_128"));
+	&aeskeygenassist("xmm1","xmm0",0x40);		# round 7
+	&call		(&label("key_128"));
+	&aeskeygenassist("xmm1","xmm0",0x80);		# round 8
+	&call		(&label("key_128"));
+	&aeskeygenassist("xmm1","xmm0",0x1b);		# round 9
+	&call		(&label("key_128"));
+	&aeskeygenassist("xmm1","xmm0",0x36);		# round 10
+	&call		(&label("key_128"));
+	&$movekey	(&QWP(0,$key),"xmm0");
+	&mov		(&DWP(80,$key),$rounds);
+
+	&jmp	(&label("good_key"));
+
+&set_label("key_128",16);
+	&$movekey	(&QWP(0,$key),"xmm0");
+	&lea		($key,&DWP(16,$key));
+&set_label("key_128_cold");
+	&shufps		("xmm4","xmm0",0b00010000);
+	&xorps		("xmm0","xmm4");
+	&shufps		("xmm4","xmm0",0b10001100);
+	&xorps		("xmm0","xmm4");
+	&shufps		("xmm1","xmm1",0b11111111);	# critical path
+	&xorps		("xmm0","xmm1");
+	&ret();
+
+&set_label("10rounds_alt",16);
+	&movdqa		("xmm5",&QWP(0x00,"ebx"));
+	&mov		($rounds,8);
+	&movdqa		("xmm4",&QWP(0x20,"ebx"));
+	&movdqa		("xmm2","xmm0");
+	&movdqu		(&QWP(-16,$key),"xmm0");
+
+&set_label("loop_key128");
+	&pshufb		("xmm0","xmm5");
+	&aesenclast	("xmm0","xmm4");
+	&pslld		("xmm4",1);
+	&lea		($key,&DWP(16,$key));
+
+	&movdqa		("xmm3","xmm2");
+	&pslldq		("xmm2",4);
+	&pxor		("xmm3","xmm2");
+	&pslldq		("xmm2",4);
+	&pxor		("xmm3","xmm2");
+	&pslldq		("xmm2",4);
+	&pxor		("xmm2","xmm3");
+
+	&pxor		("xmm0","xmm2");
+	&movdqu		(&QWP(-16,$key),"xmm0");
+	&movdqa		("xmm2","xmm0");
+
+	&dec		($rounds);
+	&jnz		(&label("loop_key128"));
+
+	&movdqa		("xmm4",&QWP(0x30,"ebx"));
+
+	&pshufb		("xmm0","xmm5");
+	&aesenclast	("xmm0","xmm4");
+	&pslld		("xmm4",1);
+
+	&movdqa		("xmm3","xmm2");
+	&pslldq		("xmm2",4);
+	&pxor		("xmm3","xmm2");
+	&pslldq		("xmm2",4);
+	&pxor		("xmm3","xmm2");
+	&pslldq		("xmm2",4);
+	&pxor		("xmm2","xmm3");
+
+	&pxor		("xmm0","xmm2");
+	&movdqu		(&QWP(0,$key),"xmm0");
+
+	&movdqa		("xmm2","xmm0");
+	&pshufb		("xmm0","xmm5");
+	&aesenclast	("xmm0","xmm4");
+
+	&movdqa		("xmm3","xmm2");
+	&pslldq		("xmm2",4);
+	&pxor		("xmm3","xmm2");
+	&pslldq		("xmm2",4);
+	&pxor		("xmm3","xmm2");
+	&pslldq		("xmm2",4);
+	&pxor		("xmm2","xmm3");
+
+	&pxor		("xmm0","xmm2");
+	&movdqu		(&QWP(16,$key),"xmm0");
+
+	&mov		($rounds,9);
+	&mov		(&DWP(96,$key),$rounds);
+
+	&jmp	(&label("good_key"));
+
+# 192-bit key support was removed.
+
+&set_label("14rounds",16);
+	&movups		("xmm2",&QWP(16,"eax"));	# remaining half of *userKey
+	&lea		($key,&DWP(16,$key));
+	&cmp		("ebp",1<<28);
+	&je		(&label("14rounds_alt"));
+
+	&mov		($rounds,13);
+	&$movekey	(&QWP(-32,$key),"xmm0");	# round 0
+	&$movekey	(&QWP(-16,$key),"xmm2");	# round 1
+	&aeskeygenassist("xmm1","xmm2",0x01);		# round 2
+	&call		(&label("key_256a_cold"));
+	&aeskeygenassist("xmm1","xmm0",0x01);		# round 3
+	&call		(&label("key_256b"));
+	&aeskeygenassist("xmm1","xmm2",0x02);		# round 4
+	&call		(&label("key_256a"));
+	&aeskeygenassist("xmm1","xmm0",0x02);		# round 5
+	&call		(&label("key_256b"));
+	&aeskeygenassist("xmm1","xmm2",0x04);		# round 6
+	&call		(&label("key_256a"));
+	&aeskeygenassist("xmm1","xmm0",0x04);		# round 7
+	&call		(&label("key_256b"));
+	&aeskeygenassist("xmm1","xmm2",0x08);		# round 8
+	&call		(&label("key_256a"));
+	&aeskeygenassist("xmm1","xmm0",0x08);		# round 9
+	&call		(&label("key_256b"));
+	&aeskeygenassist("xmm1","xmm2",0x10);		# round 10
+	&call		(&label("key_256a"));
+	&aeskeygenassist("xmm1","xmm0",0x10);		# round 11
+	&call		(&label("key_256b"));
+	&aeskeygenassist("xmm1","xmm2",0x20);		# round 12
+	&call		(&label("key_256a"));
+	&aeskeygenassist("xmm1","xmm0",0x20);		# round 13
+	&call		(&label("key_256b"));
+	&aeskeygenassist("xmm1","xmm2",0x40);		# round 14
+	&call		(&label("key_256a"));
+	&$movekey	(&QWP(0,$key),"xmm0");
+	&mov		(&DWP(16,$key),$rounds);
+	&xor		("eax","eax");
+
+	&jmp	(&label("good_key"));
+
+&set_label("key_256a",16);
+	&$movekey	(&QWP(0,$key),"xmm2");
+	&lea		($key,&DWP(16,$key));
+&set_label("key_256a_cold");
+	&shufps		("xmm4","xmm0",0b00010000);
+	&xorps		("xmm0","xmm4");
+	&shufps		("xmm4","xmm0",0b10001100);
+	&xorps		("xmm0","xmm4");
+	&shufps		("xmm1","xmm1",0b11111111);	# critical path
+	&xorps		("xmm0","xmm1");
+	&ret();
+
+&set_label("key_256b",16);
+	&$movekey	(&QWP(0,$key),"xmm0");
+	&lea		($key,&DWP(16,$key));
+
+	&shufps		("xmm4","xmm2",0b00010000);
+	&xorps		("xmm2","xmm4");
+	&shufps		("xmm4","xmm2",0b10001100);
+	&xorps		("xmm2","xmm4");
+	&shufps		("xmm1","xmm1",0b10101010);	# critical path
+	&xorps		("xmm2","xmm1");
+	&ret();
+
+&set_label("14rounds_alt",16);
+	&movdqa		("xmm5",&QWP(0x00,"ebx"));
+	&movdqa		("xmm4",&QWP(0x20,"ebx"));
+	&mov		($rounds,7);
+	&movdqu		(&QWP(-32,$key),"xmm0");
+	&movdqa		("xmm1","xmm2");
+	&movdqu		(&QWP(-16,$key),"xmm2");
+
+&set_label("loop_key256");
+	&pshufb		("xmm2","xmm5");
+	&aesenclast	("xmm2","xmm4");
+
+	&movdqa		("xmm3","xmm0");
+	&pslldq		("xmm0",4);
+	&pxor		("xmm3","xmm0");
+	&pslldq		("xmm0",4);
+	&pxor		("xmm3","xmm0");
+	&pslldq		("xmm0",4);
+	&pxor		("xmm0","xmm3");
+	&pslld		("xmm4",1);
+
+	&pxor		("xmm0","xmm2");
+	&movdqu		(&QWP(0,$key),"xmm0");
+
+	&dec		($rounds);
+	&jz		(&label("done_key256"));
+
+	&pshufd		("xmm2","xmm0",0xff);
+	&pxor		("xmm3","xmm3");
+	&aesenclast	("xmm2","xmm3");
+
+	&movdqa		("xmm3","xmm1");
+	&pslldq		("xmm1",4);
+	&pxor		("xmm3","xmm1");
+	&pslldq		("xmm1",4);
+	&pxor		("xmm3","xmm1");
+	&pslldq		("xmm1",4);
+	&pxor		("xmm1","xmm3");
+
+	&pxor		("xmm2","xmm1");
+	&movdqu		(&QWP(16,$key),"xmm2");
+	&lea		($key,&DWP(32,$key));
+	&movdqa		("xmm1","xmm2");
+	&jmp		(&label("loop_key256"));
+
+&set_label("done_key256");
+	&mov		($rounds,13);
+	&mov		(&DWP(16,$key),$rounds);
+
+&set_label("good_key");
+	&pxor	("xmm0","xmm0");
+	&pxor	("xmm1","xmm1");
+	&pxor	("xmm2","xmm2");
+	&pxor	("xmm3","xmm3");
+	&pxor	("xmm4","xmm4");
+	&pxor	("xmm5","xmm5");
+	&xor	("eax","eax");
+	&pop	("ebx");
+	&pop	("ebp");
+	&ret	();
+
+&set_label("bad_pointer",4);
+	&mov	("eax",-1);
+	&pop	("ebx");
+	&pop	("ebp");
+	&ret	();
+&set_label("bad_keybits",4);
+	&pxor	("xmm0","xmm0");
+	&mov	("eax",-2);
+	&pop	("ebx");
+	&pop	("ebp");
+	&ret	();
+&function_end_B("_aesni_set_encrypt_key");
+
+# int $PREFIX_set_encrypt_key (const unsigned char *userKey, int bits,
+#                              AES_KEY *key)
+&function_begin_B("${PREFIX}_set_encrypt_key");
+	&mov	("eax",&wparam(0));
+	&mov	($rounds,&wparam(1));
+	&mov	($key,&wparam(2));
+	&call	("_aesni_set_encrypt_key");
+	&ret	();
+&function_end_B("${PREFIX}_set_encrypt_key");
+
+&set_label("key_const",64);
+&data_word(0x0c0f0e0d,0x0c0f0e0d,0x0c0f0e0d,0x0c0f0e0d);
+&data_word(0x04070605,0x04070605,0x04070605,0x04070605);
+&data_word(1,1,1,1);
+&data_word(0x1b,0x1b,0x1b,0x1b);
+&asciz("AES for Intel AES-NI, CRYPTOGAMS by <appro\@openssl.org>");
+
+&asm_finish();
+
+close STDOUT or die "error closing STDOUT";
--- a/zeroidc/vendor/ring/crypto/fipsmodule/aes/asm/aesni-x86_64.pl
+++ b/zeroidc/vendor/ring/crypto/fipsmodule/aes/asm/aesni-x86_64.pl
--- a/zeroidc/vendor/ring/crypto/fipsmodule/aes/asm/aesv8-armx.pl
+++ b/zeroidc/vendor/ring/crypto/fipsmodule/aes/asm/aesv8-armx.pl
@@ -0,0 +1,630 @@
+#! /usr/bin/env perl
+# Copyright 2014-2016 The OpenSSL Project Authors. All Rights Reserved.
+#
+# Licensed under the OpenSSL license (the "License").  You may not use
+# this file except in compliance with the License.  You can obtain a copy
+# in the file LICENSE in the source distribution or at
+# https://www.openssl.org/source/license.html
+
+#
+# ====================================================================
+# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
+# project. The module is, however, dual licensed under OpenSSL and
+# CRYPTOGAMS licenses depending on where you obtain it. For further
+# details see http://www.openssl.org/~appro/cryptogams/.
+# ====================================================================
+#
+# This module implements support for ARMv8 AES instructions. The
+# module is endian-agnostic in sense that it supports both big- and
+# little-endian cases. As does it support both 32- and 64-bit modes
+# of operation. Latter is achieved by limiting amount of utilized
+# registers to 16, which implies additional NEON load and integer
+# instructions. This has no effect on mighty Apple A7, where results
+# are literally equal to the theoretical estimates based on AES
+# instruction latencies and issue rates. On Cortex-A53, an in-order
+# execution core, this costs up to 10-15%, which is partially
+# compensated by implementing dedicated code path for 128-bit
+# CBC encrypt case. On Cortex-A57 parallelizable mode performance
+# seems to be limited by sheer amount of NEON instructions...
+#
+# Performance in cycles per byte processed with 128-bit key:
+#
+#		CBC enc		CBC dec		CTR
+# Apple A7	2.39		1.20		1.20
+# Cortex-A53	1.32		1.29		1.46
+# Cortex-A57(*)	1.95		0.85		0.93
+# Denver	1.96		0.86		0.80
+# Mongoose	1.33		1.20		1.20
+#
+# (*)	original 3.64/1.34/1.32 results were for r0p0 revision
+#	and are still same even for updated module;
+
+$flavour = shift;
+$output  = shift;
+
+$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
+( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or
+( $xlate="${dir}../../../perlasm/arm-xlate.pl" and -f $xlate) or
+die "can't locate arm-xlate.pl";
+
+open OUT,"| \"$^X\" $xlate $flavour $output";
+*STDOUT=*OUT;
+
+$prefix="aes_hw";
+
+$code=<<___;
+#include <GFp/arm_arch.h>
+
+#if __ARM_MAX_ARCH__>=7
+.text
+___
+$code.=".arch	armv8-a+crypto\n"			if ($flavour =~ /64/);
+$code.=<<___						if ($flavour !~ /64/);
+.arch	armv7-a	// don't confuse not-so-latest binutils with argv8 :-)
+.fpu	neon
+.code	32
+#undef	__thumb2__
+___
+
+# Assembler mnemonics are an eclectic mix of 32- and 64-bit syntax,
+# NEON is mostly 32-bit mnemonics, integer - mostly 64. Goal is to
+# maintain both 32- and 64-bit codes within single module and
+# transliterate common code to either flavour with regex vodoo.
+#
+{{{
+my ($inp,$bits,$out,$ptr,$rounds)=("x0","w1","x2","x3","w12");
+my ($zero,$rcon,$mask,$in0,$in1,$tmp,$key)=
+	$flavour=~/64/? map("q$_",(0..6)) : map("q$_",(0..3,8..10));
+
+
+# On AArch64, put the data .rodata and use adrp + add for compatibility with
+# execute-only memory. On AArch32, put it in .text and use adr.
+$code.= ".section .rodata\n" if ($flavour =~ /64/);
+$code.=<<___;
+.align	5
+.Lrcon:
+.long	0x01,0x01,0x01,0x01
+.long	0x0c0f0e0d,0x0c0f0e0d,0x0c0f0e0d,0x0c0f0e0d	// rotate-n-splat
+.long	0x1b,0x1b,0x1b,0x1b
+
+.text
+
+.globl	GFp_${prefix}_set_encrypt_key
+.type	GFp_${prefix}_set_encrypt_key,%function
+.align	5
+GFp_${prefix}_set_encrypt_key:
+.Lenc_key:
+___
+$code.=<<___	if ($flavour =~ /64/);
+	// Armv8.3-A PAuth: even though x30 is pushed to stack it is not popped later.
+	AARCH64_VALID_CALL_TARGET
+	stp	x29,x30,[sp,#-16]!
+	add	x29,sp,#0
+___
+$code.=<<___;
+	mov	$ptr,#-1
+	cmp	$inp,#0
+	b.eq	.Lenc_key_abort
+	cmp	$out,#0
+	b.eq	.Lenc_key_abort
+	mov	$ptr,#-2
+	cmp	$bits,#128
+	b.lt	.Lenc_key_abort
+	cmp	$bits,#256
+	b.gt	.Lenc_key_abort
+	tst	$bits,#0x3f
+	b.ne	.Lenc_key_abort
+
+___
+$code.=<<___	if ($flavour =~ /64/);
+	adrp	$ptr,:pg_hi21:.Lrcon
+	add	$ptr,$ptr,:lo12:.Lrcon
+___
+$code.=<<___	if ($flavour !~ /64/);
+	adr	$ptr,.Lrcon
+___
+$code.=<<___;
+	cmp	$bits,#192
+
+	veor	$zero,$zero,$zero
+	vld1.8	{$in0},[$inp],#16
+	mov	$bits,#8		// reuse $bits
+	vld1.32	{$rcon,$mask},[$ptr],#32
+
+	b.lt	.Loop128
+	// 192-bit key support was removed.
+	b	.L256
+
+.align	4
+.Loop128:
+	vtbl.8	$key,{$in0},$mask
+	vext.8	$tmp,$zero,$in0,#12
+	vst1.32	{$in0},[$out],#16
+	aese	$key,$zero
+	subs	$bits,$bits,#1
+
+	veor	$in0,$in0,$tmp
+	vext.8	$tmp,$zero,$tmp,#12
+	veor	$in0,$in0,$tmp
+	vext.8	$tmp,$zero,$tmp,#12
+	 veor	$key,$key,$rcon
+	veor	$in0,$in0,$tmp
+	vshl.u8	$rcon,$rcon,#1
+	veor	$in0,$in0,$key
+	b.ne	.Loop128
+
+	vld1.32	{$rcon},[$ptr]
+
+	vtbl.8	$key,{$in0},$mask
+	vext.8	$tmp,$zero,$in0,#12
+	vst1.32	{$in0},[$out],#16
+	aese	$key,$zero
+
+	veor	$in0,$in0,$tmp
+	vext.8	$tmp,$zero,$tmp,#12
+	veor	$in0,$in0,$tmp
+	vext.8	$tmp,$zero,$tmp,#12
+	 veor	$key,$key,$rcon
+	veor	$in0,$in0,$tmp
+	vshl.u8	$rcon,$rcon,#1
+	veor	$in0,$in0,$key
+
+	vtbl.8	$key,{$in0},$mask
+	vext.8	$tmp,$zero,$in0,#12
+	vst1.32	{$in0},[$out],#16
+	aese	$key,$zero
+
+	veor	$in0,$in0,$tmp
+	vext.8	$tmp,$zero,$tmp,#12
+	veor	$in0,$in0,$tmp
+	vext.8	$tmp,$zero,$tmp,#12
+	 veor	$key,$key,$rcon
+	veor	$in0,$in0,$tmp
+	veor	$in0,$in0,$key
+	vst1.32	{$in0},[$out]
+	add	$out,$out,#0x50
+
+	mov	$rounds,#10
+	b	.Ldone
+
+// 192-bit key support was removed.
+
+.align	4
+.L256:
+	vld1.8	{$in1},[$inp]
+	mov	$bits,#7
+	mov	$rounds,#14
+	vst1.32	{$in0},[$out],#16
+
+.Loop256:
+	vtbl.8	$key,{$in1},$mask
+	vext.8	$tmp,$zero,$in0,#12
+	vst1.32	{$in1},[$out],#16
+	aese	$key,$zero
+	subs	$bits,$bits,#1
+
+	veor	$in0,$in0,$tmp
+	vext.8	$tmp,$zero,$tmp,#12
+	veor	$in0,$in0,$tmp
+	vext.8	$tmp,$zero,$tmp,#12
+	 veor	$key,$key,$rcon
+	veor	$in0,$in0,$tmp
+	vshl.u8	$rcon,$rcon,#1
+	veor	$in0,$in0,$key
+	vst1.32	{$in0},[$out],#16
+	b.eq	.Ldone
+
+	vdup.32	$key,${in0}[3]		// just splat
+	vext.8	$tmp,$zero,$in1,#12
+	aese	$key,$zero
+
+	veor	$in1,$in1,$tmp
+	vext.8	$tmp,$zero,$tmp,#12
+	veor	$in1,$in1,$tmp
+	vext.8	$tmp,$zero,$tmp,#12
+	veor	$in1,$in1,$tmp
+
+	veor	$in1,$in1,$key
+	b	.Loop256
+
+.Ldone:
+	str	$rounds,[$out]
+	mov	$ptr,#0
+
+.Lenc_key_abort:
+	mov	x0,$ptr			// return value
+	`"ldr	x29,[sp],#16"		if ($flavour =~ /64/)`
+	ret
+.size	GFp_${prefix}_set_encrypt_key,.-GFp_${prefix}_set_encrypt_key
+___
+}}}
+{{{
+sub gen_block () {
+my $dir = shift;
+my ($e,$mc) = $dir eq "en" ? ("e","mc") : ("d","imc");
+my ($inp,$out,$key)=map("x$_",(0..2));
+my $rounds="w3";
+my ($rndkey0,$rndkey1,$inout)=map("q$_",(0..3));
+
+$code.=<<___;
+.globl	GFp_${prefix}_${dir}crypt
+.type	GFp_${prefix}_${dir}crypt,%function
+.align	5
+GFp_${prefix}_${dir}crypt:
+	AARCH64_VALID_CALL_TARGET
+	ldr	$rounds,[$key,#240]
+	vld1.32	{$rndkey0},[$key],#16
+	vld1.8	{$inout},[$inp]
+	sub	$rounds,$rounds,#2
+	vld1.32	{$rndkey1},[$key],#16
+
+.Loop_${dir}c:
+	aes$e	$inout,$rndkey0
+	aes$mc	$inout,$inout
+	vld1.32	{$rndkey0},[$key],#16
+	subs	$rounds,$rounds,#2
+	aes$e	$inout,$rndkey1
+	aes$mc	$inout,$inout
+	vld1.32	{$rndkey1},[$key],#16
+	b.gt	.Loop_${dir}c
+
+	aes$e	$inout,$rndkey0
+	aes$mc	$inout,$inout
+	vld1.32	{$rndkey0},[$key]
+	aes$e	$inout,$rndkey1
+	veor	$inout,$inout,$rndkey0
+
+	vst1.8	{$inout},[$out]
+	ret
+.size	GFp_${prefix}_${dir}crypt,.-GFp_${prefix}_${dir}crypt
+___
+}
+&gen_block("en");
+&gen_block("de");
+}}}
+{{{
+my ($inp,$out,$len,$key,$ivp)=map("x$_",(0..4));
+my ($rounds,$cnt,$key_)=("w5","w6","x7");
+my ($ctr,$tctr0,$tctr1,$tctr2)=map("w$_",(8..10,12));
+my $step="x12";		# aliases with $tctr2
+
+my ($dat0,$dat1,$in0,$in1,$tmp0,$tmp1,$ivec,$rndlast)=map("q$_",(0..7));
+my ($dat2,$in2,$tmp2)=map("q$_",(10,11,9));
+
+my ($dat,$tmp)=($dat0,$tmp0);
+
+### q8-q15	preloaded key schedule
+
+$code.=<<___;
+.globl	GFp_${prefix}_ctr32_encrypt_blocks
+.type	GFp_${prefix}_ctr32_encrypt_blocks,%function
+.align	5
+GFp_${prefix}_ctr32_encrypt_blocks:
+___
+$code.=<<___	if ($flavour =~ /64/);
+	// Armv8.3-A PAuth: even though x30 is pushed to stack it is not popped later.
+	AARCH64_VALID_CALL_TARGET
+	stp		x29,x30,[sp,#-16]!
+	add		x29,sp,#0
+___
+$code.=<<___	if ($flavour !~ /64/);
+	mov		ip,sp
+	stmdb		sp!,{r4-r10,lr}
+	vstmdb		sp!,{d8-d15}            @ ABI specification says so
+	ldr		r4, [ip]		@ load remaining arg
+___
+$code.=<<___;
+	ldr		$rounds,[$key,#240]
+
+	ldr		$ctr, [$ivp, #12]
+	vld1.32		{$dat0},[$ivp]
+
+	vld1.32		{q8-q9},[$key]		// load key schedule...
+	sub		$rounds,$rounds,#4
+	mov		$step,#16
+	cmp		$len,#2
+	add		$key_,$key,x5,lsl#4	// pointer to last 5 round keys
+	sub		$rounds,$rounds,#2
+	vld1.32		{q12-q13},[$key_],#32
+	vld1.32		{q14-q15},[$key_],#32
+	vld1.32		{$rndlast},[$key_]
+	add		$key_,$key,#32
+	mov		$cnt,$rounds
+	cclr		$step,lo
+
+	// ARM Cortex-A57 and Cortex-A72 cores running in 32-bit mode are
+	// affected by silicon errata #1742098 [0] and #1655431 [1],
+	// respectively, where the second instruction of an aese/aesmc
+	// instruction pair may execute twice if an interrupt is taken right
+	// after the first instruction consumes an input register of which a
+	// single 32-bit lane has been updated the last time it was modified.
+	//
+	// This function uses a counter in one 32-bit lane. The vmov.32 lines
+	// could write to $dat1 and $dat2 directly, but that trips this bugs.
+	// We write to $ivec and copy to the final register as a workaround.
+	//
+	// [0] ARM-EPM-049219 v23 Cortex-A57 MPCore Software Developers Errata Notice
+	// [1] ARM-EPM-012079 v11.0 Cortex-A72 MPCore Software Developers Errata Notice
+#ifndef __ARMEB__
+	rev		$ctr, $ctr
+#endif
+	add		$tctr1, $ctr, #1
+	vorr		$ivec,$dat0,$dat0
+	rev		$tctr1, $tctr1
+	vmov.32		${ivec}[3],$tctr1
+	add		$ctr, $ctr, #2
+	vorr		$dat1,$ivec,$ivec
+	b.ls		.Lctr32_tail
+	rev		$tctr2, $ctr
+	vmov.32		${ivec}[3],$tctr2
+	sub		$len,$len,#3		// bias
+	vorr		$dat2,$ivec,$ivec
+	b		.Loop3x_ctr32
+
+.align	4
+.Loop3x_ctr32:
+	aese		$dat0,q8
+	aesmc		$dat0,$dat0
+	aese		$dat1,q8
+	aesmc		$dat1,$dat1
+	aese		$dat2,q8
+	aesmc		$dat2,$dat2
+	vld1.32		{q8},[$key_],#16
+	subs		$cnt,$cnt,#2
+	aese		$dat0,q9
+	aesmc		$dat0,$dat0
+	aese		$dat1,q9
+	aesmc		$dat1,$dat1
+	aese		$dat2,q9
+	aesmc		$dat2,$dat2
+	vld1.32		{q9},[$key_],#16
+	b.gt		.Loop3x_ctr32
+
+	aese		$dat0,q8
+	aesmc		$tmp0,$dat0
+	aese		$dat1,q8
+	aesmc		$tmp1,$dat1
+	 vld1.8		{$in0},[$inp],#16
+	 add		$tctr0,$ctr,#1
+	aese		$dat2,q8
+	aesmc		$dat2,$dat2
+	 vld1.8		{$in1},[$inp],#16
+	 rev		$tctr0,$tctr0
+	aese		$tmp0,q9
+	aesmc		$tmp0,$tmp0
+	aese		$tmp1,q9
+	aesmc		$tmp1,$tmp1
+	 vld1.8		{$in2},[$inp],#16
+	 mov		$key_,$key
+	aese		$dat2,q9
+	aesmc		$tmp2,$dat2
+	aese		$tmp0,q12
+	aesmc		$tmp0,$tmp0
+	aese		$tmp1,q12
+	aesmc		$tmp1,$tmp1
+	 veor		$in0,$in0,$rndlast
+	 add		$tctr1,$ctr,#2
+	aese		$tmp2,q12
+	aesmc		$tmp2,$tmp2
+	 veor		$in1,$in1,$rndlast
+	 add		$ctr,$ctr,#3
+	aese		$tmp0,q13
+	aesmc		$tmp0,$tmp0
+	aese		$tmp1,q13
+	aesmc		$tmp1,$tmp1
+	 // Note the logic to update $dat0, $dat1, and $dat1 is written to work
+	 // around a bug in ARM Cortex-A57 and Cortex-A72 cores running in
+	 // 32-bit mode. See the comment above.
+	 veor		$in2,$in2,$rndlast
+	 vmov.32	${ivec}[3], $tctr0
+	aese		$tmp2,q13
+	aesmc		$tmp2,$tmp2
+	 vorr		$dat0,$ivec,$ivec
+	 rev		$tctr1,$tctr1
+	aese		$tmp0,q14
+	aesmc		$tmp0,$tmp0
+	 vmov.32	${ivec}[3], $tctr1
+	 rev		$tctr2,$ctr
+	aese		$tmp1,q14
+	aesmc		$tmp1,$tmp1
+	 vorr		$dat1,$ivec,$ivec
+	 vmov.32	${ivec}[3], $tctr2
+	aese		$tmp2,q14
+	aesmc		$tmp2,$tmp2
+	 vorr		$dat2,$ivec,$ivec
+	 subs		$len,$len,#3
+	aese		$tmp0,q15
+	aese		$tmp1,q15
+	aese		$tmp2,q15
+
+	veor		$in0,$in0,$tmp0
+	 vld1.32	 {q8},[$key_],#16	// re-pre-load rndkey[0]
+	vst1.8		{$in0},[$out],#16
+	veor		$in1,$in1,$tmp1
+	 mov		$cnt,$rounds
+	vst1.8		{$in1},[$out],#16
+	veor		$in2,$in2,$tmp2
+	 vld1.32	 {q9},[$key_],#16	// re-pre-load rndkey[1]
+	vst1.8		{$in2},[$out],#16
+	b.hs		.Loop3x_ctr32
+
+	adds		$len,$len,#3
+	b.eq		.Lctr32_done
+	cmp		$len,#1
+	mov		$step,#16
+	cclr		$step,eq
+
+.Lctr32_tail:
+	aese		$dat0,q8
+	aesmc		$dat0,$dat0
+	aese		$dat1,q8
+	aesmc		$dat1,$dat1
+	vld1.32		{q8},[$key_],#16
+	subs		$cnt,$cnt,#2
+	aese		$dat0,q9
+	aesmc		$dat0,$dat0
+	aese		$dat1,q9
+	aesmc		$dat1,$dat1
+	vld1.32		{q9},[$key_],#16
+	b.gt		.Lctr32_tail
+
+	aese		$dat0,q8
+	aesmc		$dat0,$dat0
+	aese		$dat1,q8
+	aesmc		$dat1,$dat1
+	aese		$dat0,q9
+	aesmc		$dat0,$dat0
+	aese		$dat1,q9
+	aesmc		$dat1,$dat1
+	 vld1.8		{$in0},[$inp],$step
+	aese		$dat0,q12
+	aesmc		$dat0,$dat0
+	aese		$dat1,q12
+	aesmc		$dat1,$dat1
+	 vld1.8		{$in1},[$inp]
+	aese		$dat0,q13
+	aesmc		$dat0,$dat0
+	aese		$dat1,q13
+	aesmc		$dat1,$dat1
+	 veor		$in0,$in0,$rndlast
+	aese		$dat0,q14
+	aesmc		$dat0,$dat0
+	aese		$dat1,q14
+	aesmc		$dat1,$dat1
+	 veor		$in1,$in1,$rndlast
+	aese		$dat0,q15
+	aese		$dat1,q15
+
+	cmp		$len,#1
+	veor		$in0,$in0,$dat0
+	veor		$in1,$in1,$dat1
+	vst1.8		{$in0},[$out],#16
+	b.eq		.Lctr32_done
+	vst1.8		{$in1},[$out]
+
+.Lctr32_done:
+___
+$code.=<<___	if ($flavour !~ /64/);
+	vldmia		sp!,{d8-d15}
+	ldmia		sp!,{r4-r10,pc}
+___
+$code.=<<___	if ($flavour =~ /64/);
+	ldr		x29,[sp],#16
+	ret
+___
+$code.=<<___;
+.size	GFp_${prefix}_ctr32_encrypt_blocks,.-GFp_${prefix}_ctr32_encrypt_blocks
+___
+}}}
+$code.=<<___;
+#endif
+___
+########################################
+if ($flavour =~ /64/) {			######## 64-bit code
+    my %opcode = (
+	"aesd"	=>	0x4e285800,	"aese"	=>	0x4e284800,
+	"aesimc"=>	0x4e287800,	"aesmc"	=>	0x4e286800	);
+
+    local *unaes = sub {
+	my ($mnemonic,$arg)=@_;
+
+	$arg =~ m/[qv]([0-9]+)[^,]*,\s*[qv]([0-9]+)/o	&&
+	sprintf ".inst\t0x%08x\t//%s %s",
+			$opcode{$mnemonic}|$1|($2<<5),
+			$mnemonic,$arg;
+    };
+
+    foreach(split("\n",$code)) {
+	s/\`([^\`]*)\`/eval($1)/geo;
+
+	s/\bq([0-9]+)\b/"v".($1<8?$1:$1+8).".16b"/geo;	# old->new registers
+	s/@\s/\/\//o;			# old->new style commentary
+
+	#s/[v]?(aes\w+)\s+([qv].*)/unaes($1,$2)/geo	or
+	s/cclr\s+([wx])([^,]+),\s*([a-z]+)/csel	$1$2,$1zr,$1$2,$3/o	or
+	s/mov\.([a-z]+)\s+([wx][0-9]+),\s*([wx][0-9]+)/csel	$2,$3,$2,$1/o	or
+	s/vmov\.i8/movi/o	or	# fix up legacy mnemonics
+	s/vext\.8/ext/o		or
+	s/vrev32\.8/rev32/o	or
+	s/vtst\.8/cmtst/o	or
+	s/vshr/ushr/o		or
+	s/^(\s+)v/$1/o		or	# strip off v prefix
+	s/\bbx\s+lr\b/ret/o;
+
+	# fix up remaining legacy suffixes
+	s/\.[ui]?8//o;
+	m/\],#8/o and s/\.16b/\.8b/go;
+	s/\.[ui]?32//o and s/\.16b/\.4s/go;
+	s/\.[ui]?64//o and s/\.16b/\.2d/go;
+	s/\.[42]([sd])\[([0-3])\]/\.$1\[$2\]/o;
+
+	print $_,"\n";
+    }
+} else {				######## 32-bit code
+    my %opcode = (
+	"aesd"	=>	0xf3b00340,	"aese"	=>	0xf3b00300,
+	"aesimc"=>	0xf3b003c0,	"aesmc"	=>	0xf3b00380	);
+
+    local *unaes = sub {
+	my ($mnemonic,$arg)=@_;
+
+	if ($arg =~ m/[qv]([0-9]+)[^,]*,\s*[qv]([0-9]+)/o) {
+	    my $word = $opcode{$mnemonic}|(($1&7)<<13)|(($1&8)<<19)
+					 |(($2&7)<<1) |(($2&8)<<2);
+	    # since ARMv7 instructions are always encoded little-endian.
+	    # correct solution is to use .inst directive, but older
+	    # assemblers don't implement it:-(
+	    sprintf ".byte\t0x%02x,0x%02x,0x%02x,0x%02x\t@ %s %s",
+			$word&0xff,($word>>8)&0xff,
+			($word>>16)&0xff,($word>>24)&0xff,
+			$mnemonic,$arg;
+	}
+    };
+
+    sub unvtbl {
+	my $arg=shift;
+
+	$arg =~ m/q([0-9]+),\s*\{q([0-9]+)\},\s*q([0-9]+)/o &&
+	sprintf	"vtbl.8	d%d,{q%d},d%d\n\t".
+		"vtbl.8	d%d,{q%d},d%d", 2*$1,$2,2*$3, 2*$1+1,$2,2*$3+1;
+    }
+
+    sub unvdup32 {
+	my $arg=shift;
+
+	$arg =~ m/q([0-9]+),\s*q([0-9]+)\[([0-3])\]/o &&
+	sprintf	"vdup.32	q%d,d%d[%d]",$1,2*$2+($3>>1),$3&1;
+    }
+
+    sub unvmov32 {
+	my $arg=shift;
+
+	$arg =~ m/q([0-9]+)\[([0-3])\],(.*)/o &&
+	sprintf	"vmov.32	d%d[%d],%s",2*$1+($2>>1),$2&1,$3;
+    }
+
+    foreach(split("\n",$code)) {
+	s/\`([^\`]*)\`/eval($1)/geo;
+
+	s/\b[wx]([0-9]+)\b/r$1/go;		# new->old registers
+	s/\bv([0-9])\.[12468]+[bsd]\b/q$1/go;	# new->old registers
+	s/\/\/\s?/@ /o;				# new->old style commentary
+
+	# fix up remaining new-style suffixes
+	s/\{q([0-9]+)\},\s*\[(.+)\],#8/sprintf "{d%d},[$2]!",2*$1/eo	or
+	s/\],#[0-9]+/]!/o;
+
+	s/[v]?(aes\w+)\s+([qv].*)/unaes($1,$2)/geo	or
+	s/cclr\s+([^,]+),\s*([a-z]+)/mov$2	$1,#0/o	or
+	s/vtbl\.8\s+(.*)/unvtbl($1)/geo			or
+	s/vdup\.32\s+(.*)/unvdup32($1)/geo		or
+	s/vmov\.32\s+(.*)/unvmov32($1)/geo		or
+	s/^(\s+)b\./$1b/o				or
+	s/^(\s+)mov\./$1mov/o				or
+	s/^(\s+)ret/$1bx\tlr/o;
+
+	print $_,"\n";
+    }
+}
+
+close STDOUT or die "error closing STDOUT";
--- a/zeroidc/vendor/ring/crypto/fipsmodule/aes/asm/bsaes-armv7.pl
+++ b/zeroidc/vendor/ring/crypto/fipsmodule/aes/asm/bsaes-armv7.pl
--- a/zeroidc/vendor/ring/crypto/fipsmodule/aes/asm/vpaes-x86.pl
+++ b/zeroidc/vendor/ring/crypto/fipsmodule/aes/asm/vpaes-x86.pl
@@ -0,0 +1,603 @@
+#! /usr/bin/env perl
+# Copyright 2011-2016 The OpenSSL Project Authors. All Rights Reserved.
+#
+# Licensed under the OpenSSL license (the "License").  You may not use
+# this file except in compliance with the License.  You can obtain a copy
+# in the file LICENSE in the source distribution or at
+# https://www.openssl.org/source/license.html
+
+
+######################################################################
+## Constant-time SSSE3 AES core implementation.
+## version 0.1
+##
+## By Mike Hamburg (Stanford University), 2009
+## Public domain.
+##
+## For details see http://shiftleft.org/papers/vector_aes/ and
+## http://crypto.stanford.edu/vpaes/.
+
+######################################################################
+# September 2011.
+#
+# Port vpaes-x86_64.pl as 32-bit "almost" drop-in replacement for
+# aes-586.pl. "Almost" refers to the fact that AES_cbc_encrypt
+# doesn't handle partial vectors (doesn't have to if called from
+# EVP only). "Drop-in" implies that this module doesn't share key
+# schedule structure with the original nor does it make assumption
+# about its alignment...
+#
+# Performance summary. aes-586.pl column lists large-block CBC
+# encrypt/decrypt/with-hyper-threading-off(*) results in cycles per
+# byte processed with 128-bit key, and vpaes-x86.pl column - [also
+# large-block CBC] encrypt/decrypt.
+#
+#		aes-586.pl		vpaes-x86.pl
+#
+# Core 2(**)	28.1/41.4/18.3		21.9/25.2(***)
+# Nehalem	27.9/40.4/18.1		10.2/11.9
+# Atom		70.7/92.1/60.1		61.1/75.4(***)
+# Silvermont	45.4/62.9/24.1		49.2/61.1(***)
+#
+# (*)	"Hyper-threading" in the context refers rather to cache shared
+#	among multiple cores, than to specifically Intel HTT. As vast
+#	majority of contemporary cores share cache, slower code path
+#	is common place. In other words "with-hyper-threading-off"
+#	results are presented mostly for reference purposes.
+#
+# (**)	"Core 2" refers to initial 65nm design, a.k.a. Conroe.
+#
+# (***)	Less impressive improvement on Core 2 and Atom is due to slow
+#	pshufb,	yet it's respectable +28%/64%  improvement on Core 2
+#	and +15% on Atom (as implied, over "hyper-threading-safe"
+#	code path).
+#
+#						<appro@openssl.org>
+
+$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
+push(@INC,"${dir}","${dir}../../../perlasm");
+require "x86asm.pl";
+
+$output = pop;
+open OUT,">$output";
+*STDOUT=*OUT;
+
+&asm_init($ARGV[0],$x86only = $ARGV[$#ARGV] eq "386");
+
+$PREFIX="vpaes";
+
+my  ($round, $base, $magic, $key, $const, $inp, $out)=
+    ("eax",  "ebx", "ecx",  "edx","ebp",  "esi","edi");
+
+&static_label("_vpaes_consts");
+&static_label("_vpaes_schedule_low_round");
+
+&set_label("_vpaes_consts",64);
+$k_inv=-0x30;		# inv, inva
+	&data_word(0x0D080180,0x0E05060F,0x0A0B0C02,0x04070309);
+	&data_word(0x0F0B0780,0x01040A06,0x02050809,0x030D0E0C);
+
+$k_s0F=-0x10;		# s0F
+	&data_word(0x0F0F0F0F,0x0F0F0F0F,0x0F0F0F0F,0x0F0F0F0F);
+
+$k_ipt=0x00;		# input transform (lo, hi)
+	&data_word(0x5A2A7000,0xC2B2E898,0x52227808,0xCABAE090);
+	&data_word(0x317C4D00,0x4C01307D,0xB0FDCC81,0xCD80B1FC);
+
+$k_sb1=0x20;		# sb1u, sb1t
+	&data_word(0xCB503E00,0xB19BE18F,0x142AF544,0xA5DF7A6E);
+	&data_word(0xFAE22300,0x3618D415,0x0D2ED9EF,0x3BF7CCC1);
+$k_sb2=0x40;		# sb2u, sb2t
+	&data_word(0x0B712400,0xE27A93C6,0xBC982FCD,0x5EB7E955);
+	&data_word(0x0AE12900,0x69EB8840,0xAB82234A,0xC2A163C8);
+$k_sbo=0x60;		# sbou, sbot
+	&data_word(0x6FBDC700,0xD0D26D17,0xC502A878,0x15AABF7A);
+	&data_word(0x5FBB6A00,0xCFE474A5,0x412B35FA,0x8E1E90D1);
+
+$k_mc_forward=0x80;	# mc_forward
+	&data_word(0x00030201,0x04070605,0x080B0A09,0x0C0F0E0D);
+	&data_word(0x04070605,0x080B0A09,0x0C0F0E0D,0x00030201);
+	&data_word(0x080B0A09,0x0C0F0E0D,0x00030201,0x04070605);
+	&data_word(0x0C0F0E0D,0x00030201,0x04070605,0x080B0A09);
+
+$k_mc_backward=0xc0;	# mc_backward
+	&data_word(0x02010003,0x06050407,0x0A09080B,0x0E0D0C0F);
+	&data_word(0x0E0D0C0F,0x02010003,0x06050407,0x0A09080B);
+	&data_word(0x0A09080B,0x0E0D0C0F,0x02010003,0x06050407);
+	&data_word(0x06050407,0x0A09080B,0x0E0D0C0F,0x02010003);
+
+$k_sr=0x100;		# sr
+	&data_word(0x03020100,0x07060504,0x0B0A0908,0x0F0E0D0C);
+	&data_word(0x0F0A0500,0x030E0904,0x07020D08,0x0B06010C);
+	&data_word(0x0B020900,0x0F060D04,0x030A0108,0x070E050C);
+	&data_word(0x070A0D00,0x0B0E0104,0x0F020508,0x0306090C);
+
+$k_rcon=0x140;		# rcon
+	&data_word(0xAF9DEEB6,0x1F8391B9,0x4D7C7D81,0x702A9808);
+
+$k_s63=0x150;		# s63: all equal to 0x63 transformed
+	&data_word(0x5B5B5B5B,0x5B5B5B5B,0x5B5B5B5B,0x5B5B5B5B);
+
+$k_opt=0x160;		# output transform
+	&data_word(0xD6B66000,0xFF9F4929,0xDEBE6808,0xF7974121);
+	&data_word(0x50BCEC00,0x01EDBD51,0xB05C0CE0,0xE10D5DB1);
+
+$k_deskew=0x180;	# deskew tables: inverts the sbox's "skew"
+	&data_word(0x47A4E300,0x07E4A340,0x5DBEF91A,0x1DFEB95A);
+	&data_word(0x83EA6900,0x5F36B5DC,0xF49D1E77,0x2841C2AB);
+
+&asciz	("Vector Permutation AES for x86/SSSE3, Mike Hamburg (Stanford University)");
+&align	(64);
+
+&function_begin_B("_vpaes_preheat");
+	&add	($const,&DWP(0,"esp"));
+	&movdqa	("xmm7",&QWP($k_inv,$const));
+	&movdqa	("xmm6",&QWP($k_s0F,$const));
+	&ret	();
+&function_end_B("_vpaes_preheat");
+
+##
+##  _aes_encrypt_core
+##
+##  AES-encrypt %xmm0.
+##
+##  Inputs:
+##     %xmm0 = input
+##     %xmm6-%xmm7 as in _vpaes_preheat
+##    (%edx) = scheduled keys
+##
+##  Output in %xmm0
+##  Clobbers  %xmm1-%xmm5, %eax, %ebx, %ecx, %edx
+##
+##
+&function_begin_B("_vpaes_encrypt_core");
+	&mov	($magic,16);
+	&mov	($round,&DWP(240,$key));
+	&movdqa	("xmm1","xmm6")
+	&movdqa	("xmm2",&QWP($k_ipt,$const));
+	&pandn	("xmm1","xmm0");
+	&pand	("xmm0","xmm6");
+	&movdqu	("xmm5",&QWP(0,$key));
+	&pshufb	("xmm2","xmm0");
+	&movdqa	("xmm0",&QWP($k_ipt+16,$const));
+	&pxor	("xmm2","xmm5");
+	&psrld	("xmm1",4);
+	&add	($key,16);
+	&pshufb	("xmm0","xmm1");
+	&lea	($base,&DWP($k_mc_backward,$const));
+	&pxor	("xmm0","xmm2");
+	&jmp	(&label("enc_entry"));
+
+
+&set_label("enc_loop",16);
+	# middle of middle round
+	&movdqa	("xmm4",&QWP($k_sb1,$const));	# 4 : sb1u
+	&movdqa	("xmm0",&QWP($k_sb1+16,$const));# 0 : sb1t
+	&pshufb	("xmm4","xmm2");		# 4 = sb1u
+	&pshufb	("xmm0","xmm3");		# 0 = sb1t
+	&pxor	("xmm4","xmm5");		# 4 = sb1u + k
+	&movdqa	("xmm5",&QWP($k_sb2,$const));	# 4 : sb2u
+	&pxor	("xmm0","xmm4");		# 0 = A
+	&movdqa	("xmm1",&QWP(-0x40,$base,$magic));# .Lk_mc_forward[]
+	&pshufb	("xmm5","xmm2");		# 4 = sb2u
+	&movdqa	("xmm2",&QWP($k_sb2+16,$const));# 2 : sb2t
+	&movdqa	("xmm4",&QWP(0,$base,$magic));	# .Lk_mc_backward[]
+	&pshufb	("xmm2","xmm3");		# 2 = sb2t
+	&movdqa	("xmm3","xmm0");		# 3 = A
+	&pxor	("xmm2","xmm5");		# 2 = 2A
+	&pshufb	("xmm0","xmm1");		# 0 = B
+	&add	($key,16);			# next key
+	&pxor	("xmm0","xmm2");		# 0 = 2A+B
+	&pshufb	("xmm3","xmm4");		# 3 = D
+	&add	($magic,16);			# next mc
+	&pxor	("xmm3","xmm0");		# 3 = 2A+B+D
+	&pshufb	("xmm0","xmm1");		# 0 = 2B+C
+	&and	($magic,0x30);			# ... mod 4
+	&sub	($round,1);			# nr--
+	&pxor	("xmm0","xmm3");		# 0 = 2A+3B+C+D
+
+&set_label("enc_entry");
+	# top of round
+	&movdqa	("xmm1","xmm6");		# 1 : i
+	&movdqa	("xmm5",&QWP($k_inv+16,$const));# 2 : a/k
+	&pandn	("xmm1","xmm0");		# 1 = i<<4
+	&psrld	("xmm1",4);			# 1 = i
+	&pand	("xmm0","xmm6");		# 0 = k
+	&pshufb	("xmm5","xmm0");		# 2 = a/k
+	&movdqa	("xmm3","xmm7");		# 3 : 1/i
+	&pxor	("xmm0","xmm1");		# 0 = j
+	&pshufb	("xmm3","xmm1");		# 3 = 1/i
+	&movdqa	("xmm4","xmm7");		# 4 : 1/j
+	&pxor	("xmm3","xmm5");		# 3 = iak = 1/i + a/k
+	&pshufb	("xmm4","xmm0");		# 4 = 1/j
+	&movdqa	("xmm2","xmm7");		# 2 : 1/iak
+	&pxor	("xmm4","xmm5");		# 4 = jak = 1/j + a/k
+	&pshufb	("xmm2","xmm3");		# 2 = 1/iak
+	&movdqa	("xmm3","xmm7");		# 3 : 1/jak
+	&pxor	("xmm2","xmm0");		# 2 = io
+	&pshufb	("xmm3","xmm4");		# 3 = 1/jak
+	&movdqu	("xmm5",&QWP(0,$key));
+	&pxor	("xmm3","xmm1");		# 3 = jo
+	&jnz	(&label("enc_loop"));
+
+	# middle of last round
+	&movdqa	("xmm4",&QWP($k_sbo,$const));	# 3 : sbou      .Lk_sbo
+	&movdqa	("xmm0",&QWP($k_sbo+16,$const));# 3 : sbot      .Lk_sbo+16
+	&pshufb	("xmm4","xmm2");		# 4 = sbou
+	&pxor	("xmm4","xmm5");		# 4 = sb1u + k
+	&pshufb	("xmm0","xmm3");		# 0 = sb1t
+	&movdqa	("xmm1",&QWP(0x40,$base,$magic));# .Lk_sr[]
+	&pxor	("xmm0","xmm4");		# 0 = A
+	&pshufb	("xmm0","xmm1");
+	&ret	();
+&function_end_B("_vpaes_encrypt_core");
+
+########################################################
+##                                                    ##
+##                  AES key schedule                  ##
+##                                                    ##
+########################################################
+&function_begin_B("_vpaes_schedule_core");
+	&add	($const,&DWP(0,"esp"));
+	&movdqu	("xmm0",&QWP(0,$inp));		# load key (unaligned)
+	&movdqa	("xmm2",&QWP($k_rcon,$const));	# load rcon
+
+	# input transform
+	&movdqa	("xmm3","xmm0");
+	&lea	($base,&DWP($k_ipt,$const));
+	&movdqa	(&QWP(4,"esp"),"xmm2");		# xmm8
+	&call	("_vpaes_schedule_transform");
+	&movdqa	("xmm7","xmm0");
+
+	&test	($out,$out);
+	&jnz	(&label("schedule_am_decrypting"));
+
+	# encrypting, output zeroth round key after transform
+	&movdqu	(&QWP(0,$key),"xmm0");
+	&jmp	(&label("schedule_go"));
+
+&set_label("schedule_am_decrypting");
+	# decrypting, output zeroth round key after shiftrows
+	&movdqa	("xmm1",&QWP($k_sr,$const,$magic));
+	&pshufb	("xmm3","xmm1");
+	&movdqu	(&QWP(0,$key),"xmm3");
+	&xor	($magic,0x30);
+
+&set_label("schedule_go");
+	&cmp	($round,192);
+	&ja	(&label("schedule_256"));
+	# 192-bit key support was removed. 
+	# 128: fall though
+
+##
+##  .schedule_128
+##
+##  128-bit specific part of key schedule.
+##
+##  This schedule is really simple, because all its parts
+##  are accomplished by the subroutines.
+##
+&set_label("schedule_128");
+	&mov	($round,10);
+
+&set_label("loop_schedule_128");
+	&call	("_vpaes_schedule_round");
+	&dec	($round);
+	&jz	(&label("schedule_mangle_last"));
+	&call	("_vpaes_schedule_mangle");	# write output
+	&jmp	(&label("loop_schedule_128"));
+
+##
+##  .aes_schedule_256
+##
+##  256-bit specific part of key schedule.
+##
+##  The structure here is very similar to the 128-bit
+##  schedule, but with an additional "low side" in
+##  %xmm6.  The low side's rounds are the same as the
+##  high side's, except no rcon and no rotation.
+##
+&set_label("schedule_256",16);
+	&movdqu	("xmm0",&QWP(16,$inp));		# load key part 2 (unaligned)
+	&call	("_vpaes_schedule_transform");	# input transform
+	&mov	($round,7);
+
+&set_label("loop_schedule_256");
+	&call	("_vpaes_schedule_mangle");	# output low result
+	&movdqa	("xmm6","xmm0");		# save cur_lo in xmm6
+
+	# high round
+	&call	("_vpaes_schedule_round");
+	&dec	($round);
+	&jz	(&label("schedule_mangle_last"));
+	&call	("_vpaes_schedule_mangle");
+
+	# low round. swap xmm7 and xmm6
+	&pshufd	("xmm0","xmm0",0xFF);
+	&movdqa	(&QWP(20,"esp"),"xmm7");
+	&movdqa	("xmm7","xmm6");
+	&call	("_vpaes_schedule_low_round");
+	&movdqa	("xmm7",&QWP(20,"esp"));
+
+	&jmp	(&label("loop_schedule_256"));
+
+##
+##  .aes_schedule_mangle_last
+##
+##  Mangler for last round of key schedule
+##  Mangles %xmm0
+##    when encrypting, outputs out(%xmm0) ^ 63
+##    when decrypting, outputs unskew(%xmm0)
+##
+##  Always called right before return... jumps to cleanup and exits
+##
+&set_label("schedule_mangle_last",16);
+	# schedule last round key from xmm0
+	&lea	($base,&DWP($k_deskew,$const));
+	&test	($out,$out);
+	&jnz	(&label("schedule_mangle_last_dec"));
+
+	# encrypting
+	&movdqa	("xmm1",&QWP($k_sr,$const,$magic));
+	&pshufb	("xmm0","xmm1");		# output permute
+	&lea	($base,&DWP($k_opt,$const));	# prepare to output transform
+	&add	($key,32);
+
+&set_label("schedule_mangle_last_dec");
+	&add	($key,-16);
+	&pxor	("xmm0",&QWP($k_s63,$const));
+	&call	("_vpaes_schedule_transform");	# output transform
+	&movdqu	(&QWP(0,$key),"xmm0");		# save last key
+
+	# cleanup
+	&pxor	("xmm0","xmm0");
+	&pxor	("xmm1","xmm1");
+	&pxor	("xmm2","xmm2");
+	&pxor	("xmm3","xmm3");
+	&pxor	("xmm4","xmm4");
+	&pxor	("xmm5","xmm5");
+	&pxor	("xmm6","xmm6");
+	&pxor	("xmm7","xmm7");
+	&ret	();
+&function_end_B("_vpaes_schedule_core");
+
+##
+##  .aes_schedule_round
+##
+##  Runs one main round of the key schedule on %xmm0, %xmm7
+##
+##  Specifically, runs subbytes on the high dword of %xmm0
+##  then rotates it by one byte and xors into the low dword of
+##  %xmm7.
+##
+##  Adds rcon from low byte of %xmm8, then rotates %xmm8 for
+##  next rcon.
+##
+##  Smears the dwords of %xmm7 by xoring the low into the
+##  second low, result into third, result into highest.
+##
+##  Returns results in %xmm7 = %xmm0.
+##  Clobbers %xmm1-%xmm5.
+##
+&function_begin_B("_vpaes_schedule_round");
+	# extract rcon from xmm8
+	&movdqa	("xmm2",&QWP(8,"esp"));		# xmm8
+	&pxor	("xmm1","xmm1");
+	&palignr("xmm1","xmm2",15);
+	&palignr("xmm2","xmm2",15);
+	&pxor	("xmm7","xmm1");
+
+	# rotate
+	&pshufd	("xmm0","xmm0",0xFF);
+	&palignr("xmm0","xmm0",1);
+
+	# fall through...
+	&movdqa	(&QWP(8,"esp"),"xmm2");		# xmm8
+
+	# low round: same as high round, but no rotation and no rcon.
+&set_label("_vpaes_schedule_low_round");
+	# smear xmm7
+	&movdqa	("xmm1","xmm7");
+	&pslldq	("xmm7",4);
+	&pxor	("xmm7","xmm1");
+	&movdqa	("xmm1","xmm7");
+	&pslldq	("xmm7",8);
+	&pxor	("xmm7","xmm1");
+	&pxor	("xmm7",&QWP($k_s63,$const));
+
+	# subbyte
+	&movdqa	("xmm4",&QWP($k_s0F,$const));
+	&movdqa	("xmm5",&QWP($k_inv,$const));	# 4 : 1/j
+	&movdqa	("xmm1","xmm4");
+	&pandn	("xmm1","xmm0");
+	&psrld	("xmm1",4);			# 1 = i
+	&pand	("xmm0","xmm4");		# 0 = k
+	&movdqa	("xmm2",&QWP($k_inv+16,$const));# 2 : a/k
+	&pshufb	("xmm2","xmm0");		# 2 = a/k
+	&pxor	("xmm0","xmm1");		# 0 = j
+	&movdqa	("xmm3","xmm5");		# 3 : 1/i
+	&pshufb	("xmm3","xmm1");		# 3 = 1/i
+	&pxor	("xmm3","xmm2");		# 3 = iak = 1/i + a/k
+	&movdqa	("xmm4","xmm5");		# 4 : 1/j
+	&pshufb	("xmm4","xmm0");		# 4 = 1/j
+	&pxor	("xmm4","xmm2");		# 4 = jak = 1/j + a/k
+	&movdqa	("xmm2","xmm5");		# 2 : 1/iak
+	&pshufb	("xmm2","xmm3");		# 2 = 1/iak
+	&pxor	("xmm2","xmm0");		# 2 = io
+	&movdqa	("xmm3","xmm5");		# 3 : 1/jak
+	&pshufb	("xmm3","xmm4");		# 3 = 1/jak
+	&pxor	("xmm3","xmm1");		# 3 = jo
+	&movdqa	("xmm4",&QWP($k_sb1,$const));	# 4 : sbou
+	&pshufb	("xmm4","xmm2");		# 4 = sbou
+	&movdqa	("xmm0",&QWP($k_sb1+16,$const));# 0 : sbot
+	&pshufb	("xmm0","xmm3");		# 0 = sb1t
+	&pxor	("xmm0","xmm4");		# 0 = sbox output
+
+	# add in smeared stuff
+	&pxor	("xmm0","xmm7");
+	&movdqa	("xmm7","xmm0");
+	&ret	();
+&function_end_B("_vpaes_schedule_round");
+
+##
+##  .aes_schedule_transform
+##
+##  Linear-transform %xmm0 according to tables at (%ebx)
+##
+##  Output in %xmm0
+##  Clobbers %xmm1, %xmm2
+##
+&function_begin_B("_vpaes_schedule_transform");
+	&movdqa	("xmm2",&QWP($k_s0F,$const));
+	&movdqa	("xmm1","xmm2");
+	&pandn	("xmm1","xmm0");
+	&psrld	("xmm1",4);
+	&pand	("xmm0","xmm2");
+	&movdqa	("xmm2",&QWP(0,$base));
+	&pshufb	("xmm2","xmm0");
+	&movdqa	("xmm0",&QWP(16,$base));
+	&pshufb	("xmm0","xmm1");
+	&pxor	("xmm0","xmm2");
+	&ret	();
+&function_end_B("_vpaes_schedule_transform");
+
+##
+##  .aes_schedule_mangle
+##
+##  Mangle xmm0 from (basis-transformed) standard version
+##  to our version.
+##
+##  On encrypt,
+##    xor with 0x63
+##    multiply by circulant 0,1,1,1
+##    apply shiftrows transform
+##
+##  On decrypt,
+##    xor with 0x63
+##    multiply by "inverse mixcolumns" circulant E,B,D,9
+##    deskew
+##    apply shiftrows transform
+##
+##
+##  Writes out to (%edx), and increments or decrements it
+##  Keeps track of round number mod 4 in %ecx
+##  Preserves xmm0
+##  Clobbers xmm1-xmm5
+##
+&function_begin_B("_vpaes_schedule_mangle");
+	&movdqa	("xmm4","xmm0");	# save xmm0 for later
+	&movdqa	("xmm5",&QWP($k_mc_forward,$const));
+	&test	($out,$out);
+	&jnz	(&label("schedule_mangle_dec"));
+
+	# encrypting
+	&add	($key,16);
+	&pxor	("xmm4",&QWP($k_s63,$const));
+	&pshufb	("xmm4","xmm5");
+	&movdqa	("xmm3","xmm4");
+	&pshufb	("xmm4","xmm5");
+	&pxor	("xmm3","xmm4");
+	&pshufb	("xmm4","xmm5");
+	&pxor	("xmm3","xmm4");
+
+	&jmp	(&label("schedule_mangle_both"));
+
+&set_label("schedule_mangle_dec",16);
+	# inverse mix columns
+	&movdqa	("xmm2",&QWP($k_s0F,$const));
+	&lea	($inp,&DWP($k_dksd,$const));
+	&movdqa	("xmm1","xmm2");
+	&pandn	("xmm1","xmm4");
+	&psrld	("xmm1",4);			# 1 = hi
+	&pand	("xmm4","xmm2");		# 4 = lo
+
+	&movdqa	("xmm2",&QWP(0,$inp));
+	&pshufb	("xmm2","xmm4");
+	&movdqa	("xmm3",&QWP(0x10,$inp));
+	&pshufb	("xmm3","xmm1");
+	&pxor	("xmm3","xmm2");
+	&pshufb	("xmm3","xmm5");
+
+	&movdqa	("xmm2",&QWP(0x20,$inp));
+	&pshufb	("xmm2","xmm4");
+	&pxor	("xmm2","xmm3");
+	&movdqa	("xmm3",&QWP(0x30,$inp));
+	&pshufb	("xmm3","xmm1");
+	&pxor	("xmm3","xmm2");
+	&pshufb	("xmm3","xmm5");
+
+	&movdqa	("xmm2",&QWP(0x40,$inp));
+	&pshufb	("xmm2","xmm4");
+	&pxor	("xmm2","xmm3");
+	&movdqa	("xmm3",&QWP(0x50,$inp));
+	&pshufb	("xmm3","xmm1");
+	&pxor	("xmm3","xmm2");
+	&pshufb	("xmm3","xmm5");
+
+	&movdqa	("xmm2",&QWP(0x60,$inp));
+	&pshufb	("xmm2","xmm4");
+	&pxor	("xmm2","xmm3");
+	&movdqa	("xmm3",&QWP(0x70,$inp));
+	&pshufb	("xmm3","xmm1");
+	&pxor	("xmm3","xmm2");
+
+	&add	($key,-16);
+
+&set_label("schedule_mangle_both");
+	&movdqa	("xmm1",&QWP($k_sr,$const,$magic));
+	&pshufb	("xmm3","xmm1");
+	&add	($magic,-16);
+	&and	($magic,0x30);
+	&movdqu	(&QWP(0,$key),"xmm3");
+	&ret	();
+&function_end_B("_vpaes_schedule_mangle");
+
+#
+# Interface to OpenSSL
+#
+&function_begin("GFp_${PREFIX}_set_encrypt_key");
+	&mov	($inp,&wparam(0));		# inp
+	&lea	($base,&DWP(-56,"esp"));
+	&mov	($round,&wparam(1));		# bits
+	&and	($base,-16);
+	&mov	($key,&wparam(2));		# key
+	&xchg	($base,"esp");			# alloca
+	&mov	(&DWP(48,"esp"),$base);
+
+	&mov	($base,$round);
+	&shr	($base,5);
+	&add	($base,5);
+	&mov	(&DWP(240,$key),$base);		# AES_KEY->rounds = nbits/32+5;
+	&mov	($magic,0x30);
+	&mov	($out,0);
+
+	&lea	($const,&DWP(&label("_vpaes_consts")."+0x30-".&label("pic_point")));
+	&call	("_vpaes_schedule_core");
+&set_label("pic_point");
+
+	&mov	("esp",&DWP(48,"esp"));
+	&xor	("eax","eax");
+&function_end("GFp_${PREFIX}_set_encrypt_key");
+
+&function_begin("GFp_${PREFIX}_encrypt");
+	&lea	($const,&DWP(&label("_vpaes_consts")."+0x30-".&label("pic_point")));
+	&call	("_vpaes_preheat");
+&set_label("pic_point");
+	&mov	($inp,&wparam(0));		# inp
+	&lea	($base,&DWP(-56,"esp"));
+	&mov	($out,&wparam(1));		# out
+	&and	($base,-16);
+	&mov	($key,&wparam(2));		# key
+	&xchg	($base,"esp");			# alloca
+	&mov	(&DWP(48,"esp"),$base);
+
+	&movdqu	("xmm0",&QWP(0,$inp));
+	&call	("_vpaes_encrypt_core");
+	&movdqu	(&QWP(0,$out),"xmm0");
+
+	&mov	("esp",&DWP(48,"esp"));
+&function_end("GFp_${PREFIX}_encrypt");
+
+&asm_finish();
+
+close STDOUT or die "error closing STDOUT";
--- a/zeroidc/vendor/ring/crypto/fipsmodule/aes/asm/vpaes-x86_64.pl
+++ b/zeroidc/vendor/ring/crypto/fipsmodule/aes/asm/vpaes-x86_64.pl