RPM build fix (reverted CI changes which will need to be un-reverted or made conditional) and vendor Rust dependencies to make builds much faster in any CI system.
This commit is contained in:
961
zeroidc/vendor/ring/crypto/fipsmodule/aes/aes_nohw.c
vendored
Normal file
961
zeroidc/vendor/ring/crypto/fipsmodule/aes/aes_nohw.c
vendored
Normal file
@@ -0,0 +1,961 @@
|
||||
/* Copyright (c) 2019, Google Inc.
|
||||
*
|
||||
* Permission to use, copy, modify, and/or distribute this software for any
|
||||
* purpose with or without fee is hereby granted, provided that the above
|
||||
* copyright notice and this permission notice appear in all copies.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
|
||||
* WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
|
||||
* MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY
|
||||
* SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
|
||||
* WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION
|
||||
* OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN
|
||||
* CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. */
|
||||
|
||||
#include <GFp/aes.h>
|
||||
|
||||
#include "../../internal.h"
|
||||
|
||||
#if defined(OPENSSL_SSE2)
|
||||
#include <emmintrin.h>
|
||||
#endif
|
||||
|
||||
|
||||
// This file contains a constant-time implementation of AES, bitsliced with
|
||||
// 32-bit, 64-bit, or 128-bit words, operating on two-, four-, and eight-block
|
||||
// batches, respectively. The 128-bit implementation requires SSE2 intrinsics.
|
||||
//
|
||||
// This implementation is based on the algorithms described in the following
|
||||
// references:
|
||||
// - https://bearssl.org/constanttime.html#aes
|
||||
// - https://eprint.iacr.org/2009/129.pdf
|
||||
// - https://eprint.iacr.org/2009/191.pdf
|
||||
|
||||
|
||||
// Word operations.
|
||||
//
|
||||
// An aes_word_t is the word used for this AES implementation. Throughout this
|
||||
// file, bits and bytes are ordered little-endian, though "left" and "right"
|
||||
// shifts match the operations themselves, which makes them reversed in a
|
||||
// little-endian, left-to-right reading.
|
||||
//
|
||||
// Eight |aes_word_t|s contain |AES_NOHW_BATCH_SIZE| blocks. The bits in an
|
||||
// |aes_word_t| are divided into 16 consecutive groups of |AES_NOHW_BATCH_SIZE|
|
||||
// bits each, each corresponding to a byte in an AES block in column-major
|
||||
// order (AES's byte order). We refer to these as "logical bytes". Note, in the
|
||||
// 32-bit and 64-bit implementations, they are smaller than a byte. (The
|
||||
// contents of a logical byte will be described later.)
|
||||
//
|
||||
// MSVC does not support C bit operators on |__m128i|, so the wrapper functions
|
||||
// |aes_nohw_and|, etc., should be used instead. Note |aes_nohw_shift_left| and
|
||||
// |aes_nohw_shift_right| measure the shift in logical bytes. That is, the shift
|
||||
// value ranges from 0 to 15 independent of |aes_word_t| and
|
||||
// |AES_NOHW_BATCH_SIZE|.
|
||||
//
|
||||
// This ordering is different from https://eprint.iacr.org/2009/129.pdf, which
|
||||
// uses row-major order. Matching the AES order was easier to reason about, and
|
||||
// we do not have PSHUFB available to arbitrarily permute bytes.
|
||||
|
||||
#if defined(OPENSSL_SSE2)
|
||||
typedef __m128i aes_word_t;
|
||||
// AES_NOHW_WORD_SIZE is sizeof(aes_word_t). alignas(sizeof(T)) does not work in
|
||||
// MSVC, so we define a constant.
|
||||
#define AES_NOHW_WORD_SIZE 16
|
||||
#define AES_NOHW_BATCH_SIZE 8
|
||||
#define AES_NOHW_ROW0_MASK \
|
||||
_mm_set_epi32(0x000000ff, 0x000000ff, 0x000000ff, 0x000000ff)
|
||||
#define AES_NOHW_ROW1_MASK \
|
||||
_mm_set_epi32(0x0000ff00, 0x0000ff00, 0x0000ff00, 0x0000ff00)
|
||||
#define AES_NOHW_ROW2_MASK \
|
||||
_mm_set_epi32(0x00ff0000, 0x00ff0000, 0x00ff0000, 0x00ff0000)
|
||||
#define AES_NOHW_ROW3_MASK \
|
||||
_mm_set_epi32(0xff000000, 0xff000000, 0xff000000, 0xff000000)
|
||||
#define AES_NOHW_COL01_MASK \
|
||||
_mm_set_epi32(0x00000000, 0x00000000, 0xffffffff, 0xffffffff)
|
||||
#define AES_NOHW_COL2_MASK \
|
||||
_mm_set_epi32(0x00000000, 0xffffffff, 0x00000000, 0x00000000)
|
||||
#define AES_NOHW_COL3_MASK \
|
||||
_mm_set_epi32(0xffffffff, 0x00000000, 0x00000000, 0x00000000)
|
||||
|
||||
static inline aes_word_t aes_nohw_and(aes_word_t a, aes_word_t b) {
|
||||
return _mm_and_si128(a, b);
|
||||
}
|
||||
|
||||
static inline aes_word_t aes_nohw_or(aes_word_t a, aes_word_t b) {
|
||||
return _mm_or_si128(a, b);
|
||||
}
|
||||
|
||||
static inline aes_word_t aes_nohw_xor(aes_word_t a, aes_word_t b) {
|
||||
return _mm_xor_si128(a, b);
|
||||
}
|
||||
|
||||
static inline aes_word_t aes_nohw_not(aes_word_t a) {
|
||||
return _mm_xor_si128(
|
||||
a, _mm_set_epi32(0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff));
|
||||
}
|
||||
|
||||
// These are macros because parameters to |_mm_slli_si128| and |_mm_srli_si128|
|
||||
// must be constants.
|
||||
#define aes_nohw_shift_left(/* aes_word_t */ a, /* const */ i) \
|
||||
_mm_slli_si128((a), (i))
|
||||
#define aes_nohw_shift_right(/* aes_word_t */ a, /* const */ i) \
|
||||
_mm_srli_si128((a), (i))
|
||||
#else // !OPENSSL_SSE2
|
||||
#if defined(OPENSSL_64_BIT)
|
||||
typedef uint64_t aes_word_t;
|
||||
#define AES_NOHW_WORD_SIZE 8
|
||||
#define AES_NOHW_BATCH_SIZE 4
|
||||
#define AES_NOHW_ROW0_MASK UINT64_C(0x000f000f000f000f)
|
||||
#define AES_NOHW_ROW1_MASK UINT64_C(0x00f000f000f000f0)
|
||||
#define AES_NOHW_ROW2_MASK UINT64_C(0x0f000f000f000f00)
|
||||
#define AES_NOHW_ROW3_MASK UINT64_C(0xf000f000f000f000)
|
||||
#define AES_NOHW_COL01_MASK UINT64_C(0x00000000ffffffff)
|
||||
#define AES_NOHW_COL2_MASK UINT64_C(0x0000ffff00000000)
|
||||
#define AES_NOHW_COL3_MASK UINT64_C(0xffff000000000000)
|
||||
#else // !OPENSSL_64_BIT
|
||||
typedef uint32_t aes_word_t;
|
||||
#define AES_NOHW_WORD_SIZE 4
|
||||
#define AES_NOHW_BATCH_SIZE 2
|
||||
#define AES_NOHW_ROW0_MASK 0x03030303
|
||||
#define AES_NOHW_ROW1_MASK 0x0c0c0c0c
|
||||
#define AES_NOHW_ROW2_MASK 0x30303030
|
||||
#define AES_NOHW_ROW3_MASK 0xc0c0c0c0
|
||||
#define AES_NOHW_COL01_MASK 0x0000ffff
|
||||
#define AES_NOHW_COL2_MASK 0x00ff0000
|
||||
#define AES_NOHW_COL3_MASK 0xff000000
|
||||
#endif // OPENSSL_64_BIT
|
||||
|
||||
static inline aes_word_t aes_nohw_and(aes_word_t a, aes_word_t b) {
|
||||
return a & b;
|
||||
}
|
||||
|
||||
static inline aes_word_t aes_nohw_or(aes_word_t a, aes_word_t b) {
|
||||
return a | b;
|
||||
}
|
||||
|
||||
static inline aes_word_t aes_nohw_xor(aes_word_t a, aes_word_t b) {
|
||||
return a ^ b;
|
||||
}
|
||||
|
||||
static inline aes_word_t aes_nohw_not(aes_word_t a) { return ~a; }
|
||||
|
||||
static inline aes_word_t aes_nohw_shift_left(aes_word_t a, aes_word_t i) {
|
||||
return a << (i * AES_NOHW_BATCH_SIZE);
|
||||
}
|
||||
|
||||
static inline aes_word_t aes_nohw_shift_right(aes_word_t a, aes_word_t i) {
|
||||
return a >> (i * AES_NOHW_BATCH_SIZE);
|
||||
}
|
||||
#endif // OPENSSL_SSE2
|
||||
|
||||
OPENSSL_STATIC_ASSERT(AES_NOHW_BATCH_SIZE * 128 == 8 * 8 * sizeof(aes_word_t),
|
||||
"batch size does not match word size");
|
||||
OPENSSL_STATIC_ASSERT(AES_NOHW_WORD_SIZE == sizeof(aes_word_t),
|
||||
"AES_NOHW_WORD_SIZE is incorrect");
|
||||
|
||||
|
||||
// Block representations.
|
||||
//
|
||||
// This implementation uses three representations for AES blocks. First, the
|
||||
// public API represents blocks as uint8_t[16] in the usual way. Second, most
|
||||
// AES steps are evaluated in bitsliced form, stored in an |AES_NOHW_BATCH|.
|
||||
// This stores |AES_NOHW_BATCH_SIZE| blocks in bitsliced order. For 64-bit words
|
||||
// containing bitsliced blocks a, b, c, d, this would be as follows (vertical
|
||||
// bars divide logical bytes):
|
||||
//
|
||||
// batch.w[0] = a0 b0 c0 d0 | a8 b8 c8 d8 | a16 b16 c16 d16 ...
|
||||
// batch.w[1] = a1 b1 c1 d1 | a9 b9 c9 d9 | a17 b17 c17 d17 ...
|
||||
// batch.w[2] = a2 b2 c2 d2 | a10 b10 c10 d10 | a18 b18 c18 d18 ...
|
||||
// batch.w[3] = a3 b3 c3 d3 | a11 b11 c11 d11 | a19 b19 c19 d19 ...
|
||||
// ...
|
||||
//
|
||||
// Finally, an individual block may be stored as an intermediate form in an
|
||||
// aes_word_t[AES_NOHW_BLOCK_WORDS]. In this form, we permute the bits in each
|
||||
// block, so that block[0]'s ith logical byte contains least-significant
|
||||
// |AES_NOHW_BATCH_SIZE| bits of byte i, block[1] contains the next group of
|
||||
// |AES_NOHW_BATCH_SIZE| bits, and so on. We refer to this transformation as
|
||||
// "compacting" the block. Note this is no-op with 128-bit words because then
|
||||
// |AES_NOHW_BLOCK_WORDS| is one and |AES_NOHW_BATCH_SIZE| is eight. For 64-bit
|
||||
// words, one block would be stored in two words:
|
||||
//
|
||||
// block[0] = a0 a1 a2 a3 | a8 a9 a10 a11 | a16 a17 a18 a19 ...
|
||||
// block[1] = a4 a5 a6 a7 | a12 a13 a14 a15 | a20 a21 a22 a23 ...
|
||||
//
|
||||
// Observe that the distances between corresponding bits in bitsliced and
|
||||
// compact bit orders match. If we line up corresponding words of each block,
|
||||
// the bitsliced and compact representations may be converted by tranposing bits
|
||||
// in corresponding logical bytes. Continuing the 64-bit example:
|
||||
//
|
||||
// block_a[0] = a0 a1 a2 a3 | a8 a9 a10 a11 | a16 a17 a18 a19 ...
|
||||
// block_b[0] = b0 b1 b2 b3 | b8 b9 b10 b11 | b16 b17 b18 b19 ...
|
||||
// block_c[0] = c0 c1 c2 c3 | c8 c9 c10 c11 | c16 c17 c18 c19 ...
|
||||
// block_d[0] = d0 d1 d2 d3 | d8 d9 d10 d11 | d16 d17 d18 d19 ...
|
||||
//
|
||||
// batch.w[0] = a0 b0 c0 d0 | a8 b8 c8 d8 | a16 b16 c16 d16 ...
|
||||
// batch.w[1] = a1 b1 c1 d1 | a9 b9 c9 d9 | a17 b17 c17 d17 ...
|
||||
// batch.w[2] = a2 b2 c2 d2 | a10 b10 c10 d10 | a18 b18 c18 d18 ...
|
||||
// batch.w[3] = a3 b3 c3 d3 | a11 b11 c11 d11 | a19 b19 c19 d19 ...
|
||||
//
|
||||
// Note also that bitwise operations and (logical) byte permutations on an
|
||||
// |aes_word_t| work equally for the bitsliced and compact words.
|
||||
//
|
||||
// We use the compact form in the |AES_KEY| representation to save work
|
||||
// inflating round keys into |AES_NOHW_BATCH|. The compact form also exists
|
||||
// temporarily while moving blocks in or out of an |AES_NOHW_BATCH|, immediately
|
||||
// before or after |aes_nohw_transpose|.
|
||||
|
||||
#define AES_NOHW_BLOCK_WORDS (16 / sizeof(aes_word_t))
|
||||
|
||||
// An AES_NOHW_BATCH stores |AES_NOHW_BATCH_SIZE| blocks. Unless otherwise
|
||||
// specified, it is in bitsliced form.
|
||||
typedef struct {
|
||||
aes_word_t w[8];
|
||||
} AES_NOHW_BATCH;
|
||||
|
||||
// An AES_NOHW_SCHEDULE is an expanded bitsliced AES key schedule. It is
|
||||
// suitable for encryption or decryption. It is as large as |AES_NOHW_BATCH|
|
||||
// |AES_KEY|s so it should not be used as a long-term key representation.
|
||||
typedef struct {
|
||||
// keys is an array of batches, one for each round key. Each batch stores
|
||||
// |AES_NOHW_BATCH_SIZE| copies of the round key in bitsliced form.
|
||||
AES_NOHW_BATCH keys[AES_MAXNR + 1];
|
||||
} AES_NOHW_SCHEDULE;
|
||||
|
||||
// aes_nohw_batch_set sets the |i|th block of |batch| to |in|. |batch| is in
|
||||
// compact form.
|
||||
static inline void aes_nohw_batch_set(AES_NOHW_BATCH *batch,
|
||||
const aes_word_t in[AES_NOHW_BLOCK_WORDS],
|
||||
size_t i) {
|
||||
// Note the words are interleaved. The order comes from |aes_nohw_transpose|.
|
||||
// If |i| is zero and this is the 64-bit implementation, in[0] contains bits
|
||||
// 0-3 and in[1] contains bits 4-7. We place in[0] at w[0] and in[1] at
|
||||
// w[4] so that bits 0 and 4 are in the correct position. (In general, bits
|
||||
// along diagonals of |AES_NOHW_BATCH_SIZE| by |AES_NOHW_BATCH_SIZE| squares
|
||||
// will be correctly placed.)
|
||||
dev_assert_secret(i < AES_NOHW_BATCH_SIZE);
|
||||
#if defined(OPENSSL_SSE2)
|
||||
batch->w[i] = in[0];
|
||||
#elif defined(OPENSSL_64_BIT)
|
||||
batch->w[i] = in[0];
|
||||
batch->w[i + 4] = in[1];
|
||||
#else
|
||||
batch->w[i] = in[0];
|
||||
batch->w[i + 2] = in[1];
|
||||
batch->w[i + 4] = in[2];
|
||||
batch->w[i + 6] = in[3];
|
||||
#endif
|
||||
}
|
||||
|
||||
// aes_nohw_batch_get writes the |i|th block of |batch| to |out|. |batch| is in
|
||||
// compact form.
|
||||
static inline void aes_nohw_batch_get(const AES_NOHW_BATCH *batch,
|
||||
aes_word_t out[AES_NOHW_BLOCK_WORDS],
|
||||
size_t i) {
|
||||
dev_assert_secret(i < AES_NOHW_BATCH_SIZE);
|
||||
#if defined(OPENSSL_SSE2)
|
||||
out[0] = batch->w[i];
|
||||
#elif defined(OPENSSL_64_BIT)
|
||||
out[0] = batch->w[i];
|
||||
out[1] = batch->w[i + 4];
|
||||
#else
|
||||
out[0] = batch->w[i];
|
||||
out[1] = batch->w[i + 2];
|
||||
out[2] = batch->w[i + 4];
|
||||
out[3] = batch->w[i + 6];
|
||||
#endif
|
||||
}
|
||||
|
||||
#if !defined(OPENSSL_SSE2)
|
||||
// aes_nohw_delta_swap returns |a| with bits |a & mask| and
|
||||
// |a & (mask << shift)| swapped. |mask| and |mask << shift| may not overlap.
|
||||
static inline aes_word_t aes_nohw_delta_swap(aes_word_t a, aes_word_t mask,
|
||||
aes_word_t shift) {
|
||||
// See
|
||||
// https://reflectionsonsecurity.wordpress.com/2014/05/11/efficient-bit-permutation-using-delta-swaps/
|
||||
aes_word_t b = (a ^ (a >> shift)) & mask;
|
||||
return a ^ b ^ (b << shift);
|
||||
}
|
||||
|
||||
// In the 32-bit and 64-bit implementations, a block spans multiple words.
|
||||
// |aes_nohw_compact_block| must permute bits across different words. First we
|
||||
// implement |aes_nohw_compact_word| which performs a smaller version of the
|
||||
// transformation which stays within a single word.
|
||||
//
|
||||
// These transformations are generalizations of the output of
|
||||
// http://programming.sirrida.de/calcperm.php on smaller inputs.
|
||||
#if defined(OPENSSL_64_BIT)
|
||||
static inline uint64_t aes_nohw_compact_word(uint64_t a) {
|
||||
// Numbering the 64/2 = 16 4-bit chunks, least to most significant, we swap
|
||||
// quartets of those chunks:
|
||||
// 0 1 2 3 | 4 5 6 7 | 8 9 10 11 | 12 13 14 15 =>
|
||||
// 0 2 1 3 | 4 6 5 7 | 8 10 9 11 | 12 14 13 15
|
||||
a = aes_nohw_delta_swap(a, UINT64_C(0x00f000f000f000f0), 4);
|
||||
// Swap quartets of 8-bit chunks (still numbering by 4-bit chunks):
|
||||
// 0 2 1 3 | 4 6 5 7 | 8 10 9 11 | 12 14 13 15 =>
|
||||
// 0 2 4 6 | 1 3 5 7 | 8 10 12 14 | 9 11 13 15
|
||||
a = aes_nohw_delta_swap(a, UINT64_C(0x0000ff000000ff00), 8);
|
||||
// Swap quartets of 16-bit chunks (still numbering by 4-bit chunks):
|
||||
// 0 2 4 6 | 1 3 5 7 | 8 10 12 14 | 9 11 13 15 =>
|
||||
// 0 2 4 6 | 8 10 12 14 | 1 3 5 7 | 9 11 13 15
|
||||
a = aes_nohw_delta_swap(a, UINT64_C(0x00000000ffff0000), 16);
|
||||
return a;
|
||||
}
|
||||
|
||||
static inline uint64_t aes_nohw_uncompact_word(uint64_t a) {
|
||||
// Reverse the steps of |aes_nohw_uncompact_word|.
|
||||
a = aes_nohw_delta_swap(a, UINT64_C(0x00000000ffff0000), 16);
|
||||
a = aes_nohw_delta_swap(a, UINT64_C(0x0000ff000000ff00), 8);
|
||||
a = aes_nohw_delta_swap(a, UINT64_C(0x00f000f000f000f0), 4);
|
||||
return a;
|
||||
}
|
||||
#else // !OPENSSL_64_BIT
|
||||
static inline uint32_t aes_nohw_compact_word(uint32_t a) {
|
||||
// Numbering the 32/2 = 16 pairs of bits, least to most significant, we swap:
|
||||
// 0 1 2 3 | 4 5 6 7 | 8 9 10 11 | 12 13 14 15 =>
|
||||
// 0 4 2 6 | 1 5 3 7 | 8 12 10 14 | 9 13 11 15
|
||||
// Note: 0x00cc = 0b0000_0000_1100_1100
|
||||
// 0x00cc << 6 = 0b0011_0011_0000_0000
|
||||
a = aes_nohw_delta_swap(a, 0x00cc00cc, 6);
|
||||
// Now we swap groups of four bits (still numbering by pairs):
|
||||
// 0 4 2 6 | 1 5 3 7 | 8 12 10 14 | 9 13 11 15 =>
|
||||
// 0 4 8 12 | 1 5 9 13 | 2 6 10 14 | 3 7 11 15
|
||||
// Note: 0x0000_f0f0 << 12 = 0x0f0f_0000
|
||||
a = aes_nohw_delta_swap(a, 0x0000f0f0, 12);
|
||||
return a;
|
||||
}
|
||||
|
||||
static inline uint32_t aes_nohw_uncompact_word(uint32_t a) {
|
||||
// Reverse the steps of |aes_nohw_uncompact_word|.
|
||||
a = aes_nohw_delta_swap(a, 0x0000f0f0, 12);
|
||||
a = aes_nohw_delta_swap(a, 0x00cc00cc, 6);
|
||||
return a;
|
||||
}
|
||||
|
||||
static inline uint32_t aes_nohw_word_from_bytes(uint8_t a0, uint8_t a1,
|
||||
uint8_t a2, uint8_t a3) {
|
||||
return (uint32_t)a0 | ((uint32_t)a1 << 8) | ((uint32_t)a2 << 16) |
|
||||
((uint32_t)a3 << 24);
|
||||
}
|
||||
|
||||
static inline uint8_t lo(uint32_t a) {
|
||||
return (uint8_t)a;
|
||||
}
|
||||
|
||||
#endif // OPENSSL_64_BIT
|
||||
#endif // !OPENSSL_SSE2
|
||||
|
||||
static inline void aes_nohw_compact_block(aes_word_t out[AES_NOHW_BLOCK_WORDS],
|
||||
const uint8_t in[16]) {
|
||||
GFp_memcpy(out, in, 16);
|
||||
#if defined(OPENSSL_SSE2)
|
||||
// No conversions needed.
|
||||
#elif defined(OPENSSL_64_BIT)
|
||||
uint64_t a0 = aes_nohw_compact_word(out[0]);
|
||||
uint64_t a1 = aes_nohw_compact_word(out[1]);
|
||||
out[0] = (a0 & UINT64_C(0x00000000ffffffff)) | (a1 << 32);
|
||||
out[1] = (a1 & UINT64_C(0xffffffff00000000)) | (a0 >> 32);
|
||||
#else
|
||||
uint32_t a0 = aes_nohw_compact_word(out[0]);
|
||||
uint32_t a1 = aes_nohw_compact_word(out[1]);
|
||||
uint32_t a2 = aes_nohw_compact_word(out[2]);
|
||||
uint32_t a3 = aes_nohw_compact_word(out[3]);
|
||||
// Note clang, when building for ARM Thumb2, will sometimes miscompile
|
||||
// expressions such as (a0 & 0x0000ff00) << 8, particularly when building
|
||||
// without optimizations. This bug was introduced in
|
||||
// https://reviews.llvm.org/rL340261 and fixed in
|
||||
// https://reviews.llvm.org/rL351310. The following is written to avoid this.
|
||||
out[0] = aes_nohw_word_from_bytes(lo(a0), lo(a1), lo(a2), lo(a3));
|
||||
out[1] = aes_nohw_word_from_bytes(lo(a0 >> 8), lo(a1 >> 8), lo(a2 >> 8), lo(a3 >> 8));
|
||||
out[2] = aes_nohw_word_from_bytes(lo(a0 >> 16), lo(a1 >> 16), lo(a2 >> 16), lo(a3 >> 16));
|
||||
out[3] = aes_nohw_word_from_bytes(lo(a0 >> 24), lo(a1 >> 24), lo(a2 >> 24), lo(a3 >> 24));
|
||||
#endif
|
||||
}
|
||||
|
||||
static inline void aes_nohw_uncompact_block(
|
||||
uint8_t out[16], const aes_word_t in[AES_NOHW_BLOCK_WORDS]) {
|
||||
#if defined(OPENSSL_SSE2)
|
||||
GFp_memcpy(out, in, 16); // No conversions needed.
|
||||
#elif defined(OPENSSL_64_BIT)
|
||||
uint64_t a0 = in[0];
|
||||
uint64_t a1 = in[1];
|
||||
uint64_t b0 =
|
||||
aes_nohw_uncompact_word((a0 & UINT64_C(0x00000000ffffffff)) | (a1 << 32));
|
||||
uint64_t b1 =
|
||||
aes_nohw_uncompact_word((a1 & UINT64_C(0xffffffff00000000)) | (a0 >> 32));
|
||||
GFp_memcpy(out, &b0, 8);
|
||||
GFp_memcpy(out + 8, &b1, 8);
|
||||
#else
|
||||
uint32_t a0 = in[0];
|
||||
uint32_t a1 = in[1];
|
||||
uint32_t a2 = in[2];
|
||||
uint32_t a3 = in[3];
|
||||
// Note clang, when building for ARM Thumb2, will sometimes miscompile
|
||||
// expressions such as (a0 & 0x0000ff00) << 8, particularly when building
|
||||
// without optimizations. This bug was introduced in
|
||||
// https://reviews.llvm.org/rL340261 and fixed in
|
||||
// https://reviews.llvm.org/rL351310. The following is written to avoid this.
|
||||
uint32_t b0 = aes_nohw_word_from_bytes(lo(a0), lo(a1), lo(a2), lo(a3));
|
||||
uint32_t b1 = aes_nohw_word_from_bytes(lo(a0 >> 8), lo(a1 >> 8), lo(a2 >> 8), lo(a3 >> 8));
|
||||
uint32_t b2 =
|
||||
aes_nohw_word_from_bytes(lo(a0 >> 16), lo(a1 >> 16), lo(a2 >> 16), lo(a3 >> 16));
|
||||
uint32_t b3 =
|
||||
aes_nohw_word_from_bytes(lo(a0 >> 24), lo(a1 >> 24), lo(a2 >> 24), lo(a3 >> 24));
|
||||
b0 = aes_nohw_uncompact_word(b0);
|
||||
b1 = aes_nohw_uncompact_word(b1);
|
||||
b2 = aes_nohw_uncompact_word(b2);
|
||||
b3 = aes_nohw_uncompact_word(b3);
|
||||
GFp_memcpy(out, &b0, 4);
|
||||
GFp_memcpy(out + 4, &b1, 4);
|
||||
GFp_memcpy(out + 8, &b2, 4);
|
||||
GFp_memcpy(out + 12, &b3, 4);
|
||||
#endif
|
||||
}
|
||||
|
||||
// aes_nohw_swap_bits is a variation on a delta swap. It swaps the bits in
|
||||
// |*a & (mask << shift)| with the bits in |*b & mask|. |mask| and
|
||||
// |mask << shift| must not overlap. |mask| is specified as a |uint32_t|, but it
|
||||
// is repeated to the full width of |aes_word_t|.
|
||||
#if defined(OPENSSL_SSE2)
|
||||
// This must be a macro because |_mm_srli_epi32| and |_mm_slli_epi32| require
|
||||
// constant shift values.
|
||||
#define aes_nohw_swap_bits(/*__m128i* */ a, /*__m128i* */ b, \
|
||||
/* uint32_t */ mask, /* const */ shift) \
|
||||
do { \
|
||||
__m128i swap = \
|
||||
_mm_and_si128(_mm_xor_si128(_mm_srli_epi32(*(a), (shift)), *(b)), \
|
||||
_mm_set_epi32((mask), (mask), (mask), (mask))); \
|
||||
*(a) = _mm_xor_si128(*(a), _mm_slli_epi32(swap, (shift))); \
|
||||
*(b) = _mm_xor_si128(*(b), swap); \
|
||||
\
|
||||
} while (0)
|
||||
#else
|
||||
static inline void aes_nohw_swap_bits(aes_word_t *a, aes_word_t *b,
|
||||
uint32_t mask, aes_word_t shift) {
|
||||
#if defined(OPENSSL_64_BIT)
|
||||
aes_word_t mask_w = (((uint64_t)mask) << 32) | mask;
|
||||
#else
|
||||
aes_word_t mask_w = mask;
|
||||
#endif
|
||||
// This is a variation on a delta swap.
|
||||
aes_word_t swap = ((*a >> shift) ^ *b) & mask_w;
|
||||
*a ^= swap << shift;
|
||||
*b ^= swap;
|
||||
}
|
||||
#endif // OPENSSL_SSE2
|
||||
|
||||
// aes_nohw_transpose converts |batch| to and from bitsliced form. It divides
|
||||
// the 8 × word_size bits into AES_NOHW_BATCH_SIZE × AES_NOHW_BATCH_SIZE squares
|
||||
// and transposes each square.
|
||||
static void aes_nohw_transpose(AES_NOHW_BATCH *batch) {
|
||||
// Swap bits with index 0 and 1 mod 2 (0x55 = 0b01010101).
|
||||
aes_nohw_swap_bits(&batch->w[0], &batch->w[1], 0x55555555, 1);
|
||||
aes_nohw_swap_bits(&batch->w[2], &batch->w[3], 0x55555555, 1);
|
||||
aes_nohw_swap_bits(&batch->w[4], &batch->w[5], 0x55555555, 1);
|
||||
aes_nohw_swap_bits(&batch->w[6], &batch->w[7], 0x55555555, 1);
|
||||
|
||||
#if AES_NOHW_BATCH_SIZE >= 4
|
||||
// Swap bits with index 0-1 and 2-3 mod 4 (0x33 = 0b00110011).
|
||||
aes_nohw_swap_bits(&batch->w[0], &batch->w[2], 0x33333333, 2);
|
||||
aes_nohw_swap_bits(&batch->w[1], &batch->w[3], 0x33333333, 2);
|
||||
aes_nohw_swap_bits(&batch->w[4], &batch->w[6], 0x33333333, 2);
|
||||
aes_nohw_swap_bits(&batch->w[5], &batch->w[7], 0x33333333, 2);
|
||||
#endif
|
||||
|
||||
#if AES_NOHW_BATCH_SIZE >= 8
|
||||
// Swap bits with index 0-3 and 4-7 mod 8 (0x0f = 0b00001111).
|
||||
aes_nohw_swap_bits(&batch->w[0], &batch->w[4], 0x0f0f0f0f, 4);
|
||||
aes_nohw_swap_bits(&batch->w[1], &batch->w[5], 0x0f0f0f0f, 4);
|
||||
aes_nohw_swap_bits(&batch->w[2], &batch->w[6], 0x0f0f0f0f, 4);
|
||||
aes_nohw_swap_bits(&batch->w[3], &batch->w[7], 0x0f0f0f0f, 4);
|
||||
#endif
|
||||
}
|
||||
|
||||
// aes_nohw_to_batch initializes |out| with the |num_blocks| blocks from |in|.
|
||||
// |num_blocks| must be at most |AES_NOHW_BATCH|.
|
||||
static void aes_nohw_to_batch(AES_NOHW_BATCH *out, const uint8_t *in,
|
||||
size_t num_blocks) {
|
||||
// Don't leave unused blocks uninitialized.
|
||||
GFp_memset(out, 0, sizeof(AES_NOHW_BATCH));
|
||||
debug_assert_nonsecret(num_blocks <= AES_NOHW_BATCH_SIZE);
|
||||
for (size_t i = 0; i < num_blocks; i++) {
|
||||
aes_word_t block[AES_NOHW_BLOCK_WORDS];
|
||||
aes_nohw_compact_block(block, in + 16 * i);
|
||||
aes_nohw_batch_set(out, block, i);
|
||||
}
|
||||
|
||||
aes_nohw_transpose(out);
|
||||
}
|
||||
|
||||
// aes_nohw_to_batch writes the first |num_blocks| blocks in |batch| to |out|.
|
||||
// |num_blocks| must be at most |AES_NOHW_BATCH|.
|
||||
static void aes_nohw_from_batch(uint8_t *out, size_t num_blocks,
|
||||
const AES_NOHW_BATCH *batch) {
|
||||
AES_NOHW_BATCH copy = *batch;
|
||||
aes_nohw_transpose(©);
|
||||
|
||||
debug_assert_nonsecret(num_blocks <= AES_NOHW_BATCH_SIZE);
|
||||
for (size_t i = 0; i < num_blocks; i++) {
|
||||
aes_word_t block[AES_NOHW_BLOCK_WORDS];
|
||||
aes_nohw_batch_get(©, block, i);
|
||||
aes_nohw_uncompact_block(out + 16 * i, block);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
// AES round steps.
|
||||
|
||||
static void aes_nohw_add_round_key(AES_NOHW_BATCH *batch,
|
||||
const AES_NOHW_BATCH *key) {
|
||||
for (size_t i = 0; i < 8; i++) {
|
||||
batch->w[i] = aes_nohw_xor(batch->w[i], key->w[i]);
|
||||
}
|
||||
}
|
||||
|
||||
static void aes_nohw_sub_bytes(AES_NOHW_BATCH *batch) {
|
||||
// See https://eprint.iacr.org/2009/191.pdf, Appendix C.
|
||||
aes_word_t x0 = batch->w[7];
|
||||
aes_word_t x1 = batch->w[6];
|
||||
aes_word_t x2 = batch->w[5];
|
||||
aes_word_t x3 = batch->w[4];
|
||||
aes_word_t x4 = batch->w[3];
|
||||
aes_word_t x5 = batch->w[2];
|
||||
aes_word_t x6 = batch->w[1];
|
||||
aes_word_t x7 = batch->w[0];
|
||||
|
||||
// Figure 2, the top linear transformation.
|
||||
aes_word_t y14 = aes_nohw_xor(x3, x5);
|
||||
aes_word_t y13 = aes_nohw_xor(x0, x6);
|
||||
aes_word_t y9 = aes_nohw_xor(x0, x3);
|
||||
aes_word_t y8 = aes_nohw_xor(x0, x5);
|
||||
aes_word_t t0 = aes_nohw_xor(x1, x2);
|
||||
aes_word_t y1 = aes_nohw_xor(t0, x7);
|
||||
aes_word_t y4 = aes_nohw_xor(y1, x3);
|
||||
aes_word_t y12 = aes_nohw_xor(y13, y14);
|
||||
aes_word_t y2 = aes_nohw_xor(y1, x0);
|
||||
aes_word_t y5 = aes_nohw_xor(y1, x6);
|
||||
aes_word_t y3 = aes_nohw_xor(y5, y8);
|
||||
aes_word_t t1 = aes_nohw_xor(x4, y12);
|
||||
aes_word_t y15 = aes_nohw_xor(t1, x5);
|
||||
aes_word_t y20 = aes_nohw_xor(t1, x1);
|
||||
aes_word_t y6 = aes_nohw_xor(y15, x7);
|
||||
aes_word_t y10 = aes_nohw_xor(y15, t0);
|
||||
aes_word_t y11 = aes_nohw_xor(y20, y9);
|
||||
aes_word_t y7 = aes_nohw_xor(x7, y11);
|
||||
aes_word_t y17 = aes_nohw_xor(y10, y11);
|
||||
aes_word_t y19 = aes_nohw_xor(y10, y8);
|
||||
aes_word_t y16 = aes_nohw_xor(t0, y11);
|
||||
aes_word_t y21 = aes_nohw_xor(y13, y16);
|
||||
aes_word_t y18 = aes_nohw_xor(x0, y16);
|
||||
|
||||
// Figure 3, the middle non-linear section.
|
||||
aes_word_t t2 = aes_nohw_and(y12, y15);
|
||||
aes_word_t t3 = aes_nohw_and(y3, y6);
|
||||
aes_word_t t4 = aes_nohw_xor(t3, t2);
|
||||
aes_word_t t5 = aes_nohw_and(y4, x7);
|
||||
aes_word_t t6 = aes_nohw_xor(t5, t2);
|
||||
aes_word_t t7 = aes_nohw_and(y13, y16);
|
||||
aes_word_t t8 = aes_nohw_and(y5, y1);
|
||||
aes_word_t t9 = aes_nohw_xor(t8, t7);
|
||||
aes_word_t t10 = aes_nohw_and(y2, y7);
|
||||
aes_word_t t11 = aes_nohw_xor(t10, t7);
|
||||
aes_word_t t12 = aes_nohw_and(y9, y11);
|
||||
aes_word_t t13 = aes_nohw_and(y14, y17);
|
||||
aes_word_t t14 = aes_nohw_xor(t13, t12);
|
||||
aes_word_t t15 = aes_nohw_and(y8, y10);
|
||||
aes_word_t t16 = aes_nohw_xor(t15, t12);
|
||||
aes_word_t t17 = aes_nohw_xor(t4, t14);
|
||||
aes_word_t t18 = aes_nohw_xor(t6, t16);
|
||||
aes_word_t t19 = aes_nohw_xor(t9, t14);
|
||||
aes_word_t t20 = aes_nohw_xor(t11, t16);
|
||||
aes_word_t t21 = aes_nohw_xor(t17, y20);
|
||||
aes_word_t t22 = aes_nohw_xor(t18, y19);
|
||||
aes_word_t t23 = aes_nohw_xor(t19, y21);
|
||||
aes_word_t t24 = aes_nohw_xor(t20, y18);
|
||||
aes_word_t t25 = aes_nohw_xor(t21, t22);
|
||||
aes_word_t t26 = aes_nohw_and(t21, t23);
|
||||
aes_word_t t27 = aes_nohw_xor(t24, t26);
|
||||
aes_word_t t28 = aes_nohw_and(t25, t27);
|
||||
aes_word_t t29 = aes_nohw_xor(t28, t22);
|
||||
aes_word_t t30 = aes_nohw_xor(t23, t24);
|
||||
aes_word_t t31 = aes_nohw_xor(t22, t26);
|
||||
aes_word_t t32 = aes_nohw_and(t31, t30);
|
||||
aes_word_t t33 = aes_nohw_xor(t32, t24);
|
||||
aes_word_t t34 = aes_nohw_xor(t23, t33);
|
||||
aes_word_t t35 = aes_nohw_xor(t27, t33);
|
||||
aes_word_t t36 = aes_nohw_and(t24, t35);
|
||||
aes_word_t t37 = aes_nohw_xor(t36, t34);
|
||||
aes_word_t t38 = aes_nohw_xor(t27, t36);
|
||||
aes_word_t t39 = aes_nohw_and(t29, t38);
|
||||
aes_word_t t40 = aes_nohw_xor(t25, t39);
|
||||
aes_word_t t41 = aes_nohw_xor(t40, t37);
|
||||
aes_word_t t42 = aes_nohw_xor(t29, t33);
|
||||
aes_word_t t43 = aes_nohw_xor(t29, t40);
|
||||
aes_word_t t44 = aes_nohw_xor(t33, t37);
|
||||
aes_word_t t45 = aes_nohw_xor(t42, t41);
|
||||
aes_word_t z0 = aes_nohw_and(t44, y15);
|
||||
aes_word_t z1 = aes_nohw_and(t37, y6);
|
||||
aes_word_t z2 = aes_nohw_and(t33, x7);
|
||||
aes_word_t z3 = aes_nohw_and(t43, y16);
|
||||
aes_word_t z4 = aes_nohw_and(t40, y1);
|
||||
aes_word_t z5 = aes_nohw_and(t29, y7);
|
||||
aes_word_t z6 = aes_nohw_and(t42, y11);
|
||||
aes_word_t z7 = aes_nohw_and(t45, y17);
|
||||
aes_word_t z8 = aes_nohw_and(t41, y10);
|
||||
aes_word_t z9 = aes_nohw_and(t44, y12);
|
||||
aes_word_t z10 = aes_nohw_and(t37, y3);
|
||||
aes_word_t z11 = aes_nohw_and(t33, y4);
|
||||
aes_word_t z12 = aes_nohw_and(t43, y13);
|
||||
aes_word_t z13 = aes_nohw_and(t40, y5);
|
||||
aes_word_t z14 = aes_nohw_and(t29, y2);
|
||||
aes_word_t z15 = aes_nohw_and(t42, y9);
|
||||
aes_word_t z16 = aes_nohw_and(t45, y14);
|
||||
aes_word_t z17 = aes_nohw_and(t41, y8);
|
||||
|
||||
// Figure 4, bottom linear transformation.
|
||||
aes_word_t t46 = aes_nohw_xor(z15, z16);
|
||||
aes_word_t t47 = aes_nohw_xor(z10, z11);
|
||||
aes_word_t t48 = aes_nohw_xor(z5, z13);
|
||||
aes_word_t t49 = aes_nohw_xor(z9, z10);
|
||||
aes_word_t t50 = aes_nohw_xor(z2, z12);
|
||||
aes_word_t t51 = aes_nohw_xor(z2, z5);
|
||||
aes_word_t t52 = aes_nohw_xor(z7, z8);
|
||||
aes_word_t t53 = aes_nohw_xor(z0, z3);
|
||||
aes_word_t t54 = aes_nohw_xor(z6, z7);
|
||||
aes_word_t t55 = aes_nohw_xor(z16, z17);
|
||||
aes_word_t t56 = aes_nohw_xor(z12, t48);
|
||||
aes_word_t t57 = aes_nohw_xor(t50, t53);
|
||||
aes_word_t t58 = aes_nohw_xor(z4, t46);
|
||||
aes_word_t t59 = aes_nohw_xor(z3, t54);
|
||||
aes_word_t t60 = aes_nohw_xor(t46, t57);
|
||||
aes_word_t t61 = aes_nohw_xor(z14, t57);
|
||||
aes_word_t t62 = aes_nohw_xor(t52, t58);
|
||||
aes_word_t t63 = aes_nohw_xor(t49, t58);
|
||||
aes_word_t t64 = aes_nohw_xor(z4, t59);
|
||||
aes_word_t t65 = aes_nohw_xor(t61, t62);
|
||||
aes_word_t t66 = aes_nohw_xor(z1, t63);
|
||||
aes_word_t s0 = aes_nohw_xor(t59, t63);
|
||||
aes_word_t s6 = aes_nohw_xor(t56, aes_nohw_not(t62));
|
||||
aes_word_t s7 = aes_nohw_xor(t48, aes_nohw_not(t60));
|
||||
aes_word_t t67 = aes_nohw_xor(t64, t65);
|
||||
aes_word_t s3 = aes_nohw_xor(t53, t66);
|
||||
aes_word_t s4 = aes_nohw_xor(t51, t66);
|
||||
aes_word_t s5 = aes_nohw_xor(t47, t65);
|
||||
aes_word_t s1 = aes_nohw_xor(t64, aes_nohw_not(s3));
|
||||
aes_word_t s2 = aes_nohw_xor(t55, aes_nohw_not(t67));
|
||||
|
||||
batch->w[0] = s7;
|
||||
batch->w[1] = s6;
|
||||
batch->w[2] = s5;
|
||||
batch->w[3] = s4;
|
||||
batch->w[4] = s3;
|
||||
batch->w[5] = s2;
|
||||
batch->w[6] = s1;
|
||||
batch->w[7] = s0;
|
||||
}
|
||||
|
||||
// aes_nohw_rotate_cols_right returns |v| with the columns in each row rotated
|
||||
// to the right by |n|. This is a macro because |aes_nohw_shift_*| require
|
||||
// constant shift counts in the SSE2 implementation.
|
||||
#define aes_nohw_rotate_cols_right(/* aes_word_t */ v, /* const */ n) \
|
||||
(aes_nohw_or(aes_nohw_shift_right((v), (n)*4), \
|
||||
aes_nohw_shift_left((v), 16 - (n)*4)))
|
||||
|
||||
static void aes_nohw_shift_rows(AES_NOHW_BATCH *batch) {
|
||||
for (size_t i = 0; i < 8; i++) {
|
||||
aes_word_t row0 = aes_nohw_and(batch->w[i], AES_NOHW_ROW0_MASK);
|
||||
aes_word_t row1 = aes_nohw_and(batch->w[i], AES_NOHW_ROW1_MASK);
|
||||
aes_word_t row2 = aes_nohw_and(batch->w[i], AES_NOHW_ROW2_MASK);
|
||||
aes_word_t row3 = aes_nohw_and(batch->w[i], AES_NOHW_ROW3_MASK);
|
||||
row1 = aes_nohw_rotate_cols_right(row1, 1);
|
||||
row2 = aes_nohw_rotate_cols_right(row2, 2);
|
||||
row3 = aes_nohw_rotate_cols_right(row3, 3);
|
||||
batch->w[i] = aes_nohw_or(aes_nohw_or(row0, row1), aes_nohw_or(row2, row3));
|
||||
}
|
||||
}
|
||||
|
||||
// aes_nohw_rotate_rows_down returns |v| with the rows in each column rotated
|
||||
// down by one.
|
||||
static inline aes_word_t aes_nohw_rotate_rows_down(aes_word_t v) {
|
||||
#if defined(OPENSSL_SSE2)
|
||||
return _mm_or_si128(_mm_srli_epi32(v, 8), _mm_slli_epi32(v, 24));
|
||||
#elif defined(OPENSSL_64_BIT)
|
||||
return ((v >> 4) & UINT64_C(0x0fff0fff0fff0fff)) |
|
||||
((v << 12) & UINT64_C(0xf000f000f000f000));
|
||||
#else
|
||||
return ((v >> 2) & 0x3f3f3f3f) | ((v << 6) & 0xc0c0c0c0);
|
||||
#endif
|
||||
}
|
||||
|
||||
// aes_nohw_rotate_rows_twice returns |v| with the rows in each column rotated
|
||||
// by two.
|
||||
static inline aes_word_t aes_nohw_rotate_rows_twice(aes_word_t v) {
|
||||
#if defined(OPENSSL_SSE2)
|
||||
return _mm_or_si128(_mm_srli_epi32(v, 16), _mm_slli_epi32(v, 16));
|
||||
#elif defined(OPENSSL_64_BIT)
|
||||
return ((v >> 8) & UINT64_C(0x00ff00ff00ff00ff)) |
|
||||
((v << 8) & UINT64_C(0xff00ff00ff00ff00));
|
||||
#else
|
||||
return ((v >> 4) & 0x0f0f0f0f) | ((v << 4) & 0xf0f0f0f0);
|
||||
#endif
|
||||
}
|
||||
|
||||
static void aes_nohw_mix_columns(AES_NOHW_BATCH *batch) {
|
||||
// See https://eprint.iacr.org/2009/129.pdf, section 4.4 and appendix A.
|
||||
aes_word_t a0 = batch->w[0];
|
||||
aes_word_t a1 = batch->w[1];
|
||||
aes_word_t a2 = batch->w[2];
|
||||
aes_word_t a3 = batch->w[3];
|
||||
aes_word_t a4 = batch->w[4];
|
||||
aes_word_t a5 = batch->w[5];
|
||||
aes_word_t a6 = batch->w[6];
|
||||
aes_word_t a7 = batch->w[7];
|
||||
|
||||
aes_word_t r0 = aes_nohw_rotate_rows_down(a0);
|
||||
aes_word_t a0_r0 = aes_nohw_xor(a0, r0);
|
||||
aes_word_t r1 = aes_nohw_rotate_rows_down(a1);
|
||||
aes_word_t a1_r1 = aes_nohw_xor(a1, r1);
|
||||
aes_word_t r2 = aes_nohw_rotate_rows_down(a2);
|
||||
aes_word_t a2_r2 = aes_nohw_xor(a2, r2);
|
||||
aes_word_t r3 = aes_nohw_rotate_rows_down(a3);
|
||||
aes_word_t a3_r3 = aes_nohw_xor(a3, r3);
|
||||
aes_word_t r4 = aes_nohw_rotate_rows_down(a4);
|
||||
aes_word_t a4_r4 = aes_nohw_xor(a4, r4);
|
||||
aes_word_t r5 = aes_nohw_rotate_rows_down(a5);
|
||||
aes_word_t a5_r5 = aes_nohw_xor(a5, r5);
|
||||
aes_word_t r6 = aes_nohw_rotate_rows_down(a6);
|
||||
aes_word_t a6_r6 = aes_nohw_xor(a6, r6);
|
||||
aes_word_t r7 = aes_nohw_rotate_rows_down(a7);
|
||||
aes_word_t a7_r7 = aes_nohw_xor(a7, r7);
|
||||
|
||||
batch->w[0] =
|
||||
aes_nohw_xor(aes_nohw_xor(a7_r7, r0), aes_nohw_rotate_rows_twice(a0_r0));
|
||||
batch->w[1] =
|
||||
aes_nohw_xor(aes_nohw_xor(a0_r0, a7_r7),
|
||||
aes_nohw_xor(r1, aes_nohw_rotate_rows_twice(a1_r1)));
|
||||
batch->w[2] =
|
||||
aes_nohw_xor(aes_nohw_xor(a1_r1, r2), aes_nohw_rotate_rows_twice(a2_r2));
|
||||
batch->w[3] =
|
||||
aes_nohw_xor(aes_nohw_xor(a2_r2, a7_r7),
|
||||
aes_nohw_xor(r3, aes_nohw_rotate_rows_twice(a3_r3)));
|
||||
batch->w[4] =
|
||||
aes_nohw_xor(aes_nohw_xor(a3_r3, a7_r7),
|
||||
aes_nohw_xor(r4, aes_nohw_rotate_rows_twice(a4_r4)));
|
||||
batch->w[5] =
|
||||
aes_nohw_xor(aes_nohw_xor(a4_r4, r5), aes_nohw_rotate_rows_twice(a5_r5));
|
||||
batch->w[6] =
|
||||
aes_nohw_xor(aes_nohw_xor(a5_r5, r6), aes_nohw_rotate_rows_twice(a6_r6));
|
||||
batch->w[7] =
|
||||
aes_nohw_xor(aes_nohw_xor(a6_r6, r7), aes_nohw_rotate_rows_twice(a7_r7));
|
||||
}
|
||||
|
||||
static void aes_nohw_encrypt_batch(const AES_NOHW_SCHEDULE *key,
|
||||
size_t num_rounds, AES_NOHW_BATCH *batch) {
|
||||
aes_nohw_add_round_key(batch, &key->keys[0]);
|
||||
for (size_t i = 1; i < num_rounds; i++) {
|
||||
aes_nohw_sub_bytes(batch);
|
||||
aes_nohw_shift_rows(batch);
|
||||
aes_nohw_mix_columns(batch);
|
||||
aes_nohw_add_round_key(batch, &key->keys[i]);
|
||||
}
|
||||
aes_nohw_sub_bytes(batch);
|
||||
aes_nohw_shift_rows(batch);
|
||||
aes_nohw_add_round_key(batch, &key->keys[num_rounds]);
|
||||
}
|
||||
|
||||
// Key schedule.
|
||||
|
||||
static void aes_nohw_expand_round_keys(AES_NOHW_SCHEDULE *out,
|
||||
const AES_KEY *key) {
|
||||
for (unsigned i = 0; i <= key->rounds; i++) {
|
||||
// Copy the round key into each block in the batch.
|
||||
for (size_t j = 0; j < AES_NOHW_BATCH_SIZE; j++) {
|
||||
aes_word_t tmp[AES_NOHW_BLOCK_WORDS];
|
||||
GFp_memcpy(tmp, key->rd_key + 4 * i, 16);
|
||||
aes_nohw_batch_set(&out->keys[i], tmp, j);
|
||||
}
|
||||
aes_nohw_transpose(&out->keys[i]);
|
||||
}
|
||||
}
|
||||
|
||||
static const uint8_t aes_nohw_rcon[10] = {0x01, 0x02, 0x04, 0x08, 0x10,
|
||||
0x20, 0x40, 0x80, 0x1b, 0x36};
|
||||
|
||||
// aes_nohw_rcon_slice returns the |i|th group of |AES_NOHW_BATCH_SIZE| bits in
|
||||
// |rcon|, stored in a |aes_word_t|.
|
||||
static inline aes_word_t aes_nohw_rcon_slice(uint8_t rcon, size_t i) {
|
||||
rcon = (rcon >> (i * AES_NOHW_BATCH_SIZE)) & ((1 << AES_NOHW_BATCH_SIZE) - 1);
|
||||
#if defined(OPENSSL_SSE2)
|
||||
return _mm_set_epi32(0, 0, 0, rcon);
|
||||
#else
|
||||
return ((aes_word_t)rcon);
|
||||
#endif
|
||||
}
|
||||
|
||||
static void aes_nohw_sub_block(aes_word_t out[AES_NOHW_BLOCK_WORDS],
|
||||
const aes_word_t in[AES_NOHW_BLOCK_WORDS]) {
|
||||
AES_NOHW_BATCH batch;
|
||||
GFp_memset(&batch, 0, sizeof(batch));
|
||||
aes_nohw_batch_set(&batch, in, 0);
|
||||
aes_nohw_transpose(&batch);
|
||||
aes_nohw_sub_bytes(&batch);
|
||||
aes_nohw_transpose(&batch);
|
||||
aes_nohw_batch_get(&batch, out, 0);
|
||||
}
|
||||
|
||||
static void aes_nohw_setup_key_128(AES_KEY *key, const uint8_t in[16]) {
|
||||
key->rounds = 10;
|
||||
|
||||
aes_word_t block[AES_NOHW_BLOCK_WORDS];
|
||||
aes_nohw_compact_block(block, in);
|
||||
GFp_memcpy(key->rd_key, block, 16);
|
||||
|
||||
for (size_t i = 1; i <= 10; i++) {
|
||||
aes_word_t sub[AES_NOHW_BLOCK_WORDS];
|
||||
aes_nohw_sub_block(sub, block);
|
||||
uint8_t rcon = aes_nohw_rcon[i - 1];
|
||||
for (size_t j = 0; j < AES_NOHW_BLOCK_WORDS; j++) {
|
||||
// Incorporate |rcon| and the transformed word into the first word.
|
||||
block[j] = aes_nohw_xor(block[j], aes_nohw_rcon_slice(rcon, j));
|
||||
block[j] = aes_nohw_xor(
|
||||
block[j],
|
||||
aes_nohw_shift_right(aes_nohw_rotate_rows_down(sub[j]), 12));
|
||||
// Propagate to the remaining words. Note this is reordered from the usual
|
||||
// formulation to avoid needing masks.
|
||||
aes_word_t v = block[j];
|
||||
block[j] = aes_nohw_xor(block[j], aes_nohw_shift_left(v, 4));
|
||||
block[j] = aes_nohw_xor(block[j], aes_nohw_shift_left(v, 8));
|
||||
block[j] = aes_nohw_xor(block[j], aes_nohw_shift_left(v, 12));
|
||||
}
|
||||
GFp_memcpy(key->rd_key + 4 * i, block, 16);
|
||||
}
|
||||
}
|
||||
|
||||
static void aes_nohw_setup_key_256(AES_KEY *key, const uint8_t in[32]) {
|
||||
key->rounds = 14;
|
||||
|
||||
// Each key schedule iteration produces two round keys.
|
||||
aes_word_t block1[AES_NOHW_BLOCK_WORDS], block2[AES_NOHW_BLOCK_WORDS];
|
||||
aes_nohw_compact_block(block1, in);
|
||||
GFp_memcpy(key->rd_key, block1, 16);
|
||||
|
||||
aes_nohw_compact_block(block2, in + 16);
|
||||
GFp_memcpy(key->rd_key + 4, block2, 16);
|
||||
|
||||
for (size_t i = 2; i <= 14; i += 2) {
|
||||
aes_word_t sub[AES_NOHW_BLOCK_WORDS];
|
||||
aes_nohw_sub_block(sub, block2);
|
||||
uint8_t rcon = aes_nohw_rcon[i / 2 - 1];
|
||||
for (size_t j = 0; j < AES_NOHW_BLOCK_WORDS; j++) {
|
||||
// Incorporate |rcon| and the transformed word into the first word.
|
||||
block1[j] = aes_nohw_xor(block1[j], aes_nohw_rcon_slice(rcon, j));
|
||||
block1[j] = aes_nohw_xor(
|
||||
block1[j],
|
||||
aes_nohw_shift_right(aes_nohw_rotate_rows_down(sub[j]), 12));
|
||||
// Propagate to the remaining words.
|
||||
aes_word_t v = block1[j];
|
||||
block1[j] = aes_nohw_xor(block1[j], aes_nohw_shift_left(v, 4));
|
||||
block1[j] = aes_nohw_xor(block1[j], aes_nohw_shift_left(v, 8));
|
||||
block1[j] = aes_nohw_xor(block1[j], aes_nohw_shift_left(v, 12));
|
||||
}
|
||||
GFp_memcpy(key->rd_key + 4 * i, block1, 16);
|
||||
|
||||
if (i == 14) {
|
||||
break;
|
||||
}
|
||||
|
||||
aes_nohw_sub_block(sub, block1);
|
||||
for (size_t j = 0; j < AES_NOHW_BLOCK_WORDS; j++) {
|
||||
// Incorporate the transformed word into the first word.
|
||||
block2[j] = aes_nohw_xor(block2[j], aes_nohw_shift_right(sub[j], 12));
|
||||
// Propagate to the remaining words.
|
||||
aes_word_t v = block2[j];
|
||||
block2[j] = aes_nohw_xor(block2[j], aes_nohw_shift_left(v, 4));
|
||||
block2[j] = aes_nohw_xor(block2[j], aes_nohw_shift_left(v, 8));
|
||||
block2[j] = aes_nohw_xor(block2[j], aes_nohw_shift_left(v, 12));
|
||||
}
|
||||
GFp_memcpy(key->rd_key + 4 * (i + 1), block2, 16);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
// External API.
|
||||
|
||||
int GFp_aes_nohw_set_encrypt_key(const uint8_t *key, unsigned bits,
|
||||
AES_KEY *aeskey) {
|
||||
switch (bits) {
|
||||
case 128:
|
||||
aes_nohw_setup_key_128(aeskey, key);
|
||||
return 0;
|
||||
case 256:
|
||||
aes_nohw_setup_key_256(aeskey, key);
|
||||
return 0;
|
||||
}
|
||||
return 1;
|
||||
}
|
||||
|
||||
void GFp_aes_nohw_encrypt(const uint8_t *in, uint8_t *out, const AES_KEY *key) {
|
||||
AES_NOHW_SCHEDULE sched;
|
||||
aes_nohw_expand_round_keys(&sched, key);
|
||||
AES_NOHW_BATCH batch;
|
||||
aes_nohw_to_batch(&batch, in, /*num_blocks=*/1);
|
||||
aes_nohw_encrypt_batch(&sched, key->rounds, &batch);
|
||||
aes_nohw_from_batch(out, /*num_blocks=*/1, &batch);
|
||||
}
|
||||
|
||||
static inline void aes_nohw_xor_block(uint8_t out[16], const uint8_t a[16],
|
||||
const uint8_t b[16]) {
|
||||
for (size_t i = 0; i < 16; i += sizeof(aes_word_t)) {
|
||||
aes_word_t x, y;
|
||||
GFp_memcpy(&x, a + i, sizeof(aes_word_t));
|
||||
GFp_memcpy(&y, b + i, sizeof(aes_word_t));
|
||||
x = aes_nohw_xor(x, y);
|
||||
GFp_memcpy(out + i, &x, sizeof(aes_word_t));
|
||||
}
|
||||
}
|
||||
|
||||
void GFp_aes_nohw_ctr32_encrypt_blocks(const uint8_t *in, uint8_t *out,
|
||||
size_t blocks, const AES_KEY *key,
|
||||
const uint8_t ivec[16]) {
|
||||
if (blocks == 0) {
|
||||
return;
|
||||
}
|
||||
|
||||
AES_NOHW_SCHEDULE sched;
|
||||
aes_nohw_expand_round_keys(&sched, key);
|
||||
|
||||
// Make |AES_NOHW_BATCH_SIZE| copies of |ivec|.
|
||||
alignas(AES_NOHW_WORD_SIZE) union {
|
||||
uint32_t u32[AES_NOHW_BATCH_SIZE * 4];
|
||||
uint8_t u8[AES_NOHW_BATCH_SIZE * 16];
|
||||
} ivs, enc_ivs;
|
||||
for (size_t i = 0; i < AES_NOHW_BATCH_SIZE; i++) {
|
||||
GFp_memcpy(ivs.u8 + 16 * i, ivec, 16);
|
||||
}
|
||||
|
||||
uint32_t ctr = CRYPTO_bswap4(ivs.u32[3]);
|
||||
for (;;) {
|
||||
// Update counters.
|
||||
for (uint32_t i = 0; i < AES_NOHW_BATCH_SIZE; i++) {
|
||||
ivs.u32[4 * i + 3] = CRYPTO_bswap4(ctr + i);
|
||||
}
|
||||
|
||||
size_t todo = blocks >= AES_NOHW_BATCH_SIZE ? AES_NOHW_BATCH_SIZE : blocks;
|
||||
AES_NOHW_BATCH batch;
|
||||
aes_nohw_to_batch(&batch, ivs.u8, todo);
|
||||
aes_nohw_encrypt_batch(&sched, key->rounds, &batch);
|
||||
aes_nohw_from_batch(enc_ivs.u8, todo, &batch);
|
||||
|
||||
for (size_t i = 0; i < todo; i++) {
|
||||
aes_nohw_xor_block(out + 16 * i, in + 16 * i, enc_ivs.u8 + 16 * i);
|
||||
}
|
||||
|
||||
blocks -= todo;
|
||||
if (blocks == 0) {
|
||||
break;
|
||||
}
|
||||
|
||||
in += 16 * AES_NOHW_BATCH_SIZE;
|
||||
out += 16 * AES_NOHW_BATCH_SIZE;
|
||||
ctr += AES_NOHW_BATCH_SIZE;
|
||||
}
|
||||
}
|
||||
971
zeroidc/vendor/ring/crypto/fipsmodule/aes/asm/aesni-x86.pl
vendored
Normal file
971
zeroidc/vendor/ring/crypto/fipsmodule/aes/asm/aesni-x86.pl
vendored
Normal file
@@ -0,0 +1,971 @@
|
||||
#! /usr/bin/env perl
|
||||
# Copyright 2009-2016 The OpenSSL Project Authors. All Rights Reserved.
|
||||
#
|
||||
# Licensed under the OpenSSL license (the "License"). You may not use
|
||||
# this file except in compliance with the License. You can obtain a copy
|
||||
# in the file LICENSE in the source distribution or at
|
||||
# https://www.openssl.org/source/license.html
|
||||
|
||||
|
||||
# ====================================================================
|
||||
# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
|
||||
# project. The module is, however, dual licensed under OpenSSL and
|
||||
# CRYPTOGAMS licenses depending on where you obtain it. For further
|
||||
# details see http://www.openssl.org/~appro/cryptogams/.
|
||||
# ====================================================================
|
||||
#
|
||||
# This module implements support for Intel AES-NI extension. In
|
||||
# OpenSSL context it's used with Intel engine, but can also be used as
|
||||
# drop-in replacement for crypto/aes/asm/aes-586.pl [see below for
|
||||
# details].
|
||||
#
|
||||
# Performance.
|
||||
#
|
||||
# To start with see corresponding paragraph in aesni-x86_64.pl...
|
||||
# Instead of filling table similar to one found there I've chosen to
|
||||
# summarize *comparison* results for raw ECB, CTR and CBC benchmarks.
|
||||
# The simplified table below represents 32-bit performance relative
|
||||
# to 64-bit one in every given point. Ratios vary for different
|
||||
# encryption modes, therefore interval values.
|
||||
#
|
||||
# 16-byte 64-byte 256-byte 1-KB 8-KB
|
||||
# 53-67% 67-84% 91-94% 95-98% 97-99.5%
|
||||
#
|
||||
# Lower ratios for smaller block sizes are perfectly understandable,
|
||||
# because function call overhead is higher in 32-bit mode. Largest
|
||||
# 8-KB block performance is virtually same: 32-bit code is less than
|
||||
# 1% slower for ECB, CBC and CCM, and ~3% slower otherwise.
|
||||
|
||||
# January 2011
|
||||
#
|
||||
# See aesni-x86_64.pl for details. Unlike x86_64 version this module
|
||||
# interleaves at most 6 aes[enc|dec] instructions, because there are
|
||||
# not enough registers for 8x interleave [which should be optimal for
|
||||
# Sandy Bridge]. Actually, performance results for 6x interleave
|
||||
# factor presented in aesni-x86_64.pl (except for CTR) are for this
|
||||
# module.
|
||||
|
||||
# April 2011
|
||||
#
|
||||
# Add aesni_xts_[en|de]crypt. Westmere spends 1.50 cycles processing
|
||||
# one byte out of 8KB with 128-bit key, Sandy Bridge - 1.09.
|
||||
|
||||
# November 2015
|
||||
#
|
||||
# Add aesni_ocb_[en|de]crypt. [Removed in BoringSSL]
|
||||
|
||||
######################################################################
|
||||
# Current large-block performance in cycles per byte processed with
|
||||
# 128-bit key (less is better).
|
||||
#
|
||||
# CBC en-/decrypt CTR XTS ECB OCB
|
||||
# Westmere 3.77/1.37 1.37 1.52 1.27
|
||||
# * Bridge 5.07/0.98 0.99 1.09 0.91 1.10
|
||||
# Haswell 4.44/0.80 0.97 1.03 0.72 0.76
|
||||
# Skylake 2.68/0.65 0.65 0.66 0.64 0.66
|
||||
# Silvermont 5.77/3.56 3.67 4.03 3.46 4.03
|
||||
# Goldmont 3.84/1.39 1.39 1.63 1.31 1.70
|
||||
# Bulldozer 5.80/0.98 1.05 1.24 0.93 1.23
|
||||
|
||||
$PREFIX="GFp_aes_hw"; # if $PREFIX is set to "AES", the script
|
||||
# generates drop-in replacement for
|
||||
# crypto/aes/asm/aes-586.pl:-)
|
||||
$AESNI_PREFIX="GFp_aes_hw";
|
||||
$inline=1; # inline _aesni_[en|de]crypt
|
||||
|
||||
$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
|
||||
push(@INC,"${dir}","${dir}../../../perlasm");
|
||||
require "x86asm.pl";
|
||||
|
||||
$output = pop;
|
||||
open OUT,">$output";
|
||||
*STDOUT=*OUT;
|
||||
|
||||
&asm_init($ARGV[0]);
|
||||
|
||||
&external_label("GFp_ia32cap_P");
|
||||
&static_label("key_const");
|
||||
|
||||
if ($PREFIX eq $AESNI_PREFIX) { $movekey=\&movups; }
|
||||
else { $movekey=\&movups; }
|
||||
|
||||
$len="eax";
|
||||
$rounds="ecx";
|
||||
$key="edx";
|
||||
$inp="esi";
|
||||
$out="edi";
|
||||
$rounds_="ebx"; # backup copy for $rounds
|
||||
$key_="ebp"; # backup copy for $key
|
||||
|
||||
$rndkey0="xmm0";
|
||||
$rndkey1="xmm1";
|
||||
$inout0="xmm2";
|
||||
$inout1="xmm3";
|
||||
$inout2="xmm4";
|
||||
$inout3="xmm5"; $in1="xmm5";
|
||||
$inout4="xmm6"; $in0="xmm6";
|
||||
$inout5="xmm7"; $ivec="xmm7";
|
||||
|
||||
# AESNI extension
|
||||
sub aeskeygenassist
|
||||
{ my($dst,$src,$imm)=@_;
|
||||
if ("$dst:$src" =~ /xmm([0-7]):xmm([0-7])/)
|
||||
{ &data_byte(0x66,0x0f,0x3a,0xdf,0xc0|($1<<3)|$2,$imm); }
|
||||
}
|
||||
sub aescommon
|
||||
{ my($opcodelet,$dst,$src)=@_;
|
||||
if ("$dst:$src" =~ /xmm([0-7]):xmm([0-7])/)
|
||||
{ &data_byte(0x66,0x0f,0x38,$opcodelet,0xc0|($1<<3)|$2);}
|
||||
}
|
||||
sub aesimc { aescommon(0xdb,@_); }
|
||||
sub aesenc { aescommon(0xdc,@_); }
|
||||
sub aesenclast { aescommon(0xdd,@_); }
|
||||
|
||||
# Inline version of internal aesni_[en|de]crypt1
|
||||
{ my $sn;
|
||||
sub aesni_inline_generate1
|
||||
{ my ($p,$inout,$ivec)=@_; $inout=$inout0 if (!defined($inout));
|
||||
$sn++;
|
||||
|
||||
&$movekey ($rndkey0,&QWP(0,$key));
|
||||
&$movekey ($rndkey1,&QWP(16,$key));
|
||||
&xorps ($ivec,$rndkey0) if (defined($ivec));
|
||||
&lea ($key,&DWP(32,$key));
|
||||
&xorps ($inout,$ivec) if (defined($ivec));
|
||||
&xorps ($inout,$rndkey0) if (!defined($ivec));
|
||||
&set_label("${p}1_loop_$sn");
|
||||
eval"&aes${p} ($inout,$rndkey1)";
|
||||
&dec ($rounds);
|
||||
&$movekey ($rndkey1,&QWP(0,$key));
|
||||
&lea ($key,&DWP(16,$key));
|
||||
&jnz (&label("${p}1_loop_$sn"));
|
||||
eval"&aes${p}last ($inout,$rndkey1)";
|
||||
}}
|
||||
|
||||
sub aesni_generate1 # fully unrolled loop
|
||||
{ my ($p,$inout)=@_; $inout=$inout0 if (!defined($inout));
|
||||
|
||||
&function_begin_B("_aesni_${p}rypt1");
|
||||
&movups ($rndkey0,&QWP(0,$key));
|
||||
&$movekey ($rndkey1,&QWP(0x10,$key));
|
||||
&xorps ($inout,$rndkey0);
|
||||
&$movekey ($rndkey0,&QWP(0x20,$key));
|
||||
&lea ($key,&DWP(0x30,$key));
|
||||
&cmp ($rounds,11);
|
||||
&jb (&label("${p}128"));
|
||||
&lea ($key,&DWP(0x40,$key));
|
||||
# 192-bit key support was removed.
|
||||
|
||||
eval"&aes${p} ($inout,$rndkey1)";
|
||||
&$movekey ($rndkey1,&QWP(-0x40,$key));
|
||||
eval"&aes${p} ($inout,$rndkey0)";
|
||||
&$movekey ($rndkey0,&QWP(-0x30,$key));
|
||||
|
||||
# 192-bit key support was removed.
|
||||
eval"&aes${p} ($inout,$rndkey1)";
|
||||
&$movekey ($rndkey1,&QWP(-0x20,$key));
|
||||
eval"&aes${p} ($inout,$rndkey0)";
|
||||
&$movekey ($rndkey0,&QWP(-0x10,$key));
|
||||
&set_label("${p}128");
|
||||
eval"&aes${p} ($inout,$rndkey1)";
|
||||
&$movekey ($rndkey1,&QWP(0,$key));
|
||||
eval"&aes${p} ($inout,$rndkey0)";
|
||||
&$movekey ($rndkey0,&QWP(0x10,$key));
|
||||
eval"&aes${p} ($inout,$rndkey1)";
|
||||
&$movekey ($rndkey1,&QWP(0x20,$key));
|
||||
eval"&aes${p} ($inout,$rndkey0)";
|
||||
&$movekey ($rndkey0,&QWP(0x30,$key));
|
||||
eval"&aes${p} ($inout,$rndkey1)";
|
||||
&$movekey ($rndkey1,&QWP(0x40,$key));
|
||||
eval"&aes${p} ($inout,$rndkey0)";
|
||||
&$movekey ($rndkey0,&QWP(0x50,$key));
|
||||
eval"&aes${p} ($inout,$rndkey1)";
|
||||
&$movekey ($rndkey1,&QWP(0x60,$key));
|
||||
eval"&aes${p} ($inout,$rndkey0)";
|
||||
&$movekey ($rndkey0,&QWP(0x70,$key));
|
||||
eval"&aes${p} ($inout,$rndkey1)";
|
||||
eval"&aes${p}last ($inout,$rndkey0)";
|
||||
&ret();
|
||||
&function_end_B("_aesni_${p}rypt1");
|
||||
}
|
||||
|
||||
# void $PREFIX_encrypt (const void *inp,void *out,const AES_KEY *key);
|
||||
&aesni_generate1("enc") if (!$inline);
|
||||
&function_begin_B("${PREFIX}_encrypt");
|
||||
&mov ("eax",&wparam(0));
|
||||
&mov ($key,&wparam(2));
|
||||
&movups ($inout0,&QWP(0,"eax"));
|
||||
&mov ($rounds,&DWP(240,$key));
|
||||
&mov ("eax",&wparam(1));
|
||||
if ($inline)
|
||||
{ &aesni_inline_generate1("enc"); }
|
||||
else
|
||||
{ &call ("_aesni_encrypt1"); }
|
||||
&pxor ($rndkey0,$rndkey0); # clear register bank
|
||||
&pxor ($rndkey1,$rndkey1);
|
||||
&movups (&QWP(0,"eax"),$inout0);
|
||||
&pxor ($inout0,$inout0);
|
||||
&ret ();
|
||||
&function_end_B("${PREFIX}_encrypt");
|
||||
|
||||
# _aesni_[en|de]cryptN are private interfaces, N denotes interleave
|
||||
# factor. Why 3x subroutine were originally used in loops? Even though
|
||||
# aes[enc|dec] latency was originally 6, it could be scheduled only
|
||||
# every *2nd* cycle. Thus 3x interleave was the one providing optimal
|
||||
# utilization, i.e. when subroutine's throughput is virtually same as
|
||||
# of non-interleaved subroutine [for number of input blocks up to 3].
|
||||
# This is why it originally made no sense to implement 2x subroutine.
|
||||
# But times change and it became appropriate to spend extra 192 bytes
|
||||
# on 2x subroutine on Atom Silvermont account. For processors that
|
||||
# can schedule aes[enc|dec] every cycle optimal interleave factor
|
||||
# equals to corresponding instructions latency. 8x is optimal for
|
||||
# * Bridge, but it's unfeasible to accommodate such implementation
|
||||
# in XMM registers addressable in 32-bit mode and therefore maximum
|
||||
# of 6x is used instead...
|
||||
|
||||
sub aesni_generate2
|
||||
{ my $p=shift;
|
||||
|
||||
&function_begin_B("_aesni_${p}rypt2");
|
||||
&$movekey ($rndkey0,&QWP(0,$key));
|
||||
&shl ($rounds,4);
|
||||
&$movekey ($rndkey1,&QWP(16,$key));
|
||||
&xorps ($inout0,$rndkey0);
|
||||
&pxor ($inout1,$rndkey0);
|
||||
&$movekey ($rndkey0,&QWP(32,$key));
|
||||
&lea ($key,&DWP(32,$key,$rounds));
|
||||
&neg ($rounds);
|
||||
&add ($rounds,16);
|
||||
|
||||
&set_label("${p}2_loop");
|
||||
eval"&aes${p} ($inout0,$rndkey1)";
|
||||
eval"&aes${p} ($inout1,$rndkey1)";
|
||||
&$movekey ($rndkey1,&QWP(0,$key,$rounds));
|
||||
&add ($rounds,32);
|
||||
eval"&aes${p} ($inout0,$rndkey0)";
|
||||
eval"&aes${p} ($inout1,$rndkey0)";
|
||||
&$movekey ($rndkey0,&QWP(-16,$key,$rounds));
|
||||
&jnz (&label("${p}2_loop"));
|
||||
eval"&aes${p} ($inout0,$rndkey1)";
|
||||
eval"&aes${p} ($inout1,$rndkey1)";
|
||||
eval"&aes${p}last ($inout0,$rndkey0)";
|
||||
eval"&aes${p}last ($inout1,$rndkey0)";
|
||||
&ret();
|
||||
&function_end_B("_aesni_${p}rypt2");
|
||||
}
|
||||
|
||||
sub aesni_generate3
|
||||
{ my $p=shift;
|
||||
|
||||
&function_begin_B("_aesni_${p}rypt3");
|
||||
&$movekey ($rndkey0,&QWP(0,$key));
|
||||
&shl ($rounds,4);
|
||||
&$movekey ($rndkey1,&QWP(16,$key));
|
||||
&xorps ($inout0,$rndkey0);
|
||||
&pxor ($inout1,$rndkey0);
|
||||
&pxor ($inout2,$rndkey0);
|
||||
&$movekey ($rndkey0,&QWP(32,$key));
|
||||
&lea ($key,&DWP(32,$key,$rounds));
|
||||
&neg ($rounds);
|
||||
&add ($rounds,16);
|
||||
|
||||
&set_label("${p}3_loop");
|
||||
eval"&aes${p} ($inout0,$rndkey1)";
|
||||
eval"&aes${p} ($inout1,$rndkey1)";
|
||||
eval"&aes${p} ($inout2,$rndkey1)";
|
||||
&$movekey ($rndkey1,&QWP(0,$key,$rounds));
|
||||
&add ($rounds,32);
|
||||
eval"&aes${p} ($inout0,$rndkey0)";
|
||||
eval"&aes${p} ($inout1,$rndkey0)";
|
||||
eval"&aes${p} ($inout2,$rndkey0)";
|
||||
&$movekey ($rndkey0,&QWP(-16,$key,$rounds));
|
||||
&jnz (&label("${p}3_loop"));
|
||||
eval"&aes${p} ($inout0,$rndkey1)";
|
||||
eval"&aes${p} ($inout1,$rndkey1)";
|
||||
eval"&aes${p} ($inout2,$rndkey1)";
|
||||
eval"&aes${p}last ($inout0,$rndkey0)";
|
||||
eval"&aes${p}last ($inout1,$rndkey0)";
|
||||
eval"&aes${p}last ($inout2,$rndkey0)";
|
||||
&ret();
|
||||
&function_end_B("_aesni_${p}rypt3");
|
||||
}
|
||||
|
||||
# 4x interleave is implemented to improve small block performance,
|
||||
# most notably [and naturally] 4 block by ~30%. One can argue that one
|
||||
# should have implemented 5x as well, but improvement would be <20%,
|
||||
# so it's not worth it...
|
||||
sub aesni_generate4
|
||||
{ my $p=shift;
|
||||
|
||||
&function_begin_B("_aesni_${p}rypt4");
|
||||
&$movekey ($rndkey0,&QWP(0,$key));
|
||||
&$movekey ($rndkey1,&QWP(16,$key));
|
||||
&shl ($rounds,4);
|
||||
&xorps ($inout0,$rndkey0);
|
||||
&pxor ($inout1,$rndkey0);
|
||||
&pxor ($inout2,$rndkey0);
|
||||
&pxor ($inout3,$rndkey0);
|
||||
&$movekey ($rndkey0,&QWP(32,$key));
|
||||
&lea ($key,&DWP(32,$key,$rounds));
|
||||
&neg ($rounds);
|
||||
&data_byte (0x0f,0x1f,0x40,0x00);
|
||||
&add ($rounds,16);
|
||||
|
||||
&set_label("${p}4_loop");
|
||||
eval"&aes${p} ($inout0,$rndkey1)";
|
||||
eval"&aes${p} ($inout1,$rndkey1)";
|
||||
eval"&aes${p} ($inout2,$rndkey1)";
|
||||
eval"&aes${p} ($inout3,$rndkey1)";
|
||||
&$movekey ($rndkey1,&QWP(0,$key,$rounds));
|
||||
&add ($rounds,32);
|
||||
eval"&aes${p} ($inout0,$rndkey0)";
|
||||
eval"&aes${p} ($inout1,$rndkey0)";
|
||||
eval"&aes${p} ($inout2,$rndkey0)";
|
||||
eval"&aes${p} ($inout3,$rndkey0)";
|
||||
&$movekey ($rndkey0,&QWP(-16,$key,$rounds));
|
||||
&jnz (&label("${p}4_loop"));
|
||||
|
||||
eval"&aes${p} ($inout0,$rndkey1)";
|
||||
eval"&aes${p} ($inout1,$rndkey1)";
|
||||
eval"&aes${p} ($inout2,$rndkey1)";
|
||||
eval"&aes${p} ($inout3,$rndkey1)";
|
||||
eval"&aes${p}last ($inout0,$rndkey0)";
|
||||
eval"&aes${p}last ($inout1,$rndkey0)";
|
||||
eval"&aes${p}last ($inout2,$rndkey0)";
|
||||
eval"&aes${p}last ($inout3,$rndkey0)";
|
||||
&ret();
|
||||
&function_end_B("_aesni_${p}rypt4");
|
||||
}
|
||||
|
||||
sub aesni_generate6
|
||||
{ my $p=shift;
|
||||
|
||||
&function_begin_B("_aesni_${p}rypt6");
|
||||
&static_label("_aesni_${p}rypt6_enter");
|
||||
&$movekey ($rndkey0,&QWP(0,$key));
|
||||
&shl ($rounds,4);
|
||||
&$movekey ($rndkey1,&QWP(16,$key));
|
||||
&xorps ($inout0,$rndkey0);
|
||||
&pxor ($inout1,$rndkey0); # pxor does better here
|
||||
&pxor ($inout2,$rndkey0);
|
||||
eval"&aes${p} ($inout0,$rndkey1)";
|
||||
&pxor ($inout3,$rndkey0);
|
||||
&pxor ($inout4,$rndkey0);
|
||||
eval"&aes${p} ($inout1,$rndkey1)";
|
||||
&lea ($key,&DWP(32,$key,$rounds));
|
||||
&neg ($rounds);
|
||||
eval"&aes${p} ($inout2,$rndkey1)";
|
||||
&pxor ($inout5,$rndkey0);
|
||||
&$movekey ($rndkey0,&QWP(0,$key,$rounds));
|
||||
&add ($rounds,16);
|
||||
&jmp (&label("_aesni_${p}rypt6_inner"));
|
||||
|
||||
&set_label("${p}6_loop",16);
|
||||
eval"&aes${p} ($inout0,$rndkey1)";
|
||||
eval"&aes${p} ($inout1,$rndkey1)";
|
||||
eval"&aes${p} ($inout2,$rndkey1)";
|
||||
&set_label("_aesni_${p}rypt6_inner");
|
||||
eval"&aes${p} ($inout3,$rndkey1)";
|
||||
eval"&aes${p} ($inout4,$rndkey1)";
|
||||
eval"&aes${p} ($inout5,$rndkey1)";
|
||||
&set_label("_aesni_${p}rypt6_enter");
|
||||
&$movekey ($rndkey1,&QWP(0,$key,$rounds));
|
||||
&add ($rounds,32);
|
||||
eval"&aes${p} ($inout0,$rndkey0)";
|
||||
eval"&aes${p} ($inout1,$rndkey0)";
|
||||
eval"&aes${p} ($inout2,$rndkey0)";
|
||||
eval"&aes${p} ($inout3,$rndkey0)";
|
||||
eval"&aes${p} ($inout4,$rndkey0)";
|
||||
eval"&aes${p} ($inout5,$rndkey0)";
|
||||
&$movekey ($rndkey0,&QWP(-16,$key,$rounds));
|
||||
&jnz (&label("${p}6_loop"));
|
||||
|
||||
eval"&aes${p} ($inout0,$rndkey1)";
|
||||
eval"&aes${p} ($inout1,$rndkey1)";
|
||||
eval"&aes${p} ($inout2,$rndkey1)";
|
||||
eval"&aes${p} ($inout3,$rndkey1)";
|
||||
eval"&aes${p} ($inout4,$rndkey1)";
|
||||
eval"&aes${p} ($inout5,$rndkey1)";
|
||||
eval"&aes${p}last ($inout0,$rndkey0)";
|
||||
eval"&aes${p}last ($inout1,$rndkey0)";
|
||||
eval"&aes${p}last ($inout2,$rndkey0)";
|
||||
eval"&aes${p}last ($inout3,$rndkey0)";
|
||||
eval"&aes${p}last ($inout4,$rndkey0)";
|
||||
eval"&aes${p}last ($inout5,$rndkey0)";
|
||||
&ret();
|
||||
&function_end_B("_aesni_${p}rypt6");
|
||||
}
|
||||
&aesni_generate2("enc") if ($PREFIX eq $AESNI_PREFIX);
|
||||
&aesni_generate3("enc") if ($PREFIX eq $AESNI_PREFIX);
|
||||
&aesni_generate4("enc") if ($PREFIX eq $AESNI_PREFIX);
|
||||
&aesni_generate6("enc") if ($PREFIX eq $AESNI_PREFIX);
|
||||
|
||||
if ($PREFIX eq $AESNI_PREFIX) {
|
||||
|
||||
######################################################################
|
||||
# void aes_hw_ctr32_encrypt_blocks (const void *in, void *out,
|
||||
# size_t blocks, const AES_KEY *key,
|
||||
# const char *ivec);
|
||||
#
|
||||
# Handles only complete blocks, operates on 32-bit counter and
|
||||
# does not update *ivec! (see crypto/modes/ctr128.c for details)
|
||||
#
|
||||
# stack layout:
|
||||
# 0 pshufb mask
|
||||
# 16 vector addend: 0,6,6,6
|
||||
# 32 counter-less ivec
|
||||
# 48 1st triplet of counter vector
|
||||
# 64 2nd triplet of counter vector
|
||||
# 80 saved %esp
|
||||
|
||||
&function_begin("${PREFIX}_ctr32_encrypt_blocks");
|
||||
&mov ($inp,&wparam(0));
|
||||
&mov ($out,&wparam(1));
|
||||
&mov ($len,&wparam(2));
|
||||
&mov ($key,&wparam(3));
|
||||
&mov ($rounds_,&wparam(4));
|
||||
&mov ($key_,"esp");
|
||||
&sub ("esp",88);
|
||||
&and ("esp",-16); # align stack
|
||||
&mov (&DWP(80,"esp"),$key_);
|
||||
|
||||
&cmp ($len,1);
|
||||
&je (&label("ctr32_one_shortcut"));
|
||||
|
||||
&movdqu ($inout5,&QWP(0,$rounds_)); # load ivec
|
||||
|
||||
# compose byte-swap control mask for pshufb on stack
|
||||
&mov (&DWP(0,"esp"),0x0c0d0e0f);
|
||||
&mov (&DWP(4,"esp"),0x08090a0b);
|
||||
&mov (&DWP(8,"esp"),0x04050607);
|
||||
&mov (&DWP(12,"esp"),0x00010203);
|
||||
|
||||
# compose counter increment vector on stack
|
||||
&mov ($rounds,6);
|
||||
&xor ($key_,$key_);
|
||||
&mov (&DWP(16,"esp"),$rounds);
|
||||
&mov (&DWP(20,"esp"),$rounds);
|
||||
&mov (&DWP(24,"esp"),$rounds);
|
||||
&mov (&DWP(28,"esp"),$key_);
|
||||
|
||||
&pextrd ($rounds_,$inout5,3); # pull 32-bit counter
|
||||
&pinsrd ($inout5,$key_,3); # wipe 32-bit counter
|
||||
|
||||
&mov ($rounds,&DWP(240,$key)); # key->rounds
|
||||
|
||||
# compose 2 vectors of 3x32-bit counters
|
||||
&bswap ($rounds_);
|
||||
&pxor ($rndkey0,$rndkey0);
|
||||
&pxor ($rndkey1,$rndkey1);
|
||||
&movdqa ($inout0,&QWP(0,"esp")); # load byte-swap mask
|
||||
&pinsrd ($rndkey0,$rounds_,0);
|
||||
&lea ($key_,&DWP(3,$rounds_));
|
||||
&pinsrd ($rndkey1,$key_,0);
|
||||
&inc ($rounds_);
|
||||
&pinsrd ($rndkey0,$rounds_,1);
|
||||
&inc ($key_);
|
||||
&pinsrd ($rndkey1,$key_,1);
|
||||
&inc ($rounds_);
|
||||
&pinsrd ($rndkey0,$rounds_,2);
|
||||
&inc ($key_);
|
||||
&pinsrd ($rndkey1,$key_,2);
|
||||
&movdqa (&QWP(48,"esp"),$rndkey0); # save 1st triplet
|
||||
&pshufb ($rndkey0,$inout0); # byte swap
|
||||
&movdqu ($inout4,&QWP(0,$key)); # key[0]
|
||||
&movdqa (&QWP(64,"esp"),$rndkey1); # save 2nd triplet
|
||||
&pshufb ($rndkey1,$inout0); # byte swap
|
||||
|
||||
&pshufd ($inout0,$rndkey0,3<<6); # place counter to upper dword
|
||||
&pshufd ($inout1,$rndkey0,2<<6);
|
||||
&cmp ($len,6);
|
||||
&jb (&label("ctr32_tail"));
|
||||
&pxor ($inout5,$inout4); # counter-less ivec^key[0]
|
||||
&shl ($rounds,4);
|
||||
&mov ($rounds_,16);
|
||||
&movdqa (&QWP(32,"esp"),$inout5); # save counter-less ivec^key[0]
|
||||
&mov ($key_,$key); # backup $key
|
||||
&sub ($rounds_,$rounds); # backup twisted $rounds
|
||||
&lea ($key,&DWP(32,$key,$rounds));
|
||||
&sub ($len,6);
|
||||
&jmp (&label("ctr32_loop6"));
|
||||
|
||||
&set_label("ctr32_loop6",16);
|
||||
# inlining _aesni_encrypt6's prologue gives ~6% improvement...
|
||||
&pshufd ($inout2,$rndkey0,1<<6);
|
||||
&movdqa ($rndkey0,&QWP(32,"esp")); # pull counter-less ivec
|
||||
&pshufd ($inout3,$rndkey1,3<<6);
|
||||
&pxor ($inout0,$rndkey0); # merge counter-less ivec
|
||||
&pshufd ($inout4,$rndkey1,2<<6);
|
||||
&pxor ($inout1,$rndkey0);
|
||||
&pshufd ($inout5,$rndkey1,1<<6);
|
||||
&$movekey ($rndkey1,&QWP(16,$key_));
|
||||
&pxor ($inout2,$rndkey0);
|
||||
&pxor ($inout3,$rndkey0);
|
||||
&aesenc ($inout0,$rndkey1);
|
||||
&pxor ($inout4,$rndkey0);
|
||||
&pxor ($inout5,$rndkey0);
|
||||
&aesenc ($inout1,$rndkey1);
|
||||
&$movekey ($rndkey0,&QWP(32,$key_));
|
||||
&mov ($rounds,$rounds_);
|
||||
&aesenc ($inout2,$rndkey1);
|
||||
&aesenc ($inout3,$rndkey1);
|
||||
&aesenc ($inout4,$rndkey1);
|
||||
&aesenc ($inout5,$rndkey1);
|
||||
|
||||
&call (&label("_aesni_encrypt6_enter"));
|
||||
|
||||
&movups ($rndkey1,&QWP(0,$inp));
|
||||
&movups ($rndkey0,&QWP(0x10,$inp));
|
||||
&xorps ($inout0,$rndkey1);
|
||||
&movups ($rndkey1,&QWP(0x20,$inp));
|
||||
&xorps ($inout1,$rndkey0);
|
||||
&movups (&QWP(0,$out),$inout0);
|
||||
&movdqa ($rndkey0,&QWP(16,"esp")); # load increment
|
||||
&xorps ($inout2,$rndkey1);
|
||||
&movdqa ($rndkey1,&QWP(64,"esp")); # load 2nd triplet
|
||||
&movups (&QWP(0x10,$out),$inout1);
|
||||
&movups (&QWP(0x20,$out),$inout2);
|
||||
|
||||
&paddd ($rndkey1,$rndkey0); # 2nd triplet increment
|
||||
&paddd ($rndkey0,&QWP(48,"esp")); # 1st triplet increment
|
||||
&movdqa ($inout0,&QWP(0,"esp")); # load byte swap mask
|
||||
|
||||
&movups ($inout1,&QWP(0x30,$inp));
|
||||
&movups ($inout2,&QWP(0x40,$inp));
|
||||
&xorps ($inout3,$inout1);
|
||||
&movups ($inout1,&QWP(0x50,$inp));
|
||||
&lea ($inp,&DWP(0x60,$inp));
|
||||
&movdqa (&QWP(48,"esp"),$rndkey0); # save 1st triplet
|
||||
&pshufb ($rndkey0,$inout0); # byte swap
|
||||
&xorps ($inout4,$inout2);
|
||||
&movups (&QWP(0x30,$out),$inout3);
|
||||
&xorps ($inout5,$inout1);
|
||||
&movdqa (&QWP(64,"esp"),$rndkey1); # save 2nd triplet
|
||||
&pshufb ($rndkey1,$inout0); # byte swap
|
||||
&movups (&QWP(0x40,$out),$inout4);
|
||||
&pshufd ($inout0,$rndkey0,3<<6);
|
||||
&movups (&QWP(0x50,$out),$inout5);
|
||||
&lea ($out,&DWP(0x60,$out));
|
||||
|
||||
&pshufd ($inout1,$rndkey0,2<<6);
|
||||
&sub ($len,6);
|
||||
&jnc (&label("ctr32_loop6"));
|
||||
|
||||
&add ($len,6);
|
||||
&jz (&label("ctr32_ret"));
|
||||
&movdqu ($inout5,&QWP(0,$key_));
|
||||
&mov ($key,$key_);
|
||||
&pxor ($inout5,&QWP(32,"esp")); # restore count-less ivec
|
||||
&mov ($rounds,&DWP(240,$key_)); # restore $rounds
|
||||
|
||||
&set_label("ctr32_tail");
|
||||
&por ($inout0,$inout5);
|
||||
&cmp ($len,2);
|
||||
&jb (&label("ctr32_one"));
|
||||
|
||||
&pshufd ($inout2,$rndkey0,1<<6);
|
||||
&por ($inout1,$inout5);
|
||||
&je (&label("ctr32_two"));
|
||||
|
||||
&pshufd ($inout3,$rndkey1,3<<6);
|
||||
&por ($inout2,$inout5);
|
||||
&cmp ($len,4);
|
||||
&jb (&label("ctr32_three"));
|
||||
|
||||
&pshufd ($inout4,$rndkey1,2<<6);
|
||||
&por ($inout3,$inout5);
|
||||
&je (&label("ctr32_four"));
|
||||
|
||||
&por ($inout4,$inout5);
|
||||
&call ("_aesni_encrypt6");
|
||||
&movups ($rndkey1,&QWP(0,$inp));
|
||||
&movups ($rndkey0,&QWP(0x10,$inp));
|
||||
&xorps ($inout0,$rndkey1);
|
||||
&movups ($rndkey1,&QWP(0x20,$inp));
|
||||
&xorps ($inout1,$rndkey0);
|
||||
&movups ($rndkey0,&QWP(0x30,$inp));
|
||||
&xorps ($inout2,$rndkey1);
|
||||
&movups ($rndkey1,&QWP(0x40,$inp));
|
||||
&xorps ($inout3,$rndkey0);
|
||||
&movups (&QWP(0,$out),$inout0);
|
||||
&xorps ($inout4,$rndkey1);
|
||||
&movups (&QWP(0x10,$out),$inout1);
|
||||
&movups (&QWP(0x20,$out),$inout2);
|
||||
&movups (&QWP(0x30,$out),$inout3);
|
||||
&movups (&QWP(0x40,$out),$inout4);
|
||||
&jmp (&label("ctr32_ret"));
|
||||
|
||||
&set_label("ctr32_one_shortcut",16);
|
||||
&movups ($inout0,&QWP(0,$rounds_)); # load ivec
|
||||
&mov ($rounds,&DWP(240,$key));
|
||||
|
||||
&set_label("ctr32_one");
|
||||
if ($inline)
|
||||
{ &aesni_inline_generate1("enc"); }
|
||||
else
|
||||
{ &call ("_aesni_encrypt1"); }
|
||||
&movups ($in0,&QWP(0,$inp));
|
||||
&xorps ($in0,$inout0);
|
||||
&movups (&QWP(0,$out),$in0);
|
||||
&jmp (&label("ctr32_ret"));
|
||||
|
||||
&set_label("ctr32_two",16);
|
||||
&call ("_aesni_encrypt2");
|
||||
&movups ($inout3,&QWP(0,$inp));
|
||||
&movups ($inout4,&QWP(0x10,$inp));
|
||||
&xorps ($inout0,$inout3);
|
||||
&xorps ($inout1,$inout4);
|
||||
&movups (&QWP(0,$out),$inout0);
|
||||
&movups (&QWP(0x10,$out),$inout1);
|
||||
&jmp (&label("ctr32_ret"));
|
||||
|
||||
&set_label("ctr32_three",16);
|
||||
&call ("_aesni_encrypt3");
|
||||
&movups ($inout3,&QWP(0,$inp));
|
||||
&movups ($inout4,&QWP(0x10,$inp));
|
||||
&xorps ($inout0,$inout3);
|
||||
&movups ($inout5,&QWP(0x20,$inp));
|
||||
&xorps ($inout1,$inout4);
|
||||
&movups (&QWP(0,$out),$inout0);
|
||||
&xorps ($inout2,$inout5);
|
||||
&movups (&QWP(0x10,$out),$inout1);
|
||||
&movups (&QWP(0x20,$out),$inout2);
|
||||
&jmp (&label("ctr32_ret"));
|
||||
|
||||
&set_label("ctr32_four",16);
|
||||
&call ("_aesni_encrypt4");
|
||||
&movups ($inout4,&QWP(0,$inp));
|
||||
&movups ($inout5,&QWP(0x10,$inp));
|
||||
&movups ($rndkey1,&QWP(0x20,$inp));
|
||||
&xorps ($inout0,$inout4);
|
||||
&movups ($rndkey0,&QWP(0x30,$inp));
|
||||
&xorps ($inout1,$inout5);
|
||||
&movups (&QWP(0,$out),$inout0);
|
||||
&xorps ($inout2,$rndkey1);
|
||||
&movups (&QWP(0x10,$out),$inout1);
|
||||
&xorps ($inout3,$rndkey0);
|
||||
&movups (&QWP(0x20,$out),$inout2);
|
||||
&movups (&QWP(0x30,$out),$inout3);
|
||||
|
||||
&set_label("ctr32_ret");
|
||||
&pxor ("xmm0","xmm0"); # clear register bank
|
||||
&pxor ("xmm1","xmm1");
|
||||
&pxor ("xmm2","xmm2");
|
||||
&pxor ("xmm3","xmm3");
|
||||
&pxor ("xmm4","xmm4");
|
||||
&movdqa (&QWP(32,"esp"),"xmm0"); # clear stack
|
||||
&pxor ("xmm5","xmm5");
|
||||
&movdqa (&QWP(48,"esp"),"xmm0");
|
||||
&pxor ("xmm6","xmm6");
|
||||
&movdqa (&QWP(64,"esp"),"xmm0");
|
||||
&pxor ("xmm7","xmm7");
|
||||
&mov ("esp",&DWP(80,"esp"));
|
||||
&function_end("${PREFIX}_ctr32_encrypt_blocks");
|
||||
}
|
||||
|
||||
######################################################################
|
||||
# Mechanical port from aesni-x86_64.pl.
|
||||
#
|
||||
# _aesni_set_encrypt_key is private interface,
|
||||
# input:
|
||||
# "eax" const unsigned char *userKey
|
||||
# $rounds int bits
|
||||
# $key AES_KEY *key
|
||||
# output:
|
||||
# "eax" return code
|
||||
# $round rounds
|
||||
|
||||
&function_begin_B("_aesni_set_encrypt_key");
|
||||
&push ("ebp");
|
||||
&push ("ebx");
|
||||
&test ("eax","eax");
|
||||
&jz (&label("bad_pointer"));
|
||||
&test ($key,$key);
|
||||
&jz (&label("bad_pointer"));
|
||||
|
||||
&call (&label("pic"));
|
||||
&set_label("pic");
|
||||
&blindpop("ebx");
|
||||
&lea ("ebx",&DWP(&label("key_const")."-".&label("pic"),"ebx"));
|
||||
|
||||
&picmeup("ebp","GFp_ia32cap_P","ebx",&label("key_const"));
|
||||
&movups ("xmm0",&QWP(0,"eax")); # pull first 128 bits of *userKey
|
||||
&xorps ("xmm4","xmm4"); # low dword of xmm4 is assumed 0
|
||||
&mov ("ebp",&DWP(4,"ebp"));
|
||||
&lea ($key,&DWP(16,$key));
|
||||
&and ("ebp",1<<28|1<<11); # AVX and XOP bits
|
||||
&cmp ($rounds,256);
|
||||
&je (&label("14rounds"));
|
||||
# 192-bit key support was removed.
|
||||
&cmp ($rounds,128);
|
||||
&jne (&label("bad_keybits"));
|
||||
|
||||
&set_label("10rounds",16);
|
||||
&cmp ("ebp",1<<28);
|
||||
&je (&label("10rounds_alt"));
|
||||
|
||||
&mov ($rounds,9);
|
||||
&$movekey (&QWP(-16,$key),"xmm0"); # round 0
|
||||
&aeskeygenassist("xmm1","xmm0",0x01); # round 1
|
||||
&call (&label("key_128_cold"));
|
||||
&aeskeygenassist("xmm1","xmm0",0x2); # round 2
|
||||
&call (&label("key_128"));
|
||||
&aeskeygenassist("xmm1","xmm0",0x04); # round 3
|
||||
&call (&label("key_128"));
|
||||
&aeskeygenassist("xmm1","xmm0",0x08); # round 4
|
||||
&call (&label("key_128"));
|
||||
&aeskeygenassist("xmm1","xmm0",0x10); # round 5
|
||||
&call (&label("key_128"));
|
||||
&aeskeygenassist("xmm1","xmm0",0x20); # round 6
|
||||
&call (&label("key_128"));
|
||||
&aeskeygenassist("xmm1","xmm0",0x40); # round 7
|
||||
&call (&label("key_128"));
|
||||
&aeskeygenassist("xmm1","xmm0",0x80); # round 8
|
||||
&call (&label("key_128"));
|
||||
&aeskeygenassist("xmm1","xmm0",0x1b); # round 9
|
||||
&call (&label("key_128"));
|
||||
&aeskeygenassist("xmm1","xmm0",0x36); # round 10
|
||||
&call (&label("key_128"));
|
||||
&$movekey (&QWP(0,$key),"xmm0");
|
||||
&mov (&DWP(80,$key),$rounds);
|
||||
|
||||
&jmp (&label("good_key"));
|
||||
|
||||
&set_label("key_128",16);
|
||||
&$movekey (&QWP(0,$key),"xmm0");
|
||||
&lea ($key,&DWP(16,$key));
|
||||
&set_label("key_128_cold");
|
||||
&shufps ("xmm4","xmm0",0b00010000);
|
||||
&xorps ("xmm0","xmm4");
|
||||
&shufps ("xmm4","xmm0",0b10001100);
|
||||
&xorps ("xmm0","xmm4");
|
||||
&shufps ("xmm1","xmm1",0b11111111); # critical path
|
||||
&xorps ("xmm0","xmm1");
|
||||
&ret();
|
||||
|
||||
&set_label("10rounds_alt",16);
|
||||
&movdqa ("xmm5",&QWP(0x00,"ebx"));
|
||||
&mov ($rounds,8);
|
||||
&movdqa ("xmm4",&QWP(0x20,"ebx"));
|
||||
&movdqa ("xmm2","xmm0");
|
||||
&movdqu (&QWP(-16,$key),"xmm0");
|
||||
|
||||
&set_label("loop_key128");
|
||||
&pshufb ("xmm0","xmm5");
|
||||
&aesenclast ("xmm0","xmm4");
|
||||
&pslld ("xmm4",1);
|
||||
&lea ($key,&DWP(16,$key));
|
||||
|
||||
&movdqa ("xmm3","xmm2");
|
||||
&pslldq ("xmm2",4);
|
||||
&pxor ("xmm3","xmm2");
|
||||
&pslldq ("xmm2",4);
|
||||
&pxor ("xmm3","xmm2");
|
||||
&pslldq ("xmm2",4);
|
||||
&pxor ("xmm2","xmm3");
|
||||
|
||||
&pxor ("xmm0","xmm2");
|
||||
&movdqu (&QWP(-16,$key),"xmm0");
|
||||
&movdqa ("xmm2","xmm0");
|
||||
|
||||
&dec ($rounds);
|
||||
&jnz (&label("loop_key128"));
|
||||
|
||||
&movdqa ("xmm4",&QWP(0x30,"ebx"));
|
||||
|
||||
&pshufb ("xmm0","xmm5");
|
||||
&aesenclast ("xmm0","xmm4");
|
||||
&pslld ("xmm4",1);
|
||||
|
||||
&movdqa ("xmm3","xmm2");
|
||||
&pslldq ("xmm2",4);
|
||||
&pxor ("xmm3","xmm2");
|
||||
&pslldq ("xmm2",4);
|
||||
&pxor ("xmm3","xmm2");
|
||||
&pslldq ("xmm2",4);
|
||||
&pxor ("xmm2","xmm3");
|
||||
|
||||
&pxor ("xmm0","xmm2");
|
||||
&movdqu (&QWP(0,$key),"xmm0");
|
||||
|
||||
&movdqa ("xmm2","xmm0");
|
||||
&pshufb ("xmm0","xmm5");
|
||||
&aesenclast ("xmm0","xmm4");
|
||||
|
||||
&movdqa ("xmm3","xmm2");
|
||||
&pslldq ("xmm2",4);
|
||||
&pxor ("xmm3","xmm2");
|
||||
&pslldq ("xmm2",4);
|
||||
&pxor ("xmm3","xmm2");
|
||||
&pslldq ("xmm2",4);
|
||||
&pxor ("xmm2","xmm3");
|
||||
|
||||
&pxor ("xmm0","xmm2");
|
||||
&movdqu (&QWP(16,$key),"xmm0");
|
||||
|
||||
&mov ($rounds,9);
|
||||
&mov (&DWP(96,$key),$rounds);
|
||||
|
||||
&jmp (&label("good_key"));
|
||||
|
||||
# 192-bit key support was removed.
|
||||
|
||||
&set_label("14rounds",16);
|
||||
&movups ("xmm2",&QWP(16,"eax")); # remaining half of *userKey
|
||||
&lea ($key,&DWP(16,$key));
|
||||
&cmp ("ebp",1<<28);
|
||||
&je (&label("14rounds_alt"));
|
||||
|
||||
&mov ($rounds,13);
|
||||
&$movekey (&QWP(-32,$key),"xmm0"); # round 0
|
||||
&$movekey (&QWP(-16,$key),"xmm2"); # round 1
|
||||
&aeskeygenassist("xmm1","xmm2",0x01); # round 2
|
||||
&call (&label("key_256a_cold"));
|
||||
&aeskeygenassist("xmm1","xmm0",0x01); # round 3
|
||||
&call (&label("key_256b"));
|
||||
&aeskeygenassist("xmm1","xmm2",0x02); # round 4
|
||||
&call (&label("key_256a"));
|
||||
&aeskeygenassist("xmm1","xmm0",0x02); # round 5
|
||||
&call (&label("key_256b"));
|
||||
&aeskeygenassist("xmm1","xmm2",0x04); # round 6
|
||||
&call (&label("key_256a"));
|
||||
&aeskeygenassist("xmm1","xmm0",0x04); # round 7
|
||||
&call (&label("key_256b"));
|
||||
&aeskeygenassist("xmm1","xmm2",0x08); # round 8
|
||||
&call (&label("key_256a"));
|
||||
&aeskeygenassist("xmm1","xmm0",0x08); # round 9
|
||||
&call (&label("key_256b"));
|
||||
&aeskeygenassist("xmm1","xmm2",0x10); # round 10
|
||||
&call (&label("key_256a"));
|
||||
&aeskeygenassist("xmm1","xmm0",0x10); # round 11
|
||||
&call (&label("key_256b"));
|
||||
&aeskeygenassist("xmm1","xmm2",0x20); # round 12
|
||||
&call (&label("key_256a"));
|
||||
&aeskeygenassist("xmm1","xmm0",0x20); # round 13
|
||||
&call (&label("key_256b"));
|
||||
&aeskeygenassist("xmm1","xmm2",0x40); # round 14
|
||||
&call (&label("key_256a"));
|
||||
&$movekey (&QWP(0,$key),"xmm0");
|
||||
&mov (&DWP(16,$key),$rounds);
|
||||
&xor ("eax","eax");
|
||||
|
||||
&jmp (&label("good_key"));
|
||||
|
||||
&set_label("key_256a",16);
|
||||
&$movekey (&QWP(0,$key),"xmm2");
|
||||
&lea ($key,&DWP(16,$key));
|
||||
&set_label("key_256a_cold");
|
||||
&shufps ("xmm4","xmm0",0b00010000);
|
||||
&xorps ("xmm0","xmm4");
|
||||
&shufps ("xmm4","xmm0",0b10001100);
|
||||
&xorps ("xmm0","xmm4");
|
||||
&shufps ("xmm1","xmm1",0b11111111); # critical path
|
||||
&xorps ("xmm0","xmm1");
|
||||
&ret();
|
||||
|
||||
&set_label("key_256b",16);
|
||||
&$movekey (&QWP(0,$key),"xmm0");
|
||||
&lea ($key,&DWP(16,$key));
|
||||
|
||||
&shufps ("xmm4","xmm2",0b00010000);
|
||||
&xorps ("xmm2","xmm4");
|
||||
&shufps ("xmm4","xmm2",0b10001100);
|
||||
&xorps ("xmm2","xmm4");
|
||||
&shufps ("xmm1","xmm1",0b10101010); # critical path
|
||||
&xorps ("xmm2","xmm1");
|
||||
&ret();
|
||||
|
||||
&set_label("14rounds_alt",16);
|
||||
&movdqa ("xmm5",&QWP(0x00,"ebx"));
|
||||
&movdqa ("xmm4",&QWP(0x20,"ebx"));
|
||||
&mov ($rounds,7);
|
||||
&movdqu (&QWP(-32,$key),"xmm0");
|
||||
&movdqa ("xmm1","xmm2");
|
||||
&movdqu (&QWP(-16,$key),"xmm2");
|
||||
|
||||
&set_label("loop_key256");
|
||||
&pshufb ("xmm2","xmm5");
|
||||
&aesenclast ("xmm2","xmm4");
|
||||
|
||||
&movdqa ("xmm3","xmm0");
|
||||
&pslldq ("xmm0",4);
|
||||
&pxor ("xmm3","xmm0");
|
||||
&pslldq ("xmm0",4);
|
||||
&pxor ("xmm3","xmm0");
|
||||
&pslldq ("xmm0",4);
|
||||
&pxor ("xmm0","xmm3");
|
||||
&pslld ("xmm4",1);
|
||||
|
||||
&pxor ("xmm0","xmm2");
|
||||
&movdqu (&QWP(0,$key),"xmm0");
|
||||
|
||||
&dec ($rounds);
|
||||
&jz (&label("done_key256"));
|
||||
|
||||
&pshufd ("xmm2","xmm0",0xff);
|
||||
&pxor ("xmm3","xmm3");
|
||||
&aesenclast ("xmm2","xmm3");
|
||||
|
||||
&movdqa ("xmm3","xmm1");
|
||||
&pslldq ("xmm1",4);
|
||||
&pxor ("xmm3","xmm1");
|
||||
&pslldq ("xmm1",4);
|
||||
&pxor ("xmm3","xmm1");
|
||||
&pslldq ("xmm1",4);
|
||||
&pxor ("xmm1","xmm3");
|
||||
|
||||
&pxor ("xmm2","xmm1");
|
||||
&movdqu (&QWP(16,$key),"xmm2");
|
||||
&lea ($key,&DWP(32,$key));
|
||||
&movdqa ("xmm1","xmm2");
|
||||
&jmp (&label("loop_key256"));
|
||||
|
||||
&set_label("done_key256");
|
||||
&mov ($rounds,13);
|
||||
&mov (&DWP(16,$key),$rounds);
|
||||
|
||||
&set_label("good_key");
|
||||
&pxor ("xmm0","xmm0");
|
||||
&pxor ("xmm1","xmm1");
|
||||
&pxor ("xmm2","xmm2");
|
||||
&pxor ("xmm3","xmm3");
|
||||
&pxor ("xmm4","xmm4");
|
||||
&pxor ("xmm5","xmm5");
|
||||
&xor ("eax","eax");
|
||||
&pop ("ebx");
|
||||
&pop ("ebp");
|
||||
&ret ();
|
||||
|
||||
&set_label("bad_pointer",4);
|
||||
&mov ("eax",-1);
|
||||
&pop ("ebx");
|
||||
&pop ("ebp");
|
||||
&ret ();
|
||||
&set_label("bad_keybits",4);
|
||||
&pxor ("xmm0","xmm0");
|
||||
&mov ("eax",-2);
|
||||
&pop ("ebx");
|
||||
&pop ("ebp");
|
||||
&ret ();
|
||||
&function_end_B("_aesni_set_encrypt_key");
|
||||
|
||||
# int $PREFIX_set_encrypt_key (const unsigned char *userKey, int bits,
|
||||
# AES_KEY *key)
|
||||
&function_begin_B("${PREFIX}_set_encrypt_key");
|
||||
&mov ("eax",&wparam(0));
|
||||
&mov ($rounds,&wparam(1));
|
||||
&mov ($key,&wparam(2));
|
||||
&call ("_aesni_set_encrypt_key");
|
||||
&ret ();
|
||||
&function_end_B("${PREFIX}_set_encrypt_key");
|
||||
|
||||
&set_label("key_const",64);
|
||||
&data_word(0x0c0f0e0d,0x0c0f0e0d,0x0c0f0e0d,0x0c0f0e0d);
|
||||
&data_word(0x04070605,0x04070605,0x04070605,0x04070605);
|
||||
&data_word(1,1,1,1);
|
||||
&data_word(0x1b,0x1b,0x1b,0x1b);
|
||||
&asciz("AES for Intel AES-NI, CRYPTOGAMS by <appro\@openssl.org>");
|
||||
|
||||
&asm_finish();
|
||||
|
||||
close STDOUT or die "error closing STDOUT";
|
||||
1704
zeroidc/vendor/ring/crypto/fipsmodule/aes/asm/aesni-x86_64.pl
vendored
Normal file
1704
zeroidc/vendor/ring/crypto/fipsmodule/aes/asm/aesni-x86_64.pl
vendored
Normal file
File diff suppressed because it is too large
Load Diff
630
zeroidc/vendor/ring/crypto/fipsmodule/aes/asm/aesv8-armx.pl
vendored
Normal file
630
zeroidc/vendor/ring/crypto/fipsmodule/aes/asm/aesv8-armx.pl
vendored
Normal file
@@ -0,0 +1,630 @@
|
||||
#! /usr/bin/env perl
|
||||
# Copyright 2014-2016 The OpenSSL Project Authors. All Rights Reserved.
|
||||
#
|
||||
# Licensed under the OpenSSL license (the "License"). You may not use
|
||||
# this file except in compliance with the License. You can obtain a copy
|
||||
# in the file LICENSE in the source distribution or at
|
||||
# https://www.openssl.org/source/license.html
|
||||
|
||||
#
|
||||
# ====================================================================
|
||||
# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
|
||||
# project. The module is, however, dual licensed under OpenSSL and
|
||||
# CRYPTOGAMS licenses depending on where you obtain it. For further
|
||||
# details see http://www.openssl.org/~appro/cryptogams/.
|
||||
# ====================================================================
|
||||
#
|
||||
# This module implements support for ARMv8 AES instructions. The
|
||||
# module is endian-agnostic in sense that it supports both big- and
|
||||
# little-endian cases. As does it support both 32- and 64-bit modes
|
||||
# of operation. Latter is achieved by limiting amount of utilized
|
||||
# registers to 16, which implies additional NEON load and integer
|
||||
# instructions. This has no effect on mighty Apple A7, where results
|
||||
# are literally equal to the theoretical estimates based on AES
|
||||
# instruction latencies and issue rates. On Cortex-A53, an in-order
|
||||
# execution core, this costs up to 10-15%, which is partially
|
||||
# compensated by implementing dedicated code path for 128-bit
|
||||
# CBC encrypt case. On Cortex-A57 parallelizable mode performance
|
||||
# seems to be limited by sheer amount of NEON instructions...
|
||||
#
|
||||
# Performance in cycles per byte processed with 128-bit key:
|
||||
#
|
||||
# CBC enc CBC dec CTR
|
||||
# Apple A7 2.39 1.20 1.20
|
||||
# Cortex-A53 1.32 1.29 1.46
|
||||
# Cortex-A57(*) 1.95 0.85 0.93
|
||||
# Denver 1.96 0.86 0.80
|
||||
# Mongoose 1.33 1.20 1.20
|
||||
#
|
||||
# (*) original 3.64/1.34/1.32 results were for r0p0 revision
|
||||
# and are still same even for updated module;
|
||||
|
||||
$flavour = shift;
|
||||
$output = shift;
|
||||
|
||||
$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
|
||||
( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or
|
||||
( $xlate="${dir}../../../perlasm/arm-xlate.pl" and -f $xlate) or
|
||||
die "can't locate arm-xlate.pl";
|
||||
|
||||
open OUT,"| \"$^X\" $xlate $flavour $output";
|
||||
*STDOUT=*OUT;
|
||||
|
||||
$prefix="aes_hw";
|
||||
|
||||
$code=<<___;
|
||||
#include <GFp/arm_arch.h>
|
||||
|
||||
#if __ARM_MAX_ARCH__>=7
|
||||
.text
|
||||
___
|
||||
$code.=".arch armv8-a+crypto\n" if ($flavour =~ /64/);
|
||||
$code.=<<___ if ($flavour !~ /64/);
|
||||
.arch armv7-a // don't confuse not-so-latest binutils with argv8 :-)
|
||||
.fpu neon
|
||||
.code 32
|
||||
#undef __thumb2__
|
||||
___
|
||||
|
||||
# Assembler mnemonics are an eclectic mix of 32- and 64-bit syntax,
|
||||
# NEON is mostly 32-bit mnemonics, integer - mostly 64. Goal is to
|
||||
# maintain both 32- and 64-bit codes within single module and
|
||||
# transliterate common code to either flavour with regex vodoo.
|
||||
#
|
||||
{{{
|
||||
my ($inp,$bits,$out,$ptr,$rounds)=("x0","w1","x2","x3","w12");
|
||||
my ($zero,$rcon,$mask,$in0,$in1,$tmp,$key)=
|
||||
$flavour=~/64/? map("q$_",(0..6)) : map("q$_",(0..3,8..10));
|
||||
|
||||
|
||||
# On AArch64, put the data .rodata and use adrp + add for compatibility with
|
||||
# execute-only memory. On AArch32, put it in .text and use adr.
|
||||
$code.= ".section .rodata\n" if ($flavour =~ /64/);
|
||||
$code.=<<___;
|
||||
.align 5
|
||||
.Lrcon:
|
||||
.long 0x01,0x01,0x01,0x01
|
||||
.long 0x0c0f0e0d,0x0c0f0e0d,0x0c0f0e0d,0x0c0f0e0d // rotate-n-splat
|
||||
.long 0x1b,0x1b,0x1b,0x1b
|
||||
|
||||
.text
|
||||
|
||||
.globl GFp_${prefix}_set_encrypt_key
|
||||
.type GFp_${prefix}_set_encrypt_key,%function
|
||||
.align 5
|
||||
GFp_${prefix}_set_encrypt_key:
|
||||
.Lenc_key:
|
||||
___
|
||||
$code.=<<___ if ($flavour =~ /64/);
|
||||
// Armv8.3-A PAuth: even though x30 is pushed to stack it is not popped later.
|
||||
AARCH64_VALID_CALL_TARGET
|
||||
stp x29,x30,[sp,#-16]!
|
||||
add x29,sp,#0
|
||||
___
|
||||
$code.=<<___;
|
||||
mov $ptr,#-1
|
||||
cmp $inp,#0
|
||||
b.eq .Lenc_key_abort
|
||||
cmp $out,#0
|
||||
b.eq .Lenc_key_abort
|
||||
mov $ptr,#-2
|
||||
cmp $bits,#128
|
||||
b.lt .Lenc_key_abort
|
||||
cmp $bits,#256
|
||||
b.gt .Lenc_key_abort
|
||||
tst $bits,#0x3f
|
||||
b.ne .Lenc_key_abort
|
||||
|
||||
___
|
||||
$code.=<<___ if ($flavour =~ /64/);
|
||||
adrp $ptr,:pg_hi21:.Lrcon
|
||||
add $ptr,$ptr,:lo12:.Lrcon
|
||||
___
|
||||
$code.=<<___ if ($flavour !~ /64/);
|
||||
adr $ptr,.Lrcon
|
||||
___
|
||||
$code.=<<___;
|
||||
cmp $bits,#192
|
||||
|
||||
veor $zero,$zero,$zero
|
||||
vld1.8 {$in0},[$inp],#16
|
||||
mov $bits,#8 // reuse $bits
|
||||
vld1.32 {$rcon,$mask},[$ptr],#32
|
||||
|
||||
b.lt .Loop128
|
||||
// 192-bit key support was removed.
|
||||
b .L256
|
||||
|
||||
.align 4
|
||||
.Loop128:
|
||||
vtbl.8 $key,{$in0},$mask
|
||||
vext.8 $tmp,$zero,$in0,#12
|
||||
vst1.32 {$in0},[$out],#16
|
||||
aese $key,$zero
|
||||
subs $bits,$bits,#1
|
||||
|
||||
veor $in0,$in0,$tmp
|
||||
vext.8 $tmp,$zero,$tmp,#12
|
||||
veor $in0,$in0,$tmp
|
||||
vext.8 $tmp,$zero,$tmp,#12
|
||||
veor $key,$key,$rcon
|
||||
veor $in0,$in0,$tmp
|
||||
vshl.u8 $rcon,$rcon,#1
|
||||
veor $in0,$in0,$key
|
||||
b.ne .Loop128
|
||||
|
||||
vld1.32 {$rcon},[$ptr]
|
||||
|
||||
vtbl.8 $key,{$in0},$mask
|
||||
vext.8 $tmp,$zero,$in0,#12
|
||||
vst1.32 {$in0},[$out],#16
|
||||
aese $key,$zero
|
||||
|
||||
veor $in0,$in0,$tmp
|
||||
vext.8 $tmp,$zero,$tmp,#12
|
||||
veor $in0,$in0,$tmp
|
||||
vext.8 $tmp,$zero,$tmp,#12
|
||||
veor $key,$key,$rcon
|
||||
veor $in0,$in0,$tmp
|
||||
vshl.u8 $rcon,$rcon,#1
|
||||
veor $in0,$in0,$key
|
||||
|
||||
vtbl.8 $key,{$in0},$mask
|
||||
vext.8 $tmp,$zero,$in0,#12
|
||||
vst1.32 {$in0},[$out],#16
|
||||
aese $key,$zero
|
||||
|
||||
veor $in0,$in0,$tmp
|
||||
vext.8 $tmp,$zero,$tmp,#12
|
||||
veor $in0,$in0,$tmp
|
||||
vext.8 $tmp,$zero,$tmp,#12
|
||||
veor $key,$key,$rcon
|
||||
veor $in0,$in0,$tmp
|
||||
veor $in0,$in0,$key
|
||||
vst1.32 {$in0},[$out]
|
||||
add $out,$out,#0x50
|
||||
|
||||
mov $rounds,#10
|
||||
b .Ldone
|
||||
|
||||
// 192-bit key support was removed.
|
||||
|
||||
.align 4
|
||||
.L256:
|
||||
vld1.8 {$in1},[$inp]
|
||||
mov $bits,#7
|
||||
mov $rounds,#14
|
||||
vst1.32 {$in0},[$out],#16
|
||||
|
||||
.Loop256:
|
||||
vtbl.8 $key,{$in1},$mask
|
||||
vext.8 $tmp,$zero,$in0,#12
|
||||
vst1.32 {$in1},[$out],#16
|
||||
aese $key,$zero
|
||||
subs $bits,$bits,#1
|
||||
|
||||
veor $in0,$in0,$tmp
|
||||
vext.8 $tmp,$zero,$tmp,#12
|
||||
veor $in0,$in0,$tmp
|
||||
vext.8 $tmp,$zero,$tmp,#12
|
||||
veor $key,$key,$rcon
|
||||
veor $in0,$in0,$tmp
|
||||
vshl.u8 $rcon,$rcon,#1
|
||||
veor $in0,$in0,$key
|
||||
vst1.32 {$in0},[$out],#16
|
||||
b.eq .Ldone
|
||||
|
||||
vdup.32 $key,${in0}[3] // just splat
|
||||
vext.8 $tmp,$zero,$in1,#12
|
||||
aese $key,$zero
|
||||
|
||||
veor $in1,$in1,$tmp
|
||||
vext.8 $tmp,$zero,$tmp,#12
|
||||
veor $in1,$in1,$tmp
|
||||
vext.8 $tmp,$zero,$tmp,#12
|
||||
veor $in1,$in1,$tmp
|
||||
|
||||
veor $in1,$in1,$key
|
||||
b .Loop256
|
||||
|
||||
.Ldone:
|
||||
str $rounds,[$out]
|
||||
mov $ptr,#0
|
||||
|
||||
.Lenc_key_abort:
|
||||
mov x0,$ptr // return value
|
||||
`"ldr x29,[sp],#16" if ($flavour =~ /64/)`
|
||||
ret
|
||||
.size GFp_${prefix}_set_encrypt_key,.-GFp_${prefix}_set_encrypt_key
|
||||
___
|
||||
}}}
|
||||
{{{
|
||||
sub gen_block () {
|
||||
my $dir = shift;
|
||||
my ($e,$mc) = $dir eq "en" ? ("e","mc") : ("d","imc");
|
||||
my ($inp,$out,$key)=map("x$_",(0..2));
|
||||
my $rounds="w3";
|
||||
my ($rndkey0,$rndkey1,$inout)=map("q$_",(0..3));
|
||||
|
||||
$code.=<<___;
|
||||
.globl GFp_${prefix}_${dir}crypt
|
||||
.type GFp_${prefix}_${dir}crypt,%function
|
||||
.align 5
|
||||
GFp_${prefix}_${dir}crypt:
|
||||
AARCH64_VALID_CALL_TARGET
|
||||
ldr $rounds,[$key,#240]
|
||||
vld1.32 {$rndkey0},[$key],#16
|
||||
vld1.8 {$inout},[$inp]
|
||||
sub $rounds,$rounds,#2
|
||||
vld1.32 {$rndkey1},[$key],#16
|
||||
|
||||
.Loop_${dir}c:
|
||||
aes$e $inout,$rndkey0
|
||||
aes$mc $inout,$inout
|
||||
vld1.32 {$rndkey0},[$key],#16
|
||||
subs $rounds,$rounds,#2
|
||||
aes$e $inout,$rndkey1
|
||||
aes$mc $inout,$inout
|
||||
vld1.32 {$rndkey1},[$key],#16
|
||||
b.gt .Loop_${dir}c
|
||||
|
||||
aes$e $inout,$rndkey0
|
||||
aes$mc $inout,$inout
|
||||
vld1.32 {$rndkey0},[$key]
|
||||
aes$e $inout,$rndkey1
|
||||
veor $inout,$inout,$rndkey0
|
||||
|
||||
vst1.8 {$inout},[$out]
|
||||
ret
|
||||
.size GFp_${prefix}_${dir}crypt,.-GFp_${prefix}_${dir}crypt
|
||||
___
|
||||
}
|
||||
&gen_block("en");
|
||||
&gen_block("de");
|
||||
}}}
|
||||
{{{
|
||||
my ($inp,$out,$len,$key,$ivp)=map("x$_",(0..4));
|
||||
my ($rounds,$cnt,$key_)=("w5","w6","x7");
|
||||
my ($ctr,$tctr0,$tctr1,$tctr2)=map("w$_",(8..10,12));
|
||||
my $step="x12"; # aliases with $tctr2
|
||||
|
||||
my ($dat0,$dat1,$in0,$in1,$tmp0,$tmp1,$ivec,$rndlast)=map("q$_",(0..7));
|
||||
my ($dat2,$in2,$tmp2)=map("q$_",(10,11,9));
|
||||
|
||||
my ($dat,$tmp)=($dat0,$tmp0);
|
||||
|
||||
### q8-q15 preloaded key schedule
|
||||
|
||||
$code.=<<___;
|
||||
.globl GFp_${prefix}_ctr32_encrypt_blocks
|
||||
.type GFp_${prefix}_ctr32_encrypt_blocks,%function
|
||||
.align 5
|
||||
GFp_${prefix}_ctr32_encrypt_blocks:
|
||||
___
|
||||
$code.=<<___ if ($flavour =~ /64/);
|
||||
// Armv8.3-A PAuth: even though x30 is pushed to stack it is not popped later.
|
||||
AARCH64_VALID_CALL_TARGET
|
||||
stp x29,x30,[sp,#-16]!
|
||||
add x29,sp,#0
|
||||
___
|
||||
$code.=<<___ if ($flavour !~ /64/);
|
||||
mov ip,sp
|
||||
stmdb sp!,{r4-r10,lr}
|
||||
vstmdb sp!,{d8-d15} @ ABI specification says so
|
||||
ldr r4, [ip] @ load remaining arg
|
||||
___
|
||||
$code.=<<___;
|
||||
ldr $rounds,[$key,#240]
|
||||
|
||||
ldr $ctr, [$ivp, #12]
|
||||
vld1.32 {$dat0},[$ivp]
|
||||
|
||||
vld1.32 {q8-q9},[$key] // load key schedule...
|
||||
sub $rounds,$rounds,#4
|
||||
mov $step,#16
|
||||
cmp $len,#2
|
||||
add $key_,$key,x5,lsl#4 // pointer to last 5 round keys
|
||||
sub $rounds,$rounds,#2
|
||||
vld1.32 {q12-q13},[$key_],#32
|
||||
vld1.32 {q14-q15},[$key_],#32
|
||||
vld1.32 {$rndlast},[$key_]
|
||||
add $key_,$key,#32
|
||||
mov $cnt,$rounds
|
||||
cclr $step,lo
|
||||
|
||||
// ARM Cortex-A57 and Cortex-A72 cores running in 32-bit mode are
|
||||
// affected by silicon errata #1742098 [0] and #1655431 [1],
|
||||
// respectively, where the second instruction of an aese/aesmc
|
||||
// instruction pair may execute twice if an interrupt is taken right
|
||||
// after the first instruction consumes an input register of which a
|
||||
// single 32-bit lane has been updated the last time it was modified.
|
||||
//
|
||||
// This function uses a counter in one 32-bit lane. The vmov.32 lines
|
||||
// could write to $dat1 and $dat2 directly, but that trips this bugs.
|
||||
// We write to $ivec and copy to the final register as a workaround.
|
||||
//
|
||||
// [0] ARM-EPM-049219 v23 Cortex-A57 MPCore Software Developers Errata Notice
|
||||
// [1] ARM-EPM-012079 v11.0 Cortex-A72 MPCore Software Developers Errata Notice
|
||||
#ifndef __ARMEB__
|
||||
rev $ctr, $ctr
|
||||
#endif
|
||||
add $tctr1, $ctr, #1
|
||||
vorr $ivec,$dat0,$dat0
|
||||
rev $tctr1, $tctr1
|
||||
vmov.32 ${ivec}[3],$tctr1
|
||||
add $ctr, $ctr, #2
|
||||
vorr $dat1,$ivec,$ivec
|
||||
b.ls .Lctr32_tail
|
||||
rev $tctr2, $ctr
|
||||
vmov.32 ${ivec}[3],$tctr2
|
||||
sub $len,$len,#3 // bias
|
||||
vorr $dat2,$ivec,$ivec
|
||||
b .Loop3x_ctr32
|
||||
|
||||
.align 4
|
||||
.Loop3x_ctr32:
|
||||
aese $dat0,q8
|
||||
aesmc $dat0,$dat0
|
||||
aese $dat1,q8
|
||||
aesmc $dat1,$dat1
|
||||
aese $dat2,q8
|
||||
aesmc $dat2,$dat2
|
||||
vld1.32 {q8},[$key_],#16
|
||||
subs $cnt,$cnt,#2
|
||||
aese $dat0,q9
|
||||
aesmc $dat0,$dat0
|
||||
aese $dat1,q9
|
||||
aesmc $dat1,$dat1
|
||||
aese $dat2,q9
|
||||
aesmc $dat2,$dat2
|
||||
vld1.32 {q9},[$key_],#16
|
||||
b.gt .Loop3x_ctr32
|
||||
|
||||
aese $dat0,q8
|
||||
aesmc $tmp0,$dat0
|
||||
aese $dat1,q8
|
||||
aesmc $tmp1,$dat1
|
||||
vld1.8 {$in0},[$inp],#16
|
||||
add $tctr0,$ctr,#1
|
||||
aese $dat2,q8
|
||||
aesmc $dat2,$dat2
|
||||
vld1.8 {$in1},[$inp],#16
|
||||
rev $tctr0,$tctr0
|
||||
aese $tmp0,q9
|
||||
aesmc $tmp0,$tmp0
|
||||
aese $tmp1,q9
|
||||
aesmc $tmp1,$tmp1
|
||||
vld1.8 {$in2},[$inp],#16
|
||||
mov $key_,$key
|
||||
aese $dat2,q9
|
||||
aesmc $tmp2,$dat2
|
||||
aese $tmp0,q12
|
||||
aesmc $tmp0,$tmp0
|
||||
aese $tmp1,q12
|
||||
aesmc $tmp1,$tmp1
|
||||
veor $in0,$in0,$rndlast
|
||||
add $tctr1,$ctr,#2
|
||||
aese $tmp2,q12
|
||||
aesmc $tmp2,$tmp2
|
||||
veor $in1,$in1,$rndlast
|
||||
add $ctr,$ctr,#3
|
||||
aese $tmp0,q13
|
||||
aesmc $tmp0,$tmp0
|
||||
aese $tmp1,q13
|
||||
aesmc $tmp1,$tmp1
|
||||
// Note the logic to update $dat0, $dat1, and $dat1 is written to work
|
||||
// around a bug in ARM Cortex-A57 and Cortex-A72 cores running in
|
||||
// 32-bit mode. See the comment above.
|
||||
veor $in2,$in2,$rndlast
|
||||
vmov.32 ${ivec}[3], $tctr0
|
||||
aese $tmp2,q13
|
||||
aesmc $tmp2,$tmp2
|
||||
vorr $dat0,$ivec,$ivec
|
||||
rev $tctr1,$tctr1
|
||||
aese $tmp0,q14
|
||||
aesmc $tmp0,$tmp0
|
||||
vmov.32 ${ivec}[3], $tctr1
|
||||
rev $tctr2,$ctr
|
||||
aese $tmp1,q14
|
||||
aesmc $tmp1,$tmp1
|
||||
vorr $dat1,$ivec,$ivec
|
||||
vmov.32 ${ivec}[3], $tctr2
|
||||
aese $tmp2,q14
|
||||
aesmc $tmp2,$tmp2
|
||||
vorr $dat2,$ivec,$ivec
|
||||
subs $len,$len,#3
|
||||
aese $tmp0,q15
|
||||
aese $tmp1,q15
|
||||
aese $tmp2,q15
|
||||
|
||||
veor $in0,$in0,$tmp0
|
||||
vld1.32 {q8},[$key_],#16 // re-pre-load rndkey[0]
|
||||
vst1.8 {$in0},[$out],#16
|
||||
veor $in1,$in1,$tmp1
|
||||
mov $cnt,$rounds
|
||||
vst1.8 {$in1},[$out],#16
|
||||
veor $in2,$in2,$tmp2
|
||||
vld1.32 {q9},[$key_],#16 // re-pre-load rndkey[1]
|
||||
vst1.8 {$in2},[$out],#16
|
||||
b.hs .Loop3x_ctr32
|
||||
|
||||
adds $len,$len,#3
|
||||
b.eq .Lctr32_done
|
||||
cmp $len,#1
|
||||
mov $step,#16
|
||||
cclr $step,eq
|
||||
|
||||
.Lctr32_tail:
|
||||
aese $dat0,q8
|
||||
aesmc $dat0,$dat0
|
||||
aese $dat1,q8
|
||||
aesmc $dat1,$dat1
|
||||
vld1.32 {q8},[$key_],#16
|
||||
subs $cnt,$cnt,#2
|
||||
aese $dat0,q9
|
||||
aesmc $dat0,$dat0
|
||||
aese $dat1,q9
|
||||
aesmc $dat1,$dat1
|
||||
vld1.32 {q9},[$key_],#16
|
||||
b.gt .Lctr32_tail
|
||||
|
||||
aese $dat0,q8
|
||||
aesmc $dat0,$dat0
|
||||
aese $dat1,q8
|
||||
aesmc $dat1,$dat1
|
||||
aese $dat0,q9
|
||||
aesmc $dat0,$dat0
|
||||
aese $dat1,q9
|
||||
aesmc $dat1,$dat1
|
||||
vld1.8 {$in0},[$inp],$step
|
||||
aese $dat0,q12
|
||||
aesmc $dat0,$dat0
|
||||
aese $dat1,q12
|
||||
aesmc $dat1,$dat1
|
||||
vld1.8 {$in1},[$inp]
|
||||
aese $dat0,q13
|
||||
aesmc $dat0,$dat0
|
||||
aese $dat1,q13
|
||||
aesmc $dat1,$dat1
|
||||
veor $in0,$in0,$rndlast
|
||||
aese $dat0,q14
|
||||
aesmc $dat0,$dat0
|
||||
aese $dat1,q14
|
||||
aesmc $dat1,$dat1
|
||||
veor $in1,$in1,$rndlast
|
||||
aese $dat0,q15
|
||||
aese $dat1,q15
|
||||
|
||||
cmp $len,#1
|
||||
veor $in0,$in0,$dat0
|
||||
veor $in1,$in1,$dat1
|
||||
vst1.8 {$in0},[$out],#16
|
||||
b.eq .Lctr32_done
|
||||
vst1.8 {$in1},[$out]
|
||||
|
||||
.Lctr32_done:
|
||||
___
|
||||
$code.=<<___ if ($flavour !~ /64/);
|
||||
vldmia sp!,{d8-d15}
|
||||
ldmia sp!,{r4-r10,pc}
|
||||
___
|
||||
$code.=<<___ if ($flavour =~ /64/);
|
||||
ldr x29,[sp],#16
|
||||
ret
|
||||
___
|
||||
$code.=<<___;
|
||||
.size GFp_${prefix}_ctr32_encrypt_blocks,.-GFp_${prefix}_ctr32_encrypt_blocks
|
||||
___
|
||||
}}}
|
||||
$code.=<<___;
|
||||
#endif
|
||||
___
|
||||
########################################
|
||||
if ($flavour =~ /64/) { ######## 64-bit code
|
||||
my %opcode = (
|
||||
"aesd" => 0x4e285800, "aese" => 0x4e284800,
|
||||
"aesimc"=> 0x4e287800, "aesmc" => 0x4e286800 );
|
||||
|
||||
local *unaes = sub {
|
||||
my ($mnemonic,$arg)=@_;
|
||||
|
||||
$arg =~ m/[qv]([0-9]+)[^,]*,\s*[qv]([0-9]+)/o &&
|
||||
sprintf ".inst\t0x%08x\t//%s %s",
|
||||
$opcode{$mnemonic}|$1|($2<<5),
|
||||
$mnemonic,$arg;
|
||||
};
|
||||
|
||||
foreach(split("\n",$code)) {
|
||||
s/\`([^\`]*)\`/eval($1)/geo;
|
||||
|
||||
s/\bq([0-9]+)\b/"v".($1<8?$1:$1+8).".16b"/geo; # old->new registers
|
||||
s/@\s/\/\//o; # old->new style commentary
|
||||
|
||||
#s/[v]?(aes\w+)\s+([qv].*)/unaes($1,$2)/geo or
|
||||
s/cclr\s+([wx])([^,]+),\s*([a-z]+)/csel $1$2,$1zr,$1$2,$3/o or
|
||||
s/mov\.([a-z]+)\s+([wx][0-9]+),\s*([wx][0-9]+)/csel $2,$3,$2,$1/o or
|
||||
s/vmov\.i8/movi/o or # fix up legacy mnemonics
|
||||
s/vext\.8/ext/o or
|
||||
s/vrev32\.8/rev32/o or
|
||||
s/vtst\.8/cmtst/o or
|
||||
s/vshr/ushr/o or
|
||||
s/^(\s+)v/$1/o or # strip off v prefix
|
||||
s/\bbx\s+lr\b/ret/o;
|
||||
|
||||
# fix up remaining legacy suffixes
|
||||
s/\.[ui]?8//o;
|
||||
m/\],#8/o and s/\.16b/\.8b/go;
|
||||
s/\.[ui]?32//o and s/\.16b/\.4s/go;
|
||||
s/\.[ui]?64//o and s/\.16b/\.2d/go;
|
||||
s/\.[42]([sd])\[([0-3])\]/\.$1\[$2\]/o;
|
||||
|
||||
print $_,"\n";
|
||||
}
|
||||
} else { ######## 32-bit code
|
||||
my %opcode = (
|
||||
"aesd" => 0xf3b00340, "aese" => 0xf3b00300,
|
||||
"aesimc"=> 0xf3b003c0, "aesmc" => 0xf3b00380 );
|
||||
|
||||
local *unaes = sub {
|
||||
my ($mnemonic,$arg)=@_;
|
||||
|
||||
if ($arg =~ m/[qv]([0-9]+)[^,]*,\s*[qv]([0-9]+)/o) {
|
||||
my $word = $opcode{$mnemonic}|(($1&7)<<13)|(($1&8)<<19)
|
||||
|(($2&7)<<1) |(($2&8)<<2);
|
||||
# since ARMv7 instructions are always encoded little-endian.
|
||||
# correct solution is to use .inst directive, but older
|
||||
# assemblers don't implement it:-(
|
||||
sprintf ".byte\t0x%02x,0x%02x,0x%02x,0x%02x\t@ %s %s",
|
||||
$word&0xff,($word>>8)&0xff,
|
||||
($word>>16)&0xff,($word>>24)&0xff,
|
||||
$mnemonic,$arg;
|
||||
}
|
||||
};
|
||||
|
||||
sub unvtbl {
|
||||
my $arg=shift;
|
||||
|
||||
$arg =~ m/q([0-9]+),\s*\{q([0-9]+)\},\s*q([0-9]+)/o &&
|
||||
sprintf "vtbl.8 d%d,{q%d},d%d\n\t".
|
||||
"vtbl.8 d%d,{q%d},d%d", 2*$1,$2,2*$3, 2*$1+1,$2,2*$3+1;
|
||||
}
|
||||
|
||||
sub unvdup32 {
|
||||
my $arg=shift;
|
||||
|
||||
$arg =~ m/q([0-9]+),\s*q([0-9]+)\[([0-3])\]/o &&
|
||||
sprintf "vdup.32 q%d,d%d[%d]",$1,2*$2+($3>>1),$3&1;
|
||||
}
|
||||
|
||||
sub unvmov32 {
|
||||
my $arg=shift;
|
||||
|
||||
$arg =~ m/q([0-9]+)\[([0-3])\],(.*)/o &&
|
||||
sprintf "vmov.32 d%d[%d],%s",2*$1+($2>>1),$2&1,$3;
|
||||
}
|
||||
|
||||
foreach(split("\n",$code)) {
|
||||
s/\`([^\`]*)\`/eval($1)/geo;
|
||||
|
||||
s/\b[wx]([0-9]+)\b/r$1/go; # new->old registers
|
||||
s/\bv([0-9])\.[12468]+[bsd]\b/q$1/go; # new->old registers
|
||||
s/\/\/\s?/@ /o; # new->old style commentary
|
||||
|
||||
# fix up remaining new-style suffixes
|
||||
s/\{q([0-9]+)\},\s*\[(.+)\],#8/sprintf "{d%d},[$2]!",2*$1/eo or
|
||||
s/\],#[0-9]+/]!/o;
|
||||
|
||||
s/[v]?(aes\w+)\s+([qv].*)/unaes($1,$2)/geo or
|
||||
s/cclr\s+([^,]+),\s*([a-z]+)/mov$2 $1,#0/o or
|
||||
s/vtbl\.8\s+(.*)/unvtbl($1)/geo or
|
||||
s/vdup\.32\s+(.*)/unvdup32($1)/geo or
|
||||
s/vmov\.32\s+(.*)/unvmov32($1)/geo or
|
||||
s/^(\s+)b\./$1b/o or
|
||||
s/^(\s+)mov\./$1mov/o or
|
||||
s/^(\s+)ret/$1bx\tlr/o;
|
||||
|
||||
print $_,"\n";
|
||||
}
|
||||
}
|
||||
|
||||
close STDOUT or die "error closing STDOUT";
|
||||
1142
zeroidc/vendor/ring/crypto/fipsmodule/aes/asm/bsaes-armv7.pl
vendored
Normal file
1142
zeroidc/vendor/ring/crypto/fipsmodule/aes/asm/bsaes-armv7.pl
vendored
Normal file
File diff suppressed because it is too large
Load Diff
603
zeroidc/vendor/ring/crypto/fipsmodule/aes/asm/vpaes-x86.pl
vendored
Normal file
603
zeroidc/vendor/ring/crypto/fipsmodule/aes/asm/vpaes-x86.pl
vendored
Normal file
@@ -0,0 +1,603 @@
|
||||
#! /usr/bin/env perl
|
||||
# Copyright 2011-2016 The OpenSSL Project Authors. All Rights Reserved.
|
||||
#
|
||||
# Licensed under the OpenSSL license (the "License"). You may not use
|
||||
# this file except in compliance with the License. You can obtain a copy
|
||||
# in the file LICENSE in the source distribution or at
|
||||
# https://www.openssl.org/source/license.html
|
||||
|
||||
|
||||
######################################################################
|
||||
## Constant-time SSSE3 AES core implementation.
|
||||
## version 0.1
|
||||
##
|
||||
## By Mike Hamburg (Stanford University), 2009
|
||||
## Public domain.
|
||||
##
|
||||
## For details see http://shiftleft.org/papers/vector_aes/ and
|
||||
## http://crypto.stanford.edu/vpaes/.
|
||||
|
||||
######################################################################
|
||||
# September 2011.
|
||||
#
|
||||
# Port vpaes-x86_64.pl as 32-bit "almost" drop-in replacement for
|
||||
# aes-586.pl. "Almost" refers to the fact that AES_cbc_encrypt
|
||||
# doesn't handle partial vectors (doesn't have to if called from
|
||||
# EVP only). "Drop-in" implies that this module doesn't share key
|
||||
# schedule structure with the original nor does it make assumption
|
||||
# about its alignment...
|
||||
#
|
||||
# Performance summary. aes-586.pl column lists large-block CBC
|
||||
# encrypt/decrypt/with-hyper-threading-off(*) results in cycles per
|
||||
# byte processed with 128-bit key, and vpaes-x86.pl column - [also
|
||||
# large-block CBC] encrypt/decrypt.
|
||||
#
|
||||
# aes-586.pl vpaes-x86.pl
|
||||
#
|
||||
# Core 2(**) 28.1/41.4/18.3 21.9/25.2(***)
|
||||
# Nehalem 27.9/40.4/18.1 10.2/11.9
|
||||
# Atom 70.7/92.1/60.1 61.1/75.4(***)
|
||||
# Silvermont 45.4/62.9/24.1 49.2/61.1(***)
|
||||
#
|
||||
# (*) "Hyper-threading" in the context refers rather to cache shared
|
||||
# among multiple cores, than to specifically Intel HTT. As vast
|
||||
# majority of contemporary cores share cache, slower code path
|
||||
# is common place. In other words "with-hyper-threading-off"
|
||||
# results are presented mostly for reference purposes.
|
||||
#
|
||||
# (**) "Core 2" refers to initial 65nm design, a.k.a. Conroe.
|
||||
#
|
||||
# (***) Less impressive improvement on Core 2 and Atom is due to slow
|
||||
# pshufb, yet it's respectable +28%/64% improvement on Core 2
|
||||
# and +15% on Atom (as implied, over "hyper-threading-safe"
|
||||
# code path).
|
||||
#
|
||||
# <appro@openssl.org>
|
||||
|
||||
$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
|
||||
push(@INC,"${dir}","${dir}../../../perlasm");
|
||||
require "x86asm.pl";
|
||||
|
||||
$output = pop;
|
||||
open OUT,">$output";
|
||||
*STDOUT=*OUT;
|
||||
|
||||
&asm_init($ARGV[0],$x86only = $ARGV[$#ARGV] eq "386");
|
||||
|
||||
$PREFIX="vpaes";
|
||||
|
||||
my ($round, $base, $magic, $key, $const, $inp, $out)=
|
||||
("eax", "ebx", "ecx", "edx","ebp", "esi","edi");
|
||||
|
||||
&static_label("_vpaes_consts");
|
||||
&static_label("_vpaes_schedule_low_round");
|
||||
|
||||
&set_label("_vpaes_consts",64);
|
||||
$k_inv=-0x30; # inv, inva
|
||||
&data_word(0x0D080180,0x0E05060F,0x0A0B0C02,0x04070309);
|
||||
&data_word(0x0F0B0780,0x01040A06,0x02050809,0x030D0E0C);
|
||||
|
||||
$k_s0F=-0x10; # s0F
|
||||
&data_word(0x0F0F0F0F,0x0F0F0F0F,0x0F0F0F0F,0x0F0F0F0F);
|
||||
|
||||
$k_ipt=0x00; # input transform (lo, hi)
|
||||
&data_word(0x5A2A7000,0xC2B2E898,0x52227808,0xCABAE090);
|
||||
&data_word(0x317C4D00,0x4C01307D,0xB0FDCC81,0xCD80B1FC);
|
||||
|
||||
$k_sb1=0x20; # sb1u, sb1t
|
||||
&data_word(0xCB503E00,0xB19BE18F,0x142AF544,0xA5DF7A6E);
|
||||
&data_word(0xFAE22300,0x3618D415,0x0D2ED9EF,0x3BF7CCC1);
|
||||
$k_sb2=0x40; # sb2u, sb2t
|
||||
&data_word(0x0B712400,0xE27A93C6,0xBC982FCD,0x5EB7E955);
|
||||
&data_word(0x0AE12900,0x69EB8840,0xAB82234A,0xC2A163C8);
|
||||
$k_sbo=0x60; # sbou, sbot
|
||||
&data_word(0x6FBDC700,0xD0D26D17,0xC502A878,0x15AABF7A);
|
||||
&data_word(0x5FBB6A00,0xCFE474A5,0x412B35FA,0x8E1E90D1);
|
||||
|
||||
$k_mc_forward=0x80; # mc_forward
|
||||
&data_word(0x00030201,0x04070605,0x080B0A09,0x0C0F0E0D);
|
||||
&data_word(0x04070605,0x080B0A09,0x0C0F0E0D,0x00030201);
|
||||
&data_word(0x080B0A09,0x0C0F0E0D,0x00030201,0x04070605);
|
||||
&data_word(0x0C0F0E0D,0x00030201,0x04070605,0x080B0A09);
|
||||
|
||||
$k_mc_backward=0xc0; # mc_backward
|
||||
&data_word(0x02010003,0x06050407,0x0A09080B,0x0E0D0C0F);
|
||||
&data_word(0x0E0D0C0F,0x02010003,0x06050407,0x0A09080B);
|
||||
&data_word(0x0A09080B,0x0E0D0C0F,0x02010003,0x06050407);
|
||||
&data_word(0x06050407,0x0A09080B,0x0E0D0C0F,0x02010003);
|
||||
|
||||
$k_sr=0x100; # sr
|
||||
&data_word(0x03020100,0x07060504,0x0B0A0908,0x0F0E0D0C);
|
||||
&data_word(0x0F0A0500,0x030E0904,0x07020D08,0x0B06010C);
|
||||
&data_word(0x0B020900,0x0F060D04,0x030A0108,0x070E050C);
|
||||
&data_word(0x070A0D00,0x0B0E0104,0x0F020508,0x0306090C);
|
||||
|
||||
$k_rcon=0x140; # rcon
|
||||
&data_word(0xAF9DEEB6,0x1F8391B9,0x4D7C7D81,0x702A9808);
|
||||
|
||||
$k_s63=0x150; # s63: all equal to 0x63 transformed
|
||||
&data_word(0x5B5B5B5B,0x5B5B5B5B,0x5B5B5B5B,0x5B5B5B5B);
|
||||
|
||||
$k_opt=0x160; # output transform
|
||||
&data_word(0xD6B66000,0xFF9F4929,0xDEBE6808,0xF7974121);
|
||||
&data_word(0x50BCEC00,0x01EDBD51,0xB05C0CE0,0xE10D5DB1);
|
||||
|
||||
$k_deskew=0x180; # deskew tables: inverts the sbox's "skew"
|
||||
&data_word(0x47A4E300,0x07E4A340,0x5DBEF91A,0x1DFEB95A);
|
||||
&data_word(0x83EA6900,0x5F36B5DC,0xF49D1E77,0x2841C2AB);
|
||||
|
||||
&asciz ("Vector Permutation AES for x86/SSSE3, Mike Hamburg (Stanford University)");
|
||||
&align (64);
|
||||
|
||||
&function_begin_B("_vpaes_preheat");
|
||||
&add ($const,&DWP(0,"esp"));
|
||||
&movdqa ("xmm7",&QWP($k_inv,$const));
|
||||
&movdqa ("xmm6",&QWP($k_s0F,$const));
|
||||
&ret ();
|
||||
&function_end_B("_vpaes_preheat");
|
||||
|
||||
##
|
||||
## _aes_encrypt_core
|
||||
##
|
||||
## AES-encrypt %xmm0.
|
||||
##
|
||||
## Inputs:
|
||||
## %xmm0 = input
|
||||
## %xmm6-%xmm7 as in _vpaes_preheat
|
||||
## (%edx) = scheduled keys
|
||||
##
|
||||
## Output in %xmm0
|
||||
## Clobbers %xmm1-%xmm5, %eax, %ebx, %ecx, %edx
|
||||
##
|
||||
##
|
||||
&function_begin_B("_vpaes_encrypt_core");
|
||||
&mov ($magic,16);
|
||||
&mov ($round,&DWP(240,$key));
|
||||
&movdqa ("xmm1","xmm6")
|
||||
&movdqa ("xmm2",&QWP($k_ipt,$const));
|
||||
&pandn ("xmm1","xmm0");
|
||||
&pand ("xmm0","xmm6");
|
||||
&movdqu ("xmm5",&QWP(0,$key));
|
||||
&pshufb ("xmm2","xmm0");
|
||||
&movdqa ("xmm0",&QWP($k_ipt+16,$const));
|
||||
&pxor ("xmm2","xmm5");
|
||||
&psrld ("xmm1",4);
|
||||
&add ($key,16);
|
||||
&pshufb ("xmm0","xmm1");
|
||||
&lea ($base,&DWP($k_mc_backward,$const));
|
||||
&pxor ("xmm0","xmm2");
|
||||
&jmp (&label("enc_entry"));
|
||||
|
||||
|
||||
&set_label("enc_loop",16);
|
||||
# middle of middle round
|
||||
&movdqa ("xmm4",&QWP($k_sb1,$const)); # 4 : sb1u
|
||||
&movdqa ("xmm0",&QWP($k_sb1+16,$const));# 0 : sb1t
|
||||
&pshufb ("xmm4","xmm2"); # 4 = sb1u
|
||||
&pshufb ("xmm0","xmm3"); # 0 = sb1t
|
||||
&pxor ("xmm4","xmm5"); # 4 = sb1u + k
|
||||
&movdqa ("xmm5",&QWP($k_sb2,$const)); # 4 : sb2u
|
||||
&pxor ("xmm0","xmm4"); # 0 = A
|
||||
&movdqa ("xmm1",&QWP(-0x40,$base,$magic));# .Lk_mc_forward[]
|
||||
&pshufb ("xmm5","xmm2"); # 4 = sb2u
|
||||
&movdqa ("xmm2",&QWP($k_sb2+16,$const));# 2 : sb2t
|
||||
&movdqa ("xmm4",&QWP(0,$base,$magic)); # .Lk_mc_backward[]
|
||||
&pshufb ("xmm2","xmm3"); # 2 = sb2t
|
||||
&movdqa ("xmm3","xmm0"); # 3 = A
|
||||
&pxor ("xmm2","xmm5"); # 2 = 2A
|
||||
&pshufb ("xmm0","xmm1"); # 0 = B
|
||||
&add ($key,16); # next key
|
||||
&pxor ("xmm0","xmm2"); # 0 = 2A+B
|
||||
&pshufb ("xmm3","xmm4"); # 3 = D
|
||||
&add ($magic,16); # next mc
|
||||
&pxor ("xmm3","xmm0"); # 3 = 2A+B+D
|
||||
&pshufb ("xmm0","xmm1"); # 0 = 2B+C
|
||||
&and ($magic,0x30); # ... mod 4
|
||||
&sub ($round,1); # nr--
|
||||
&pxor ("xmm0","xmm3"); # 0 = 2A+3B+C+D
|
||||
|
||||
&set_label("enc_entry");
|
||||
# top of round
|
||||
&movdqa ("xmm1","xmm6"); # 1 : i
|
||||
&movdqa ("xmm5",&QWP($k_inv+16,$const));# 2 : a/k
|
||||
&pandn ("xmm1","xmm0"); # 1 = i<<4
|
||||
&psrld ("xmm1",4); # 1 = i
|
||||
&pand ("xmm0","xmm6"); # 0 = k
|
||||
&pshufb ("xmm5","xmm0"); # 2 = a/k
|
||||
&movdqa ("xmm3","xmm7"); # 3 : 1/i
|
||||
&pxor ("xmm0","xmm1"); # 0 = j
|
||||
&pshufb ("xmm3","xmm1"); # 3 = 1/i
|
||||
&movdqa ("xmm4","xmm7"); # 4 : 1/j
|
||||
&pxor ("xmm3","xmm5"); # 3 = iak = 1/i + a/k
|
||||
&pshufb ("xmm4","xmm0"); # 4 = 1/j
|
||||
&movdqa ("xmm2","xmm7"); # 2 : 1/iak
|
||||
&pxor ("xmm4","xmm5"); # 4 = jak = 1/j + a/k
|
||||
&pshufb ("xmm2","xmm3"); # 2 = 1/iak
|
||||
&movdqa ("xmm3","xmm7"); # 3 : 1/jak
|
||||
&pxor ("xmm2","xmm0"); # 2 = io
|
||||
&pshufb ("xmm3","xmm4"); # 3 = 1/jak
|
||||
&movdqu ("xmm5",&QWP(0,$key));
|
||||
&pxor ("xmm3","xmm1"); # 3 = jo
|
||||
&jnz (&label("enc_loop"));
|
||||
|
||||
# middle of last round
|
||||
&movdqa ("xmm4",&QWP($k_sbo,$const)); # 3 : sbou .Lk_sbo
|
||||
&movdqa ("xmm0",&QWP($k_sbo+16,$const));# 3 : sbot .Lk_sbo+16
|
||||
&pshufb ("xmm4","xmm2"); # 4 = sbou
|
||||
&pxor ("xmm4","xmm5"); # 4 = sb1u + k
|
||||
&pshufb ("xmm0","xmm3"); # 0 = sb1t
|
||||
&movdqa ("xmm1",&QWP(0x40,$base,$magic));# .Lk_sr[]
|
||||
&pxor ("xmm0","xmm4"); # 0 = A
|
||||
&pshufb ("xmm0","xmm1");
|
||||
&ret ();
|
||||
&function_end_B("_vpaes_encrypt_core");
|
||||
|
||||
########################################################
|
||||
## ##
|
||||
## AES key schedule ##
|
||||
## ##
|
||||
########################################################
|
||||
&function_begin_B("_vpaes_schedule_core");
|
||||
&add ($const,&DWP(0,"esp"));
|
||||
&movdqu ("xmm0",&QWP(0,$inp)); # load key (unaligned)
|
||||
&movdqa ("xmm2",&QWP($k_rcon,$const)); # load rcon
|
||||
|
||||
# input transform
|
||||
&movdqa ("xmm3","xmm0");
|
||||
&lea ($base,&DWP($k_ipt,$const));
|
||||
&movdqa (&QWP(4,"esp"),"xmm2"); # xmm8
|
||||
&call ("_vpaes_schedule_transform");
|
||||
&movdqa ("xmm7","xmm0");
|
||||
|
||||
&test ($out,$out);
|
||||
&jnz (&label("schedule_am_decrypting"));
|
||||
|
||||
# encrypting, output zeroth round key after transform
|
||||
&movdqu (&QWP(0,$key),"xmm0");
|
||||
&jmp (&label("schedule_go"));
|
||||
|
||||
&set_label("schedule_am_decrypting");
|
||||
# decrypting, output zeroth round key after shiftrows
|
||||
&movdqa ("xmm1",&QWP($k_sr,$const,$magic));
|
||||
&pshufb ("xmm3","xmm1");
|
||||
&movdqu (&QWP(0,$key),"xmm3");
|
||||
&xor ($magic,0x30);
|
||||
|
||||
&set_label("schedule_go");
|
||||
&cmp ($round,192);
|
||||
&ja (&label("schedule_256"));
|
||||
# 192-bit key support was removed.
|
||||
# 128: fall though
|
||||
|
||||
##
|
||||
## .schedule_128
|
||||
##
|
||||
## 128-bit specific part of key schedule.
|
||||
##
|
||||
## This schedule is really simple, because all its parts
|
||||
## are accomplished by the subroutines.
|
||||
##
|
||||
&set_label("schedule_128");
|
||||
&mov ($round,10);
|
||||
|
||||
&set_label("loop_schedule_128");
|
||||
&call ("_vpaes_schedule_round");
|
||||
&dec ($round);
|
||||
&jz (&label("schedule_mangle_last"));
|
||||
&call ("_vpaes_schedule_mangle"); # write output
|
||||
&jmp (&label("loop_schedule_128"));
|
||||
|
||||
##
|
||||
## .aes_schedule_256
|
||||
##
|
||||
## 256-bit specific part of key schedule.
|
||||
##
|
||||
## The structure here is very similar to the 128-bit
|
||||
## schedule, but with an additional "low side" in
|
||||
## %xmm6. The low side's rounds are the same as the
|
||||
## high side's, except no rcon and no rotation.
|
||||
##
|
||||
&set_label("schedule_256",16);
|
||||
&movdqu ("xmm0",&QWP(16,$inp)); # load key part 2 (unaligned)
|
||||
&call ("_vpaes_schedule_transform"); # input transform
|
||||
&mov ($round,7);
|
||||
|
||||
&set_label("loop_schedule_256");
|
||||
&call ("_vpaes_schedule_mangle"); # output low result
|
||||
&movdqa ("xmm6","xmm0"); # save cur_lo in xmm6
|
||||
|
||||
# high round
|
||||
&call ("_vpaes_schedule_round");
|
||||
&dec ($round);
|
||||
&jz (&label("schedule_mangle_last"));
|
||||
&call ("_vpaes_schedule_mangle");
|
||||
|
||||
# low round. swap xmm7 and xmm6
|
||||
&pshufd ("xmm0","xmm0",0xFF);
|
||||
&movdqa (&QWP(20,"esp"),"xmm7");
|
||||
&movdqa ("xmm7","xmm6");
|
||||
&call ("_vpaes_schedule_low_round");
|
||||
&movdqa ("xmm7",&QWP(20,"esp"));
|
||||
|
||||
&jmp (&label("loop_schedule_256"));
|
||||
|
||||
##
|
||||
## .aes_schedule_mangle_last
|
||||
##
|
||||
## Mangler for last round of key schedule
|
||||
## Mangles %xmm0
|
||||
## when encrypting, outputs out(%xmm0) ^ 63
|
||||
## when decrypting, outputs unskew(%xmm0)
|
||||
##
|
||||
## Always called right before return... jumps to cleanup and exits
|
||||
##
|
||||
&set_label("schedule_mangle_last",16);
|
||||
# schedule last round key from xmm0
|
||||
&lea ($base,&DWP($k_deskew,$const));
|
||||
&test ($out,$out);
|
||||
&jnz (&label("schedule_mangle_last_dec"));
|
||||
|
||||
# encrypting
|
||||
&movdqa ("xmm1",&QWP($k_sr,$const,$magic));
|
||||
&pshufb ("xmm0","xmm1"); # output permute
|
||||
&lea ($base,&DWP($k_opt,$const)); # prepare to output transform
|
||||
&add ($key,32);
|
||||
|
||||
&set_label("schedule_mangle_last_dec");
|
||||
&add ($key,-16);
|
||||
&pxor ("xmm0",&QWP($k_s63,$const));
|
||||
&call ("_vpaes_schedule_transform"); # output transform
|
||||
&movdqu (&QWP(0,$key),"xmm0"); # save last key
|
||||
|
||||
# cleanup
|
||||
&pxor ("xmm0","xmm0");
|
||||
&pxor ("xmm1","xmm1");
|
||||
&pxor ("xmm2","xmm2");
|
||||
&pxor ("xmm3","xmm3");
|
||||
&pxor ("xmm4","xmm4");
|
||||
&pxor ("xmm5","xmm5");
|
||||
&pxor ("xmm6","xmm6");
|
||||
&pxor ("xmm7","xmm7");
|
||||
&ret ();
|
||||
&function_end_B("_vpaes_schedule_core");
|
||||
|
||||
##
|
||||
## .aes_schedule_round
|
||||
##
|
||||
## Runs one main round of the key schedule on %xmm0, %xmm7
|
||||
##
|
||||
## Specifically, runs subbytes on the high dword of %xmm0
|
||||
## then rotates it by one byte and xors into the low dword of
|
||||
## %xmm7.
|
||||
##
|
||||
## Adds rcon from low byte of %xmm8, then rotates %xmm8 for
|
||||
## next rcon.
|
||||
##
|
||||
## Smears the dwords of %xmm7 by xoring the low into the
|
||||
## second low, result into third, result into highest.
|
||||
##
|
||||
## Returns results in %xmm7 = %xmm0.
|
||||
## Clobbers %xmm1-%xmm5.
|
||||
##
|
||||
&function_begin_B("_vpaes_schedule_round");
|
||||
# extract rcon from xmm8
|
||||
&movdqa ("xmm2",&QWP(8,"esp")); # xmm8
|
||||
&pxor ("xmm1","xmm1");
|
||||
&palignr("xmm1","xmm2",15);
|
||||
&palignr("xmm2","xmm2",15);
|
||||
&pxor ("xmm7","xmm1");
|
||||
|
||||
# rotate
|
||||
&pshufd ("xmm0","xmm0",0xFF);
|
||||
&palignr("xmm0","xmm0",1);
|
||||
|
||||
# fall through...
|
||||
&movdqa (&QWP(8,"esp"),"xmm2"); # xmm8
|
||||
|
||||
# low round: same as high round, but no rotation and no rcon.
|
||||
&set_label("_vpaes_schedule_low_round");
|
||||
# smear xmm7
|
||||
&movdqa ("xmm1","xmm7");
|
||||
&pslldq ("xmm7",4);
|
||||
&pxor ("xmm7","xmm1");
|
||||
&movdqa ("xmm1","xmm7");
|
||||
&pslldq ("xmm7",8);
|
||||
&pxor ("xmm7","xmm1");
|
||||
&pxor ("xmm7",&QWP($k_s63,$const));
|
||||
|
||||
# subbyte
|
||||
&movdqa ("xmm4",&QWP($k_s0F,$const));
|
||||
&movdqa ("xmm5",&QWP($k_inv,$const)); # 4 : 1/j
|
||||
&movdqa ("xmm1","xmm4");
|
||||
&pandn ("xmm1","xmm0");
|
||||
&psrld ("xmm1",4); # 1 = i
|
||||
&pand ("xmm0","xmm4"); # 0 = k
|
||||
&movdqa ("xmm2",&QWP($k_inv+16,$const));# 2 : a/k
|
||||
&pshufb ("xmm2","xmm0"); # 2 = a/k
|
||||
&pxor ("xmm0","xmm1"); # 0 = j
|
||||
&movdqa ("xmm3","xmm5"); # 3 : 1/i
|
||||
&pshufb ("xmm3","xmm1"); # 3 = 1/i
|
||||
&pxor ("xmm3","xmm2"); # 3 = iak = 1/i + a/k
|
||||
&movdqa ("xmm4","xmm5"); # 4 : 1/j
|
||||
&pshufb ("xmm4","xmm0"); # 4 = 1/j
|
||||
&pxor ("xmm4","xmm2"); # 4 = jak = 1/j + a/k
|
||||
&movdqa ("xmm2","xmm5"); # 2 : 1/iak
|
||||
&pshufb ("xmm2","xmm3"); # 2 = 1/iak
|
||||
&pxor ("xmm2","xmm0"); # 2 = io
|
||||
&movdqa ("xmm3","xmm5"); # 3 : 1/jak
|
||||
&pshufb ("xmm3","xmm4"); # 3 = 1/jak
|
||||
&pxor ("xmm3","xmm1"); # 3 = jo
|
||||
&movdqa ("xmm4",&QWP($k_sb1,$const)); # 4 : sbou
|
||||
&pshufb ("xmm4","xmm2"); # 4 = sbou
|
||||
&movdqa ("xmm0",&QWP($k_sb1+16,$const));# 0 : sbot
|
||||
&pshufb ("xmm0","xmm3"); # 0 = sb1t
|
||||
&pxor ("xmm0","xmm4"); # 0 = sbox output
|
||||
|
||||
# add in smeared stuff
|
||||
&pxor ("xmm0","xmm7");
|
||||
&movdqa ("xmm7","xmm0");
|
||||
&ret ();
|
||||
&function_end_B("_vpaes_schedule_round");
|
||||
|
||||
##
|
||||
## .aes_schedule_transform
|
||||
##
|
||||
## Linear-transform %xmm0 according to tables at (%ebx)
|
||||
##
|
||||
## Output in %xmm0
|
||||
## Clobbers %xmm1, %xmm2
|
||||
##
|
||||
&function_begin_B("_vpaes_schedule_transform");
|
||||
&movdqa ("xmm2",&QWP($k_s0F,$const));
|
||||
&movdqa ("xmm1","xmm2");
|
||||
&pandn ("xmm1","xmm0");
|
||||
&psrld ("xmm1",4);
|
||||
&pand ("xmm0","xmm2");
|
||||
&movdqa ("xmm2",&QWP(0,$base));
|
||||
&pshufb ("xmm2","xmm0");
|
||||
&movdqa ("xmm0",&QWP(16,$base));
|
||||
&pshufb ("xmm0","xmm1");
|
||||
&pxor ("xmm0","xmm2");
|
||||
&ret ();
|
||||
&function_end_B("_vpaes_schedule_transform");
|
||||
|
||||
##
|
||||
## .aes_schedule_mangle
|
||||
##
|
||||
## Mangle xmm0 from (basis-transformed) standard version
|
||||
## to our version.
|
||||
##
|
||||
## On encrypt,
|
||||
## xor with 0x63
|
||||
## multiply by circulant 0,1,1,1
|
||||
## apply shiftrows transform
|
||||
##
|
||||
## On decrypt,
|
||||
## xor with 0x63
|
||||
## multiply by "inverse mixcolumns" circulant E,B,D,9
|
||||
## deskew
|
||||
## apply shiftrows transform
|
||||
##
|
||||
##
|
||||
## Writes out to (%edx), and increments or decrements it
|
||||
## Keeps track of round number mod 4 in %ecx
|
||||
## Preserves xmm0
|
||||
## Clobbers xmm1-xmm5
|
||||
##
|
||||
&function_begin_B("_vpaes_schedule_mangle");
|
||||
&movdqa ("xmm4","xmm0"); # save xmm0 for later
|
||||
&movdqa ("xmm5",&QWP($k_mc_forward,$const));
|
||||
&test ($out,$out);
|
||||
&jnz (&label("schedule_mangle_dec"));
|
||||
|
||||
# encrypting
|
||||
&add ($key,16);
|
||||
&pxor ("xmm4",&QWP($k_s63,$const));
|
||||
&pshufb ("xmm4","xmm5");
|
||||
&movdqa ("xmm3","xmm4");
|
||||
&pshufb ("xmm4","xmm5");
|
||||
&pxor ("xmm3","xmm4");
|
||||
&pshufb ("xmm4","xmm5");
|
||||
&pxor ("xmm3","xmm4");
|
||||
|
||||
&jmp (&label("schedule_mangle_both"));
|
||||
|
||||
&set_label("schedule_mangle_dec",16);
|
||||
# inverse mix columns
|
||||
&movdqa ("xmm2",&QWP($k_s0F,$const));
|
||||
&lea ($inp,&DWP($k_dksd,$const));
|
||||
&movdqa ("xmm1","xmm2");
|
||||
&pandn ("xmm1","xmm4");
|
||||
&psrld ("xmm1",4); # 1 = hi
|
||||
&pand ("xmm4","xmm2"); # 4 = lo
|
||||
|
||||
&movdqa ("xmm2",&QWP(0,$inp));
|
||||
&pshufb ("xmm2","xmm4");
|
||||
&movdqa ("xmm3",&QWP(0x10,$inp));
|
||||
&pshufb ("xmm3","xmm1");
|
||||
&pxor ("xmm3","xmm2");
|
||||
&pshufb ("xmm3","xmm5");
|
||||
|
||||
&movdqa ("xmm2",&QWP(0x20,$inp));
|
||||
&pshufb ("xmm2","xmm4");
|
||||
&pxor ("xmm2","xmm3");
|
||||
&movdqa ("xmm3",&QWP(0x30,$inp));
|
||||
&pshufb ("xmm3","xmm1");
|
||||
&pxor ("xmm3","xmm2");
|
||||
&pshufb ("xmm3","xmm5");
|
||||
|
||||
&movdqa ("xmm2",&QWP(0x40,$inp));
|
||||
&pshufb ("xmm2","xmm4");
|
||||
&pxor ("xmm2","xmm3");
|
||||
&movdqa ("xmm3",&QWP(0x50,$inp));
|
||||
&pshufb ("xmm3","xmm1");
|
||||
&pxor ("xmm3","xmm2");
|
||||
&pshufb ("xmm3","xmm5");
|
||||
|
||||
&movdqa ("xmm2",&QWP(0x60,$inp));
|
||||
&pshufb ("xmm2","xmm4");
|
||||
&pxor ("xmm2","xmm3");
|
||||
&movdqa ("xmm3",&QWP(0x70,$inp));
|
||||
&pshufb ("xmm3","xmm1");
|
||||
&pxor ("xmm3","xmm2");
|
||||
|
||||
&add ($key,-16);
|
||||
|
||||
&set_label("schedule_mangle_both");
|
||||
&movdqa ("xmm1",&QWP($k_sr,$const,$magic));
|
||||
&pshufb ("xmm3","xmm1");
|
||||
&add ($magic,-16);
|
||||
&and ($magic,0x30);
|
||||
&movdqu (&QWP(0,$key),"xmm3");
|
||||
&ret ();
|
||||
&function_end_B("_vpaes_schedule_mangle");
|
||||
|
||||
#
|
||||
# Interface to OpenSSL
|
||||
#
|
||||
&function_begin("GFp_${PREFIX}_set_encrypt_key");
|
||||
&mov ($inp,&wparam(0)); # inp
|
||||
&lea ($base,&DWP(-56,"esp"));
|
||||
&mov ($round,&wparam(1)); # bits
|
||||
&and ($base,-16);
|
||||
&mov ($key,&wparam(2)); # key
|
||||
&xchg ($base,"esp"); # alloca
|
||||
&mov (&DWP(48,"esp"),$base);
|
||||
|
||||
&mov ($base,$round);
|
||||
&shr ($base,5);
|
||||
&add ($base,5);
|
||||
&mov (&DWP(240,$key),$base); # AES_KEY->rounds = nbits/32+5;
|
||||
&mov ($magic,0x30);
|
||||
&mov ($out,0);
|
||||
|
||||
&lea ($const,&DWP(&label("_vpaes_consts")."+0x30-".&label("pic_point")));
|
||||
&call ("_vpaes_schedule_core");
|
||||
&set_label("pic_point");
|
||||
|
||||
&mov ("esp",&DWP(48,"esp"));
|
||||
&xor ("eax","eax");
|
||||
&function_end("GFp_${PREFIX}_set_encrypt_key");
|
||||
|
||||
&function_begin("GFp_${PREFIX}_encrypt");
|
||||
&lea ($const,&DWP(&label("_vpaes_consts")."+0x30-".&label("pic_point")));
|
||||
&call ("_vpaes_preheat");
|
||||
&set_label("pic_point");
|
||||
&mov ($inp,&wparam(0)); # inp
|
||||
&lea ($base,&DWP(-56,"esp"));
|
||||
&mov ($out,&wparam(1)); # out
|
||||
&and ($base,-16);
|
||||
&mov ($key,&wparam(2)); # key
|
||||
&xchg ($base,"esp"); # alloca
|
||||
&mov (&DWP(48,"esp"),$base);
|
||||
|
||||
&movdqu ("xmm0",&QWP(0,$inp));
|
||||
&call ("_vpaes_encrypt_core");
|
||||
&movdqu (&QWP(0,$out),"xmm0");
|
||||
|
||||
&mov ("esp",&DWP(48,"esp"));
|
||||
&function_end("GFp_${PREFIX}_encrypt");
|
||||
|
||||
&asm_finish();
|
||||
|
||||
close STDOUT or die "error closing STDOUT";
|
||||
1064
zeroidc/vendor/ring/crypto/fipsmodule/aes/asm/vpaes-x86_64.pl
vendored
Normal file
1064
zeroidc/vendor/ring/crypto/fipsmodule/aes/asm/vpaes-x86_64.pl
vendored
Normal file
File diff suppressed because it is too large
Load Diff
761
zeroidc/vendor/ring/crypto/fipsmodule/bn/asm/armv4-mont.pl
vendored
Normal file
761
zeroidc/vendor/ring/crypto/fipsmodule/bn/asm/armv4-mont.pl
vendored
Normal file
@@ -0,0 +1,761 @@
|
||||
#! /usr/bin/env perl
|
||||
# Copyright 2007-2016 The OpenSSL Project Authors. All Rights Reserved.
|
||||
#
|
||||
# Licensed under the OpenSSL license (the "License"). You may not use
|
||||
# this file except in compliance with the License. You can obtain a copy
|
||||
# in the file LICENSE in the source distribution or at
|
||||
# https://www.openssl.org/source/license.html
|
||||
|
||||
|
||||
# ====================================================================
|
||||
# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
|
||||
# project. The module is, however, dual licensed under OpenSSL and
|
||||
# CRYPTOGAMS licenses depending on where you obtain it. For further
|
||||
# details see http://www.openssl.org/~appro/cryptogams/.
|
||||
# ====================================================================
|
||||
|
||||
# January 2007.
|
||||
|
||||
# Montgomery multiplication for ARMv4.
|
||||
#
|
||||
# Performance improvement naturally varies among CPU implementations
|
||||
# and compilers. The code was observed to provide +65-35% improvement
|
||||
# [depending on key length, less for longer keys] on ARM920T, and
|
||||
# +115-80% on Intel IXP425. This is compared to pre-bn_mul_mont code
|
||||
# base and compiler generated code with in-lined umull and even umlal
|
||||
# instructions. The latter means that this code didn't really have an
|
||||
# "advantage" of utilizing some "secret" instruction.
|
||||
#
|
||||
# The code is interoperable with Thumb ISA and is rather compact, less
|
||||
# than 1/2KB. Windows CE port would be trivial, as it's exclusively
|
||||
# about decorations, ABI and instruction syntax are identical.
|
||||
|
||||
# November 2013
|
||||
#
|
||||
# Add NEON code path, which handles lengths divisible by 8. RSA/DSA
|
||||
# performance improvement on Cortex-A8 is ~45-100% depending on key
|
||||
# length, more for longer keys. On Cortex-A15 the span is ~10-105%.
|
||||
# On Snapdragon S4 improvement was measured to vary from ~70% to
|
||||
# incredible ~380%, yes, 4.8x faster, for RSA4096 sign. But this is
|
||||
# rather because original integer-only code seems to perform
|
||||
# suboptimally on S4. Situation on Cortex-A9 is unfortunately
|
||||
# different. It's being looked into, but the trouble is that
|
||||
# performance for vectors longer than 256 bits is actually couple
|
||||
# of percent worse than for integer-only code. The code is chosen
|
||||
# for execution on all NEON-capable processors, because gain on
|
||||
# others outweighs the marginal loss on Cortex-A9.
|
||||
|
||||
# September 2015
|
||||
#
|
||||
# Align Cortex-A9 performance with November 2013 improvements, i.e.
|
||||
# NEON code is now ~20-105% faster than integer-only one on this
|
||||
# processor. But this optimization further improved performance even
|
||||
# on other processors: NEON code path is ~45-180% faster than original
|
||||
# integer-only on Cortex-A8, ~10-210% on Cortex-A15, ~70-450% on
|
||||
# Snapdragon S4.
|
||||
|
||||
$flavour = shift;
|
||||
if ($flavour=~/\w[\w\-]*\.\w+$/) { $output=$flavour; undef $flavour; }
|
||||
else { while (($output=shift) && ($output!~/\w[\w\-]*\.\w+$/)) {} }
|
||||
|
||||
if ($flavour && $flavour ne "void") {
|
||||
$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
|
||||
( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or
|
||||
( $xlate="${dir}../../../perlasm/arm-xlate.pl" and -f $xlate) or
|
||||
die "can't locate arm-xlate.pl";
|
||||
|
||||
open OUT,"| \"$^X\" $xlate $flavour $output";
|
||||
*STDOUT=*OUT;
|
||||
} else {
|
||||
open OUT,">$output";
|
||||
*STDOUT=*OUT;
|
||||
}
|
||||
|
||||
$num="r0"; # starts as num argument, but holds &tp[num-1]
|
||||
$ap="r1";
|
||||
$bp="r2"; $bi="r2"; $rp="r2";
|
||||
$np="r3";
|
||||
$tp="r4";
|
||||
$aj="r5";
|
||||
$nj="r6";
|
||||
$tj="r7";
|
||||
$n0="r8";
|
||||
########### # r9 is reserved by ELF as platform specific, e.g. TLS pointer
|
||||
$alo="r10"; # sl, gcc uses it to keep @GOT
|
||||
$ahi="r11"; # fp
|
||||
$nlo="r12"; # ip
|
||||
########### # r13 is stack pointer
|
||||
$nhi="r14"; # lr
|
||||
########### # r15 is program counter
|
||||
|
||||
#### argument block layout relative to &tp[num-1], a.k.a. $num
|
||||
$_rp="$num,#12*4";
|
||||
# ap permanently resides in r1
|
||||
$_bp="$num,#13*4";
|
||||
# np permanently resides in r3
|
||||
$_n0="$num,#14*4";
|
||||
$_num="$num,#15*4"; $_bpend=$_num;
|
||||
|
||||
$code=<<___;
|
||||
#include <GFp/arm_arch.h>
|
||||
|
||||
@ Silence ARMv8 deprecated IT instruction warnings. This file is used by both
|
||||
@ ARMv7 and ARMv8 processors and does not use ARMv8 instructions.
|
||||
.arch armv7-a
|
||||
|
||||
.text
|
||||
#if defined(__thumb2__)
|
||||
.syntax unified
|
||||
.thumb
|
||||
#else
|
||||
.code 32
|
||||
#endif
|
||||
|
||||
#if __ARM_MAX_ARCH__>=7
|
||||
.extern GFp_armcap_P
|
||||
.hidden GFp_armcap_P
|
||||
.align 5
|
||||
.LOPENSSL_armcap:
|
||||
.word GFp_armcap_P-.Lbn_mul_mont
|
||||
#endif
|
||||
|
||||
.global GFp_bn_mul_mont
|
||||
.type GFp_bn_mul_mont,%function
|
||||
|
||||
.align 5
|
||||
GFp_bn_mul_mont:
|
||||
.Lbn_mul_mont:
|
||||
ldr ip,[sp,#4] @ load num
|
||||
stmdb sp!,{r0,r2} @ sp points at argument block
|
||||
#if __ARM_MAX_ARCH__>=7
|
||||
tst ip,#7
|
||||
bne .Lialu
|
||||
adr r0,.Lbn_mul_mont
|
||||
ldr r2,.LOPENSSL_armcap
|
||||
ldr r0,[r0,r2]
|
||||
#ifdef __APPLE__
|
||||
ldr r0,[r0]
|
||||
#endif
|
||||
tst r0,#ARMV7_NEON @ NEON available?
|
||||
ldmia sp, {r0,r2}
|
||||
beq .Lialu
|
||||
add sp,sp,#8
|
||||
b bn_mul8x_mont_neon
|
||||
.align 4
|
||||
.Lialu:
|
||||
#endif
|
||||
cmp ip,#2
|
||||
mov $num,ip @ load num
|
||||
#ifdef __thumb2__
|
||||
ittt lt
|
||||
#endif
|
||||
movlt r0,#0
|
||||
addlt sp,sp,#2*4
|
||||
blt .Labrt
|
||||
|
||||
stmdb sp!,{r4-r12,lr} @ save 10 registers
|
||||
|
||||
mov $num,$num,lsl#2 @ rescale $num for byte count
|
||||
sub sp,sp,$num @ alloca(4*num)
|
||||
sub sp,sp,#4 @ +extra dword
|
||||
sub $num,$num,#4 @ "num=num-1"
|
||||
add $tp,$bp,$num @ &bp[num-1]
|
||||
|
||||
add $num,sp,$num @ $num to point at &tp[num-1]
|
||||
ldr $n0,[$_n0] @ &n0
|
||||
ldr $bi,[$bp] @ bp[0]
|
||||
ldr $aj,[$ap],#4 @ ap[0],ap++
|
||||
ldr $nj,[$np],#4 @ np[0],np++
|
||||
ldr $n0,[$n0] @ *n0
|
||||
str $tp,[$_bpend] @ save &bp[num]
|
||||
|
||||
umull $alo,$ahi,$aj,$bi @ ap[0]*bp[0]
|
||||
str $n0,[$_n0] @ save n0 value
|
||||
mul $n0,$alo,$n0 @ "tp[0]"*n0
|
||||
mov $nlo,#0
|
||||
umlal $alo,$nlo,$nj,$n0 @ np[0]*n0+"t[0]"
|
||||
mov $tp,sp
|
||||
|
||||
.L1st:
|
||||
ldr $aj,[$ap],#4 @ ap[j],ap++
|
||||
mov $alo,$ahi
|
||||
ldr $nj,[$np],#4 @ np[j],np++
|
||||
mov $ahi,#0
|
||||
umlal $alo,$ahi,$aj,$bi @ ap[j]*bp[0]
|
||||
mov $nhi,#0
|
||||
umlal $nlo,$nhi,$nj,$n0 @ np[j]*n0
|
||||
adds $nlo,$nlo,$alo
|
||||
str $nlo,[$tp],#4 @ tp[j-1]=,tp++
|
||||
adc $nlo,$nhi,#0
|
||||
cmp $tp,$num
|
||||
bne .L1st
|
||||
|
||||
adds $nlo,$nlo,$ahi
|
||||
ldr $tp,[$_bp] @ restore bp
|
||||
mov $nhi,#0
|
||||
ldr $n0,[$_n0] @ restore n0
|
||||
adc $nhi,$nhi,#0
|
||||
str $nlo,[$num] @ tp[num-1]=
|
||||
mov $tj,sp
|
||||
str $nhi,[$num,#4] @ tp[num]=
|
||||
|
||||
.Louter:
|
||||
sub $tj,$num,$tj @ "original" $num-1 value
|
||||
sub $ap,$ap,$tj @ "rewind" ap to &ap[1]
|
||||
ldr $bi,[$tp,#4]! @ *(++bp)
|
||||
sub $np,$np,$tj @ "rewind" np to &np[1]
|
||||
ldr $aj,[$ap,#-4] @ ap[0]
|
||||
ldr $alo,[sp] @ tp[0]
|
||||
ldr $nj,[$np,#-4] @ np[0]
|
||||
ldr $tj,[sp,#4] @ tp[1]
|
||||
|
||||
mov $ahi,#0
|
||||
umlal $alo,$ahi,$aj,$bi @ ap[0]*bp[i]+tp[0]
|
||||
str $tp,[$_bp] @ save bp
|
||||
mul $n0,$alo,$n0
|
||||
mov $nlo,#0
|
||||
umlal $alo,$nlo,$nj,$n0 @ np[0]*n0+"tp[0]"
|
||||
mov $tp,sp
|
||||
|
||||
.Linner:
|
||||
ldr $aj,[$ap],#4 @ ap[j],ap++
|
||||
adds $alo,$ahi,$tj @ +=tp[j]
|
||||
ldr $nj,[$np],#4 @ np[j],np++
|
||||
mov $ahi,#0
|
||||
umlal $alo,$ahi,$aj,$bi @ ap[j]*bp[i]
|
||||
mov $nhi,#0
|
||||
umlal $nlo,$nhi,$nj,$n0 @ np[j]*n0
|
||||
adc $ahi,$ahi,#0
|
||||
ldr $tj,[$tp,#8] @ tp[j+1]
|
||||
adds $nlo,$nlo,$alo
|
||||
str $nlo,[$tp],#4 @ tp[j-1]=,tp++
|
||||
adc $nlo,$nhi,#0
|
||||
cmp $tp,$num
|
||||
bne .Linner
|
||||
|
||||
adds $nlo,$nlo,$ahi
|
||||
mov $nhi,#0
|
||||
ldr $tp,[$_bp] @ restore bp
|
||||
adc $nhi,$nhi,#0
|
||||
ldr $n0,[$_n0] @ restore n0
|
||||
adds $nlo,$nlo,$tj
|
||||
ldr $tj,[$_bpend] @ restore &bp[num]
|
||||
adc $nhi,$nhi,#0
|
||||
str $nlo,[$num] @ tp[num-1]=
|
||||
str $nhi,[$num,#4] @ tp[num]=
|
||||
|
||||
cmp $tp,$tj
|
||||
#ifdef __thumb2__
|
||||
itt ne
|
||||
#endif
|
||||
movne $tj,sp
|
||||
bne .Louter
|
||||
|
||||
ldr $rp,[$_rp] @ pull rp
|
||||
mov $aj,sp
|
||||
add $num,$num,#4 @ $num to point at &tp[num]
|
||||
sub $aj,$num,$aj @ "original" num value
|
||||
mov $tp,sp @ "rewind" $tp
|
||||
mov $ap,$tp @ "borrow" $ap
|
||||
sub $np,$np,$aj @ "rewind" $np to &np[0]
|
||||
|
||||
subs $tj,$tj,$tj @ "clear" carry flag
|
||||
.Lsub: ldr $tj,[$tp],#4
|
||||
ldr $nj,[$np],#4
|
||||
sbcs $tj,$tj,$nj @ tp[j]-np[j]
|
||||
str $tj,[$rp],#4 @ rp[j]=
|
||||
teq $tp,$num @ preserve carry
|
||||
bne .Lsub
|
||||
sbcs $nhi,$nhi,#0 @ upmost carry
|
||||
mov $tp,sp @ "rewind" $tp
|
||||
sub $rp,$rp,$aj @ "rewind" $rp
|
||||
|
||||
.Lcopy: ldr $tj,[$tp] @ conditional copy
|
||||
ldr $aj,[$rp]
|
||||
str sp,[$tp],#4 @ zap tp
|
||||
#ifdef __thumb2__
|
||||
it cc
|
||||
#endif
|
||||
movcc $aj,$tj
|
||||
str $aj,[$rp],#4
|
||||
teq $tp,$num @ preserve carry
|
||||
bne .Lcopy
|
||||
|
||||
mov sp,$num
|
||||
add sp,sp,#4 @ skip over tp[num+1]
|
||||
ldmia sp!,{r4-r12,lr} @ restore registers
|
||||
add sp,sp,#2*4 @ skip over {r0,r2}
|
||||
mov r0,#1
|
||||
.Labrt:
|
||||
#if __ARM_ARCH__>=5
|
||||
ret @ bx lr
|
||||
#else
|
||||
tst lr,#1
|
||||
moveq pc,lr @ be binary compatible with V4, yet
|
||||
bx lr @ interoperable with Thumb ISA:-)
|
||||
#endif
|
||||
.size GFp_bn_mul_mont,.-GFp_bn_mul_mont
|
||||
___
|
||||
{
|
||||
my ($A0,$A1,$A2,$A3)=map("d$_",(0..3));
|
||||
my ($N0,$N1,$N2,$N3)=map("d$_",(4..7));
|
||||
my ($Z,$Temp)=("q4","q5");
|
||||
my @ACC=map("q$_",(6..13));
|
||||
my ($Bi,$Ni,$M0)=map("d$_",(28..31));
|
||||
my $zero="$Z#lo";
|
||||
my $temp="$Temp#lo";
|
||||
|
||||
my ($rptr,$aptr,$bptr,$nptr,$n0,$num)=map("r$_",(0..5));
|
||||
my ($tinptr,$toutptr,$inner,$outer,$bnptr)=map("r$_",(6..11));
|
||||
|
||||
$code.=<<___;
|
||||
#if __ARM_MAX_ARCH__>=7
|
||||
.arch armv7-a
|
||||
.fpu neon
|
||||
|
||||
.type bn_mul8x_mont_neon,%function
|
||||
.align 5
|
||||
bn_mul8x_mont_neon:
|
||||
mov ip,sp
|
||||
stmdb sp!,{r4-r11}
|
||||
vstmdb sp!,{d8-d15} @ ABI specification says so
|
||||
ldmia ip,{r4-r5} @ load rest of parameter block
|
||||
mov ip,sp
|
||||
|
||||
cmp $num,#8
|
||||
bhi .LNEON_8n
|
||||
|
||||
@ special case for $num==8, everything is in register bank...
|
||||
|
||||
vld1.32 {${Bi}[0]}, [$bptr,:32]!
|
||||
veor $zero,$zero,$zero
|
||||
sub $toutptr,sp,$num,lsl#4
|
||||
vld1.32 {$A0-$A3}, [$aptr]! @ can't specify :32 :-(
|
||||
and $toutptr,$toutptr,#-64
|
||||
vld1.32 {${M0}[0]}, [$n0,:32]
|
||||
mov sp,$toutptr @ alloca
|
||||
vzip.16 $Bi,$zero
|
||||
|
||||
vmull.u32 @ACC[0],$Bi,${A0}[0]
|
||||
vmull.u32 @ACC[1],$Bi,${A0}[1]
|
||||
vmull.u32 @ACC[2],$Bi,${A1}[0]
|
||||
vshl.i64 $Ni,@ACC[0]#hi,#16
|
||||
vmull.u32 @ACC[3],$Bi,${A1}[1]
|
||||
|
||||
vadd.u64 $Ni,$Ni,@ACC[0]#lo
|
||||
veor $zero,$zero,$zero
|
||||
vmul.u32 $Ni,$Ni,$M0
|
||||
|
||||
vmull.u32 @ACC[4],$Bi,${A2}[0]
|
||||
vld1.32 {$N0-$N3}, [$nptr]!
|
||||
vmull.u32 @ACC[5],$Bi,${A2}[1]
|
||||
vmull.u32 @ACC[6],$Bi,${A3}[0]
|
||||
vzip.16 $Ni,$zero
|
||||
vmull.u32 @ACC[7],$Bi,${A3}[1]
|
||||
|
||||
vmlal.u32 @ACC[0],$Ni,${N0}[0]
|
||||
sub $outer,$num,#1
|
||||
vmlal.u32 @ACC[1],$Ni,${N0}[1]
|
||||
vmlal.u32 @ACC[2],$Ni,${N1}[0]
|
||||
vmlal.u32 @ACC[3],$Ni,${N1}[1]
|
||||
|
||||
vmlal.u32 @ACC[4],$Ni,${N2}[0]
|
||||
vmov $Temp,@ACC[0]
|
||||
vmlal.u32 @ACC[5],$Ni,${N2}[1]
|
||||
vmov @ACC[0],@ACC[1]
|
||||
vmlal.u32 @ACC[6],$Ni,${N3}[0]
|
||||
vmov @ACC[1],@ACC[2]
|
||||
vmlal.u32 @ACC[7],$Ni,${N3}[1]
|
||||
vmov @ACC[2],@ACC[3]
|
||||
vmov @ACC[3],@ACC[4]
|
||||
vshr.u64 $temp,$temp,#16
|
||||
vmov @ACC[4],@ACC[5]
|
||||
vmov @ACC[5],@ACC[6]
|
||||
vadd.u64 $temp,$temp,$Temp#hi
|
||||
vmov @ACC[6],@ACC[7]
|
||||
veor @ACC[7],@ACC[7]
|
||||
vshr.u64 $temp,$temp,#16
|
||||
|
||||
b .LNEON_outer8
|
||||
|
||||
.align 4
|
||||
.LNEON_outer8:
|
||||
vld1.32 {${Bi}[0]}, [$bptr,:32]!
|
||||
veor $zero,$zero,$zero
|
||||
vzip.16 $Bi,$zero
|
||||
vadd.u64 @ACC[0]#lo,@ACC[0]#lo,$temp
|
||||
|
||||
vmlal.u32 @ACC[0],$Bi,${A0}[0]
|
||||
vmlal.u32 @ACC[1],$Bi,${A0}[1]
|
||||
vmlal.u32 @ACC[2],$Bi,${A1}[0]
|
||||
vshl.i64 $Ni,@ACC[0]#hi,#16
|
||||
vmlal.u32 @ACC[3],$Bi,${A1}[1]
|
||||
|
||||
vadd.u64 $Ni,$Ni,@ACC[0]#lo
|
||||
veor $zero,$zero,$zero
|
||||
subs $outer,$outer,#1
|
||||
vmul.u32 $Ni,$Ni,$M0
|
||||
|
||||
vmlal.u32 @ACC[4],$Bi,${A2}[0]
|
||||
vmlal.u32 @ACC[5],$Bi,${A2}[1]
|
||||
vmlal.u32 @ACC[6],$Bi,${A3}[0]
|
||||
vzip.16 $Ni,$zero
|
||||
vmlal.u32 @ACC[7],$Bi,${A3}[1]
|
||||
|
||||
vmlal.u32 @ACC[0],$Ni,${N0}[0]
|
||||
vmlal.u32 @ACC[1],$Ni,${N0}[1]
|
||||
vmlal.u32 @ACC[2],$Ni,${N1}[0]
|
||||
vmlal.u32 @ACC[3],$Ni,${N1}[1]
|
||||
|
||||
vmlal.u32 @ACC[4],$Ni,${N2}[0]
|
||||
vmov $Temp,@ACC[0]
|
||||
vmlal.u32 @ACC[5],$Ni,${N2}[1]
|
||||
vmov @ACC[0],@ACC[1]
|
||||
vmlal.u32 @ACC[6],$Ni,${N3}[0]
|
||||
vmov @ACC[1],@ACC[2]
|
||||
vmlal.u32 @ACC[7],$Ni,${N3}[1]
|
||||
vmov @ACC[2],@ACC[3]
|
||||
vmov @ACC[3],@ACC[4]
|
||||
vshr.u64 $temp,$temp,#16
|
||||
vmov @ACC[4],@ACC[5]
|
||||
vmov @ACC[5],@ACC[6]
|
||||
vadd.u64 $temp,$temp,$Temp#hi
|
||||
vmov @ACC[6],@ACC[7]
|
||||
veor @ACC[7],@ACC[7]
|
||||
vshr.u64 $temp,$temp,#16
|
||||
|
||||
bne .LNEON_outer8
|
||||
|
||||
vadd.u64 @ACC[0]#lo,@ACC[0]#lo,$temp
|
||||
mov $toutptr,sp
|
||||
vshr.u64 $temp,@ACC[0]#lo,#16
|
||||
mov $inner,$num
|
||||
vadd.u64 @ACC[0]#hi,@ACC[0]#hi,$temp
|
||||
add $tinptr,sp,#96
|
||||
vshr.u64 $temp,@ACC[0]#hi,#16
|
||||
vzip.16 @ACC[0]#lo,@ACC[0]#hi
|
||||
|
||||
b .LNEON_tail_entry
|
||||
|
||||
.align 4
|
||||
.LNEON_8n:
|
||||
veor @ACC[0],@ACC[0],@ACC[0]
|
||||
sub $toutptr,sp,#128
|
||||
veor @ACC[1],@ACC[1],@ACC[1]
|
||||
sub $toutptr,$toutptr,$num,lsl#4
|
||||
veor @ACC[2],@ACC[2],@ACC[2]
|
||||
and $toutptr,$toutptr,#-64
|
||||
veor @ACC[3],@ACC[3],@ACC[3]
|
||||
mov sp,$toutptr @ alloca
|
||||
veor @ACC[4],@ACC[4],@ACC[4]
|
||||
add $toutptr,$toutptr,#256
|
||||
veor @ACC[5],@ACC[5],@ACC[5]
|
||||
sub $inner,$num,#8
|
||||
veor @ACC[6],@ACC[6],@ACC[6]
|
||||
veor @ACC[7],@ACC[7],@ACC[7]
|
||||
|
||||
.LNEON_8n_init:
|
||||
vst1.64 {@ACC[0]-@ACC[1]},[$toutptr,:256]!
|
||||
subs $inner,$inner,#8
|
||||
vst1.64 {@ACC[2]-@ACC[3]},[$toutptr,:256]!
|
||||
vst1.64 {@ACC[4]-@ACC[5]},[$toutptr,:256]!
|
||||
vst1.64 {@ACC[6]-@ACC[7]},[$toutptr,:256]!
|
||||
bne .LNEON_8n_init
|
||||
|
||||
add $tinptr,sp,#256
|
||||
vld1.32 {$A0-$A3},[$aptr]!
|
||||
add $bnptr,sp,#8
|
||||
vld1.32 {${M0}[0]},[$n0,:32]
|
||||
mov $outer,$num
|
||||
b .LNEON_8n_outer
|
||||
|
||||
.align 4
|
||||
.LNEON_8n_outer:
|
||||
vld1.32 {${Bi}[0]},[$bptr,:32]! @ *b++
|
||||
veor $zero,$zero,$zero
|
||||
vzip.16 $Bi,$zero
|
||||
add $toutptr,sp,#128
|
||||
vld1.32 {$N0-$N3},[$nptr]!
|
||||
|
||||
vmlal.u32 @ACC[0],$Bi,${A0}[0]
|
||||
vmlal.u32 @ACC[1],$Bi,${A0}[1]
|
||||
veor $zero,$zero,$zero
|
||||
vmlal.u32 @ACC[2],$Bi,${A1}[0]
|
||||
vshl.i64 $Ni,@ACC[0]#hi,#16
|
||||
vmlal.u32 @ACC[3],$Bi,${A1}[1]
|
||||
vadd.u64 $Ni,$Ni,@ACC[0]#lo
|
||||
vmlal.u32 @ACC[4],$Bi,${A2}[0]
|
||||
vmul.u32 $Ni,$Ni,$M0
|
||||
vmlal.u32 @ACC[5],$Bi,${A2}[1]
|
||||
vst1.32 {$Bi},[sp,:64] @ put aside smashed b[8*i+0]
|
||||
vmlal.u32 @ACC[6],$Bi,${A3}[0]
|
||||
vzip.16 $Ni,$zero
|
||||
vmlal.u32 @ACC[7],$Bi,${A3}[1]
|
||||
___
|
||||
for ($i=0; $i<7;) {
|
||||
$code.=<<___;
|
||||
vld1.32 {${Bi}[0]},[$bptr,:32]! @ *b++
|
||||
vmlal.u32 @ACC[0],$Ni,${N0}[0]
|
||||
veor $temp,$temp,$temp
|
||||
vmlal.u32 @ACC[1],$Ni,${N0}[1]
|
||||
vzip.16 $Bi,$temp
|
||||
vmlal.u32 @ACC[2],$Ni,${N1}[0]
|
||||
vshr.u64 @ACC[0]#lo,@ACC[0]#lo,#16
|
||||
vmlal.u32 @ACC[3],$Ni,${N1}[1]
|
||||
vmlal.u32 @ACC[4],$Ni,${N2}[0]
|
||||
vadd.u64 @ACC[0]#lo,@ACC[0]#lo,@ACC[0]#hi
|
||||
vmlal.u32 @ACC[5],$Ni,${N2}[1]
|
||||
vshr.u64 @ACC[0]#lo,@ACC[0]#lo,#16
|
||||
vmlal.u32 @ACC[6],$Ni,${N3}[0]
|
||||
vmlal.u32 @ACC[7],$Ni,${N3}[1]
|
||||
vadd.u64 @ACC[1]#lo,@ACC[1]#lo,@ACC[0]#lo
|
||||
vst1.32 {$Ni},[$bnptr,:64]! @ put aside smashed m[8*i+$i]
|
||||
___
|
||||
push(@ACC,shift(@ACC)); $i++;
|
||||
$code.=<<___;
|
||||
vmlal.u32 @ACC[0],$Bi,${A0}[0]
|
||||
vld1.64 {@ACC[7]},[$tinptr,:128]!
|
||||
vmlal.u32 @ACC[1],$Bi,${A0}[1]
|
||||
veor $zero,$zero,$zero
|
||||
vmlal.u32 @ACC[2],$Bi,${A1}[0]
|
||||
vshl.i64 $Ni,@ACC[0]#hi,#16
|
||||
vmlal.u32 @ACC[3],$Bi,${A1}[1]
|
||||
vadd.u64 $Ni,$Ni,@ACC[0]#lo
|
||||
vmlal.u32 @ACC[4],$Bi,${A2}[0]
|
||||
vmul.u32 $Ni,$Ni,$M0
|
||||
vmlal.u32 @ACC[5],$Bi,${A2}[1]
|
||||
vst1.32 {$Bi},[$bnptr,:64]! @ put aside smashed b[8*i+$i]
|
||||
vmlal.u32 @ACC[6],$Bi,${A3}[0]
|
||||
vzip.16 $Ni,$zero
|
||||
vmlal.u32 @ACC[7],$Bi,${A3}[1]
|
||||
___
|
||||
}
|
||||
$code.=<<___;
|
||||
vld1.32 {$Bi},[sp,:64] @ pull smashed b[8*i+0]
|
||||
vmlal.u32 @ACC[0],$Ni,${N0}[0]
|
||||
vld1.32 {$A0-$A3},[$aptr]!
|
||||
vmlal.u32 @ACC[1],$Ni,${N0}[1]
|
||||
vmlal.u32 @ACC[2],$Ni,${N1}[0]
|
||||
vshr.u64 @ACC[0]#lo,@ACC[0]#lo,#16
|
||||
vmlal.u32 @ACC[3],$Ni,${N1}[1]
|
||||
vmlal.u32 @ACC[4],$Ni,${N2}[0]
|
||||
vadd.u64 @ACC[0]#lo,@ACC[0]#lo,@ACC[0]#hi
|
||||
vmlal.u32 @ACC[5],$Ni,${N2}[1]
|
||||
vshr.u64 @ACC[0]#lo,@ACC[0]#lo,#16
|
||||
vmlal.u32 @ACC[6],$Ni,${N3}[0]
|
||||
vmlal.u32 @ACC[7],$Ni,${N3}[1]
|
||||
vadd.u64 @ACC[1]#lo,@ACC[1]#lo,@ACC[0]#lo
|
||||
vst1.32 {$Ni},[$bnptr,:64] @ put aside smashed m[8*i+$i]
|
||||
add $bnptr,sp,#8 @ rewind
|
||||
___
|
||||
push(@ACC,shift(@ACC));
|
||||
$code.=<<___;
|
||||
sub $inner,$num,#8
|
||||
b .LNEON_8n_inner
|
||||
|
||||
.align 4
|
||||
.LNEON_8n_inner:
|
||||
subs $inner,$inner,#8
|
||||
vmlal.u32 @ACC[0],$Bi,${A0}[0]
|
||||
vld1.64 {@ACC[7]},[$tinptr,:128]
|
||||
vmlal.u32 @ACC[1],$Bi,${A0}[1]
|
||||
vld1.32 {$Ni},[$bnptr,:64]! @ pull smashed m[8*i+0]
|
||||
vmlal.u32 @ACC[2],$Bi,${A1}[0]
|
||||
vld1.32 {$N0-$N3},[$nptr]!
|
||||
vmlal.u32 @ACC[3],$Bi,${A1}[1]
|
||||
it ne
|
||||
addne $tinptr,$tinptr,#16 @ don't advance in last iteration
|
||||
vmlal.u32 @ACC[4],$Bi,${A2}[0]
|
||||
vmlal.u32 @ACC[5],$Bi,${A2}[1]
|
||||
vmlal.u32 @ACC[6],$Bi,${A3}[0]
|
||||
vmlal.u32 @ACC[7],$Bi,${A3}[1]
|
||||
___
|
||||
for ($i=1; $i<8; $i++) {
|
||||
$code.=<<___;
|
||||
vld1.32 {$Bi},[$bnptr,:64]! @ pull smashed b[8*i+$i]
|
||||
vmlal.u32 @ACC[0],$Ni,${N0}[0]
|
||||
vmlal.u32 @ACC[1],$Ni,${N0}[1]
|
||||
vmlal.u32 @ACC[2],$Ni,${N1}[0]
|
||||
vmlal.u32 @ACC[3],$Ni,${N1}[1]
|
||||
vmlal.u32 @ACC[4],$Ni,${N2}[0]
|
||||
vmlal.u32 @ACC[5],$Ni,${N2}[1]
|
||||
vmlal.u32 @ACC[6],$Ni,${N3}[0]
|
||||
vmlal.u32 @ACC[7],$Ni,${N3}[1]
|
||||
vst1.64 {@ACC[0]},[$toutptr,:128]!
|
||||
___
|
||||
push(@ACC,shift(@ACC));
|
||||
$code.=<<___;
|
||||
vmlal.u32 @ACC[0],$Bi,${A0}[0]
|
||||
vld1.64 {@ACC[7]},[$tinptr,:128]
|
||||
vmlal.u32 @ACC[1],$Bi,${A0}[1]
|
||||
vld1.32 {$Ni},[$bnptr,:64]! @ pull smashed m[8*i+$i]
|
||||
vmlal.u32 @ACC[2],$Bi,${A1}[0]
|
||||
it ne
|
||||
addne $tinptr,$tinptr,#16 @ don't advance in last iteration
|
||||
vmlal.u32 @ACC[3],$Bi,${A1}[1]
|
||||
vmlal.u32 @ACC[4],$Bi,${A2}[0]
|
||||
vmlal.u32 @ACC[5],$Bi,${A2}[1]
|
||||
vmlal.u32 @ACC[6],$Bi,${A3}[0]
|
||||
vmlal.u32 @ACC[7],$Bi,${A3}[1]
|
||||
___
|
||||
}
|
||||
$code.=<<___;
|
||||
it eq
|
||||
subeq $aptr,$aptr,$num,lsl#2 @ rewind
|
||||
vmlal.u32 @ACC[0],$Ni,${N0}[0]
|
||||
vld1.32 {$Bi},[sp,:64] @ pull smashed b[8*i+0]
|
||||
vmlal.u32 @ACC[1],$Ni,${N0}[1]
|
||||
vld1.32 {$A0-$A3},[$aptr]!
|
||||
vmlal.u32 @ACC[2],$Ni,${N1}[0]
|
||||
add $bnptr,sp,#8 @ rewind
|
||||
vmlal.u32 @ACC[3],$Ni,${N1}[1]
|
||||
vmlal.u32 @ACC[4],$Ni,${N2}[0]
|
||||
vmlal.u32 @ACC[5],$Ni,${N2}[1]
|
||||
vmlal.u32 @ACC[6],$Ni,${N3}[0]
|
||||
vst1.64 {@ACC[0]},[$toutptr,:128]!
|
||||
vmlal.u32 @ACC[7],$Ni,${N3}[1]
|
||||
|
||||
bne .LNEON_8n_inner
|
||||
___
|
||||
push(@ACC,shift(@ACC));
|
||||
$code.=<<___;
|
||||
add $tinptr,sp,#128
|
||||
vst1.64 {@ACC[0]-@ACC[1]},[$toutptr,:256]!
|
||||
veor q2,q2,q2 @ $N0-$N1
|
||||
vst1.64 {@ACC[2]-@ACC[3]},[$toutptr,:256]!
|
||||
veor q3,q3,q3 @ $N2-$N3
|
||||
vst1.64 {@ACC[4]-@ACC[5]},[$toutptr,:256]!
|
||||
vst1.64 {@ACC[6]},[$toutptr,:128]
|
||||
|
||||
subs $outer,$outer,#8
|
||||
vld1.64 {@ACC[0]-@ACC[1]},[$tinptr,:256]!
|
||||
vld1.64 {@ACC[2]-@ACC[3]},[$tinptr,:256]!
|
||||
vld1.64 {@ACC[4]-@ACC[5]},[$tinptr,:256]!
|
||||
vld1.64 {@ACC[6]-@ACC[7]},[$tinptr,:256]!
|
||||
|
||||
itt ne
|
||||
subne $nptr,$nptr,$num,lsl#2 @ rewind
|
||||
bne .LNEON_8n_outer
|
||||
|
||||
add $toutptr,sp,#128
|
||||
vst1.64 {q2-q3}, [sp,:256]! @ start wiping stack frame
|
||||
vshr.u64 $temp,@ACC[0]#lo,#16
|
||||
vst1.64 {q2-q3},[sp,:256]!
|
||||
vadd.u64 @ACC[0]#hi,@ACC[0]#hi,$temp
|
||||
vst1.64 {q2-q3}, [sp,:256]!
|
||||
vshr.u64 $temp,@ACC[0]#hi,#16
|
||||
vst1.64 {q2-q3}, [sp,:256]!
|
||||
vzip.16 @ACC[0]#lo,@ACC[0]#hi
|
||||
|
||||
mov $inner,$num
|
||||
b .LNEON_tail_entry
|
||||
|
||||
.align 4
|
||||
.LNEON_tail:
|
||||
vadd.u64 @ACC[0]#lo,@ACC[0]#lo,$temp
|
||||
vshr.u64 $temp,@ACC[0]#lo,#16
|
||||
vld1.64 {@ACC[2]-@ACC[3]}, [$tinptr, :256]!
|
||||
vadd.u64 @ACC[0]#hi,@ACC[0]#hi,$temp
|
||||
vld1.64 {@ACC[4]-@ACC[5]}, [$tinptr, :256]!
|
||||
vshr.u64 $temp,@ACC[0]#hi,#16
|
||||
vld1.64 {@ACC[6]-@ACC[7]}, [$tinptr, :256]!
|
||||
vzip.16 @ACC[0]#lo,@ACC[0]#hi
|
||||
|
||||
.LNEON_tail_entry:
|
||||
___
|
||||
for ($i=1; $i<8; $i++) {
|
||||
$code.=<<___;
|
||||
vadd.u64 @ACC[1]#lo,@ACC[1]#lo,$temp
|
||||
vst1.32 {@ACC[0]#lo[0]}, [$toutptr, :32]!
|
||||
vshr.u64 $temp,@ACC[1]#lo,#16
|
||||
vadd.u64 @ACC[1]#hi,@ACC[1]#hi,$temp
|
||||
vshr.u64 $temp,@ACC[1]#hi,#16
|
||||
vzip.16 @ACC[1]#lo,@ACC[1]#hi
|
||||
___
|
||||
push(@ACC,shift(@ACC));
|
||||
}
|
||||
push(@ACC,shift(@ACC));
|
||||
$code.=<<___;
|
||||
vld1.64 {@ACC[0]-@ACC[1]}, [$tinptr, :256]!
|
||||
subs $inner,$inner,#8
|
||||
vst1.32 {@ACC[7]#lo[0]}, [$toutptr, :32]!
|
||||
bne .LNEON_tail
|
||||
|
||||
vst1.32 {${temp}[0]}, [$toutptr, :32] @ top-most bit
|
||||
sub $nptr,$nptr,$num,lsl#2 @ rewind $nptr
|
||||
subs $aptr,sp,#0 @ clear carry flag
|
||||
add $bptr,sp,$num,lsl#2
|
||||
|
||||
.LNEON_sub:
|
||||
ldmia $aptr!, {r4-r7}
|
||||
ldmia $nptr!, {r8-r11}
|
||||
sbcs r8, r4,r8
|
||||
sbcs r9, r5,r9
|
||||
sbcs r10,r6,r10
|
||||
sbcs r11,r7,r11
|
||||
teq $aptr,$bptr @ preserves carry
|
||||
stmia $rptr!, {r8-r11}
|
||||
bne .LNEON_sub
|
||||
|
||||
ldr r10, [$aptr] @ load top-most bit
|
||||
mov r11,sp
|
||||
veor q0,q0,q0
|
||||
sub r11,$bptr,r11 @ this is num*4
|
||||
veor q1,q1,q1
|
||||
mov $aptr,sp
|
||||
sub $rptr,$rptr,r11 @ rewind $rptr
|
||||
mov $nptr,$bptr @ second 3/4th of frame
|
||||
sbcs r10,r10,#0 @ result is carry flag
|
||||
|
||||
.LNEON_copy_n_zap:
|
||||
ldmia $aptr!, {r4-r7}
|
||||
ldmia $rptr, {r8-r11}
|
||||
it cc
|
||||
movcc r8, r4
|
||||
vst1.64 {q0-q1}, [$nptr,:256]! @ wipe
|
||||
itt cc
|
||||
movcc r9, r5
|
||||
movcc r10,r6
|
||||
vst1.64 {q0-q1}, [$nptr,:256]! @ wipe
|
||||
it cc
|
||||
movcc r11,r7
|
||||
ldmia $aptr, {r4-r7}
|
||||
stmia $rptr!, {r8-r11}
|
||||
sub $aptr,$aptr,#16
|
||||
ldmia $rptr, {r8-r11}
|
||||
it cc
|
||||
movcc r8, r4
|
||||
vst1.64 {q0-q1}, [$aptr,:256]! @ wipe
|
||||
itt cc
|
||||
movcc r9, r5
|
||||
movcc r10,r6
|
||||
vst1.64 {q0-q1}, [$nptr,:256]! @ wipe
|
||||
it cc
|
||||
movcc r11,r7
|
||||
teq $aptr,$bptr @ preserves carry
|
||||
stmia $rptr!, {r8-r11}
|
||||
bne .LNEON_copy_n_zap
|
||||
|
||||
mov sp,ip
|
||||
vldmia sp!,{d8-d15}
|
||||
ldmia sp!,{r4-r11}
|
||||
ret @ bx lr
|
||||
.size bn_mul8x_mont_neon,.-bn_mul8x_mont_neon
|
||||
#endif
|
||||
___
|
||||
}
|
||||
$code.=<<___;
|
||||
.asciz "Montgomery multiplication for ARMv4/NEON, CRYPTOGAMS by <appro\@openssl.org>"
|
||||
___
|
||||
|
||||
foreach (split("\n",$code)) {
|
||||
s/\`([^\`]*)\`/eval $1/ge;
|
||||
|
||||
s/\bq([0-9]+)#(lo|hi)/sprintf "d%d",2*$1+($2 eq "hi")/ge or
|
||||
s/\bret\b/bx lr/g or
|
||||
s/\bbx\s+lr\b/.word\t0xe12fff1e/g; # make it possible to compile with -march=armv4
|
||||
|
||||
print $_,"\n";
|
||||
}
|
||||
|
||||
close STDOUT or die "error closing STDOUT";
|
||||
1523
zeroidc/vendor/ring/crypto/fipsmodule/bn/asm/armv8-mont.pl
vendored
Normal file
1523
zeroidc/vendor/ring/crypto/fipsmodule/bn/asm/armv8-mont.pl
vendored
Normal file
File diff suppressed because it is too large
Load Diff
336
zeroidc/vendor/ring/crypto/fipsmodule/bn/asm/x86-mont.pl
vendored
Normal file
336
zeroidc/vendor/ring/crypto/fipsmodule/bn/asm/x86-mont.pl
vendored
Normal file
@@ -0,0 +1,336 @@
|
||||
#! /usr/bin/env perl
|
||||
# Copyright 2005-2016 The OpenSSL Project Authors. All Rights Reserved.
|
||||
#
|
||||
# Licensed under the OpenSSL license (the "License"). You may not use
|
||||
# this file except in compliance with the License. You can obtain a copy
|
||||
# in the file LICENSE in the source distribution or at
|
||||
# https://www.openssl.org/source/license.html
|
||||
|
||||
|
||||
# ====================================================================
|
||||
# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
|
||||
# project. The module is, however, dual licensed under OpenSSL and
|
||||
# CRYPTOGAMS licenses depending on where you obtain it. For further
|
||||
# details see http://www.openssl.org/~appro/cryptogams/.
|
||||
# ====================================================================
|
||||
|
||||
# October 2005
|
||||
#
|
||||
# This is a "teaser" code, as it can be improved in several ways...
|
||||
# First of all non-SSE2 path should be implemented (yes, for now it
|
||||
# performs Montgomery multiplication/convolution only on SSE2-capable
|
||||
# CPUs such as P4, others fall down to original code). Then inner loop
|
||||
# can be unrolled and modulo-scheduled to improve ILP and possibly
|
||||
# moved to 128-bit XMM register bank (though it would require input
|
||||
# rearrangement and/or increase bus bandwidth utilization). Dedicated
|
||||
# squaring procedure should give further performance improvement...
|
||||
# Yet, for being draft, the code improves rsa512 *sign* benchmark by
|
||||
# 110%(!), rsa1024 one - by 70% and rsa4096 - by 20%:-)
|
||||
|
||||
# December 2006
|
||||
#
|
||||
# Modulo-scheduling SSE2 loops results in further 15-20% improvement.
|
||||
# Integer-only code [being equipped with dedicated squaring procedure]
|
||||
# gives ~40% on rsa512 sign benchmark...
|
||||
|
||||
$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
|
||||
push(@INC,"${dir}","${dir}../../../perlasm");
|
||||
require "x86asm.pl";
|
||||
|
||||
$output = pop;
|
||||
open STDOUT,">$output";
|
||||
|
||||
&asm_init($ARGV[0]);
|
||||
|
||||
$sse2=0;
|
||||
for (@ARGV) { $sse2=1 if (/-DOPENSSL_IA32_SSE2/); }
|
||||
|
||||
&external_label("GFp_ia32cap_P") if ($sse2);
|
||||
|
||||
&function_begin("GFp_bn_mul_mont");
|
||||
|
||||
$i="edx";
|
||||
$j="ecx";
|
||||
$ap="esi"; $tp="esi"; # overlapping variables!!!
|
||||
$rp="edi"; $bp="edi"; # overlapping variables!!!
|
||||
$np="ebp";
|
||||
$num="ebx";
|
||||
|
||||
$_num=&DWP(4*0,"esp"); # stack top layout
|
||||
$_rp=&DWP(4*1,"esp");
|
||||
$_ap=&DWP(4*2,"esp");
|
||||
$_bp=&DWP(4*3,"esp");
|
||||
$_np=&DWP(4*4,"esp");
|
||||
$_n0=&DWP(4*5,"esp"); $_n0q=&QWP(4*5,"esp");
|
||||
$_sp=&DWP(4*6,"esp");
|
||||
$_bpend=&DWP(4*7,"esp");
|
||||
$frame=32; # size of above frame rounded up to 16n
|
||||
|
||||
&xor ("eax","eax");
|
||||
&mov ("edi",&wparam(5)); # int num
|
||||
|
||||
&lea ("esi",&wparam(0)); # put aside pointer to argument block
|
||||
&lea ("edx",&wparam(1)); # load ap
|
||||
&add ("edi",2); # extra two words on top of tp
|
||||
&neg ("edi");
|
||||
&lea ("ebp",&DWP(-$frame,"esp","edi",4)); # future alloca($frame+4*(num+2))
|
||||
&neg ("edi");
|
||||
|
||||
# minimize cache contention by arranging 2K window between stack
|
||||
# pointer and ap argument [np is also position sensitive vector,
|
||||
# but it's assumed to be near ap, as it's allocated at ~same
|
||||
# time].
|
||||
&mov ("eax","ebp");
|
||||
&sub ("eax","edx");
|
||||
&and ("eax",2047);
|
||||
&sub ("ebp","eax"); # this aligns sp and ap modulo 2048
|
||||
|
||||
&xor ("edx","ebp");
|
||||
&and ("edx",2048);
|
||||
&xor ("edx",2048);
|
||||
&sub ("ebp","edx"); # this splits them apart modulo 4096
|
||||
|
||||
&and ("ebp",-64); # align to cache line
|
||||
|
||||
# An OS-agnostic version of __chkstk.
|
||||
#
|
||||
# Some OSes (Windows) insist on stack being "wired" to
|
||||
# physical memory in strictly sequential manner, i.e. if stack
|
||||
# allocation spans two pages, then reference to farmost one can
|
||||
# be punishable by SEGV. But page walking can do good even on
|
||||
# other OSes, because it guarantees that villain thread hits
|
||||
# the guard page before it can make damage to innocent one...
|
||||
&mov ("eax","esp");
|
||||
&sub ("eax","ebp");
|
||||
&and ("eax",-4096);
|
||||
&mov ("edx","esp"); # saved stack pointer!
|
||||
&lea ("esp",&DWP(0,"ebp","eax"));
|
||||
&mov ("eax",&DWP(0,"esp"));
|
||||
&cmp ("esp","ebp");
|
||||
&ja (&label("page_walk"));
|
||||
&jmp (&label("page_walk_done"));
|
||||
|
||||
&set_label("page_walk",16);
|
||||
&lea ("esp",&DWP(-4096,"esp"));
|
||||
&mov ("eax",&DWP(0,"esp"));
|
||||
&cmp ("esp","ebp");
|
||||
&ja (&label("page_walk"));
|
||||
&set_label("page_walk_done");
|
||||
|
||||
################################# load argument block...
|
||||
&mov ("eax",&DWP(0*4,"esi"));# BN_ULONG *rp
|
||||
&mov ("ebx",&DWP(1*4,"esi"));# const BN_ULONG *ap
|
||||
&mov ("ecx",&DWP(2*4,"esi"));# const BN_ULONG *bp
|
||||
&mov ("ebp",&DWP(3*4,"esi"));# const BN_ULONG *np
|
||||
&mov ("esi",&DWP(4*4,"esi"));# const BN_ULONG *n0
|
||||
#&mov ("edi",&DWP(5*4,"esi"));# int num
|
||||
|
||||
&mov ("esi",&DWP(0,"esi")); # pull n0[0]
|
||||
&mov ($_rp,"eax"); # ... save a copy of argument block
|
||||
&mov ($_ap,"ebx");
|
||||
&mov ($_bp,"ecx");
|
||||
&mov ($_np,"ebp");
|
||||
&mov ($_n0,"esi");
|
||||
&lea ($num,&DWP(-3,"edi")); # num=num-1 to assist modulo-scheduling
|
||||
#&mov ($_num,$num); # redundant as $num is not reused
|
||||
&mov ($_sp,"edx"); # saved stack pointer!
|
||||
|
||||
if($sse2) {
|
||||
$acc0="mm0"; # mmx register bank layout
|
||||
$acc1="mm1";
|
||||
$car0="mm2";
|
||||
$car1="mm3";
|
||||
$mul0="mm4";
|
||||
$mul1="mm5";
|
||||
$temp="mm6";
|
||||
$mask="mm7";
|
||||
|
||||
&picmeup("eax","GFp_ia32cap_P");
|
||||
&bt (&DWP(0,"eax"),26);
|
||||
# The non-SSE2 code was removed.
|
||||
|
||||
&mov ("eax",-1);
|
||||
&movd ($mask,"eax"); # mask 32 lower bits
|
||||
|
||||
&mov ($ap,$_ap); # load input pointers
|
||||
&mov ($bp,$_bp);
|
||||
&mov ($np,$_np);
|
||||
|
||||
&xor ($i,$i); # i=0
|
||||
&xor ($j,$j); # j=0
|
||||
|
||||
&movd ($mul0,&DWP(0,$bp)); # bp[0]
|
||||
&movd ($mul1,&DWP(0,$ap)); # ap[0]
|
||||
&movd ($car1,&DWP(0,$np)); # np[0]
|
||||
|
||||
&pmuludq($mul1,$mul0); # ap[0]*bp[0]
|
||||
&movq ($car0,$mul1);
|
||||
&movq ($acc0,$mul1); # I wish movd worked for
|
||||
&pand ($acc0,$mask); # inter-register transfers
|
||||
|
||||
&pmuludq($mul1,$_n0q); # *=n0
|
||||
|
||||
&pmuludq($car1,$mul1); # "t[0]"*np[0]*n0
|
||||
&paddq ($car1,$acc0);
|
||||
|
||||
&movd ($acc1,&DWP(4,$np)); # np[1]
|
||||
&movd ($acc0,&DWP(4,$ap)); # ap[1]
|
||||
|
||||
&psrlq ($car0,32);
|
||||
&psrlq ($car1,32);
|
||||
|
||||
&inc ($j); # j++
|
||||
&set_label("1st",16);
|
||||
&pmuludq($acc0,$mul0); # ap[j]*bp[0]
|
||||
&pmuludq($acc1,$mul1); # np[j]*m1
|
||||
&paddq ($car0,$acc0); # +=c0
|
||||
&paddq ($car1,$acc1); # +=c1
|
||||
|
||||
&movq ($acc0,$car0);
|
||||
&pand ($acc0,$mask);
|
||||
&movd ($acc1,&DWP(4,$np,$j,4)); # np[j+1]
|
||||
&paddq ($car1,$acc0); # +=ap[j]*bp[0];
|
||||
&movd ($acc0,&DWP(4,$ap,$j,4)); # ap[j+1]
|
||||
&psrlq ($car0,32);
|
||||
&movd (&DWP($frame-4,"esp",$j,4),$car1); # tp[j-1]=
|
||||
&psrlq ($car1,32);
|
||||
|
||||
&lea ($j,&DWP(1,$j));
|
||||
&cmp ($j,$num);
|
||||
&jl (&label("1st"));
|
||||
|
||||
&pmuludq($acc0,$mul0); # ap[num-1]*bp[0]
|
||||
&pmuludq($acc1,$mul1); # np[num-1]*m1
|
||||
&paddq ($car0,$acc0); # +=c0
|
||||
&paddq ($car1,$acc1); # +=c1
|
||||
|
||||
&movq ($acc0,$car0);
|
||||
&pand ($acc0,$mask);
|
||||
&paddq ($car1,$acc0); # +=ap[num-1]*bp[0];
|
||||
&movd (&DWP($frame-4,"esp",$j,4),$car1); # tp[num-2]=
|
||||
|
||||
&psrlq ($car0,32);
|
||||
&psrlq ($car1,32);
|
||||
|
||||
&paddq ($car1,$car0);
|
||||
&movq (&QWP($frame,"esp",$num,4),$car1); # tp[num].tp[num-1]
|
||||
|
||||
&inc ($i); # i++
|
||||
&set_label("outer");
|
||||
&xor ($j,$j); # j=0
|
||||
|
||||
&movd ($mul0,&DWP(0,$bp,$i,4)); # bp[i]
|
||||
&movd ($mul1,&DWP(0,$ap)); # ap[0]
|
||||
&movd ($temp,&DWP($frame,"esp")); # tp[0]
|
||||
&movd ($car1,&DWP(0,$np)); # np[0]
|
||||
&pmuludq($mul1,$mul0); # ap[0]*bp[i]
|
||||
|
||||
&paddq ($mul1,$temp); # +=tp[0]
|
||||
&movq ($acc0,$mul1);
|
||||
&movq ($car0,$mul1);
|
||||
&pand ($acc0,$mask);
|
||||
|
||||
&pmuludq($mul1,$_n0q); # *=n0
|
||||
|
||||
&pmuludq($car1,$mul1);
|
||||
&paddq ($car1,$acc0);
|
||||
|
||||
&movd ($temp,&DWP($frame+4,"esp")); # tp[1]
|
||||
&movd ($acc1,&DWP(4,$np)); # np[1]
|
||||
&movd ($acc0,&DWP(4,$ap)); # ap[1]
|
||||
|
||||
&psrlq ($car0,32);
|
||||
&psrlq ($car1,32);
|
||||
&paddq ($car0,$temp); # +=tp[1]
|
||||
|
||||
&inc ($j); # j++
|
||||
&dec ($num);
|
||||
&set_label("inner");
|
||||
&pmuludq($acc0,$mul0); # ap[j]*bp[i]
|
||||
&pmuludq($acc1,$mul1); # np[j]*m1
|
||||
&paddq ($car0,$acc0); # +=c0
|
||||
&paddq ($car1,$acc1); # +=c1
|
||||
|
||||
&movq ($acc0,$car0);
|
||||
&movd ($temp,&DWP($frame+4,"esp",$j,4));# tp[j+1]
|
||||
&pand ($acc0,$mask);
|
||||
&movd ($acc1,&DWP(4,$np,$j,4)); # np[j+1]
|
||||
&paddq ($car1,$acc0); # +=ap[j]*bp[i]+tp[j]
|
||||
&movd ($acc0,&DWP(4,$ap,$j,4)); # ap[j+1]
|
||||
&psrlq ($car0,32);
|
||||
&movd (&DWP($frame-4,"esp",$j,4),$car1);# tp[j-1]=
|
||||
&psrlq ($car1,32);
|
||||
&paddq ($car0,$temp); # +=tp[j+1]
|
||||
|
||||
&dec ($num);
|
||||
&lea ($j,&DWP(1,$j)); # j++
|
||||
&jnz (&label("inner"));
|
||||
|
||||
&mov ($num,$j);
|
||||
&pmuludq($acc0,$mul0); # ap[num-1]*bp[i]
|
||||
&pmuludq($acc1,$mul1); # np[num-1]*m1
|
||||
&paddq ($car0,$acc0); # +=c0
|
||||
&paddq ($car1,$acc1); # +=c1
|
||||
|
||||
&movq ($acc0,$car0);
|
||||
&pand ($acc0,$mask);
|
||||
&paddq ($car1,$acc0); # +=ap[num-1]*bp[i]+tp[num-1]
|
||||
&movd (&DWP($frame-4,"esp",$j,4),$car1); # tp[num-2]=
|
||||
&psrlq ($car0,32);
|
||||
&psrlq ($car1,32);
|
||||
|
||||
&movd ($temp,&DWP($frame+4,"esp",$num,4)); # += tp[num]
|
||||
&paddq ($car1,$car0);
|
||||
&paddq ($car1,$temp);
|
||||
&movq (&QWP($frame,"esp",$num,4),$car1); # tp[num].tp[num-1]
|
||||
|
||||
&lea ($i,&DWP(1,$i)); # i++
|
||||
&cmp ($i,$num);
|
||||
&jle (&label("outer"));
|
||||
|
||||
&emms (); # done with mmx bank
|
||||
|
||||
} # The non-SSE2 code was removed.
|
||||
|
||||
&set_label("common_tail",16);
|
||||
&mov ($np,$_np); # load modulus pointer
|
||||
&mov ($rp,$_rp); # load result pointer
|
||||
&lea ($tp,&DWP($frame,"esp")); # [$ap and $bp are zapped]
|
||||
|
||||
&mov ("eax",&DWP(0,$tp)); # tp[0]
|
||||
&mov ($j,$num); # j=num-1
|
||||
&xor ($i,$i); # i=0 and clear CF!
|
||||
|
||||
&set_label("sub",16);
|
||||
&sbb ("eax",&DWP(0,$np,$i,4));
|
||||
&mov (&DWP(0,$rp,$i,4),"eax"); # rp[i]=tp[i]-np[i]
|
||||
&dec ($j); # doesn't affect CF!
|
||||
&mov ("eax",&DWP(4,$tp,$i,4)); # tp[i+1]
|
||||
&lea ($i,&DWP(1,$i)); # i++
|
||||
&jge (&label("sub"));
|
||||
|
||||
&sbb ("eax",0); # handle upmost overflow bit
|
||||
&mov ("edx",-1);
|
||||
&xor ("edx","eax");
|
||||
&jmp (&label("copy"));
|
||||
|
||||
&set_label("copy",16); # conditional copy
|
||||
&mov ($tp,&DWP($frame,"esp",$num,4));
|
||||
&mov ($np,&DWP(0,$rp,$num,4));
|
||||
&mov (&DWP($frame,"esp",$num,4),$j); # zap temporary vector
|
||||
&and ($tp,"eax");
|
||||
&and ($np,"edx");
|
||||
&or ($np,$tp);
|
||||
&mov (&DWP(0,$rp,$num,4),$np);
|
||||
&dec ($num);
|
||||
&jge (&label("copy"));
|
||||
|
||||
&mov ("esp",$_sp); # pull saved stack pointer
|
||||
&mov ("eax",1);
|
||||
&function_end("GFp_bn_mul_mont");
|
||||
|
||||
&asciz("Montgomery Multiplication for x86, CRYPTOGAMS by <appro\@openssl.org>");
|
||||
|
||||
&asm_finish();
|
||||
|
||||
close STDOUT or die "error closing STDOUT";
|
||||
1579
zeroidc/vendor/ring/crypto/fipsmodule/bn/asm/x86_64-mont.pl
vendored
Normal file
1579
zeroidc/vendor/ring/crypto/fipsmodule/bn/asm/x86_64-mont.pl
vendored
Normal file
File diff suppressed because it is too large
Load Diff
3932
zeroidc/vendor/ring/crypto/fipsmodule/bn/asm/x86_64-mont5.pl
vendored
Normal file
3932
zeroidc/vendor/ring/crypto/fipsmodule/bn/asm/x86_64-mont5.pl
vendored
Normal file
File diff suppressed because it is too large
Load Diff
197
zeroidc/vendor/ring/crypto/fipsmodule/bn/internal.h
vendored
Normal file
197
zeroidc/vendor/ring/crypto/fipsmodule/bn/internal.h
vendored
Normal file
@@ -0,0 +1,197 @@
|
||||
/* Copyright (C) 1995-1997 Eric Young (eay@cryptsoft.com)
|
||||
* All rights reserved.
|
||||
*
|
||||
* This package is an SSL implementation written
|
||||
* by Eric Young (eay@cryptsoft.com).
|
||||
* The implementation was written so as to conform with Netscapes SSL.
|
||||
*
|
||||
* This library is free for commercial and non-commercial use as long as
|
||||
* the following conditions are aheared to. The following conditions
|
||||
* apply to all code found in this distribution, be it the RC4, RSA,
|
||||
* lhash, DES, etc., code; not just the SSL code. The SSL documentation
|
||||
* included with this distribution is covered by the same copyright terms
|
||||
* except that the holder is Tim Hudson (tjh@cryptsoft.com).
|
||||
*
|
||||
* Copyright remains Eric Young's, and as such any Copyright notices in
|
||||
* the code are not to be removed.
|
||||
* If this package is used in a product, Eric Young should be given attribution
|
||||
* as the author of the parts of the library used.
|
||||
* This can be in the form of a textual message at program startup or
|
||||
* in documentation (online or textual) provided with the package.
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
* modification, are permitted provided that the following conditions
|
||||
* are met:
|
||||
* 1. Redistributions of source code must retain the copyright
|
||||
* notice, this list of conditions and the following disclaimer.
|
||||
* 2. Redistributions in binary form must reproduce the above copyright
|
||||
* notice, this list of conditions and the following disclaimer in the
|
||||
* documentation and/or other materials provided with the distribution.
|
||||
* 3. All advertising materials mentioning features or use of this software
|
||||
* must display the following acknowledgement:
|
||||
* "This product includes cryptographic software written by
|
||||
* Eric Young (eay@cryptsoft.com)"
|
||||
* The word 'cryptographic' can be left out if the rouines from the library
|
||||
* being used are not cryptographic related :-).
|
||||
* 4. If you include any Windows specific code (or a derivative thereof) from
|
||||
* the apps directory (application code) you must include an acknowledgement:
|
||||
* "This product includes software written by Tim Hudson (tjh@cryptsoft.com)"
|
||||
*
|
||||
* THIS SOFTWARE IS PROVIDED BY ERIC YOUNG ``AS IS'' AND
|
||||
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
* ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
|
||||
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
|
||||
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
|
||||
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
|
||||
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
|
||||
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
|
||||
* SUCH DAMAGE.
|
||||
*
|
||||
* The licence and distribution terms for any publically available version or
|
||||
* derivative of this code cannot be changed. i.e. this code cannot simply be
|
||||
* copied and put under another distribution licence
|
||||
* [including the GNU Public Licence.]
|
||||
*/
|
||||
/* ====================================================================
|
||||
* Copyright (c) 1998-2006 The OpenSSL Project. All rights reserved.
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
* modification, are permitted provided that the following conditions
|
||||
* are met:
|
||||
*
|
||||
* 1. Redistributions of source code must retain the above copyright
|
||||
* notice, this list of conditions and the following disclaimer.
|
||||
*
|
||||
* 2. Redistributions in binary form must reproduce the above copyright
|
||||
* notice, this list of conditions and the following disclaimer in
|
||||
* the documentation and/or other materials provided with the
|
||||
* distribution.
|
||||
*
|
||||
* 3. All advertising materials mentioning features or use of this
|
||||
* software must display the following acknowledgment:
|
||||
* "This product includes software developed by the OpenSSL Project
|
||||
* for use in the OpenSSL Toolkit. (http://www.openssl.org/)"
|
||||
*
|
||||
* 4. The names "OpenSSL Toolkit" and "OpenSSL Project" must not be used to
|
||||
* endorse or promote products derived from this software without
|
||||
* prior written permission. For written permission, please contact
|
||||
* openssl-core@openssl.org.
|
||||
*
|
||||
* 5. Products derived from this software may not be called "OpenSSL"
|
||||
* nor may "OpenSSL" appear in their names without prior written
|
||||
* permission of the OpenSSL Project.
|
||||
*
|
||||
* 6. Redistributions of any form whatsoever must retain the following
|
||||
* acknowledgment:
|
||||
* "This product includes software developed by the OpenSSL Project
|
||||
* for use in the OpenSSL Toolkit (http://www.openssl.org/)"
|
||||
*
|
||||
* THIS SOFTWARE IS PROVIDED BY THE OpenSSL PROJECT ``AS IS'' AND ANY
|
||||
* EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
|
||||
* PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE OpenSSL PROJECT OR
|
||||
* ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
* SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
|
||||
* NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
|
||||
* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
|
||||
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
|
||||
* STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
|
||||
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED
|
||||
* OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
* ====================================================================
|
||||
*
|
||||
* This product includes cryptographic software written by Eric Young
|
||||
* (eay@cryptsoft.com). This product includes software written by Tim
|
||||
* Hudson (tjh@cryptsoft.com).
|
||||
*
|
||||
*/
|
||||
/* ====================================================================
|
||||
* Copyright 2002 Sun Microsystems, Inc. ALL RIGHTS RESERVED.
|
||||
*
|
||||
* Portions of the attached software ("Contribution") are developed by
|
||||
* SUN MICROSYSTEMS, INC., and are contributed to the OpenSSL project.
|
||||
*
|
||||
* The Contribution is licensed pursuant to the Eric Young open source
|
||||
* license provided above.
|
||||
*
|
||||
* The binary polynomial arithmetic software is originally written by
|
||||
* Sheueling Chang Shantz and Douglas Stebila of Sun Microsystems
|
||||
* Laboratories. */
|
||||
|
||||
#ifndef OPENSSL_HEADER_BN_INTERNAL_H
|
||||
#define OPENSSL_HEADER_BN_INTERNAL_H
|
||||
|
||||
#include <GFp/base.h>
|
||||
|
||||
#if defined(OPENSSL_X86_64) && defined(_MSC_VER) && !defined(__clang__)
|
||||
#pragma warning(push, 3)
|
||||
#include <intrin.h>
|
||||
#pragma warning(pop)
|
||||
#pragma intrinsic(_umul128)
|
||||
#endif
|
||||
|
||||
#include "../../internal.h"
|
||||
|
||||
typedef crypto_word BN_ULONG;
|
||||
|
||||
#if defined(OPENSSL_64_BIT)
|
||||
|
||||
#if defined(BORINGSSL_HAS_UINT128)
|
||||
// MSVC doesn't support two-word integers on 64-bit.
|
||||
#define BN_ULLONG uint128_t
|
||||
#endif
|
||||
|
||||
#define BN_BITS2 64
|
||||
#define BN_MONT_CTX_N0_LIMBS 1
|
||||
#define BN_MONT_CTX_N0(hi, lo) TOBN(hi, lo), 0
|
||||
#define TOBN(hi, lo) ((BN_ULONG)(hi) << 32 | (lo))
|
||||
|
||||
#elif defined(OPENSSL_32_BIT)
|
||||
|
||||
#define BN_ULLONG uint64_t
|
||||
#define BN_BITS2 32
|
||||
// On some 32-bit platforms, Montgomery multiplication is done using 64-bit
|
||||
// arithmetic with SIMD instructions. On such platforms, |BN_MONT_CTX::n0|
|
||||
// needs to be two words long. Only certain 32-bit platforms actually make use
|
||||
// of n0[1] and shorter R value would suffice for the others. However,
|
||||
// currently only the assembly files know which is which.
|
||||
#define BN_MONT_CTX_N0_LIMBS 2
|
||||
#define BN_MONT_CTX_N0(hi, lo) TOBN(hi, lo)
|
||||
#define TOBN(hi, lo) (lo), (hi)
|
||||
|
||||
#else
|
||||
#error "Must define either OPENSSL_32_BIT or OPENSSL_64_BIT"
|
||||
#endif
|
||||
|
||||
|
||||
// |num| must be at least 4, at least on x86.
|
||||
//
|
||||
// In other forks, |bn_mul_mont| returns an |int| indicating whether it
|
||||
// actually did the multiplication. All our implementations always do the
|
||||
// multiplication, and forcing callers to deal with the possibility of it
|
||||
// failing just leads to further problems.
|
||||
//
|
||||
// In other forks, |bn_mod_mul|'s `num` argument has type |int| but it is
|
||||
// implicitly treated as a |size_t|; when |int| is smaller than |size_t|
|
||||
// then the |movq 48(%rsp),%r9| done by x86_64-xlate.pl implicitly does the
|
||||
// conversion.
|
||||
OPENSSL_STATIC_ASSERT(sizeof(int) == sizeof(size_t) ||
|
||||
(sizeof(int) == 4 && sizeof(size_t) == 8),
|
||||
"int and size_t ABI mismatch");
|
||||
void GFp_bn_mul_mont(BN_ULONG *rp, const BN_ULONG *ap, const BN_ULONG *bp,
|
||||
const BN_ULONG *np, const BN_ULONG *n0, size_t num);
|
||||
|
||||
static inline void bn_umult_lohi(BN_ULONG *low_out, BN_ULONG *high_out,
|
||||
BN_ULONG a, BN_ULONG b) {
|
||||
#if defined(OPENSSL_X86_64) && defined(_MSC_VER) && !defined(__clang__)
|
||||
*low_out = _umul128(a, b, high_out);
|
||||
#else
|
||||
BN_ULLONG result = (BN_ULLONG)a * b;
|
||||
*low_out = (BN_ULONG)result;
|
||||
*high_out = (BN_ULONG)(result >> BN_BITS2);
|
||||
#endif
|
||||
}
|
||||
|
||||
#endif // OPENSSL_HEADER_BN_INTERNAL_H
|
||||
158
zeroidc/vendor/ring/crypto/fipsmodule/bn/montgomery.c
vendored
Normal file
158
zeroidc/vendor/ring/crypto/fipsmodule/bn/montgomery.c
vendored
Normal file
@@ -0,0 +1,158 @@
|
||||
/* Copyright (C) 1995-1998 Eric Young (eay@cryptsoft.com)
|
||||
* All rights reserved.
|
||||
*
|
||||
* This package is an SSL implementation written
|
||||
* by Eric Young (eay@cryptsoft.com).
|
||||
* The implementation was written so as to conform with Netscapes SSL.
|
||||
*
|
||||
* This library is free for commercial and non-commercial use as long as
|
||||
* the following conditions are aheared to. The following conditions
|
||||
* apply to all code found in this distribution, be it the RC4, RSA,
|
||||
* lhash, DES, etc., code; not just the SSL code. The SSL documentation
|
||||
* included with this distribution is covered by the same copyright terms
|
||||
* except that the holder is Tim Hudson (tjh@cryptsoft.com).
|
||||
*
|
||||
* Copyright remains Eric Young's, and as such any Copyright notices in
|
||||
* the code are not to be removed.
|
||||
* If this package is used in a product, Eric Young should be given attribution
|
||||
* as the author of the parts of the library used.
|
||||
* This can be in the form of a textual message at program startup or
|
||||
* in documentation (online or textual) provided with the package.
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
* modification, are permitted provided that the following conditions
|
||||
* are met:
|
||||
* 1. Redistributions of source code must retain the copyright
|
||||
* notice, this list of conditions and the following disclaimer.
|
||||
* 2. Redistributions in binary form must reproduce the above copyright
|
||||
* notice, this list of conditions and the following disclaimer in the
|
||||
* documentation and/or other materials provided with the distribution.
|
||||
* 3. All advertising materials mentioning features or use of this software
|
||||
* must display the following acknowledgement:
|
||||
* "This product includes cryptographic software written by
|
||||
* Eric Young (eay@cryptsoft.com)"
|
||||
* The word 'cryptographic' can be left out if the rouines from the library
|
||||
* being used are not cryptographic related :-).
|
||||
* 4. If you include any Windows specific code (or a derivative thereof) from
|
||||
* the apps directory (application code) you must include an acknowledgement:
|
||||
* "This product includes software written by Tim Hudson (tjh@cryptsoft.com)"
|
||||
*
|
||||
* THIS SOFTWARE IS PROVIDED BY ERIC YOUNG ``AS IS'' AND
|
||||
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
* ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
|
||||
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
|
||||
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
|
||||
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
|
||||
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
|
||||
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
|
||||
* SUCH DAMAGE.
|
||||
*
|
||||
* The licence and distribution terms for any publically available version or
|
||||
* derivative of this code cannot be changed. i.e. this code cannot simply be
|
||||
* copied and put under another distribution licence
|
||||
* [including the GNU Public Licence.]
|
||||
*/
|
||||
/* ====================================================================
|
||||
* Copyright (c) 1998-2006 The OpenSSL Project. All rights reserved.
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
* modification, are permitted provided that the following conditions
|
||||
* are met:
|
||||
*
|
||||
* 1. Redistributions of source code must retain the above copyright
|
||||
* notice, this list of conditions and the following disclaimer.
|
||||
*
|
||||
* 2. Redistributions in binary form must reproduce the above copyright
|
||||
* notice, this list of conditions and the following disclaimer in
|
||||
* the documentation and/or other materials provided with the
|
||||
* distribution.
|
||||
*
|
||||
* 3. All advertising materials mentioning features or use of this
|
||||
* software must display the following acknowledgment:
|
||||
* "This product includes software developed by the OpenSSL Project
|
||||
* for use in the OpenSSL Toolkit. (http://www.openssl.org/)"
|
||||
*
|
||||
* 4. The names "OpenSSL Toolkit" and "OpenSSL Project" must not be used to
|
||||
* endorse or promote products derived from this software without
|
||||
* prior written permission. For written permission, please contact
|
||||
* openssl-core@openssl.org.
|
||||
*
|
||||
* 5. Products derived from this software may not be called "OpenSSL"
|
||||
* nor may "OpenSSL" appear in their names without prior written
|
||||
* permission of the OpenSSL Project.
|
||||
*
|
||||
* 6. Redistributions of any form whatsoever must retain the following
|
||||
* acknowledgment:
|
||||
* "This product includes software developed by the OpenSSL Project
|
||||
* for use in the OpenSSL Toolkit (http://www.openssl.org/)"
|
||||
*
|
||||
* THIS SOFTWARE IS PROVIDED BY THE OpenSSL PROJECT ``AS IS'' AND ANY
|
||||
* EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
|
||||
* PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE OpenSSL PROJECT OR
|
||||
* ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
* SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
|
||||
* NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
|
||||
* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
|
||||
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
|
||||
* STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
|
||||
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED
|
||||
* OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
* ====================================================================
|
||||
*
|
||||
* This product includes cryptographic software written by Eric Young
|
||||
* (eay@cryptsoft.com). This product includes software written by Tim
|
||||
* Hudson (tjh@cryptsoft.com). */
|
||||
|
||||
#include "internal.h"
|
||||
#include "../../internal.h"
|
||||
|
||||
#include "../../limbs/limbs.h"
|
||||
#include "../../limbs/limbs.inl"
|
||||
|
||||
OPENSSL_STATIC_ASSERT(BN_MONT_CTX_N0_LIMBS == 1 || BN_MONT_CTX_N0_LIMBS == 2,
|
||||
"BN_MONT_CTX_N0_LIMBS value is invalid");
|
||||
OPENSSL_STATIC_ASSERT(
|
||||
sizeof(BN_ULONG) * BN_MONT_CTX_N0_LIMBS == sizeof(uint64_t),
|
||||
"uint64_t is insufficient precision for n0");
|
||||
|
||||
int GFp_bn_from_montgomery_in_place(BN_ULONG r[], size_t num_r, BN_ULONG a[],
|
||||
size_t num_a, const BN_ULONG n[],
|
||||
size_t num_n,
|
||||
const BN_ULONG n0_[BN_MONT_CTX_N0_LIMBS]) {
|
||||
if (num_n == 0 || num_r != num_n || num_a != 2 * num_n) {
|
||||
return 0;
|
||||
}
|
||||
|
||||
// Add multiples of |n| to |r| until R = 2^(nl * BN_BITS2) divides it. On
|
||||
// input, we had |r| < |n| * R, so now |r| < 2 * |n| * R. Note that |r|
|
||||
// includes |carry| which is stored separately.
|
||||
BN_ULONG n0 = n0_[0];
|
||||
BN_ULONG carry = 0;
|
||||
for (size_t i = 0; i < num_n; i++) {
|
||||
BN_ULONG v = GFp_limbs_mul_add_limb(a + i, n, a[i] * n0, num_n);
|
||||
v += carry + a[i + num_n];
|
||||
carry |= (v != a[i + num_n]);
|
||||
carry &= (v <= a[i + num_n]);
|
||||
a[i + num_n] = v;
|
||||
}
|
||||
|
||||
// Shift |num_n| words to divide by R. We have |a| < 2 * |n|. Note that |a|
|
||||
// includes |carry| which is stored separately.
|
||||
a += num_n;
|
||||
|
||||
// |a| thus requires at most one additional subtraction |n| to be reduced.
|
||||
// Subtract |n| and select the answer in constant time.
|
||||
BN_ULONG v = limbs_sub(r, a, n, num_n) - carry;
|
||||
// |v| is one if |a| - |n| underflowed or zero if it did not. Note |v| cannot
|
||||
// be -1. That would imply the subtraction did not fit in |num_n| words, and
|
||||
// we know at most one subtraction is needed.
|
||||
v = 0u - v;
|
||||
for (size_t i = 0; i < num_n; i++) {
|
||||
r[i] = constant_time_select_w(v, a[i], r[i]);
|
||||
a[i] = 0;
|
||||
}
|
||||
return 1;
|
||||
}
|
||||
105
zeroidc/vendor/ring/crypto/fipsmodule/bn/montgomery_inv.c
vendored
Normal file
105
zeroidc/vendor/ring/crypto/fipsmodule/bn/montgomery_inv.c
vendored
Normal file
@@ -0,0 +1,105 @@
|
||||
/* Copyright 2016 Brian Smith.
|
||||
*
|
||||
* Permission to use, copy, modify, and/or distribute this software for any
|
||||
* purpose with or without fee is hereby granted, provided that the above
|
||||
* copyright notice and this permission notice appear in all copies.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
|
||||
* WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
|
||||
* MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY
|
||||
* SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
|
||||
* WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION
|
||||
* OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN
|
||||
* CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. */
|
||||
|
||||
#include "internal.h"
|
||||
#include "../../internal.h"
|
||||
|
||||
|
||||
OPENSSL_STATIC_ASSERT(BN_MONT_CTX_N0_LIMBS == 1 || BN_MONT_CTX_N0_LIMBS == 2,
|
||||
"BN_MONT_CTX_N0_LIMBS value is invalid");
|
||||
OPENSSL_STATIC_ASSERT(sizeof(BN_ULONG) * BN_MONT_CTX_N0_LIMBS == sizeof(uint64_t),
|
||||
"uint64_t is insufficient precision for n0");
|
||||
|
||||
// LG_LITTLE_R is log_2(r).
|
||||
#define LG_LITTLE_R (BN_MONT_CTX_N0_LIMBS * BN_BITS2)
|
||||
|
||||
// bn_neg_inv_r_mod_n_u64 calculates the -1/n mod r; i.e. it calculates |v|
|
||||
// such that u*r - v*n == 1. |r| is the constant defined in |bn_mont_n0|. |n|
|
||||
// must be odd.
|
||||
//
|
||||
// This is derived from |xbinGCD| in Henry S. Warren, Jr.'s "Montgomery
|
||||
// Multiplication" (http://www.hackersdelight.org/MontgomeryMultiplication.pdf).
|
||||
// It is very similar to the MODULAR-INVERSE function in Stephen R. Dussé's and
|
||||
// Burton S. Kaliski Jr.'s "A Cryptographic Library for the Motorola DSP56000"
|
||||
// (http://link.springer.com/chapter/10.1007%2F3-540-46877-3_21).
|
||||
//
|
||||
// This is inspired by Joppe W. Bos's "Constant Time Modular Inversion"
|
||||
// (http://www.joppebos.com/files/CTInversion.pdf) so that the inversion is
|
||||
// constant-time with respect to |n|. We assume uint64_t additions,
|
||||
// subtractions, shifts, and bitwise operations are all constant time, which
|
||||
// may be a large leap of faith on 32-bit targets. We avoid division and
|
||||
// multiplication, which tend to be the most problematic in terms of timing
|
||||
// leaks.
|
||||
//
|
||||
// Most GCD implementations return values such that |u*r + v*n == 1|, so the
|
||||
// caller would have to negate the resultant |v| for the purpose of Montgomery
|
||||
// multiplication. This implementation does the negation implicitly by doing
|
||||
// the computations as a difference instead of a sum.
|
||||
uint64_t GFp_bn_neg_inv_mod_r_u64(uint64_t n) {
|
||||
dev_assert_secret(n % 2 == 1);
|
||||
|
||||
// alpha == 2**(lg r - 1) == r / 2.
|
||||
static const uint64_t alpha = UINT64_C(1) << (LG_LITTLE_R - 1);
|
||||
|
||||
const uint64_t beta = n;
|
||||
|
||||
uint64_t u = 1;
|
||||
uint64_t v = 0;
|
||||
|
||||
// The invariant maintained from here on is:
|
||||
// 2**(lg r - i) == u*2*alpha - v*beta.
|
||||
for (size_t i = 0; i < LG_LITTLE_R; ++i) {
|
||||
#if BN_BITS2 == 64 && defined(BN_ULLONG)
|
||||
dev_assert_secret((BN_ULLONG)(1) << (LG_LITTLE_R - i) ==
|
||||
((BN_ULLONG)u * 2 * alpha) - ((BN_ULLONG)v * beta));
|
||||
#endif
|
||||
|
||||
// Delete a common factor of 2 in u and v if |u| is even. Otherwise, set
|
||||
// |u = (u + beta) / 2| and |v = (v / 2) + alpha|.
|
||||
|
||||
uint64_t u_is_odd = UINT64_C(0) - (u & 1); // Either 0xff..ff or 0.
|
||||
|
||||
// The addition can overflow, so use Dietz's method for it.
|
||||
//
|
||||
// Dietz calculates (x+y)/2 by (x xor y)>>1 + x&y. This is valid for all
|
||||
// (unsigned) x and y, even when x+y overflows. Evidence for 32-bit values
|
||||
// (embedded in 64 bits to so that overflow can be ignored):
|
||||
//
|
||||
// (declare-fun x () (_ BitVec 64))
|
||||
// (declare-fun y () (_ BitVec 64))
|
||||
// (assert (let (
|
||||
// (one (_ bv1 64))
|
||||
// (thirtyTwo (_ bv32 64)))
|
||||
// (and
|
||||
// (bvult x (bvshl one thirtyTwo))
|
||||
// (bvult y (bvshl one thirtyTwo))
|
||||
// (not (=
|
||||
// (bvadd (bvlshr (bvxor x y) one) (bvand x y))
|
||||
// (bvlshr (bvadd x y) one)))
|
||||
// )))
|
||||
// (check-sat)
|
||||
uint64_t beta_if_u_is_odd = beta & u_is_odd; // Either |beta| or 0.
|
||||
u = ((u ^ beta_if_u_is_odd) >> 1) + (u & beta_if_u_is_odd);
|
||||
|
||||
uint64_t alpha_if_u_is_odd = alpha & u_is_odd; /* Either |alpha| or 0. */
|
||||
v = (v >> 1) + alpha_if_u_is_odd;
|
||||
}
|
||||
|
||||
// The invariant now shows that u*r - v*n == 1 since r == 2 * alpha.
|
||||
#if BN_BITS2 == 64 && defined(BN_ULLONG)
|
||||
dev_assert_secret(1 == ((BN_ULLONG)u * 2 * alpha) - ((BN_ULLONG)v * beta));
|
||||
#endif
|
||||
|
||||
return v;
|
||||
}
|
||||
901
zeroidc/vendor/ring/crypto/fipsmodule/ec/asm/ecp_nistz256-armv4.pl
vendored
Normal file
901
zeroidc/vendor/ring/crypto/fipsmodule/ec/asm/ecp_nistz256-armv4.pl
vendored
Normal file
@@ -0,0 +1,901 @@
|
||||
#! /usr/bin/env perl
|
||||
# Copyright 2015-2018 The OpenSSL Project Authors. All Rights Reserved.
|
||||
#
|
||||
# Licensed under the OpenSSL license (the "License"). You may not use
|
||||
# this file except in compliance with the License. You can obtain a copy
|
||||
# in the file LICENSE in the source distribution or at
|
||||
# https://www.openssl.org/source/license.html
|
||||
|
||||
|
||||
# ====================================================================
|
||||
# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
|
||||
# project. The module is, however, dual licensed under OpenSSL and
|
||||
# CRYPTOGAMS licenses depending on where you obtain it. For further
|
||||
# details see http://www.openssl.org/~appro/cryptogams/.
|
||||
# ====================================================================
|
||||
#
|
||||
# ECP_NISTZ256 module for ARMv4.
|
||||
#
|
||||
# October 2014.
|
||||
#
|
||||
# Original ECP_NISTZ256 submission targeting x86_64 is detailed in
|
||||
# http://eprint.iacr.org/2013/816. In the process of adaptation
|
||||
# original .c module was made 32-bit savvy in order to make this
|
||||
# implementation possible.
|
||||
#
|
||||
# with/without -DECP_NISTZ256_ASM
|
||||
# Cortex-A8 +53-170%
|
||||
# Cortex-A9 +76-205%
|
||||
# Cortex-A15 +100-316%
|
||||
# Snapdragon S4 +66-187%
|
||||
#
|
||||
# Ranges denote minimum and maximum improvement coefficients depending
|
||||
# on benchmark. Lower coefficients are for ECDSA sign, server-side
|
||||
# operation. Keep in mind that +200% means 3x improvement.
|
||||
|
||||
$flavour = shift;
|
||||
if ($flavour=~/\w[\w\-]*\.\w+$/) { $output=$flavour; undef $flavour; }
|
||||
else { while (($output=shift) && ($output!~/\w[\w\-]*\.\w+$/)) {} }
|
||||
|
||||
if ($flavour && $flavour ne "void") {
|
||||
$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
|
||||
( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or
|
||||
( $xlate="${dir}../../../perlasm/arm-xlate.pl" and -f $xlate) or
|
||||
die "can't locate arm-xlate.pl";
|
||||
|
||||
open STDOUT,"| \"$^X\" $xlate $flavour $output";
|
||||
} else {
|
||||
open STDOUT,">$output";
|
||||
}
|
||||
|
||||
$code.=<<___;
|
||||
#include <GFp/arm_arch.h>
|
||||
|
||||
.text
|
||||
#if defined(__thumb2__)
|
||||
.syntax unified
|
||||
.thumb
|
||||
#else
|
||||
.code 32
|
||||
#endif
|
||||
|
||||
.asciz "ECP_NISTZ256 for ARMv4, CRYPTOGAMS by <appro\@openssl.org>"
|
||||
.align 6
|
||||
___
|
||||
|
||||
########################################################################
|
||||
# common register layout, note that $t2 is link register, so that if
|
||||
# internal subroutine uses $t2, then it has to offload lr...
|
||||
|
||||
($r_ptr,$a_ptr,$b_ptr,$ff,$a0,$a1,$a2,$a3,$a4,$a5,$a6,$a7,$t1,$t2)=
|
||||
map("r$_",(0..12,14));
|
||||
($t0,$t3)=($ff,$a_ptr);
|
||||
|
||||
$code.=<<___;
|
||||
.type __ecp_nistz256_mul_by_2,%function
|
||||
.align 4
|
||||
__ecp_nistz256_mul_by_2:
|
||||
ldr $a0,[$a_ptr,#0]
|
||||
ldr $a1,[$a_ptr,#4]
|
||||
ldr $a2,[$a_ptr,#8]
|
||||
adds $a0,$a0,$a0 @ a[0:7]+=a[0:7], i.e. add with itself
|
||||
ldr $a3,[$a_ptr,#12]
|
||||
adcs $a1,$a1,$a1
|
||||
ldr $a4,[$a_ptr,#16]
|
||||
adcs $a2,$a2,$a2
|
||||
ldr $a5,[$a_ptr,#20]
|
||||
adcs $a3,$a3,$a3
|
||||
ldr $a6,[$a_ptr,#24]
|
||||
adcs $a4,$a4,$a4
|
||||
ldr $a7,[$a_ptr,#28]
|
||||
adcs $a5,$a5,$a5
|
||||
adcs $a6,$a6,$a6
|
||||
mov $ff,#0
|
||||
adcs $a7,$a7,$a7
|
||||
adc $ff,$ff,#0
|
||||
|
||||
b .Lreduce_by_sub
|
||||
.size __ecp_nistz256_mul_by_2,.-__ecp_nistz256_mul_by_2
|
||||
|
||||
@ void GFp_nistz256_add(BN_ULONG r0[8],const BN_ULONG r1[8],
|
||||
@ const BN_ULONG r2[8]);
|
||||
.globl GFp_nistz256_add
|
||||
.type GFp_nistz256_add,%function
|
||||
.align 4
|
||||
GFp_nistz256_add:
|
||||
stmdb sp!,{r4-r12,lr}
|
||||
bl __ecp_nistz256_add
|
||||
#if __ARM_ARCH__>=5 || !defined(__thumb__)
|
||||
ldmia sp!,{r4-r12,pc}
|
||||
#else
|
||||
ldmia sp!,{r4-r12,lr}
|
||||
bx lr @ interoperable with Thumb ISA:-)
|
||||
#endif
|
||||
.size GFp_nistz256_add,.-GFp_nistz256_add
|
||||
|
||||
.type __ecp_nistz256_add,%function
|
||||
.align 4
|
||||
__ecp_nistz256_add:
|
||||
str lr,[sp,#-4]! @ push lr
|
||||
|
||||
ldr $a0,[$a_ptr,#0]
|
||||
ldr $a1,[$a_ptr,#4]
|
||||
ldr $a2,[$a_ptr,#8]
|
||||
ldr $a3,[$a_ptr,#12]
|
||||
ldr $a4,[$a_ptr,#16]
|
||||
ldr $t0,[$b_ptr,#0]
|
||||
ldr $a5,[$a_ptr,#20]
|
||||
ldr $t1,[$b_ptr,#4]
|
||||
ldr $a6,[$a_ptr,#24]
|
||||
ldr $t2,[$b_ptr,#8]
|
||||
ldr $a7,[$a_ptr,#28]
|
||||
ldr $t3,[$b_ptr,#12]
|
||||
adds $a0,$a0,$t0
|
||||
ldr $t0,[$b_ptr,#16]
|
||||
adcs $a1,$a1,$t1
|
||||
ldr $t1,[$b_ptr,#20]
|
||||
adcs $a2,$a2,$t2
|
||||
ldr $t2,[$b_ptr,#24]
|
||||
adcs $a3,$a3,$t3
|
||||
ldr $t3,[$b_ptr,#28]
|
||||
adcs $a4,$a4,$t0
|
||||
adcs $a5,$a5,$t1
|
||||
adcs $a6,$a6,$t2
|
||||
mov $ff,#0
|
||||
adcs $a7,$a7,$t3
|
||||
adc $ff,$ff,#0
|
||||
ldr lr,[sp],#4 @ pop lr
|
||||
|
||||
.Lreduce_by_sub:
|
||||
|
||||
@ if a+b >= modulus, subtract modulus.
|
||||
@
|
||||
@ But since comparison implies subtraction, we subtract
|
||||
@ modulus and then add it back if subtraction borrowed.
|
||||
|
||||
subs $a0,$a0,#-1
|
||||
sbcs $a1,$a1,#-1
|
||||
sbcs $a2,$a2,#-1
|
||||
sbcs $a3,$a3,#0
|
||||
sbcs $a4,$a4,#0
|
||||
sbcs $a5,$a5,#0
|
||||
sbcs $a6,$a6,#1
|
||||
sbcs $a7,$a7,#-1
|
||||
sbc $ff,$ff,#0
|
||||
|
||||
@ Note that because mod has special form, i.e. consists of
|
||||
@ 0xffffffff, 1 and 0s, we can conditionally synthesize it by
|
||||
@ using value of borrow as a whole or extracting single bit.
|
||||
@ Follow $ff register...
|
||||
|
||||
adds $a0,$a0,$ff @ add synthesized modulus
|
||||
adcs $a1,$a1,$ff
|
||||
str $a0,[$r_ptr,#0]
|
||||
adcs $a2,$a2,$ff
|
||||
str $a1,[$r_ptr,#4]
|
||||
adcs $a3,$a3,#0
|
||||
str $a2,[$r_ptr,#8]
|
||||
adcs $a4,$a4,#0
|
||||
str $a3,[$r_ptr,#12]
|
||||
adcs $a5,$a5,#0
|
||||
str $a4,[$r_ptr,#16]
|
||||
adcs $a6,$a6,$ff,lsr#31
|
||||
str $a5,[$r_ptr,#20]
|
||||
adcs $a7,$a7,$ff
|
||||
str $a6,[$r_ptr,#24]
|
||||
str $a7,[$r_ptr,#28]
|
||||
|
||||
mov pc,lr
|
||||
.size __ecp_nistz256_add,.-__ecp_nistz256_add
|
||||
|
||||
.type __ecp_nistz256_mul_by_3,%function
|
||||
.align 4
|
||||
__ecp_nistz256_mul_by_3:
|
||||
str lr,[sp,#-4]! @ push lr
|
||||
|
||||
@ As multiplication by 3 is performed as 2*n+n, below are inline
|
||||
@ copies of __ecp_nistz256_mul_by_2 and __ecp_nistz256_add, see
|
||||
@ corresponding subroutines for details.
|
||||
|
||||
ldr $a0,[$a_ptr,#0]
|
||||
ldr $a1,[$a_ptr,#4]
|
||||
ldr $a2,[$a_ptr,#8]
|
||||
adds $a0,$a0,$a0 @ a[0:7]+=a[0:7]
|
||||
ldr $a3,[$a_ptr,#12]
|
||||
adcs $a1,$a1,$a1
|
||||
ldr $a4,[$a_ptr,#16]
|
||||
adcs $a2,$a2,$a2
|
||||
ldr $a5,[$a_ptr,#20]
|
||||
adcs $a3,$a3,$a3
|
||||
ldr $a6,[$a_ptr,#24]
|
||||
adcs $a4,$a4,$a4
|
||||
ldr $a7,[$a_ptr,#28]
|
||||
adcs $a5,$a5,$a5
|
||||
adcs $a6,$a6,$a6
|
||||
mov $ff,#0
|
||||
adcs $a7,$a7,$a7
|
||||
adc $ff,$ff,#0
|
||||
|
||||
subs $a0,$a0,#-1 @ .Lreduce_by_sub but without stores
|
||||
sbcs $a1,$a1,#-1
|
||||
sbcs $a2,$a2,#-1
|
||||
sbcs $a3,$a3,#0
|
||||
sbcs $a4,$a4,#0
|
||||
sbcs $a5,$a5,#0
|
||||
sbcs $a6,$a6,#1
|
||||
sbcs $a7,$a7,#-1
|
||||
sbc $ff,$ff,#0
|
||||
|
||||
adds $a0,$a0,$ff @ add synthesized modulus
|
||||
adcs $a1,$a1,$ff
|
||||
adcs $a2,$a2,$ff
|
||||
adcs $a3,$a3,#0
|
||||
adcs $a4,$a4,#0
|
||||
ldr $b_ptr,[$a_ptr,#0]
|
||||
adcs $a5,$a5,#0
|
||||
ldr $t1,[$a_ptr,#4]
|
||||
adcs $a6,$a6,$ff,lsr#31
|
||||
ldr $t2,[$a_ptr,#8]
|
||||
adc $a7,$a7,$ff
|
||||
|
||||
ldr $t0,[$a_ptr,#12]
|
||||
adds $a0,$a0,$b_ptr @ 2*a[0:7]+=a[0:7]
|
||||
ldr $b_ptr,[$a_ptr,#16]
|
||||
adcs $a1,$a1,$t1
|
||||
ldr $t1,[$a_ptr,#20]
|
||||
adcs $a2,$a2,$t2
|
||||
ldr $t2,[$a_ptr,#24]
|
||||
adcs $a3,$a3,$t0
|
||||
ldr $t3,[$a_ptr,#28]
|
||||
adcs $a4,$a4,$b_ptr
|
||||
adcs $a5,$a5,$t1
|
||||
adcs $a6,$a6,$t2
|
||||
mov $ff,#0
|
||||
adcs $a7,$a7,$t3
|
||||
adc $ff,$ff,#0
|
||||
ldr lr,[sp],#4 @ pop lr
|
||||
|
||||
b .Lreduce_by_sub
|
||||
.size __ecp_nistz256_mul_by_3,.-__ecp_nistz256_mul_by_3
|
||||
|
||||
.type __ecp_nistz256_div_by_2,%function
|
||||
.align 4
|
||||
__ecp_nistz256_div_by_2:
|
||||
@ ret = (a is odd ? a+mod : a) >> 1
|
||||
|
||||
ldr $a0,[$a_ptr,#0]
|
||||
ldr $a1,[$a_ptr,#4]
|
||||
ldr $a2,[$a_ptr,#8]
|
||||
mov $ff,$a0,lsl#31 @ place least significant bit to most
|
||||
@ significant position, now arithmetic
|
||||
@ right shift by 31 will produce -1 or
|
||||
@ 0, while logical right shift 1 or 0,
|
||||
@ this is how modulus is conditionally
|
||||
@ synthesized in this case...
|
||||
ldr $a3,[$a_ptr,#12]
|
||||
adds $a0,$a0,$ff,asr#31
|
||||
ldr $a4,[$a_ptr,#16]
|
||||
adcs $a1,$a1,$ff,asr#31
|
||||
ldr $a5,[$a_ptr,#20]
|
||||
adcs $a2,$a2,$ff,asr#31
|
||||
ldr $a6,[$a_ptr,#24]
|
||||
adcs $a3,$a3,#0
|
||||
ldr $a7,[$a_ptr,#28]
|
||||
adcs $a4,$a4,#0
|
||||
mov $a0,$a0,lsr#1 @ a[0:7]>>=1, we can start early
|
||||
@ because it doesn't affect flags
|
||||
adcs $a5,$a5,#0
|
||||
orr $a0,$a0,$a1,lsl#31
|
||||
adcs $a6,$a6,$ff,lsr#31
|
||||
mov $b_ptr,#0
|
||||
adcs $a7,$a7,$ff,asr#31
|
||||
mov $a1,$a1,lsr#1
|
||||
adc $b_ptr,$b_ptr,#0 @ top-most carry bit from addition
|
||||
|
||||
orr $a1,$a1,$a2,lsl#31
|
||||
mov $a2,$a2,lsr#1
|
||||
str $a0,[$r_ptr,#0]
|
||||
orr $a2,$a2,$a3,lsl#31
|
||||
mov $a3,$a3,lsr#1
|
||||
str $a1,[$r_ptr,#4]
|
||||
orr $a3,$a3,$a4,lsl#31
|
||||
mov $a4,$a4,lsr#1
|
||||
str $a2,[$r_ptr,#8]
|
||||
orr $a4,$a4,$a5,lsl#31
|
||||
mov $a5,$a5,lsr#1
|
||||
str $a3,[$r_ptr,#12]
|
||||
orr $a5,$a5,$a6,lsl#31
|
||||
mov $a6,$a6,lsr#1
|
||||
str $a4,[$r_ptr,#16]
|
||||
orr $a6,$a6,$a7,lsl#31
|
||||
mov $a7,$a7,lsr#1
|
||||
str $a5,[$r_ptr,#20]
|
||||
orr $a7,$a7,$b_ptr,lsl#31 @ don't forget the top-most carry bit
|
||||
str $a6,[$r_ptr,#24]
|
||||
str $a7,[$r_ptr,#28]
|
||||
|
||||
mov pc,lr
|
||||
.size __ecp_nistz256_div_by_2,.-__ecp_nistz256_div_by_2
|
||||
|
||||
.type __ecp_nistz256_sub,%function
|
||||
.align 4
|
||||
__ecp_nistz256_sub:
|
||||
str lr,[sp,#-4]! @ push lr
|
||||
|
||||
ldr $a0,[$a_ptr,#0]
|
||||
ldr $a1,[$a_ptr,#4]
|
||||
ldr $a2,[$a_ptr,#8]
|
||||
ldr $a3,[$a_ptr,#12]
|
||||
ldr $a4,[$a_ptr,#16]
|
||||
ldr $t0,[$b_ptr,#0]
|
||||
ldr $a5,[$a_ptr,#20]
|
||||
ldr $t1,[$b_ptr,#4]
|
||||
ldr $a6,[$a_ptr,#24]
|
||||
ldr $t2,[$b_ptr,#8]
|
||||
ldr $a7,[$a_ptr,#28]
|
||||
ldr $t3,[$b_ptr,#12]
|
||||
subs $a0,$a0,$t0
|
||||
ldr $t0,[$b_ptr,#16]
|
||||
sbcs $a1,$a1,$t1
|
||||
ldr $t1,[$b_ptr,#20]
|
||||
sbcs $a2,$a2,$t2
|
||||
ldr $t2,[$b_ptr,#24]
|
||||
sbcs $a3,$a3,$t3
|
||||
ldr $t3,[$b_ptr,#28]
|
||||
sbcs $a4,$a4,$t0
|
||||
sbcs $a5,$a5,$t1
|
||||
sbcs $a6,$a6,$t2
|
||||
sbcs $a7,$a7,$t3
|
||||
sbc $ff,$ff,$ff @ broadcast borrow bit
|
||||
ldr lr,[sp],#4 @ pop lr
|
||||
|
||||
.Lreduce_by_add:
|
||||
|
||||
@ if a-b borrows, add modulus.
|
||||
@
|
||||
@ Note that because mod has special form, i.e. consists of
|
||||
@ 0xffffffff, 1 and 0s, we can conditionally synthesize it by
|
||||
@ broadcasting borrow bit to a register, $ff, and using it as
|
||||
@ a whole or extracting single bit.
|
||||
|
||||
adds $a0,$a0,$ff @ add synthesized modulus
|
||||
adcs $a1,$a1,$ff
|
||||
str $a0,[$r_ptr,#0]
|
||||
adcs $a2,$a2,$ff
|
||||
str $a1,[$r_ptr,#4]
|
||||
adcs $a3,$a3,#0
|
||||
str $a2,[$r_ptr,#8]
|
||||
adcs $a4,$a4,#0
|
||||
str $a3,[$r_ptr,#12]
|
||||
adcs $a5,$a5,#0
|
||||
str $a4,[$r_ptr,#16]
|
||||
adcs $a6,$a6,$ff,lsr#31
|
||||
str $a5,[$r_ptr,#20]
|
||||
adcs $a7,$a7,$ff
|
||||
str $a6,[$r_ptr,#24]
|
||||
str $a7,[$r_ptr,#28]
|
||||
|
||||
mov pc,lr
|
||||
.size __ecp_nistz256_sub,.-__ecp_nistz256_sub
|
||||
|
||||
@ void GFp_nistz256_neg(BN_ULONG r0[8],const BN_ULONG r1[8]);
|
||||
.globl GFp_nistz256_neg
|
||||
.type GFp_nistz256_neg,%function
|
||||
.align 4
|
||||
GFp_nistz256_neg:
|
||||
stmdb sp!,{r4-r12,lr}
|
||||
bl __ecp_nistz256_neg
|
||||
#if __ARM_ARCH__>=5 || !defined(__thumb__)
|
||||
ldmia sp!,{r4-r12,pc}
|
||||
#else
|
||||
ldmia sp!,{r4-r12,lr}
|
||||
bx lr @ interoperable with Thumb ISA:-)
|
||||
#endif
|
||||
.size GFp_nistz256_neg,.-GFp_nistz256_neg
|
||||
|
||||
.type __ecp_nistz256_neg,%function
|
||||
.align 4
|
||||
__ecp_nistz256_neg:
|
||||
ldr $a0,[$a_ptr,#0]
|
||||
eor $ff,$ff,$ff
|
||||
ldr $a1,[$a_ptr,#4]
|
||||
ldr $a2,[$a_ptr,#8]
|
||||
subs $a0,$ff,$a0
|
||||
ldr $a3,[$a_ptr,#12]
|
||||
sbcs $a1,$ff,$a1
|
||||
ldr $a4,[$a_ptr,#16]
|
||||
sbcs $a2,$ff,$a2
|
||||
ldr $a5,[$a_ptr,#20]
|
||||
sbcs $a3,$ff,$a3
|
||||
ldr $a6,[$a_ptr,#24]
|
||||
sbcs $a4,$ff,$a4
|
||||
ldr $a7,[$a_ptr,#28]
|
||||
sbcs $a5,$ff,$a5
|
||||
sbcs $a6,$ff,$a6
|
||||
sbcs $a7,$ff,$a7
|
||||
sbc $ff,$ff,$ff
|
||||
|
||||
b .Lreduce_by_add
|
||||
.size __ecp_nistz256_neg,.-__ecp_nistz256_neg
|
||||
___
|
||||
{
|
||||
my @acc=map("r$_",(3..11));
|
||||
my ($t0,$t1,$bj,$t2,$t3)=map("r$_",(0,1,2,12,14));
|
||||
|
||||
$code.=<<___;
|
||||
@ void GFp_nistz256_mul_mont(BN_ULONG r0[8],const BN_ULONG r1[8],
|
||||
@ const BN_ULONG r2[8]);
|
||||
.globl GFp_nistz256_mul_mont
|
||||
.type GFp_nistz256_mul_mont,%function
|
||||
.align 4
|
||||
GFp_nistz256_mul_mont:
|
||||
stmdb sp!,{r4-r12,lr}
|
||||
bl __ecp_nistz256_mul_mont
|
||||
#if __ARM_ARCH__>=5 || !defined(__thumb__)
|
||||
ldmia sp!,{r4-r12,pc}
|
||||
#else
|
||||
ldmia sp!,{r4-r12,lr}
|
||||
bx lr @ interoperable with Thumb ISA:-)
|
||||
#endif
|
||||
.size GFp_nistz256_mul_mont,.-GFp_nistz256_mul_mont
|
||||
|
||||
.type __ecp_nistz256_mul_mont,%function
|
||||
.align 4
|
||||
__ecp_nistz256_mul_mont:
|
||||
stmdb sp!,{r0-r2,lr} @ make a copy of arguments too
|
||||
|
||||
ldr $bj,[$b_ptr,#0] @ b[0]
|
||||
ldmia $a_ptr,{@acc[1]-@acc[8]}
|
||||
|
||||
umull @acc[0],$t3,@acc[1],$bj @ r[0]=a[0]*b[0]
|
||||
stmdb sp!,{$acc[1]-@acc[8]} @ copy a[0-7] to stack, so
|
||||
@ that it can be addressed
|
||||
@ without spending register
|
||||
@ on address
|
||||
umull @acc[1],$t0,@acc[2],$bj @ r[1]=a[1]*b[0]
|
||||
umull @acc[2],$t1,@acc[3],$bj
|
||||
adds @acc[1],@acc[1],$t3 @ accumulate high part of mult
|
||||
umull @acc[3],$t2,@acc[4],$bj
|
||||
adcs @acc[2],@acc[2],$t0
|
||||
umull @acc[4],$t3,@acc[5],$bj
|
||||
adcs @acc[3],@acc[3],$t1
|
||||
umull @acc[5],$t0,@acc[6],$bj
|
||||
adcs @acc[4],@acc[4],$t2
|
||||
umull @acc[6],$t1,@acc[7],$bj
|
||||
adcs @acc[5],@acc[5],$t3
|
||||
umull @acc[7],$t2,@acc[8],$bj
|
||||
adcs @acc[6],@acc[6],$t0
|
||||
adcs @acc[7],@acc[7],$t1
|
||||
eor $t3,$t3,$t3 @ first overflow bit is zero
|
||||
adc @acc[8],$t2,#0
|
||||
___
|
||||
for(my $i=1;$i<8;$i++) {
|
||||
my $t4=@acc[0];
|
||||
|
||||
# Reduction iteration is normally performed by accumulating
|
||||
# result of multiplication of modulus by "magic" digit [and
|
||||
# omitting least significant word, which is guaranteed to
|
||||
# be 0], but thanks to special form of modulus and "magic"
|
||||
# digit being equal to least significant word, it can be
|
||||
# performed with additions and subtractions alone. Indeed:
|
||||
#
|
||||
# ffff.0001.0000.0000.0000.ffff.ffff.ffff
|
||||
# * abcd
|
||||
# + xxxx.xxxx.xxxx.xxxx.xxxx.xxxx.xxxx.xxxx.abcd
|
||||
#
|
||||
# Now observing that ff..ff*x = (2^n-1)*x = 2^n*x-x, we
|
||||
# rewrite above as:
|
||||
#
|
||||
# xxxx.xxxx.xxxx.xxxx.xxxx.xxxx.xxxx.xxxx.abcd
|
||||
# + abcd.0000.abcd.0000.0000.abcd.0000.0000.0000
|
||||
# - abcd.0000.0000.0000.0000.0000.0000.abcd
|
||||
#
|
||||
# or marking redundant operations:
|
||||
#
|
||||
# xxxx.xxxx.xxxx.xxxx.xxxx.xxxx.xxxx.xxxx.----
|
||||
# + abcd.0000.abcd.0000.0000.abcd.----.----.----
|
||||
# - abcd.----.----.----.----.----.----.----
|
||||
|
||||
$code.=<<___;
|
||||
@ multiplication-less reduction $i
|
||||
adds @acc[3],@acc[3],@acc[0] @ r[3]+=r[0]
|
||||
ldr $bj,[sp,#40] @ restore b_ptr
|
||||
adcs @acc[4],@acc[4],#0 @ r[4]+=0
|
||||
adcs @acc[5],@acc[5],#0 @ r[5]+=0
|
||||
adcs @acc[6],@acc[6],@acc[0] @ r[6]+=r[0]
|
||||
ldr $t1,[sp,#0] @ load a[0]
|
||||
adcs @acc[7],@acc[7],#0 @ r[7]+=0
|
||||
ldr $bj,[$bj,#4*$i] @ load b[i]
|
||||
adcs @acc[8],@acc[8],@acc[0] @ r[8]+=r[0]
|
||||
eor $t0,$t0,$t0
|
||||
adc $t3,$t3,#0 @ overflow bit
|
||||
subs @acc[7],@acc[7],@acc[0] @ r[7]-=r[0]
|
||||
ldr $t2,[sp,#4] @ a[1]
|
||||
sbcs @acc[8],@acc[8],#0 @ r[8]-=0
|
||||
umlal @acc[1],$t0,$t1,$bj @ "r[0]"+=a[0]*b[i]
|
||||
eor $t1,$t1,$t1
|
||||
sbc @acc[0],$t3,#0 @ overflow bit, keep in mind
|
||||
@ that netto result is
|
||||
@ addition of a value which
|
||||
@ makes underflow impossible
|
||||
|
||||
ldr $t3,[sp,#8] @ a[2]
|
||||
umlal @acc[2],$t1,$t2,$bj @ "r[1]"+=a[1]*b[i]
|
||||
str @acc[0],[sp,#36] @ temporarily offload overflow
|
||||
eor $t2,$t2,$t2
|
||||
ldr $t4,[sp,#12] @ a[3], $t4 is alias @acc[0]
|
||||
umlal @acc[3],$t2,$t3,$bj @ "r[2]"+=a[2]*b[i]
|
||||
eor $t3,$t3,$t3
|
||||
adds @acc[2],@acc[2],$t0 @ accumulate high part of mult
|
||||
ldr $t0,[sp,#16] @ a[4]
|
||||
umlal @acc[4],$t3,$t4,$bj @ "r[3]"+=a[3]*b[i]
|
||||
eor $t4,$t4,$t4
|
||||
adcs @acc[3],@acc[3],$t1
|
||||
ldr $t1,[sp,#20] @ a[5]
|
||||
umlal @acc[5],$t4,$t0,$bj @ "r[4]"+=a[4]*b[i]
|
||||
eor $t0,$t0,$t0
|
||||
adcs @acc[4],@acc[4],$t2
|
||||
ldr $t2,[sp,#24] @ a[6]
|
||||
umlal @acc[6],$t0,$t1,$bj @ "r[5]"+=a[5]*b[i]
|
||||
eor $t1,$t1,$t1
|
||||
adcs @acc[5],@acc[5],$t3
|
||||
ldr $t3,[sp,#28] @ a[7]
|
||||
umlal @acc[7],$t1,$t2,$bj @ "r[6]"+=a[6]*b[i]
|
||||
eor $t2,$t2,$t2
|
||||
adcs @acc[6],@acc[6],$t4
|
||||
ldr @acc[0],[sp,#36] @ restore overflow bit
|
||||
umlal @acc[8],$t2,$t3,$bj @ "r[7]"+=a[7]*b[i]
|
||||
eor $t3,$t3,$t3
|
||||
adcs @acc[7],@acc[7],$t0
|
||||
adcs @acc[8],@acc[8],$t1
|
||||
adcs @acc[0],$acc[0],$t2
|
||||
adc $t3,$t3,#0 @ new overflow bit
|
||||
___
|
||||
push(@acc,shift(@acc)); # rotate registers, so that
|
||||
# "r[i]" becomes r[i]
|
||||
}
|
||||
$code.=<<___;
|
||||
@ last multiplication-less reduction
|
||||
adds @acc[3],@acc[3],@acc[0]
|
||||
ldr $r_ptr,[sp,#32] @ restore r_ptr
|
||||
adcs @acc[4],@acc[4],#0
|
||||
adcs @acc[5],@acc[5],#0
|
||||
adcs @acc[6],@acc[6],@acc[0]
|
||||
adcs @acc[7],@acc[7],#0
|
||||
adcs @acc[8],@acc[8],@acc[0]
|
||||
adc $t3,$t3,#0
|
||||
subs @acc[7],@acc[7],@acc[0]
|
||||
sbcs @acc[8],@acc[8],#0
|
||||
sbc @acc[0],$t3,#0 @ overflow bit
|
||||
|
||||
@ Final step is "if result > mod, subtract mod", but we do it
|
||||
@ "other way around", namely subtract modulus from result
|
||||
@ and if it borrowed, add modulus back.
|
||||
|
||||
adds @acc[1],@acc[1],#1 @ subs @acc[1],@acc[1],#-1
|
||||
adcs @acc[2],@acc[2],#0 @ sbcs @acc[2],@acc[2],#-1
|
||||
adcs @acc[3],@acc[3],#0 @ sbcs @acc[3],@acc[3],#-1
|
||||
sbcs @acc[4],@acc[4],#0
|
||||
sbcs @acc[5],@acc[5],#0
|
||||
sbcs @acc[6],@acc[6],#0
|
||||
sbcs @acc[7],@acc[7],#1
|
||||
adcs @acc[8],@acc[8],#0 @ sbcs @acc[8],@acc[8],#-1
|
||||
ldr lr,[sp,#44] @ restore lr
|
||||
sbc @acc[0],@acc[0],#0 @ broadcast borrow bit
|
||||
add sp,sp,#48
|
||||
|
||||
@ Note that because mod has special form, i.e. consists of
|
||||
@ 0xffffffff, 1 and 0s, we can conditionally synthesize it by
|
||||
@ broadcasting borrow bit to a register, @acc[0], and using it as
|
||||
@ a whole or extracting single bit.
|
||||
|
||||
adds @acc[1],@acc[1],@acc[0] @ add modulus or zero
|
||||
adcs @acc[2],@acc[2],@acc[0]
|
||||
str @acc[1],[$r_ptr,#0]
|
||||
adcs @acc[3],@acc[3],@acc[0]
|
||||
str @acc[2],[$r_ptr,#4]
|
||||
adcs @acc[4],@acc[4],#0
|
||||
str @acc[3],[$r_ptr,#8]
|
||||
adcs @acc[5],@acc[5],#0
|
||||
str @acc[4],[$r_ptr,#12]
|
||||
adcs @acc[6],@acc[6],#0
|
||||
str @acc[5],[$r_ptr,#16]
|
||||
adcs @acc[7],@acc[7],@acc[0],lsr#31
|
||||
str @acc[6],[$r_ptr,#20]
|
||||
adc @acc[8],@acc[8],@acc[0]
|
||||
str @acc[7],[$r_ptr,#24]
|
||||
str @acc[8],[$r_ptr,#28]
|
||||
|
||||
mov pc,lr
|
||||
.size __ecp_nistz256_mul_mont,.-__ecp_nistz256_mul_mont
|
||||
___
|
||||
}
|
||||
|
||||
{{{
|
||||
########################################################################
|
||||
# Below $aN assignment matches order in which 256-bit result appears in
|
||||
# register bank at return from __ecp_nistz256_mul_mont, so that we can
|
||||
# skip over reloading it from memory. This means that below functions
|
||||
# use custom calling sequence accepting 256-bit input in registers,
|
||||
# output pointer in r0, $r_ptr, and optional pointer in r2, $b_ptr.
|
||||
#
|
||||
# See their "normal" counterparts for insights on calculations.
|
||||
|
||||
my ($a0,$a1,$a2,$a3,$a4,$a5,$a6,$a7,
|
||||
$t0,$t1,$t2,$t3)=map("r$_",(11,3..10,12,14,1));
|
||||
my $ff=$b_ptr;
|
||||
|
||||
$code.=<<___;
|
||||
.type __ecp_nistz256_sub_from,%function
|
||||
.align 5
|
||||
__ecp_nistz256_sub_from:
|
||||
str lr,[sp,#-4]! @ push lr
|
||||
|
||||
ldr $t0,[$b_ptr,#0]
|
||||
ldr $t1,[$b_ptr,#4]
|
||||
ldr $t2,[$b_ptr,#8]
|
||||
ldr $t3,[$b_ptr,#12]
|
||||
subs $a0,$a0,$t0
|
||||
ldr $t0,[$b_ptr,#16]
|
||||
sbcs $a1,$a1,$t1
|
||||
ldr $t1,[$b_ptr,#20]
|
||||
sbcs $a2,$a2,$t2
|
||||
ldr $t2,[$b_ptr,#24]
|
||||
sbcs $a3,$a3,$t3
|
||||
ldr $t3,[$b_ptr,#28]
|
||||
sbcs $a4,$a4,$t0
|
||||
sbcs $a5,$a5,$t1
|
||||
sbcs $a6,$a6,$t2
|
||||
sbcs $a7,$a7,$t3
|
||||
sbc $ff,$ff,$ff @ broadcast borrow bit
|
||||
ldr lr,[sp],#4 @ pop lr
|
||||
|
||||
adds $a0,$a0,$ff @ add synthesized modulus
|
||||
adcs $a1,$a1,$ff
|
||||
str $a0,[$r_ptr,#0]
|
||||
adcs $a2,$a2,$ff
|
||||
str $a1,[$r_ptr,#4]
|
||||
adcs $a3,$a3,#0
|
||||
str $a2,[$r_ptr,#8]
|
||||
adcs $a4,$a4,#0
|
||||
str $a3,[$r_ptr,#12]
|
||||
adcs $a5,$a5,#0
|
||||
str $a4,[$r_ptr,#16]
|
||||
adcs $a6,$a6,$ff,lsr#31
|
||||
str $a5,[$r_ptr,#20]
|
||||
adcs $a7,$a7,$ff
|
||||
str $a6,[$r_ptr,#24]
|
||||
str $a7,[$r_ptr,#28]
|
||||
|
||||
mov pc,lr
|
||||
.size __ecp_nistz256_sub_from,.-__ecp_nistz256_sub_from
|
||||
|
||||
.type __ecp_nistz256_sub_morf,%function
|
||||
.align 5
|
||||
__ecp_nistz256_sub_morf:
|
||||
str lr,[sp,#-4]! @ push lr
|
||||
|
||||
ldr $t0,[$b_ptr,#0]
|
||||
ldr $t1,[$b_ptr,#4]
|
||||
ldr $t2,[$b_ptr,#8]
|
||||
ldr $t3,[$b_ptr,#12]
|
||||
subs $a0,$t0,$a0
|
||||
ldr $t0,[$b_ptr,#16]
|
||||
sbcs $a1,$t1,$a1
|
||||
ldr $t1,[$b_ptr,#20]
|
||||
sbcs $a2,$t2,$a2
|
||||
ldr $t2,[$b_ptr,#24]
|
||||
sbcs $a3,$t3,$a3
|
||||
ldr $t3,[$b_ptr,#28]
|
||||
sbcs $a4,$t0,$a4
|
||||
sbcs $a5,$t1,$a5
|
||||
sbcs $a6,$t2,$a6
|
||||
sbcs $a7,$t3,$a7
|
||||
sbc $ff,$ff,$ff @ broadcast borrow bit
|
||||
ldr lr,[sp],#4 @ pop lr
|
||||
|
||||
adds $a0,$a0,$ff @ add synthesized modulus
|
||||
adcs $a1,$a1,$ff
|
||||
str $a0,[$r_ptr,#0]
|
||||
adcs $a2,$a2,$ff
|
||||
str $a1,[$r_ptr,#4]
|
||||
adcs $a3,$a3,#0
|
||||
str $a2,[$r_ptr,#8]
|
||||
adcs $a4,$a4,#0
|
||||
str $a3,[$r_ptr,#12]
|
||||
adcs $a5,$a5,#0
|
||||
str $a4,[$r_ptr,#16]
|
||||
adcs $a6,$a6,$ff,lsr#31
|
||||
str $a5,[$r_ptr,#20]
|
||||
adcs $a7,$a7,$ff
|
||||
str $a6,[$r_ptr,#24]
|
||||
str $a7,[$r_ptr,#28]
|
||||
|
||||
mov pc,lr
|
||||
.size __ecp_nistz256_sub_morf,.-__ecp_nistz256_sub_morf
|
||||
|
||||
.type __ecp_nistz256_add_self,%function
|
||||
.align 4
|
||||
__ecp_nistz256_add_self:
|
||||
adds $a0,$a0,$a0 @ a[0:7]+=a[0:7]
|
||||
adcs $a1,$a1,$a1
|
||||
adcs $a2,$a2,$a2
|
||||
adcs $a3,$a3,$a3
|
||||
adcs $a4,$a4,$a4
|
||||
adcs $a5,$a5,$a5
|
||||
adcs $a6,$a6,$a6
|
||||
mov $ff,#0
|
||||
adcs $a7,$a7,$a7
|
||||
adc $ff,$ff,#0
|
||||
|
||||
@ if a+b >= modulus, subtract modulus.
|
||||
@
|
||||
@ But since comparison implies subtraction, we subtract
|
||||
@ modulus and then add it back if subtraction borrowed.
|
||||
|
||||
subs $a0,$a0,#-1
|
||||
sbcs $a1,$a1,#-1
|
||||
sbcs $a2,$a2,#-1
|
||||
sbcs $a3,$a3,#0
|
||||
sbcs $a4,$a4,#0
|
||||
sbcs $a5,$a5,#0
|
||||
sbcs $a6,$a6,#1
|
||||
sbcs $a7,$a7,#-1
|
||||
sbc $ff,$ff,#0
|
||||
|
||||
@ Note that because mod has special form, i.e. consists of
|
||||
@ 0xffffffff, 1 and 0s, we can conditionally synthesize it by
|
||||
@ using value of borrow as a whole or extracting single bit.
|
||||
@ Follow $ff register...
|
||||
|
||||
adds $a0,$a0,$ff @ add synthesized modulus
|
||||
adcs $a1,$a1,$ff
|
||||
str $a0,[$r_ptr,#0]
|
||||
adcs $a2,$a2,$ff
|
||||
str $a1,[$r_ptr,#4]
|
||||
adcs $a3,$a3,#0
|
||||
str $a2,[$r_ptr,#8]
|
||||
adcs $a4,$a4,#0
|
||||
str $a3,[$r_ptr,#12]
|
||||
adcs $a5,$a5,#0
|
||||
str $a4,[$r_ptr,#16]
|
||||
adcs $a6,$a6,$ff,lsr#31
|
||||
str $a5,[$r_ptr,#20]
|
||||
adcs $a7,$a7,$ff
|
||||
str $a6,[$r_ptr,#24]
|
||||
str $a7,[$r_ptr,#28]
|
||||
|
||||
mov pc,lr
|
||||
.size __ecp_nistz256_add_self,.-__ecp_nistz256_add_self
|
||||
|
||||
___
|
||||
|
||||
########################################################################
|
||||
# following subroutines are "literal" implementation of those found in
|
||||
# ecp_nistz256.c
|
||||
#
|
||||
########################################################################
|
||||
# void ecp_nistz256_point_double(P256_POINT *out,const P256_POINT *inp);
|
||||
#
|
||||
{
|
||||
my ($S,$M,$Zsqr,$in_x,$tmp0)=map(32*$_,(0..4));
|
||||
# above map() describes stack layout with 5 temporary
|
||||
# 256-bit vectors on top. Then note that we push
|
||||
# starting from r0, which means that we have copy of
|
||||
# input arguments just below these temporary vectors.
|
||||
|
||||
$code.=<<___;
|
||||
.globl GFp_nistz256_point_double
|
||||
.type GFp_nistz256_point_double,%function
|
||||
.align 5
|
||||
GFp_nistz256_point_double:
|
||||
stmdb sp!,{r0-r12,lr} @ push from r0, unusual, but intentional
|
||||
sub sp,sp,#32*5
|
||||
|
||||
.Lpoint_double_shortcut:
|
||||
add r3,sp,#$in_x
|
||||
ldmia $a_ptr!,{r4-r11} @ copy in_x
|
||||
stmia r3,{r4-r11}
|
||||
|
||||
add $r_ptr,sp,#$S
|
||||
bl __ecp_nistz256_mul_by_2 @ p256_mul_by_2(S, in_y);
|
||||
|
||||
add $b_ptr,$a_ptr,#32
|
||||
add $a_ptr,$a_ptr,#32
|
||||
add $r_ptr,sp,#$Zsqr
|
||||
bl __ecp_nistz256_mul_mont @ p256_sqr_mont(Zsqr, in_z);
|
||||
|
||||
add $a_ptr,sp,#$S
|
||||
add $b_ptr,sp,#$S
|
||||
add $r_ptr,sp,#$S
|
||||
bl __ecp_nistz256_mul_mont @ p256_sqr_mont(S, S);
|
||||
|
||||
ldr $b_ptr,[sp,#32*5+4]
|
||||
add $a_ptr,$b_ptr,#32
|
||||
add $b_ptr,$b_ptr,#64
|
||||
add $r_ptr,sp,#$tmp0
|
||||
bl __ecp_nistz256_mul_mont @ p256_mul_mont(tmp0, in_z, in_y);
|
||||
|
||||
ldr $r_ptr,[sp,#32*5]
|
||||
add $r_ptr,$r_ptr,#64
|
||||
bl __ecp_nistz256_add_self @ p256_mul_by_2(res_z, tmp0);
|
||||
|
||||
add $a_ptr,sp,#$in_x
|
||||
add $b_ptr,sp,#$Zsqr
|
||||
add $r_ptr,sp,#$M
|
||||
bl __ecp_nistz256_add @ p256_add(M, in_x, Zsqr);
|
||||
|
||||
add $a_ptr,sp,#$in_x
|
||||
add $b_ptr,sp,#$Zsqr
|
||||
add $r_ptr,sp,#$Zsqr
|
||||
bl __ecp_nistz256_sub @ p256_sub(Zsqr, in_x, Zsqr);
|
||||
|
||||
add $a_ptr,sp,#$S
|
||||
add $b_ptr,sp,#$S
|
||||
add $r_ptr,sp,#$tmp0
|
||||
bl __ecp_nistz256_mul_mont @ p256_sqr_mont(tmp0, S);
|
||||
|
||||
add $a_ptr,sp,#$Zsqr
|
||||
add $b_ptr,sp,#$M
|
||||
add $r_ptr,sp,#$M
|
||||
bl __ecp_nistz256_mul_mont @ p256_mul_mont(M, M, Zsqr);
|
||||
|
||||
ldr $r_ptr,[sp,#32*5]
|
||||
add $a_ptr,sp,#$tmp0
|
||||
add $r_ptr,$r_ptr,#32
|
||||
bl __ecp_nistz256_div_by_2 @ p256_div_by_2(res_y, tmp0);
|
||||
|
||||
add $a_ptr,sp,#$M
|
||||
add $r_ptr,sp,#$M
|
||||
bl __ecp_nistz256_mul_by_3 @ p256_mul_by_3(M, M);
|
||||
|
||||
add $a_ptr,sp,#$in_x
|
||||
add $b_ptr,sp,#$S
|
||||
add $r_ptr,sp,#$S
|
||||
bl __ecp_nistz256_mul_mont @ p256_mul_mont(S, S, in_x);
|
||||
|
||||
add $r_ptr,sp,#$tmp0
|
||||
bl __ecp_nistz256_add_self @ p256_mul_by_2(tmp0, S);
|
||||
|
||||
ldr $r_ptr,[sp,#32*5]
|
||||
add $a_ptr,sp,#$M
|
||||
add $b_ptr,sp,#$M
|
||||
bl __ecp_nistz256_mul_mont @ p256_sqr_mont(res_x, M);
|
||||
|
||||
add $b_ptr,sp,#$tmp0
|
||||
bl __ecp_nistz256_sub_from @ p256_sub(res_x, res_x, tmp0);
|
||||
|
||||
add $b_ptr,sp,#$S
|
||||
add $r_ptr,sp,#$S
|
||||
bl __ecp_nistz256_sub_morf @ p256_sub(S, S, res_x);
|
||||
|
||||
add $a_ptr,sp,#$M
|
||||
add $b_ptr,sp,#$S
|
||||
bl __ecp_nistz256_mul_mont @ p256_mul_mont(S, S, M);
|
||||
|
||||
ldr $r_ptr,[sp,#32*5]
|
||||
add $b_ptr,$r_ptr,#32
|
||||
add $r_ptr,$r_ptr,#32
|
||||
bl __ecp_nistz256_sub_from @ p256_sub(res_y, S, res_y);
|
||||
|
||||
add sp,sp,#32*5+16 @ +16 means "skip even over saved r0-r3"
|
||||
#if __ARM_ARCH__>=5 || !defined(__thumb__)
|
||||
ldmia sp!,{r4-r12,pc}
|
||||
#else
|
||||
ldmia sp!,{r4-r12,lr}
|
||||
bx lr @ interoperable with Thumb ISA:-)
|
||||
#endif
|
||||
.size GFp_nistz256_point_double,.-GFp_nistz256_point_double
|
||||
___
|
||||
}
|
||||
|
||||
}}}
|
||||
|
||||
foreach (split("\n",$code)) {
|
||||
s/\`([^\`]*)\`/eval $1/geo;
|
||||
|
||||
s/\bq([0-9]+)#(lo|hi)/sprintf "d%d",2*$1+($2 eq "hi")/geo;
|
||||
|
||||
print $_,"\n";
|
||||
}
|
||||
close STDOUT or die "error closing STDOUT";
|
||||
908
zeroidc/vendor/ring/crypto/fipsmodule/ec/asm/ecp_nistz256-armv8.pl
vendored
Normal file
908
zeroidc/vendor/ring/crypto/fipsmodule/ec/asm/ecp_nistz256-armv8.pl
vendored
Normal file
@@ -0,0 +1,908 @@
|
||||
#! /usr/bin/env perl
|
||||
# Copyright 2015-2016 The OpenSSL Project Authors. All Rights Reserved.
|
||||
#
|
||||
# Redistribution and use in source and binary forms, with or without
|
||||
# modification, are permitted provided that the following conditions
|
||||
# are met:
|
||||
#
|
||||
# 1. Redistributions of source code must retain the above copyright
|
||||
# notice, this list of conditions and the following disclaimer.
|
||||
#
|
||||
# 2. Redistributions in binary form must reproduce the above copyright
|
||||
# notice, this list of conditions and the following disclaimer in
|
||||
# the documentation and/or other materials provided with the
|
||||
# distribution.
|
||||
#
|
||||
# 3. All advertising materials mentioning features or use of this
|
||||
# software must display the following acknowledgment:
|
||||
# "This product includes software developed by the OpenSSL Project
|
||||
# for use in the OpenSSL Toolkit. (http://www.openssl.org/)"
|
||||
#
|
||||
# 4. The names "OpenSSL Toolkit" and "OpenSSL Project" must not be used to
|
||||
# endorse or promote products derived from this software without
|
||||
# prior written permission. For written permission, please contact
|
||||
# openssl-core@openssl.org.
|
||||
#
|
||||
# 5. Products derived from this software may not be called "OpenSSL"
|
||||
# nor may "OpenSSL" appear in their names without prior written
|
||||
# permission of the OpenSSL Project.
|
||||
#
|
||||
# 6. Redistributions of any form whatsoever must retain the following
|
||||
# acknowledgment:
|
||||
# "This product includes software developed by the OpenSSL Project
|
||||
# for use in the OpenSSL Toolkit (http://www.openssl.org/)"
|
||||
#
|
||||
# THIS SOFTWARE IS PROVIDED BY THE OpenSSL PROJECT ``AS IS'' AND ANY
|
||||
# EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
|
||||
# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE OpenSSL PROJECT OR
|
||||
# ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
|
||||
# NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
|
||||
# LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
|
||||
# HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
|
||||
# STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
|
||||
# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED
|
||||
# OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
# ====================================================================
|
||||
#
|
||||
# This product includes cryptographic software written by Eric Young
|
||||
# (eay@cryptsoft.com). This product includes software written by Tim
|
||||
# Hudson (tjh@cryptsoft.com).
|
||||
|
||||
|
||||
# ====================================================================
|
||||
# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
|
||||
# project. The module is, however, dual licensed under OpenSSL and
|
||||
# CRYPTOGAMS licenses depending on where you obtain it. For further
|
||||
# details see http://www.openssl.org/~appro/cryptogams/.
|
||||
# ====================================================================
|
||||
#
|
||||
# ECP_NISTZ256 module for ARMv8.
|
||||
#
|
||||
# February 2015.
|
||||
#
|
||||
# Original ECP_NISTZ256 submission targeting x86_64 is detailed in
|
||||
# http://eprint.iacr.org/2013/816.
|
||||
#
|
||||
# with/without -DECP_NISTZ256_ASM
|
||||
# Apple A7 +120-360%
|
||||
# Cortex-A53 +120-400%
|
||||
# Cortex-A57 +120-350%
|
||||
# X-Gene +200-330%
|
||||
# Denver +140-400%
|
||||
#
|
||||
# Ranges denote minimum and maximum improvement coefficients depending
|
||||
# on benchmark. Lower coefficients are for ECDSA sign, server-side
|
||||
# operation. Keep in mind that +400% means 5x improvement.
|
||||
|
||||
$flavour = shift;
|
||||
while (($output=shift) && ($output!~/\w[\w\-]*\.\w+$/)) {}
|
||||
|
||||
$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
|
||||
( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or
|
||||
( $xlate="${dir}../../../perlasm/arm-xlate.pl" and -f $xlate) or
|
||||
die "can't locate arm-xlate.pl";
|
||||
|
||||
open OUT,"| \"$^X\" $xlate $flavour $output";
|
||||
*STDOUT=*OUT;
|
||||
|
||||
{
|
||||
my ($rp,$ap,$bp,$bi,$a0,$a1,$a2,$a3,$t0,$t1,$t2,$t3,$poly1,$poly3,
|
||||
$acc0,$acc1,$acc2,$acc3,$acc4,$acc5) =
|
||||
map("x$_",(0..17,19,20));
|
||||
|
||||
my ($acc6,$acc7)=($ap,$bp); # used in __ecp_nistz256_sqr_mont
|
||||
|
||||
$code.=<<___;
|
||||
#include <GFp/arm_arch.h>
|
||||
|
||||
.text
|
||||
.align 5
|
||||
.Lpoly:
|
||||
.quad 0xffffffffffffffff,0x00000000ffffffff,0x0000000000000000,0xffffffff00000001
|
||||
.Lone_mont:
|
||||
.quad 0x0000000000000001,0xffffffff00000000,0xffffffffffffffff,0x00000000fffffffe
|
||||
.Lone:
|
||||
.quad 1,0,0,0
|
||||
.asciz "ECP_NISTZ256 for ARMv8, CRYPTOGAMS by <appro\@openssl.org>"
|
||||
|
||||
// void GFp_nistz256_mul_mont(BN_ULONG x0[4],const BN_ULONG x1[4],
|
||||
// const BN_ULONG x2[4]);
|
||||
.globl GFp_nistz256_mul_mont
|
||||
.type GFp_nistz256_mul_mont,%function
|
||||
.align 4
|
||||
GFp_nistz256_mul_mont:
|
||||
stp x29,x30,[sp,#-32]!
|
||||
add x29,sp,#0
|
||||
stp x19,x20,[sp,#16]
|
||||
|
||||
ldr $bi,[$bp] // bp[0]
|
||||
ldp $a0,$a1,[$ap]
|
||||
ldp $a2,$a3,[$ap,#16]
|
||||
ldr $poly1,.Lpoly+8
|
||||
ldr $poly3,.Lpoly+24
|
||||
|
||||
bl __ecp_nistz256_mul_mont
|
||||
|
||||
ldp x19,x20,[sp,#16]
|
||||
ldp x29,x30,[sp],#32
|
||||
ret
|
||||
.size GFp_nistz256_mul_mont,.-GFp_nistz256_mul_mont
|
||||
|
||||
// void GFp_nistz256_sqr_mont(BN_ULONG x0[4],const BN_ULONG x1[4]);
|
||||
.globl GFp_nistz256_sqr_mont
|
||||
.type GFp_nistz256_sqr_mont,%function
|
||||
.align 4
|
||||
GFp_nistz256_sqr_mont:
|
||||
stp x29,x30,[sp,#-32]!
|
||||
add x29,sp,#0
|
||||
stp x19,x20,[sp,#16]
|
||||
|
||||
ldp $a0,$a1,[$ap]
|
||||
ldp $a2,$a3,[$ap,#16]
|
||||
ldr $poly1,.Lpoly+8
|
||||
ldr $poly3,.Lpoly+24
|
||||
|
||||
bl __ecp_nistz256_sqr_mont
|
||||
|
||||
ldp x19,x20,[sp,#16]
|
||||
ldp x29,x30,[sp],#32
|
||||
ret
|
||||
.size GFp_nistz256_sqr_mont,.-GFp_nistz256_sqr_mont
|
||||
|
||||
// void GFp_nistz256_add(BN_ULONG x0[4],const BN_ULONG x1[4],
|
||||
// const BN_ULONG x2[4]);
|
||||
.globl GFp_nistz256_add
|
||||
.type GFp_nistz256_add,%function
|
||||
.align 4
|
||||
GFp_nistz256_add:
|
||||
stp x29,x30,[sp,#-16]!
|
||||
add x29,sp,#0
|
||||
|
||||
ldp $acc0,$acc1,[$ap]
|
||||
ldp $t0,$t1,[$bp]
|
||||
ldp $acc2,$acc3,[$ap,#16]
|
||||
ldp $t2,$t3,[$bp,#16]
|
||||
ldr $poly1,.Lpoly+8
|
||||
ldr $poly3,.Lpoly+24
|
||||
|
||||
bl __ecp_nistz256_add
|
||||
|
||||
ldp x29,x30,[sp],#16
|
||||
ret
|
||||
.size GFp_nistz256_add,.-GFp_nistz256_add
|
||||
|
||||
// void GFp_nistz256_neg(BN_ULONG x0[4],const BN_ULONG x1[4]);
|
||||
.globl GFp_nistz256_neg
|
||||
.type GFp_nistz256_neg,%function
|
||||
.align 4
|
||||
GFp_nistz256_neg:
|
||||
stp x29,x30,[sp,#-16]!
|
||||
add x29,sp,#0
|
||||
|
||||
mov $bp,$ap
|
||||
mov $acc0,xzr // a = 0
|
||||
mov $acc1,xzr
|
||||
mov $acc2,xzr
|
||||
mov $acc3,xzr
|
||||
ldr $poly1,.Lpoly+8
|
||||
ldr $poly3,.Lpoly+24
|
||||
|
||||
bl __ecp_nistz256_sub_from
|
||||
|
||||
ldp x29,x30,[sp],#16
|
||||
ret
|
||||
.size GFp_nistz256_neg,.-GFp_nistz256_neg
|
||||
|
||||
// note that __ecp_nistz256_mul_mont expects a[0-3] input pre-loaded
|
||||
// to $a0-$a3 and b[0] - to $bi
|
||||
.type __ecp_nistz256_mul_mont,%function
|
||||
.align 4
|
||||
__ecp_nistz256_mul_mont:
|
||||
mul $acc0,$a0,$bi // a[0]*b[0]
|
||||
umulh $t0,$a0,$bi
|
||||
|
||||
mul $acc1,$a1,$bi // a[1]*b[0]
|
||||
umulh $t1,$a1,$bi
|
||||
|
||||
mul $acc2,$a2,$bi // a[2]*b[0]
|
||||
umulh $t2,$a2,$bi
|
||||
|
||||
mul $acc3,$a3,$bi // a[3]*b[0]
|
||||
umulh $t3,$a3,$bi
|
||||
ldr $bi,[$bp,#8] // b[1]
|
||||
|
||||
adds $acc1,$acc1,$t0 // accumulate high parts of multiplication
|
||||
lsl $t0,$acc0,#32
|
||||
adcs $acc2,$acc2,$t1
|
||||
lsr $t1,$acc0,#32
|
||||
adcs $acc3,$acc3,$t2
|
||||
adc $acc4,xzr,$t3
|
||||
mov $acc5,xzr
|
||||
___
|
||||
for($i=1;$i<4;$i++) {
|
||||
# Reduction iteration is normally performed by accumulating
|
||||
# result of multiplication of modulus by "magic" digit [and
|
||||
# omitting least significant word, which is guaranteed to
|
||||
# be 0], but thanks to special form of modulus and "magic"
|
||||
# digit being equal to least significant word, it can be
|
||||
# performed with additions and subtractions alone. Indeed:
|
||||
#
|
||||
# ffff0001.00000000.0000ffff.ffffffff
|
||||
# * abcdefgh
|
||||
# + xxxxxxxx.xxxxxxxx.xxxxxxxx.xxxxxxxx.abcdefgh
|
||||
#
|
||||
# Now observing that ff..ff*x = (2^n-1)*x = 2^n*x-x, we
|
||||
# rewrite above as:
|
||||
#
|
||||
# xxxxxxxx.xxxxxxxx.xxxxxxxx.xxxxxxxx.abcdefgh
|
||||
# + abcdefgh.abcdefgh.0000abcd.efgh0000.00000000
|
||||
# - 0000abcd.efgh0000.00000000.00000000.abcdefgh
|
||||
#
|
||||
# or marking redundant operations:
|
||||
#
|
||||
# xxxxxxxx.xxxxxxxx.xxxxxxxx.xxxxxxxx.--------
|
||||
# + abcdefgh.abcdefgh.0000abcd.efgh0000.--------
|
||||
# - 0000abcd.efgh0000.--------.--------.--------
|
||||
|
||||
$code.=<<___;
|
||||
subs $t2,$acc0,$t0 // "*0xffff0001"
|
||||
sbc $t3,$acc0,$t1
|
||||
adds $acc0,$acc1,$t0 // +=acc[0]<<96 and omit acc[0]
|
||||
mul $t0,$a0,$bi // lo(a[0]*b[i])
|
||||
adcs $acc1,$acc2,$t1
|
||||
mul $t1,$a1,$bi // lo(a[1]*b[i])
|
||||
adcs $acc2,$acc3,$t2 // +=acc[0]*0xffff0001
|
||||
mul $t2,$a2,$bi // lo(a[2]*b[i])
|
||||
adcs $acc3,$acc4,$t3
|
||||
mul $t3,$a3,$bi // lo(a[3]*b[i])
|
||||
adc $acc4,$acc5,xzr
|
||||
|
||||
adds $acc0,$acc0,$t0 // accumulate low parts of multiplication
|
||||
umulh $t0,$a0,$bi // hi(a[0]*b[i])
|
||||
adcs $acc1,$acc1,$t1
|
||||
umulh $t1,$a1,$bi // hi(a[1]*b[i])
|
||||
adcs $acc2,$acc2,$t2
|
||||
umulh $t2,$a2,$bi // hi(a[2]*b[i])
|
||||
adcs $acc3,$acc3,$t3
|
||||
umulh $t3,$a3,$bi // hi(a[3]*b[i])
|
||||
adc $acc4,$acc4,xzr
|
||||
___
|
||||
$code.=<<___ if ($i<3);
|
||||
ldr $bi,[$bp,#8*($i+1)] // b[$i+1]
|
||||
___
|
||||
$code.=<<___;
|
||||
adds $acc1,$acc1,$t0 // accumulate high parts of multiplication
|
||||
lsl $t0,$acc0,#32
|
||||
adcs $acc2,$acc2,$t1
|
||||
lsr $t1,$acc0,#32
|
||||
adcs $acc3,$acc3,$t2
|
||||
adcs $acc4,$acc4,$t3
|
||||
adc $acc5,xzr,xzr
|
||||
___
|
||||
}
|
||||
$code.=<<___;
|
||||
// last reduction
|
||||
subs $t2,$acc0,$t0 // "*0xffff0001"
|
||||
sbc $t3,$acc0,$t1
|
||||
adds $acc0,$acc1,$t0 // +=acc[0]<<96 and omit acc[0]
|
||||
adcs $acc1,$acc2,$t1
|
||||
adcs $acc2,$acc3,$t2 // +=acc[0]*0xffff0001
|
||||
adcs $acc3,$acc4,$t3
|
||||
adc $acc4,$acc5,xzr
|
||||
|
||||
adds $t0,$acc0,#1 // subs $t0,$acc0,#-1 // tmp = ret-modulus
|
||||
sbcs $t1,$acc1,$poly1
|
||||
sbcs $t2,$acc2,xzr
|
||||
sbcs $t3,$acc3,$poly3
|
||||
sbcs xzr,$acc4,xzr // did it borrow?
|
||||
|
||||
csel $acc0,$acc0,$t0,lo // ret = borrow ? ret : ret-modulus
|
||||
csel $acc1,$acc1,$t1,lo
|
||||
csel $acc2,$acc2,$t2,lo
|
||||
stp $acc0,$acc1,[$rp]
|
||||
csel $acc3,$acc3,$t3,lo
|
||||
stp $acc2,$acc3,[$rp,#16]
|
||||
|
||||
ret
|
||||
.size __ecp_nistz256_mul_mont,.-__ecp_nistz256_mul_mont
|
||||
|
||||
// note that __ecp_nistz256_sqr_mont expects a[0-3] input pre-loaded
|
||||
// to $a0-$a3
|
||||
.type __ecp_nistz256_sqr_mont,%function
|
||||
.align 4
|
||||
__ecp_nistz256_sqr_mont:
|
||||
// | | | | | |a1*a0| |
|
||||
// | | | | |a2*a0| | |
|
||||
// | |a3*a2|a3*a0| | | |
|
||||
// | | | |a2*a1| | | |
|
||||
// | | |a3*a1| | | | |
|
||||
// *| | | | | | | | 2|
|
||||
// +|a3*a3|a2*a2|a1*a1|a0*a0|
|
||||
// |--+--+--+--+--+--+--+--|
|
||||
// |A7|A6|A5|A4|A3|A2|A1|A0|, where Ax is $accx, i.e. follow $accx
|
||||
//
|
||||
// "can't overflow" below mark carrying into high part of
|
||||
// multiplication result, which can't overflow, because it
|
||||
// can never be all ones.
|
||||
|
||||
mul $acc1,$a1,$a0 // a[1]*a[0]
|
||||
umulh $t1,$a1,$a0
|
||||
mul $acc2,$a2,$a0 // a[2]*a[0]
|
||||
umulh $t2,$a2,$a0
|
||||
mul $acc3,$a3,$a0 // a[3]*a[0]
|
||||
umulh $acc4,$a3,$a0
|
||||
|
||||
adds $acc2,$acc2,$t1 // accumulate high parts of multiplication
|
||||
mul $t0,$a2,$a1 // a[2]*a[1]
|
||||
umulh $t1,$a2,$a1
|
||||
adcs $acc3,$acc3,$t2
|
||||
mul $t2,$a3,$a1 // a[3]*a[1]
|
||||
umulh $t3,$a3,$a1
|
||||
adc $acc4,$acc4,xzr // can't overflow
|
||||
|
||||
mul $acc5,$a3,$a2 // a[3]*a[2]
|
||||
umulh $acc6,$a3,$a2
|
||||
|
||||
adds $t1,$t1,$t2 // accumulate high parts of multiplication
|
||||
mul $acc0,$a0,$a0 // a[0]*a[0]
|
||||
adc $t2,$t3,xzr // can't overflow
|
||||
|
||||
adds $acc3,$acc3,$t0 // accumulate low parts of multiplication
|
||||
umulh $a0,$a0,$a0
|
||||
adcs $acc4,$acc4,$t1
|
||||
mul $t1,$a1,$a1 // a[1]*a[1]
|
||||
adcs $acc5,$acc5,$t2
|
||||
umulh $a1,$a1,$a1
|
||||
adc $acc6,$acc6,xzr // can't overflow
|
||||
|
||||
adds $acc1,$acc1,$acc1 // acc[1-6]*=2
|
||||
mul $t2,$a2,$a2 // a[2]*a[2]
|
||||
adcs $acc2,$acc2,$acc2
|
||||
umulh $a2,$a2,$a2
|
||||
adcs $acc3,$acc3,$acc3
|
||||
mul $t3,$a3,$a3 // a[3]*a[3]
|
||||
adcs $acc4,$acc4,$acc4
|
||||
umulh $a3,$a3,$a3
|
||||
adcs $acc5,$acc5,$acc5
|
||||
adcs $acc6,$acc6,$acc6
|
||||
adc $acc7,xzr,xzr
|
||||
|
||||
adds $acc1,$acc1,$a0 // +a[i]*a[i]
|
||||
adcs $acc2,$acc2,$t1
|
||||
adcs $acc3,$acc3,$a1
|
||||
adcs $acc4,$acc4,$t2
|
||||
adcs $acc5,$acc5,$a2
|
||||
lsl $t0,$acc0,#32
|
||||
adcs $acc6,$acc6,$t3
|
||||
lsr $t1,$acc0,#32
|
||||
adc $acc7,$acc7,$a3
|
||||
___
|
||||
for($i=0;$i<3;$i++) { # reductions, see commentary in
|
||||
# multiplication for details
|
||||
$code.=<<___;
|
||||
subs $t2,$acc0,$t0 // "*0xffff0001"
|
||||
sbc $t3,$acc0,$t1
|
||||
adds $acc0,$acc1,$t0 // +=acc[0]<<96 and omit acc[0]
|
||||
adcs $acc1,$acc2,$t1
|
||||
lsl $t0,$acc0,#32
|
||||
adcs $acc2,$acc3,$t2 // +=acc[0]*0xffff0001
|
||||
lsr $t1,$acc0,#32
|
||||
adc $acc3,$t3,xzr // can't overflow
|
||||
___
|
||||
}
|
||||
$code.=<<___;
|
||||
subs $t2,$acc0,$t0 // "*0xffff0001"
|
||||
sbc $t3,$acc0,$t1
|
||||
adds $acc0,$acc1,$t0 // +=acc[0]<<96 and omit acc[0]
|
||||
adcs $acc1,$acc2,$t1
|
||||
adcs $acc2,$acc3,$t2 // +=acc[0]*0xffff0001
|
||||
adc $acc3,$t3,xzr // can't overflow
|
||||
|
||||
adds $acc0,$acc0,$acc4 // accumulate upper half
|
||||
adcs $acc1,$acc1,$acc5
|
||||
adcs $acc2,$acc2,$acc6
|
||||
adcs $acc3,$acc3,$acc7
|
||||
adc $acc4,xzr,xzr
|
||||
|
||||
adds $t0,$acc0,#1 // subs $t0,$acc0,#-1 // tmp = ret-modulus
|
||||
sbcs $t1,$acc1,$poly1
|
||||
sbcs $t2,$acc2,xzr
|
||||
sbcs $t3,$acc3,$poly3
|
||||
sbcs xzr,$acc4,xzr // did it borrow?
|
||||
|
||||
csel $acc0,$acc0,$t0,lo // ret = borrow ? ret : ret-modulus
|
||||
csel $acc1,$acc1,$t1,lo
|
||||
csel $acc2,$acc2,$t2,lo
|
||||
stp $acc0,$acc1,[$rp]
|
||||
csel $acc3,$acc3,$t3,lo
|
||||
stp $acc2,$acc3,[$rp,#16]
|
||||
|
||||
ret
|
||||
.size __ecp_nistz256_sqr_mont,.-__ecp_nistz256_sqr_mont
|
||||
|
||||
// Note that __ecp_nistz256_add expects both input vectors pre-loaded to
|
||||
// $a0-$a3 and $t0-$t3. This is done because it's used in multiple
|
||||
// contexts, e.g. in multiplication by 2 and 3...
|
||||
.type __ecp_nistz256_add,%function
|
||||
.align 4
|
||||
__ecp_nistz256_add:
|
||||
adds $acc0,$acc0,$t0 // ret = a+b
|
||||
adcs $acc1,$acc1,$t1
|
||||
adcs $acc2,$acc2,$t2
|
||||
adcs $acc3,$acc3,$t3
|
||||
adc $ap,xzr,xzr // zap $ap
|
||||
|
||||
adds $t0,$acc0,#1 // subs $t0,$a0,#-1 // tmp = ret-modulus
|
||||
sbcs $t1,$acc1,$poly1
|
||||
sbcs $t2,$acc2,xzr
|
||||
sbcs $t3,$acc3,$poly3
|
||||
sbcs xzr,$ap,xzr // did subtraction borrow?
|
||||
|
||||
csel $acc0,$acc0,$t0,lo // ret = borrow ? ret : ret-modulus
|
||||
csel $acc1,$acc1,$t1,lo
|
||||
csel $acc2,$acc2,$t2,lo
|
||||
stp $acc0,$acc1,[$rp]
|
||||
csel $acc3,$acc3,$t3,lo
|
||||
stp $acc2,$acc3,[$rp,#16]
|
||||
|
||||
ret
|
||||
.size __ecp_nistz256_add,.-__ecp_nistz256_add
|
||||
|
||||
.type __ecp_nistz256_sub_from,%function
|
||||
.align 4
|
||||
__ecp_nistz256_sub_from:
|
||||
ldp $t0,$t1,[$bp]
|
||||
ldp $t2,$t3,[$bp,#16]
|
||||
subs $acc0,$acc0,$t0 // ret = a-b
|
||||
sbcs $acc1,$acc1,$t1
|
||||
sbcs $acc2,$acc2,$t2
|
||||
sbcs $acc3,$acc3,$t3
|
||||
sbc $ap,xzr,xzr // zap $ap
|
||||
|
||||
subs $t0,$acc0,#1 // adds $t0,$a0,#-1 // tmp = ret+modulus
|
||||
adcs $t1,$acc1,$poly1
|
||||
adcs $t2,$acc2,xzr
|
||||
adc $t3,$acc3,$poly3
|
||||
cmp $ap,xzr // did subtraction borrow?
|
||||
|
||||
csel $acc0,$acc0,$t0,eq // ret = borrow ? ret+modulus : ret
|
||||
csel $acc1,$acc1,$t1,eq
|
||||
csel $acc2,$acc2,$t2,eq
|
||||
stp $acc0,$acc1,[$rp]
|
||||
csel $acc3,$acc3,$t3,eq
|
||||
stp $acc2,$acc3,[$rp,#16]
|
||||
|
||||
ret
|
||||
.size __ecp_nistz256_sub_from,.-__ecp_nistz256_sub_from
|
||||
|
||||
.type __ecp_nistz256_sub_morf,%function
|
||||
.align 4
|
||||
__ecp_nistz256_sub_morf:
|
||||
ldp $t0,$t1,[$bp]
|
||||
ldp $t2,$t3,[$bp,#16]
|
||||
subs $acc0,$t0,$acc0 // ret = b-a
|
||||
sbcs $acc1,$t1,$acc1
|
||||
sbcs $acc2,$t2,$acc2
|
||||
sbcs $acc3,$t3,$acc3
|
||||
sbc $ap,xzr,xzr // zap $ap
|
||||
|
||||
subs $t0,$acc0,#1 // adds $t0,$a0,#-1 // tmp = ret+modulus
|
||||
adcs $t1,$acc1,$poly1
|
||||
adcs $t2,$acc2,xzr
|
||||
adc $t3,$acc3,$poly3
|
||||
cmp $ap,xzr // did subtraction borrow?
|
||||
|
||||
csel $acc0,$acc0,$t0,eq // ret = borrow ? ret+modulus : ret
|
||||
csel $acc1,$acc1,$t1,eq
|
||||
csel $acc2,$acc2,$t2,eq
|
||||
stp $acc0,$acc1,[$rp]
|
||||
csel $acc3,$acc3,$t3,eq
|
||||
stp $acc2,$acc3,[$rp,#16]
|
||||
|
||||
ret
|
||||
.size __ecp_nistz256_sub_morf,.-__ecp_nistz256_sub_morf
|
||||
|
||||
.type __ecp_nistz256_div_by_2,%function
|
||||
.align 4
|
||||
__ecp_nistz256_div_by_2:
|
||||
subs $t0,$acc0,#1 // adds $t0,$a0,#-1 // tmp = a+modulus
|
||||
adcs $t1,$acc1,$poly1
|
||||
adcs $t2,$acc2,xzr
|
||||
adcs $t3,$acc3,$poly3
|
||||
adc $ap,xzr,xzr // zap $ap
|
||||
tst $acc0,#1 // is a even?
|
||||
|
||||
csel $acc0,$acc0,$t0,eq // ret = even ? a : a+modulus
|
||||
csel $acc1,$acc1,$t1,eq
|
||||
csel $acc2,$acc2,$t2,eq
|
||||
csel $acc3,$acc3,$t3,eq
|
||||
csel $ap,xzr,$ap,eq
|
||||
|
||||
lsr $acc0,$acc0,#1 // ret >>= 1
|
||||
orr $acc0,$acc0,$acc1,lsl#63
|
||||
lsr $acc1,$acc1,#1
|
||||
orr $acc1,$acc1,$acc2,lsl#63
|
||||
lsr $acc2,$acc2,#1
|
||||
orr $acc2,$acc2,$acc3,lsl#63
|
||||
lsr $acc3,$acc3,#1
|
||||
stp $acc0,$acc1,[$rp]
|
||||
orr $acc3,$acc3,$ap,lsl#63
|
||||
stp $acc2,$acc3,[$rp,#16]
|
||||
|
||||
ret
|
||||
.size __ecp_nistz256_div_by_2,.-__ecp_nistz256_div_by_2
|
||||
___
|
||||
########################################################################
|
||||
# following subroutines are "literal" implementation of those found in
|
||||
# ecp_nistz256.c
|
||||
#
|
||||
########################################################################
|
||||
# void GFp_nistz256_point_double(P256_POINT *out,const P256_POINT *inp);
|
||||
#
|
||||
{
|
||||
my ($S,$M,$Zsqr,$tmp0)=map(32*$_,(0..3));
|
||||
# above map() describes stack layout with 4 temporary
|
||||
# 256-bit vectors on top.
|
||||
my ($rp_real,$ap_real) = map("x$_",(21,22));
|
||||
|
||||
$code.=<<___;
|
||||
.globl GFp_nistz256_point_double
|
||||
.type GFp_nistz256_point_double,%function
|
||||
.align 5
|
||||
GFp_nistz256_point_double:
|
||||
stp x29,x30,[sp,#-80]!
|
||||
add x29,sp,#0
|
||||
stp x19,x20,[sp,#16]
|
||||
stp x21,x22,[sp,#32]
|
||||
sub sp,sp,#32*4
|
||||
|
||||
.Ldouble_shortcut:
|
||||
ldp $acc0,$acc1,[$ap,#32]
|
||||
mov $rp_real,$rp
|
||||
ldp $acc2,$acc3,[$ap,#48]
|
||||
mov $ap_real,$ap
|
||||
ldr $poly1,.Lpoly+8
|
||||
mov $t0,$acc0
|
||||
ldr $poly3,.Lpoly+24
|
||||
mov $t1,$acc1
|
||||
ldp $a0,$a1,[$ap_real,#64] // forward load for p256_sqr_mont
|
||||
mov $t2,$acc2
|
||||
mov $t3,$acc3
|
||||
ldp $a2,$a3,[$ap_real,#64+16]
|
||||
add $rp,sp,#$S
|
||||
bl __ecp_nistz256_add // p256_mul_by_2(S, in_y);
|
||||
|
||||
add $rp,sp,#$Zsqr
|
||||
bl __ecp_nistz256_sqr_mont // p256_sqr_mont(Zsqr, in_z);
|
||||
|
||||
ldp $t0,$t1,[$ap_real]
|
||||
ldp $t2,$t3,[$ap_real,#16]
|
||||
mov $a0,$acc0 // put Zsqr aside for p256_sub
|
||||
mov $a1,$acc1
|
||||
mov $a2,$acc2
|
||||
mov $a3,$acc3
|
||||
add $rp,sp,#$M
|
||||
bl __ecp_nistz256_add // p256_add(M, Zsqr, in_x);
|
||||
|
||||
add $bp,$ap_real,#0
|
||||
mov $acc0,$a0 // restore Zsqr
|
||||
mov $acc1,$a1
|
||||
ldp $a0,$a1,[sp,#$S] // forward load for p256_sqr_mont
|
||||
mov $acc2,$a2
|
||||
mov $acc3,$a3
|
||||
ldp $a2,$a3,[sp,#$S+16]
|
||||
add $rp,sp,#$Zsqr
|
||||
bl __ecp_nistz256_sub_morf // p256_sub(Zsqr, in_x, Zsqr);
|
||||
|
||||
add $rp,sp,#$S
|
||||
bl __ecp_nistz256_sqr_mont // p256_sqr_mont(S, S);
|
||||
|
||||
ldr $bi,[$ap_real,#32]
|
||||
ldp $a0,$a1,[$ap_real,#64]
|
||||
ldp $a2,$a3,[$ap_real,#64+16]
|
||||
add $bp,$ap_real,#32
|
||||
add $rp,sp,#$tmp0
|
||||
bl __ecp_nistz256_mul_mont // p256_mul_mont(tmp0, in_z, in_y);
|
||||
|
||||
mov $t0,$acc0
|
||||
mov $t1,$acc1
|
||||
ldp $a0,$a1,[sp,#$S] // forward load for p256_sqr_mont
|
||||
mov $t2,$acc2
|
||||
mov $t3,$acc3
|
||||
ldp $a2,$a3,[sp,#$S+16]
|
||||
add $rp,$rp_real,#64
|
||||
bl __ecp_nistz256_add // p256_mul_by_2(res_z, tmp0);
|
||||
|
||||
add $rp,sp,#$tmp0
|
||||
bl __ecp_nistz256_sqr_mont // p256_sqr_mont(tmp0, S);
|
||||
|
||||
ldr $bi,[sp,#$Zsqr] // forward load for p256_mul_mont
|
||||
ldp $a0,$a1,[sp,#$M]
|
||||
ldp $a2,$a3,[sp,#$M+16]
|
||||
add $rp,$rp_real,#32
|
||||
bl __ecp_nistz256_div_by_2 // p256_div_by_2(res_y, tmp0);
|
||||
|
||||
add $bp,sp,#$Zsqr
|
||||
add $rp,sp,#$M
|
||||
bl __ecp_nistz256_mul_mont // p256_mul_mont(M, M, Zsqr);
|
||||
|
||||
mov $t0,$acc0 // duplicate M
|
||||
mov $t1,$acc1
|
||||
mov $t2,$acc2
|
||||
mov $t3,$acc3
|
||||
mov $a0,$acc0 // put M aside
|
||||
mov $a1,$acc1
|
||||
mov $a2,$acc2
|
||||
mov $a3,$acc3
|
||||
add $rp,sp,#$M
|
||||
bl __ecp_nistz256_add
|
||||
mov $t0,$a0 // restore M
|
||||
mov $t1,$a1
|
||||
ldr $bi,[$ap_real] // forward load for p256_mul_mont
|
||||
mov $t2,$a2
|
||||
ldp $a0,$a1,[sp,#$S]
|
||||
mov $t3,$a3
|
||||
ldp $a2,$a3,[sp,#$S+16]
|
||||
bl __ecp_nistz256_add // p256_mul_by_3(M, M);
|
||||
|
||||
add $bp,$ap_real,#0
|
||||
add $rp,sp,#$S
|
||||
bl __ecp_nistz256_mul_mont // p256_mul_mont(S, S, in_x);
|
||||
|
||||
mov $t0,$acc0
|
||||
mov $t1,$acc1
|
||||
ldp $a0,$a1,[sp,#$M] // forward load for p256_sqr_mont
|
||||
mov $t2,$acc2
|
||||
mov $t3,$acc3
|
||||
ldp $a2,$a3,[sp,#$M+16]
|
||||
add $rp,sp,#$tmp0
|
||||
bl __ecp_nistz256_add // p256_mul_by_2(tmp0, S);
|
||||
|
||||
add $rp,$rp_real,#0
|
||||
bl __ecp_nistz256_sqr_mont // p256_sqr_mont(res_x, M);
|
||||
|
||||
add $bp,sp,#$tmp0
|
||||
bl __ecp_nistz256_sub_from // p256_sub(res_x, res_x, tmp0);
|
||||
|
||||
add $bp,sp,#$S
|
||||
add $rp,sp,#$S
|
||||
bl __ecp_nistz256_sub_morf // p256_sub(S, S, res_x);
|
||||
|
||||
ldr $bi,[sp,#$M]
|
||||
mov $a0,$acc0 // copy S
|
||||
mov $a1,$acc1
|
||||
mov $a2,$acc2
|
||||
mov $a3,$acc3
|
||||
add $bp,sp,#$M
|
||||
bl __ecp_nistz256_mul_mont // p256_mul_mont(S, S, M);
|
||||
|
||||
add $bp,$rp_real,#32
|
||||
add $rp,$rp_real,#32
|
||||
bl __ecp_nistz256_sub_from // p256_sub(res_y, S, res_y);
|
||||
|
||||
add sp,x29,#0 // destroy frame
|
||||
ldp x19,x20,[x29,#16]
|
||||
ldp x21,x22,[x29,#32]
|
||||
ldp x29,x30,[sp],#80
|
||||
ret
|
||||
.size GFp_nistz256_point_double,.-GFp_nistz256_point_double
|
||||
___
|
||||
}
|
||||
|
||||
########################################################################
|
||||
# void GFp_nistz256_point_add_affine(P256_POINT *out,const P256_POINT *in1,
|
||||
# const P256_POINT_AFFINE *in2);
|
||||
{
|
||||
my ($res_x,$res_y,$res_z,
|
||||
$U2,$S2,$H,$R,$Hsqr,$Hcub,$Rsqr)=map(32*$_,(0..9));
|
||||
my $Z1sqr = $S2;
|
||||
# above map() describes stack layout with 10 temporary
|
||||
# 256-bit vectors on top.
|
||||
my ($rp_real,$ap_real,$bp_real,$in1infty,$in2infty,$temp)=map("x$_",(21..26));
|
||||
|
||||
$code.=<<___;
|
||||
.globl GFp_nistz256_point_add_affine
|
||||
.type GFp_nistz256_point_add_affine,%function
|
||||
.align 5
|
||||
GFp_nistz256_point_add_affine:
|
||||
stp x29,x30,[sp,#-80]!
|
||||
add x29,sp,#0
|
||||
stp x19,x20,[sp,#16]
|
||||
stp x21,x22,[sp,#32]
|
||||
stp x23,x24,[sp,#48]
|
||||
stp x25,x26,[sp,#64]
|
||||
sub sp,sp,#32*10
|
||||
|
||||
mov $rp_real,$rp
|
||||
mov $ap_real,$ap
|
||||
mov $bp_real,$bp
|
||||
ldr $poly1,.Lpoly+8
|
||||
ldr $poly3,.Lpoly+24
|
||||
|
||||
ldp $a0,$a1,[$ap,#64] // in1_z
|
||||
ldp $a2,$a3,[$ap,#64+16]
|
||||
orr $t0,$a0,$a1
|
||||
orr $t2,$a2,$a3
|
||||
orr $in1infty,$t0,$t2
|
||||
cmp $in1infty,#0
|
||||
csetm $in1infty,ne // !in1infty
|
||||
|
||||
ldp $acc0,$acc1,[$bp] // in2_x
|
||||
ldp $acc2,$acc3,[$bp,#16]
|
||||
ldp $t0,$t1,[$bp,#32] // in2_y
|
||||
ldp $t2,$t3,[$bp,#48]
|
||||
orr $acc0,$acc0,$acc1
|
||||
orr $acc2,$acc2,$acc3
|
||||
orr $t0,$t0,$t1
|
||||
orr $t2,$t2,$t3
|
||||
orr $acc0,$acc0,$acc2
|
||||
orr $t0,$t0,$t2
|
||||
orr $in2infty,$acc0,$t0
|
||||
cmp $in2infty,#0
|
||||
csetm $in2infty,ne // !in2infty
|
||||
|
||||
add $rp,sp,#$Z1sqr
|
||||
bl __ecp_nistz256_sqr_mont // p256_sqr_mont(Z1sqr, in1_z);
|
||||
|
||||
mov $a0,$acc0
|
||||
mov $a1,$acc1
|
||||
mov $a2,$acc2
|
||||
mov $a3,$acc3
|
||||
ldr $bi,[$bp_real]
|
||||
add $bp,$bp_real,#0
|
||||
add $rp,sp,#$U2
|
||||
bl __ecp_nistz256_mul_mont // p256_mul_mont(U2, Z1sqr, in2_x);
|
||||
|
||||
add $bp,$ap_real,#0
|
||||
ldr $bi,[$ap_real,#64] // forward load for p256_mul_mont
|
||||
ldp $a0,$a1,[sp,#$Z1sqr]
|
||||
ldp $a2,$a3,[sp,#$Z1sqr+16]
|
||||
add $rp,sp,#$H
|
||||
bl __ecp_nistz256_sub_from // p256_sub(H, U2, in1_x);
|
||||
|
||||
add $bp,$ap_real,#64
|
||||
add $rp,sp,#$S2
|
||||
bl __ecp_nistz256_mul_mont // p256_mul_mont(S2, Z1sqr, in1_z);
|
||||
|
||||
ldr $bi,[$ap_real,#64]
|
||||
ldp $a0,$a1,[sp,#$H]
|
||||
ldp $a2,$a3,[sp,#$H+16]
|
||||
add $bp,$ap_real,#64
|
||||
add $rp,sp,#$res_z
|
||||
bl __ecp_nistz256_mul_mont // p256_mul_mont(res_z, H, in1_z);
|
||||
|
||||
ldr $bi,[$bp_real,#32]
|
||||
ldp $a0,$a1,[sp,#$S2]
|
||||
ldp $a2,$a3,[sp,#$S2+16]
|
||||
add $bp,$bp_real,#32
|
||||
add $rp,sp,#$S2
|
||||
bl __ecp_nistz256_mul_mont // p256_mul_mont(S2, S2, in2_y);
|
||||
|
||||
add $bp,$ap_real,#32
|
||||
ldp $a0,$a1,[sp,#$H] // forward load for p256_sqr_mont
|
||||
ldp $a2,$a3,[sp,#$H+16]
|
||||
add $rp,sp,#$R
|
||||
bl __ecp_nistz256_sub_from // p256_sub(R, S2, in1_y);
|
||||
|
||||
add $rp,sp,#$Hsqr
|
||||
bl __ecp_nistz256_sqr_mont // p256_sqr_mont(Hsqr, H);
|
||||
|
||||
ldp $a0,$a1,[sp,#$R]
|
||||
ldp $a2,$a3,[sp,#$R+16]
|
||||
add $rp,sp,#$Rsqr
|
||||
bl __ecp_nistz256_sqr_mont // p256_sqr_mont(Rsqr, R);
|
||||
|
||||
ldr $bi,[sp,#$H]
|
||||
ldp $a0,$a1,[sp,#$Hsqr]
|
||||
ldp $a2,$a3,[sp,#$Hsqr+16]
|
||||
add $bp,sp,#$H
|
||||
add $rp,sp,#$Hcub
|
||||
bl __ecp_nistz256_mul_mont // p256_mul_mont(Hcub, Hsqr, H);
|
||||
|
||||
ldr $bi,[$ap_real]
|
||||
ldp $a0,$a1,[sp,#$Hsqr]
|
||||
ldp $a2,$a3,[sp,#$Hsqr+16]
|
||||
add $bp,$ap_real,#0
|
||||
add $rp,sp,#$U2
|
||||
bl __ecp_nistz256_mul_mont // p256_mul_mont(U2, in1_x, Hsqr);
|
||||
|
||||
mov $t0,$acc0
|
||||
mov $t1,$acc1
|
||||
mov $t2,$acc2
|
||||
mov $t3,$acc3
|
||||
add $rp,sp,#$Hsqr
|
||||
bl __ecp_nistz256_add // p256_mul_by_2(Hsqr, U2);
|
||||
|
||||
add $bp,sp,#$Rsqr
|
||||
add $rp,sp,#$res_x
|
||||
bl __ecp_nistz256_sub_morf // p256_sub(res_x, Rsqr, Hsqr);
|
||||
|
||||
add $bp,sp,#$Hcub
|
||||
bl __ecp_nistz256_sub_from // p256_sub(res_x, res_x, Hcub);
|
||||
|
||||
add $bp,sp,#$U2
|
||||
ldr $bi,[$ap_real,#32] // forward load for p256_mul_mont
|
||||
ldp $a0,$a1,[sp,#$Hcub]
|
||||
ldp $a2,$a3,[sp,#$Hcub+16]
|
||||
add $rp,sp,#$res_y
|
||||
bl __ecp_nistz256_sub_morf // p256_sub(res_y, U2, res_x);
|
||||
|
||||
add $bp,$ap_real,#32
|
||||
add $rp,sp,#$S2
|
||||
bl __ecp_nistz256_mul_mont // p256_mul_mont(S2, in1_y, Hcub);
|
||||
|
||||
ldr $bi,[sp,#$R]
|
||||
ldp $a0,$a1,[sp,#$res_y]
|
||||
ldp $a2,$a3,[sp,#$res_y+16]
|
||||
add $bp,sp,#$R
|
||||
add $rp,sp,#$res_y
|
||||
bl __ecp_nistz256_mul_mont // p256_mul_mont(res_y, res_y, R);
|
||||
|
||||
add $bp,sp,#$S2
|
||||
bl __ecp_nistz256_sub_from // p256_sub(res_y, res_y, S2);
|
||||
|
||||
ldp $a0,$a1,[sp,#$res_x] // res
|
||||
ldp $a2,$a3,[sp,#$res_x+16]
|
||||
ldp $t0,$t1,[$bp_real] // in2
|
||||
ldp $t2,$t3,[$bp_real,#16]
|
||||
___
|
||||
for($i=0;$i<64;$i+=32) { # conditional moves
|
||||
$code.=<<___;
|
||||
ldp $acc0,$acc1,[$ap_real,#$i] // in1
|
||||
cmp $in1infty,#0 // !$in1intfy, remember?
|
||||
ldp $acc2,$acc3,[$ap_real,#$i+16]
|
||||
csel $t0,$a0,$t0,ne
|
||||
csel $t1,$a1,$t1,ne
|
||||
ldp $a0,$a1,[sp,#$res_x+$i+32] // res
|
||||
csel $t2,$a2,$t2,ne
|
||||
csel $t3,$a3,$t3,ne
|
||||
cmp $in2infty,#0 // !$in2intfy, remember?
|
||||
ldp $a2,$a3,[sp,#$res_x+$i+48]
|
||||
csel $acc0,$t0,$acc0,ne
|
||||
csel $acc1,$t1,$acc1,ne
|
||||
ldp $t0,$t1,[$bp_real,#$i+32] // in2
|
||||
csel $acc2,$t2,$acc2,ne
|
||||
csel $acc3,$t3,$acc3,ne
|
||||
ldp $t2,$t3,[$bp_real,#$i+48]
|
||||
stp $acc0,$acc1,[$rp_real,#$i]
|
||||
stp $acc2,$acc3,[$rp_real,#$i+16]
|
||||
___
|
||||
$code.=<<___ if ($i == 0);
|
||||
adr $bp_real,.Lone_mont-64
|
||||
___
|
||||
}
|
||||
$code.=<<___;
|
||||
ldp $acc0,$acc1,[$ap_real,#$i] // in1
|
||||
cmp $in1infty,#0 // !$in1intfy, remember?
|
||||
ldp $acc2,$acc3,[$ap_real,#$i+16]
|
||||
csel $t0,$a0,$t0,ne
|
||||
csel $t1,$a1,$t1,ne
|
||||
csel $t2,$a2,$t2,ne
|
||||
csel $t3,$a3,$t3,ne
|
||||
cmp $in2infty,#0 // !$in2intfy, remember?
|
||||
csel $acc0,$t0,$acc0,ne
|
||||
csel $acc1,$t1,$acc1,ne
|
||||
csel $acc2,$t2,$acc2,ne
|
||||
csel $acc3,$t3,$acc3,ne
|
||||
stp $acc0,$acc1,[$rp_real,#$i]
|
||||
stp $acc2,$acc3,[$rp_real,#$i+16]
|
||||
|
||||
add sp,x29,#0 // destroy frame
|
||||
ldp x19,x20,[x29,#16]
|
||||
ldp x21,x22,[x29,#32]
|
||||
ldp x23,x24,[x29,#48]
|
||||
ldp x25,x26,[x29,#64]
|
||||
ldp x29,x30,[sp],#80
|
||||
ret
|
||||
.size GFp_nistz256_point_add_affine,.-GFp_nistz256_point_add_affine
|
||||
___
|
||||
} }
|
||||
|
||||
foreach (split("\n",$code)) {
|
||||
s/\`([^\`]*)\`/eval $1/ge;
|
||||
|
||||
print $_,"\n";
|
||||
}
|
||||
close STDOUT or die "error closing STDOUT";
|
||||
1122
zeroidc/vendor/ring/crypto/fipsmodule/ec/asm/ecp_nistz256-x86.pl
vendored
Normal file
1122
zeroidc/vendor/ring/crypto/fipsmodule/ec/asm/ecp_nistz256-x86.pl
vendored
Normal file
File diff suppressed because it is too large
Load Diff
4202
zeroidc/vendor/ring/crypto/fipsmodule/ec/asm/p256-x86_64-asm.pl
vendored
Normal file
4202
zeroidc/vendor/ring/crypto/fipsmodule/ec/asm/p256-x86_64-asm.pl
vendored
Normal file
File diff suppressed because it is too large
Load Diff
52
zeroidc/vendor/ring/crypto/fipsmodule/ec/ecp_nistz.c
vendored
Normal file
52
zeroidc/vendor/ring/crypto/fipsmodule/ec/ecp_nistz.c
vendored
Normal file
@@ -0,0 +1,52 @@
|
||||
/* Copyright (c) 2014, Intel Corporation.
|
||||
*
|
||||
* Permission to use, copy, modify, and/or distribute this software for any
|
||||
* purpose with or without fee is hereby granted, provided that the above
|
||||
* copyright notice and this permission notice appear in all copies.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
|
||||
* WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
|
||||
* MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY
|
||||
* SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
|
||||
* WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION
|
||||
* OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN
|
||||
* CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. */
|
||||
|
||||
#include "ecp_nistz.h"
|
||||
|
||||
#if defined(__GNUC__)
|
||||
#pragma GCC diagnostic ignored "-Wconversion"
|
||||
#endif
|
||||
|
||||
/* Fills |str| with the bytewise little-endian encoding of |scalar|, where
|
||||
* |scalar| has |num_limbs| limbs. |str| is padded with zeros at the end up
|
||||
* to |str_len| bytes. Actually, |str_len| must be exactly one byte more than
|
||||
* needed to encode |num_limbs| losslessly, so that there is an extra byte at
|
||||
* the end. The extra byte is useful because the caller will be breaking |str|
|
||||
* up into windows of a number of bits (5 or 7) that isn't divisible by 8, and
|
||||
* so it is useful for it to be able to read an extra zero byte. */
|
||||
void gfp_little_endian_bytes_from_scalar(uint8_t str[], size_t str_len,
|
||||
const Limb scalar[],
|
||||
size_t num_limbs) {
|
||||
debug_assert_nonsecret(str_len == (num_limbs * sizeof(Limb)) + 1);
|
||||
|
||||
size_t i;
|
||||
for (i = 0; i < num_limbs * sizeof(Limb); i += sizeof(Limb)) {
|
||||
Limb d = scalar[i / sizeof(Limb)];
|
||||
|
||||
str[i + 0] = d & 0xff;
|
||||
str[i + 1] = (d >> 8) & 0xff;
|
||||
str[i + 2] = (d >> 16) & 0xff;
|
||||
str[i + 3] = (d >>= 24) & 0xff;
|
||||
if (sizeof(Limb) == 8) {
|
||||
d >>= 8;
|
||||
str[i + 4] = d & 0xff;
|
||||
str[i + 5] = (d >> 8) & 0xff;
|
||||
str[i + 6] = (d >> 16) & 0xff;
|
||||
str[i + 7] = (d >> 24) & 0xff;
|
||||
}
|
||||
}
|
||||
for (; i < str_len; i++) {
|
||||
str[i] = 0;
|
||||
}
|
||||
}
|
||||
274
zeroidc/vendor/ring/crypto/fipsmodule/ec/ecp_nistz.h
vendored
Normal file
274
zeroidc/vendor/ring/crypto/fipsmodule/ec/ecp_nistz.h
vendored
Normal file
@@ -0,0 +1,274 @@
|
||||
/* Copyright (c) 2015, Google Inc.
|
||||
*
|
||||
* Permission to use, copy, modify, and/or distribute this software for any
|
||||
* purpose with or without fee is hereby granted, provided that the above
|
||||
* copyright notice and this permission notice appear in all copies.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
|
||||
* WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
|
||||
* MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY
|
||||
* SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
|
||||
* WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION
|
||||
* OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN
|
||||
* CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. */
|
||||
|
||||
#ifndef OPENSSL_HEADER_EC_ECP_NISTZ_H
|
||||
#define OPENSSL_HEADER_EC_ECP_NISTZ_H
|
||||
|
||||
#include <GFp/base.h>
|
||||
|
||||
#include "../../limbs/limbs.h"
|
||||
|
||||
#if defined(__GNUC__)
|
||||
#pragma GCC diagnostic push
|
||||
#pragma GCC diagnostic ignored "-Wconversion"
|
||||
#pragma GCC diagnostic ignored "-Wsign-conversion"
|
||||
#endif
|
||||
|
||||
// This function looks at `w + 1` scalar bits (`w` current, 1 adjacent less
|
||||
// significant bit), and recodes them into a signed digit for use in fast point
|
||||
// multiplication: the use of signed rather than unsigned digits means that
|
||||
// fewer points need to be precomputed, given that point inversion is easy (a
|
||||
// precomputed point dP makes -dP available as well).
|
||||
//
|
||||
// BACKGROUND:
|
||||
//
|
||||
// Signed digits for multiplication were introduced by Booth ("A signed binary
|
||||
// multiplication technique", Quart. Journ. Mech. and Applied Math., vol. IV,
|
||||
// pt. 2 (1951), pp. 236-240), in that case for multiplication of integers.
|
||||
// Booth's original encoding did not generally improve the density of nonzero
|
||||
// digits over the binary representation, and was merely meant to simplify the
|
||||
// handling of signed factors given in two's complement; but it has since been
|
||||
// shown to be the basis of various signed-digit representations that do have
|
||||
// further advantages, including the wNAF, using the following general
|
||||
// approach:
|
||||
//
|
||||
// (1) Given a binary representation
|
||||
//
|
||||
// b_k ... b_2 b_1 b_0,
|
||||
//
|
||||
// of a nonnegative integer (b_k in {0, 1}), rewrite it in digits 0, 1, -1
|
||||
// by using bit-wise subtraction as follows:
|
||||
//
|
||||
// b_k b_(k-1) ... b_2 b_1 b_0
|
||||
// - b_k ... b_3 b_2 b_1 b_0
|
||||
// -----------------------------------------
|
||||
// s_(k+1) s_k ... s_3 s_2 s_1 s_0
|
||||
//
|
||||
// A left-shift followed by subtraction of the original value yields a new
|
||||
// representation of the same value, using signed bits s_i = b_(i-1) - b_i.
|
||||
// This representation from Booth's paper has since appeared in the
|
||||
// literature under a variety of different names including "reversed binary
|
||||
// form", "alternating greedy expansion", "mutual opposite form", and
|
||||
// "sign-alternating {+-1}-representation".
|
||||
//
|
||||
// An interesting property is that among the nonzero bits, values 1 and -1
|
||||
// strictly alternate.
|
||||
//
|
||||
// (2) Various window schemes can be applied to the Booth representation of
|
||||
// integers: for example, right-to-left sliding windows yield the wNAF
|
||||
// (a signed-digit encoding independently discovered by various researchers
|
||||
// in the 1990s), and left-to-right sliding windows yield a left-to-right
|
||||
// equivalent of the wNAF (independently discovered by various researchers
|
||||
// around 2004).
|
||||
//
|
||||
// To prevent leaking information through side channels in point multiplication,
|
||||
// we need to recode the given integer into a regular pattern: sliding windows
|
||||
// as in wNAFs won't do, we need their fixed-window equivalent -- which is a few
|
||||
// decades older: we'll be using the so-called "modified Booth encoding" due to
|
||||
// MacSorley ("High-speed arithmetic in binary computers", Proc. IRE, vol. 49
|
||||
// (1961), pp. 67-91), in a radix-2**w setting. That is, we always combine `w`
|
||||
// signed bits into a signed digit, e.g. (for `w == 5`):
|
||||
//
|
||||
// s_(5j + 4) s_(5j + 3) s_(5j + 2) s_(5j + 1) s_(5j)
|
||||
//
|
||||
// The sign-alternating property implies that the resulting digit values are
|
||||
// integers from `-2**(w-1)` to `2**(w-1)`, e.g. -16 to 16 for `w == 5`.
|
||||
//
|
||||
// Of course, we don't actually need to compute the signed digits s_i as an
|
||||
// intermediate step (that's just a nice way to see how this scheme relates
|
||||
// to the wNAF): a direct computation obtains the recoded digit from the
|
||||
// six bits b_(5j + 4) ... b_(5j - 1).
|
||||
//
|
||||
// This function takes those `w` bits as an integer (e.g. 0 .. 63), writing the
|
||||
// recoded digit to *sign (0 for positive, 1 for negative) and *digit (absolute
|
||||
// value, in the range 0 .. 2**(w-1). Note that this integer essentially provides
|
||||
// the input bits "shifted to the left" by one position: for example, the input
|
||||
// to compute the least significant recoded digit, given that there's no bit
|
||||
// b_-1, has to be b_4 b_3 b_2 b_1 b_0 0.
|
||||
//
|
||||
// DOUBLING CASE:
|
||||
//
|
||||
// Point addition formulas for short Weierstrass curves are often incomplete.
|
||||
// Edge cases such as P + P or P + ∞ must be handled separately. This
|
||||
// complicates constant-time requirements. P + ∞ cannot be avoided (any window
|
||||
// may be zero) and is handled with constant-time selects. P + P (where P is not
|
||||
// ∞) usually is not. Instead, windowing strategies are chosen to avoid this
|
||||
// case. Whether this happens depends on the group order.
|
||||
//
|
||||
// Let w be the window width (in this function, w = 5). The non-trivial doubling
|
||||
// case in single-point scalar multiplication may occur if and only if the
|
||||
// 2^(w-1) bit of the group order is zero.
|
||||
//
|
||||
// Note the above only holds if the scalar is fully reduced and the group order
|
||||
// is a prime that is much larger than 2^w. It also only holds when windows
|
||||
// are applied from most significant to least significant, doubling between each
|
||||
// window. It does not apply to more complex table strategies such as
|
||||
// |EC_GFp_nistz256_method|.
|
||||
//
|
||||
// PROOF:
|
||||
//
|
||||
// Let n be the group order. Let l be the number of bits needed to represent n.
|
||||
// Assume there exists some 0 <= k < n such that signed w-bit windowed
|
||||
// multiplication hits the doubling case.
|
||||
//
|
||||
// Windowed multiplication consists of iterating over groups of s_i (defined
|
||||
// above based on k's binary representation) from most to least significant. At
|
||||
// iteration i (for i = ..., 3w, 2w, w, 0, starting from the most significant
|
||||
// window), we:
|
||||
//
|
||||
// 1. Double the accumulator A, w times. Let A_i be the value of A at this
|
||||
// point.
|
||||
//
|
||||
// 2. Set A to T_i + A_i, where T_i is a precomputed multiple of P
|
||||
// corresponding to the window s_(i+w-1) ... s_i.
|
||||
//
|
||||
// Let j be the index such that A_j = T_j ≠ ∞. Looking at A_i and T_i as
|
||||
// multiples of P, define a_i and t_i to be scalar coefficients of A_i and T_i.
|
||||
// Thus a_j = t_j ≠ 0 (mod n). Note a_i and t_i may not be reduced mod n. t_i is
|
||||
// the value of the w signed bits s_(i+w-1) ... s_i. a_i is computed as a_i =
|
||||
// 2^w * (a_(i+w) + t_(i+w)).
|
||||
//
|
||||
// t_i is bounded by -2^(w-1) <= t_i <= 2^(w-1). Additionally, we may write it
|
||||
// in terms of unsigned bits b_i. t_i consists of signed bits s_(i+w-1) ... s_i.
|
||||
// This is computed as:
|
||||
//
|
||||
// b_(i+w-2) b_(i+w-3) ... b_i b_(i-1)
|
||||
// - b_(i+w-1) b_(i+w-2) ... b_(i+1) b_i
|
||||
// --------------------------------------------
|
||||
// t_i = s_(i+w-1) s_(i+w-2) ... s_(i+1) s_i
|
||||
//
|
||||
// Observe that b_(i+w-2) through b_i occur in both terms. Let x be the integer
|
||||
// represented by that bit string, i.e. 2^(w-2)*b_(i+w-2) + ... + b_i.
|
||||
//
|
||||
// t_i = (2*x + b_(i-1)) - (2^(w-1)*b_(i+w-1) + x)
|
||||
// = x - 2^(w-1)*b_(i+w-1) + b_(i-1)
|
||||
//
|
||||
// Or, using C notation for bit operations:
|
||||
//
|
||||
// t_i = (k>>i) & ((1<<(w-1)) - 1) - (k>>i) & (1<<(w-1)) + (k>>(i-1)) & 1
|
||||
//
|
||||
// Note b_(i-1) is added in left-shifted by one (or doubled) from its place.
|
||||
// This is compensated by t_(i-w)'s subtraction term. Thus, a_i may be computed
|
||||
// by adding b_l b_(l-1) ... b_(i+1) b_i and an extra copy of b_(i-1). In C
|
||||
// notation, this is:
|
||||
//
|
||||
// a_i = (k>>(i+w)) << w + ((k>>(i+w-1)) & 1) << w
|
||||
//
|
||||
// Observe that, while t_i may be positive or negative, a_i is bounded by
|
||||
// 0 <= a_i < n + 2^w. Additionally, a_i can only be zero if b_(i+w-1) and up
|
||||
// are all zero. (Note this implies a non-trivial P + (-P) is unreachable for
|
||||
// all groups. That would imply the subsequent a_i is zero, which means all
|
||||
// terms thus far were zero.)
|
||||
//
|
||||
// Returning to our doubling position, we have a_j = t_j (mod n). We now
|
||||
// determine the value of a_j - t_j, which must be divisible by n. Our bounds on
|
||||
// a_j and t_j imply a_j - t_j is 0 or n. If it is 0, a_j = t_j. However, 2^w
|
||||
// divides a_j and -2^(w-1) <= t_j <= 2^(w-1), so this can only happen if
|
||||
// a_j = t_j = 0, which is a trivial doubling. Therefore, a_j - t_j = n.
|
||||
//
|
||||
// Now we determine j. Suppose j > 0. w divides j, so j >= w. Then,
|
||||
//
|
||||
// n = a_j - t_j = (k>>(j+w)) << w + ((k>>(j+w-1)) & 1) << w - t_j
|
||||
// <= k/2^j + 2^w - t_j
|
||||
// < n/2^w + 2^w + 2^(w-1)
|
||||
//
|
||||
// n is much larger than 2^w, so this is impossible. Thus, j = 0: only the final
|
||||
// addition may hit the doubling case.
|
||||
//
|
||||
// Finally, we consider bit patterns for n and k. Divide k into k_H + k_M + k_L
|
||||
// such that k_H is the contribution from b_(l-1) .. b_w, k_M is the
|
||||
// contribution from b_(w-1), and k_L is the contribution from b_(w-2) ... b_0.
|
||||
// That is:
|
||||
//
|
||||
// - 2^w divides k_H
|
||||
// - k_M is 0 or 2^(w-1)
|
||||
// - 0 <= k_L < 2^(w-1)
|
||||
//
|
||||
// Divide n into n_H + n_M + n_L similarly. We thus have:
|
||||
//
|
||||
// t_0 = (k>>0) & ((1<<(w-1)) - 1) - (k>>0) & (1<<(w-1)) + (k>>(0-1)) & 1
|
||||
// = k & ((1<<(w-1)) - 1) - k & (1<<(w-1))
|
||||
// = k_L - k_M
|
||||
//
|
||||
// a_0 = (k>>(0+w)) << w + ((k>>(0+w-1)) & 1) << w
|
||||
// = (k>>w) << w + ((k>>(w-1)) & 1) << w
|
||||
// = k_H + 2*k_M
|
||||
//
|
||||
// n = a_0 - t_0
|
||||
// n_H + n_M + n_L = (k_H + 2*k_M) - (k_L - k_M)
|
||||
// = k_H + 3*k_M - k_L
|
||||
//
|
||||
// k_H - k_L < k and k < n, so k_H - k_L ≠ n. Therefore k_M is not 0 and must be
|
||||
// 2^(w-1). Now we consider k_H and n_H. We know k_H <= n_H. Suppose k_H = n_H.
|
||||
// Then,
|
||||
//
|
||||
// n_M + n_L = 3*(2^(w-1)) - k_L
|
||||
// > 3*(2^(w-1)) - 2^(w-1)
|
||||
// = 2^w
|
||||
//
|
||||
// Contradiction (n_M + n_L is the bottom w bits of n). Thus k_H < n_H. Suppose
|
||||
// k_H < n_H - 2*2^w. Then,
|
||||
//
|
||||
// n_H + n_M + n_L = k_H + 3*(2^(w-1)) - k_L
|
||||
// < n_H - 2*2^w + 3*(2^(w-1)) - k_L
|
||||
// n_M + n_L < -2^(w-1) - k_L
|
||||
//
|
||||
// Contradiction. Thus, k_H = n_H - 2^w. (Note 2^w divides n_H and k_H.) Thus,
|
||||
//
|
||||
// n_H + n_M + n_L = k_H + 3*(2^(w-1)) - k_L
|
||||
// = n_H - 2^w + 3*(2^(w-1)) - k_L
|
||||
// n_M + n_L = 2^(w-1) - k_L
|
||||
// <= 2^(w-1)
|
||||
//
|
||||
// Equality would mean 2^(w-1) divides n, which is impossible if n is prime.
|
||||
// Thus n_M + n_L < 2^(w-1), so n_M is zero, proving our condition.
|
||||
//
|
||||
// This proof constructs k, so, to show the converse, let k_H = n_H - 2^w,
|
||||
// k_M = 2^(w-1), k_L = 2^(w-1) - n_L. This will result in a non-trivial point
|
||||
// doubling in the final addition and is the only such scalar.
|
||||
//
|
||||
// COMMON CURVES:
|
||||
//
|
||||
// The group orders for common curves end in the following bit patterns:
|
||||
//
|
||||
// P-521: ...00001001; w = 4 is okay
|
||||
// P-384: ...01110011; w = 2, 5, 6, 7 are okay
|
||||
// P-256: ...01010001; w = 5, 7 are okay
|
||||
// P-224: ...00111101; w = 3, 4, 5, 6 are okay
|
||||
static inline void booth_recode(crypto_word *is_negative, crypto_word *digit,
|
||||
crypto_word in, crypto_word w) {
|
||||
debug_assert_nonsecret(w >= 2);
|
||||
debug_assert_nonsecret(w <= 7);
|
||||
|
||||
// Set all bits of `s` to MSB(in), similar to |constant_time_msb_s|,
|
||||
// but 'in' seen as (`w+1`)-bit value.
|
||||
crypto_word s = ~((in >> w) - 1);
|
||||
crypto_word d;
|
||||
d = ((crypto_word)1u << (w + 1)) - in - 1;
|
||||
d = (d & s) | (in & ~s);
|
||||
d = (d >> 1) + (d & 1);
|
||||
|
||||
*is_negative = constant_time_is_nonzero_w(s & 1);
|
||||
*digit = d;
|
||||
}
|
||||
|
||||
#if defined(__GNUC__)
|
||||
#pragma GCC diagnostic pop
|
||||
#endif
|
||||
|
||||
void gfp_little_endian_bytes_from_scalar(uint8_t str[], size_t str_len,
|
||||
const Limb scalar[],
|
||||
size_t num_limbs);
|
||||
|
||||
#endif // OPENSSL_HEADER_EC_ECP_NISTZ_H
|
||||
349
zeroidc/vendor/ring/crypto/fipsmodule/ec/ecp_nistz256.c
vendored
Normal file
349
zeroidc/vendor/ring/crypto/fipsmodule/ec/ecp_nistz256.c
vendored
Normal file
@@ -0,0 +1,349 @@
|
||||
/* Copyright (c) 2014, Intel Corporation.
|
||||
*
|
||||
* Permission to use, copy, modify, and/or distribute this software for any
|
||||
* purpose with or without fee is hereby granted, provided that the above
|
||||
* copyright notice and this permission notice appear in all copies.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
|
||||
* WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
|
||||
* MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY
|
||||
* SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
|
||||
* WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION
|
||||
* OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN
|
||||
* CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. */
|
||||
|
||||
/* Developers and authors:
|
||||
* Shay Gueron (1, 2), and Vlad Krasnov (1)
|
||||
* (1) Intel Corporation, Israel Development Center
|
||||
* (2) University of Haifa
|
||||
* Reference:
|
||||
* Shay Gueron and Vlad Krasnov
|
||||
* "Fast Prime Field Elliptic Curve Cryptography with 256 Bit Primes"
|
||||
* http://eprint.iacr.org/2013/816 */
|
||||
|
||||
#include "ecp_nistz256.h"
|
||||
|
||||
#include "ecp_nistz.h"
|
||||
#include "../bn/internal.h"
|
||||
#include "../../limbs/limbs.inl"
|
||||
|
||||
#if defined(__GNUC__)
|
||||
#pragma GCC diagnostic ignored "-Wsign-conversion"
|
||||
#endif
|
||||
|
||||
/* Functions implemented in assembly */
|
||||
/* Modular neg: res = -a mod P */
|
||||
void GFp_nistz256_neg(Limb res[P256_LIMBS], const Limb a[P256_LIMBS]);
|
||||
|
||||
|
||||
/* One converted into the Montgomery domain */
|
||||
static const Limb ONE[P256_LIMBS] = {
|
||||
TOBN(0x00000000, 0x00000001), TOBN(0xffffffff, 0x00000000),
|
||||
TOBN(0xffffffff, 0xffffffff), TOBN(0x00000000, 0xfffffffe),
|
||||
};
|
||||
|
||||
static void copy_conditional(Limb dst[P256_LIMBS],
|
||||
const Limb src[P256_LIMBS], Limb move) {
|
||||
Limb mask1 = move;
|
||||
Limb mask2 = ~mask1;
|
||||
|
||||
dst[0] = (src[0] & mask1) ^ (dst[0] & mask2);
|
||||
dst[1] = (src[1] & mask1) ^ (dst[1] & mask2);
|
||||
dst[2] = (src[2] & mask1) ^ (dst[2] & mask2);
|
||||
dst[3] = (src[3] & mask1) ^ (dst[3] & mask2);
|
||||
if (P256_LIMBS == 8) {
|
||||
dst[4] = (src[4] & mask1) ^ (dst[4] & mask2);
|
||||
dst[5] = (src[5] & mask1) ^ (dst[5] & mask2);
|
||||
dst[6] = (src[6] & mask1) ^ (dst[6] & mask2);
|
||||
dst[7] = (src[7] & mask1) ^ (dst[7] & mask2);
|
||||
}
|
||||
}
|
||||
|
||||
void GFp_nistz256_point_double(P256_POINT *r, const P256_POINT *a);
|
||||
|
||||
#if defined(GFp_USE_LARGE_TABLE)
|
||||
void GFp_nistz256_point_add_affine(P256_POINT *r, const P256_POINT *a,
|
||||
const P256_POINT_AFFINE *b);
|
||||
#endif
|
||||
|
||||
void GFp_nistz256_point_add(P256_POINT *r, const P256_POINT *a,
|
||||
const P256_POINT *b);
|
||||
|
||||
// |GFp_nistz256_point_add| is defined in assembly language in X86-64 only.
|
||||
#if !defined(OPENSSL_X86_64)
|
||||
|
||||
static const BN_ULONG Q[P256_LIMBS] = {
|
||||
TOBN(0xffffffff, 0xffffffff),
|
||||
TOBN(0x00000000, 0xffffffff),
|
||||
TOBN(0x00000000, 0x00000000),
|
||||
TOBN(0xffffffff, 0x00000001),
|
||||
};
|
||||
|
||||
static inline Limb is_equal(const Limb a[P256_LIMBS], const Limb b[P256_LIMBS]) {
|
||||
return LIMBS_equal(a, b, P256_LIMBS);
|
||||
}
|
||||
|
||||
static inline Limb is_zero(const BN_ULONG a[P256_LIMBS]) {
|
||||
return LIMBS_are_zero(a, P256_LIMBS);
|
||||
}
|
||||
|
||||
static inline void elem_mul_by_2(Limb r[P256_LIMBS], const Limb a[P256_LIMBS]) {
|
||||
LIMBS_shl_mod(r, a, Q, P256_LIMBS);
|
||||
}
|
||||
|
||||
static inline void elem_mul_mont(Limb r[P256_LIMBS], const Limb a[P256_LIMBS],
|
||||
const Limb b[P256_LIMBS]) {
|
||||
GFp_nistz256_mul_mont(r, a, b);
|
||||
}
|
||||
|
||||
static inline void elem_sqr_mont(Limb r[P256_LIMBS], const Limb a[P256_LIMBS]) {
|
||||
GFp_nistz256_sqr_mont(r, a);
|
||||
}
|
||||
|
||||
static inline void elem_sub(Limb r[P256_LIMBS], const Limb a[P256_LIMBS],
|
||||
const Limb b[P256_LIMBS]) {
|
||||
LIMBS_sub_mod(r, a, b, Q, P256_LIMBS);
|
||||
}
|
||||
|
||||
/* Point addition: r = a+b */
|
||||
void GFp_nistz256_point_add(P256_POINT *r, const P256_POINT *a, const P256_POINT *b) {
|
||||
BN_ULONG U2[P256_LIMBS], S2[P256_LIMBS];
|
||||
BN_ULONG U1[P256_LIMBS], S1[P256_LIMBS];
|
||||
BN_ULONG Z1sqr[P256_LIMBS];
|
||||
BN_ULONG Z2sqr[P256_LIMBS];
|
||||
BN_ULONG H[P256_LIMBS], R[P256_LIMBS];
|
||||
BN_ULONG Hsqr[P256_LIMBS];
|
||||
BN_ULONG Rsqr[P256_LIMBS];
|
||||
BN_ULONG Hcub[P256_LIMBS];
|
||||
|
||||
BN_ULONG res_x[P256_LIMBS];
|
||||
BN_ULONG res_y[P256_LIMBS];
|
||||
BN_ULONG res_z[P256_LIMBS];
|
||||
|
||||
const BN_ULONG *in1_x = a->X;
|
||||
const BN_ULONG *in1_y = a->Y;
|
||||
const BN_ULONG *in1_z = a->Z;
|
||||
|
||||
const BN_ULONG *in2_x = b->X;
|
||||
const BN_ULONG *in2_y = b->Y;
|
||||
const BN_ULONG *in2_z = b->Z;
|
||||
|
||||
BN_ULONG in1infty = is_zero(a->Z);
|
||||
BN_ULONG in2infty = is_zero(b->Z);
|
||||
|
||||
elem_sqr_mont(Z2sqr, in2_z); /* Z2^2 */
|
||||
elem_sqr_mont(Z1sqr, in1_z); /* Z1^2 */
|
||||
|
||||
elem_mul_mont(S1, Z2sqr, in2_z); /* S1 = Z2^3 */
|
||||
elem_mul_mont(S2, Z1sqr, in1_z); /* S2 = Z1^3 */
|
||||
|
||||
elem_mul_mont(S1, S1, in1_y); /* S1 = Y1*Z2^3 */
|
||||
elem_mul_mont(S2, S2, in2_y); /* S2 = Y2*Z1^3 */
|
||||
elem_sub(R, S2, S1); /* R = S2 - S1 */
|
||||
|
||||
elem_mul_mont(U1, in1_x, Z2sqr); /* U1 = X1*Z2^2 */
|
||||
elem_mul_mont(U2, in2_x, Z1sqr); /* U2 = X2*Z1^2 */
|
||||
elem_sub(H, U2, U1); /* H = U2 - U1 */
|
||||
|
||||
BN_ULONG is_exceptional = is_equal(U1, U2) & ~in1infty & ~in2infty;
|
||||
if (is_exceptional) {
|
||||
if (is_equal(S1, S2)) {
|
||||
GFp_nistz256_point_double(r, a);
|
||||
} else {
|
||||
limbs_zero(r->X, P256_LIMBS);
|
||||
limbs_zero(r->Y, P256_LIMBS);
|
||||
limbs_zero(r->Z, P256_LIMBS);
|
||||
}
|
||||
return;
|
||||
}
|
||||
|
||||
elem_sqr_mont(Rsqr, R); /* R^2 */
|
||||
elem_mul_mont(res_z, H, in1_z); /* Z3 = H*Z1*Z2 */
|
||||
elem_sqr_mont(Hsqr, H); /* H^2 */
|
||||
elem_mul_mont(res_z, res_z, in2_z); /* Z3 = H*Z1*Z2 */
|
||||
elem_mul_mont(Hcub, Hsqr, H); /* H^3 */
|
||||
|
||||
elem_mul_mont(U2, U1, Hsqr); /* U1*H^2 */
|
||||
elem_mul_by_2(Hsqr, U2); /* 2*U1*H^2 */
|
||||
|
||||
elem_sub(res_x, Rsqr, Hsqr);
|
||||
elem_sub(res_x, res_x, Hcub);
|
||||
|
||||
elem_sub(res_y, U2, res_x);
|
||||
|
||||
elem_mul_mont(S2, S1, Hcub);
|
||||
elem_mul_mont(res_y, R, res_y);
|
||||
elem_sub(res_y, res_y, S2);
|
||||
|
||||
copy_conditional(res_x, in2_x, in1infty);
|
||||
copy_conditional(res_y, in2_y, in1infty);
|
||||
copy_conditional(res_z, in2_z, in1infty);
|
||||
|
||||
copy_conditional(res_x, in1_x, in2infty);
|
||||
copy_conditional(res_y, in1_y, in2infty);
|
||||
copy_conditional(res_z, in1_z, in2infty);
|
||||
|
||||
limbs_copy(r->X, res_x, P256_LIMBS);
|
||||
limbs_copy(r->Y, res_y, P256_LIMBS);
|
||||
limbs_copy(r->Z, res_z, P256_LIMBS);
|
||||
}
|
||||
#endif
|
||||
|
||||
/* r = p * p_scalar */
|
||||
void GFp_nistz256_point_mul(P256_POINT *r, const Limb p_scalar[P256_LIMBS],
|
||||
const Limb p_x[P256_LIMBS],
|
||||
const Limb p_y[P256_LIMBS]) {
|
||||
static const size_t kWindowSize = 5;
|
||||
static const crypto_word kMask = (1 << (5 /* kWindowSize */ + 1)) - 1;
|
||||
|
||||
uint8_t p_str[(P256_LIMBS * sizeof(Limb)) + 1];
|
||||
gfp_little_endian_bytes_from_scalar(p_str, sizeof(p_str) / sizeof(p_str[0]),
|
||||
p_scalar, P256_LIMBS);
|
||||
|
||||
/* A |P256_POINT| is (3 * 32) = 96 bytes, and the 64-byte alignment should
|
||||
* add no more than 63 bytes of overhead. Thus, |table| should require
|
||||
* ~1599 ((96 * 16) + 63) bytes of stack space. */
|
||||
alignas(64) P256_POINT table[16];
|
||||
|
||||
/* table[0] is implicitly (0,0,0) (the point at infinity), therefore it is
|
||||
* not stored. All other values are actually stored with an offset of -1 in
|
||||
* table. */
|
||||
P256_POINT *row = table;
|
||||
|
||||
limbs_copy(row[1 - 1].X, p_x, P256_LIMBS);
|
||||
limbs_copy(row[1 - 1].Y, p_y, P256_LIMBS);
|
||||
limbs_copy(row[1 - 1].Z, ONE, P256_LIMBS);
|
||||
|
||||
GFp_nistz256_point_double(&row[2 - 1], &row[1 - 1]);
|
||||
GFp_nistz256_point_add(&row[3 - 1], &row[2 - 1], &row[1 - 1]);
|
||||
GFp_nistz256_point_double(&row[4 - 1], &row[2 - 1]);
|
||||
GFp_nistz256_point_double(&row[6 - 1], &row[3 - 1]);
|
||||
GFp_nistz256_point_double(&row[8 - 1], &row[4 - 1]);
|
||||
GFp_nistz256_point_double(&row[12 - 1], &row[6 - 1]);
|
||||
GFp_nistz256_point_add(&row[5 - 1], &row[4 - 1], &row[1 - 1]);
|
||||
GFp_nistz256_point_add(&row[7 - 1], &row[6 - 1], &row[1 - 1]);
|
||||
GFp_nistz256_point_add(&row[9 - 1], &row[8 - 1], &row[1 - 1]);
|
||||
GFp_nistz256_point_add(&row[13 - 1], &row[12 - 1], &row[1 - 1]);
|
||||
GFp_nistz256_point_double(&row[14 - 1], &row[7 - 1]);
|
||||
GFp_nistz256_point_double(&row[10 - 1], &row[5 - 1]);
|
||||
GFp_nistz256_point_add(&row[15 - 1], &row[14 - 1], &row[1 - 1]);
|
||||
GFp_nistz256_point_add(&row[11 - 1], &row[10 - 1], &row[1 - 1]);
|
||||
GFp_nistz256_point_double(&row[16 - 1], &row[8 - 1]);
|
||||
|
||||
Limb tmp[P256_LIMBS];
|
||||
alignas(32) P256_POINT h;
|
||||
static const size_t START_INDEX = 256 - 1;
|
||||
size_t index = START_INDEX;
|
||||
|
||||
crypto_word raw_wvalue;
|
||||
crypto_word recoded_is_negative;
|
||||
crypto_word recoded;
|
||||
|
||||
raw_wvalue = p_str[(index - 1) / 8];
|
||||
raw_wvalue = (raw_wvalue >> ((index - 1) % 8)) & kMask;
|
||||
booth_recode(&recoded_is_negative, &recoded, raw_wvalue, kWindowSize);
|
||||
dev_assert_secret(!recoded_is_negative);
|
||||
GFp_nistz256_select_w5(r, table, recoded);
|
||||
|
||||
while (index >= kWindowSize) {
|
||||
if (index != START_INDEX) {
|
||||
size_t off = (index - 1) / 8;
|
||||
|
||||
raw_wvalue = p_str[off] | p_str[off + 1] << 8;
|
||||
raw_wvalue = (raw_wvalue >> ((index - 1) % 8)) & kMask;
|
||||
booth_recode(&recoded_is_negative, &recoded, raw_wvalue, kWindowSize);
|
||||
|
||||
GFp_nistz256_select_w5(&h, table, recoded);
|
||||
GFp_nistz256_neg(tmp, h.Y);
|
||||
copy_conditional(h.Y, tmp, recoded_is_negative);
|
||||
|
||||
GFp_nistz256_point_add(r, r, &h);
|
||||
}
|
||||
|
||||
index -= kWindowSize;
|
||||
|
||||
GFp_nistz256_point_double(r, r);
|
||||
GFp_nistz256_point_double(r, r);
|
||||
GFp_nistz256_point_double(r, r);
|
||||
GFp_nistz256_point_double(r, r);
|
||||
GFp_nistz256_point_double(r, r);
|
||||
}
|
||||
|
||||
/* Final window */
|
||||
raw_wvalue = p_str[0];
|
||||
raw_wvalue = (raw_wvalue << 1) & kMask;
|
||||
|
||||
booth_recode(&recoded_is_negative, &recoded, raw_wvalue, kWindowSize);
|
||||
GFp_nistz256_select_w5(&h, table, recoded);
|
||||
GFp_nistz256_neg(tmp, h.Y);
|
||||
copy_conditional(h.Y, tmp, recoded_is_negative);
|
||||
GFp_nistz256_point_add(r, r, &h);
|
||||
}
|
||||
|
||||
#if defined(GFp_USE_LARGE_TABLE)
|
||||
|
||||
/* Precomputed tables for the default generator */
|
||||
#include "ecp_nistz256_table.inl"
|
||||
|
||||
static const size_t kWindowSize = 7;
|
||||
|
||||
static inline void select_precomputed(P256_POINT_AFFINE *p, size_t i,
|
||||
crypto_word raw_wvalue) {
|
||||
crypto_word recoded_is_negative;
|
||||
crypto_word recoded;
|
||||
booth_recode(&recoded_is_negative, &recoded, raw_wvalue, kWindowSize);
|
||||
GFp_nistz256_select_w7(p, GFp_nistz256_precomputed[i], recoded);
|
||||
Limb neg_y[P256_LIMBS];
|
||||
GFp_nistz256_neg(neg_y, p->Y);
|
||||
copy_conditional(p->Y, neg_y, recoded_is_negative);
|
||||
}
|
||||
|
||||
/* This assumes that |x| and |y| have been each been reduced to their minimal
|
||||
* unique representations. */
|
||||
static Limb is_infinity(const Limb x[P256_LIMBS],
|
||||
const Limb y[P256_LIMBS]) {
|
||||
Limb acc = 0;
|
||||
for (size_t i = 0; i < P256_LIMBS; ++i) {
|
||||
acc |= x[i] | y[i];
|
||||
}
|
||||
return constant_time_is_zero_w(acc);
|
||||
}
|
||||
|
||||
void GFp_nistz256_point_mul_base(P256_POINT *r,
|
||||
const Limb g_scalar[P256_LIMBS]) {
|
||||
static const crypto_word kMask = (1 << (7 /* kWindowSize */ + 1)) - 1;
|
||||
|
||||
uint8_t p_str[(P256_LIMBS * sizeof(Limb)) + 1];
|
||||
gfp_little_endian_bytes_from_scalar(p_str, sizeof(p_str) / sizeof(p_str[0]),
|
||||
g_scalar, P256_LIMBS);
|
||||
|
||||
/* First window */
|
||||
size_t index = kWindowSize;
|
||||
|
||||
alignas(32) P256_POINT_AFFINE t;
|
||||
|
||||
crypto_word raw_wvalue = (p_str[0] << 1) & kMask;
|
||||
select_precomputed(&t, 0, raw_wvalue);
|
||||
|
||||
alignas(32) P256_POINT p;
|
||||
limbs_copy(p.X, t.X, P256_LIMBS);
|
||||
limbs_copy(p.Y, t.Y, P256_LIMBS);
|
||||
limbs_copy(p.Z, ONE, P256_LIMBS);
|
||||
/* If it is at the point at infinity then p.p.X will be zero. */
|
||||
copy_conditional(p.Z, p.X, is_infinity(p.X, p.Y));
|
||||
|
||||
for (size_t i = 1; i < 37; i++) {
|
||||
size_t off = (index - 1) / 8;
|
||||
raw_wvalue = p_str[off] | p_str[off + 1] << 8;
|
||||
raw_wvalue = (raw_wvalue >> ((index - 1) % 8)) & kMask;
|
||||
index += kWindowSize;
|
||||
select_precomputed(&t, i, raw_wvalue);
|
||||
GFp_nistz256_point_add_affine(&p, &p, &t);
|
||||
}
|
||||
|
||||
limbs_copy(r->X, p.X, P256_LIMBS);
|
||||
limbs_copy(r->Y, p.Y, P256_LIMBS);
|
||||
limbs_copy(r->Z, p.Z, P256_LIMBS);
|
||||
}
|
||||
|
||||
#endif
|
||||
54
zeroidc/vendor/ring/crypto/fipsmodule/ec/ecp_nistz256.h
vendored
Normal file
54
zeroidc/vendor/ring/crypto/fipsmodule/ec/ecp_nistz256.h
vendored
Normal file
@@ -0,0 +1,54 @@
|
||||
/* Copyright (c) 2014, Intel Corporation.
|
||||
*
|
||||
* Permission to use, copy, modify, and/or distribute this software for any
|
||||
* purpose with or without fee is hereby granted, provided that the above
|
||||
* copyright notice and this permission notice appear in all copies.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
|
||||
* WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
|
||||
* MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY
|
||||
* SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
|
||||
* WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION
|
||||
* OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN
|
||||
* CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. */
|
||||
|
||||
#ifndef OPENSSL_HEADER_EC_ECP_NISTZ256_H
|
||||
#define OPENSSL_HEADER_EC_ECP_NISTZ256_H
|
||||
|
||||
#include "../../limbs/limbs.h"
|
||||
|
||||
// Keep this in sync with p256.rs.
|
||||
#if defined(OPENSSL_AARCH64) || defined(OPENSSL_X86) || defined(OPENSSL_X86_64)
|
||||
#define GFp_USE_LARGE_TABLE
|
||||
#endif
|
||||
|
||||
#define P256_LIMBS (256u / LIMB_BITS)
|
||||
|
||||
typedef struct {
|
||||
Limb X[P256_LIMBS];
|
||||
Limb Y[P256_LIMBS];
|
||||
Limb Z[P256_LIMBS];
|
||||
} P256_POINT;
|
||||
|
||||
#if defined(GFp_USE_LARGE_TABLE)
|
||||
typedef struct {
|
||||
Limb X[P256_LIMBS];
|
||||
Limb Y[P256_LIMBS];
|
||||
} P256_POINT_AFFINE;
|
||||
#endif
|
||||
|
||||
typedef Limb PRECOMP256_ROW[64 * 2 * P256_LIMBS]; // 64 (x, y) entries.
|
||||
|
||||
void GFp_nistz256_mul_mont(Limb res[P256_LIMBS], const Limb a[P256_LIMBS],
|
||||
const Limb b[P256_LIMBS]);
|
||||
void GFp_nistz256_sqr_mont(Limb res[P256_LIMBS], const Limb a[P256_LIMBS]);
|
||||
|
||||
/* Functions that perform constant time access to the precomputed tables */
|
||||
void GFp_nistz256_select_w5(P256_POINT *out, const P256_POINT table[16],
|
||||
crypto_word index);
|
||||
|
||||
#if defined(GFp_USE_LARGE_TABLE)
|
||||
void GFp_nistz256_select_w7(P256_POINT_AFFINE *out, const PRECOMP256_ROW table, crypto_word index);
|
||||
#endif
|
||||
|
||||
#endif /* OPENSSL_HEADER_EC_ECP_NISTZ256_H */
|
||||
9501
zeroidc/vendor/ring/crypto/fipsmodule/ec/ecp_nistz256_table.inl
vendored
Normal file
9501
zeroidc/vendor/ring/crypto/fipsmodule/ec/ecp_nistz256_table.inl
vendored
Normal file
File diff suppressed because it is too large
Load Diff
34
zeroidc/vendor/ring/crypto/fipsmodule/ec/ecp_nistz384.h
vendored
Normal file
34
zeroidc/vendor/ring/crypto/fipsmodule/ec/ecp_nistz384.h
vendored
Normal file
@@ -0,0 +1,34 @@
|
||||
/* Copyright (c) 2014, Intel Corporation.
|
||||
*
|
||||
* Permission to use, copy, modify, and/or distribute this software for any
|
||||
* purpose with or without fee is hereby granted, provided that the above
|
||||
* copyright notice and this permission notice appear in all copies.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
|
||||
* WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
|
||||
* MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY
|
||||
* SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
|
||||
* WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION
|
||||
* OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN
|
||||
* CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. */
|
||||
|
||||
#ifndef OPENSSL_HEADER_EC_ECP_NISTZ384_H
|
||||
#define OPENSSL_HEADER_EC_ECP_NISTZ384_H
|
||||
|
||||
#include "../../limbs/limbs.h"
|
||||
|
||||
#define P384_LIMBS (384u / LIMB_BITS)
|
||||
|
||||
typedef struct {
|
||||
Limb X[P384_LIMBS];
|
||||
Limb Y[P384_LIMBS];
|
||||
Limb Z[P384_LIMBS];
|
||||
} P384_POINT;
|
||||
|
||||
typedef struct {
|
||||
Limb X[P384_LIMBS];
|
||||
Limb Y[P384_LIMBS];
|
||||
} P384_POINT_AFFINE;
|
||||
|
||||
|
||||
#endif // OPENSSL_HEADER_EC_ECP_NISTZ384_H
|
||||
257
zeroidc/vendor/ring/crypto/fipsmodule/ec/ecp_nistz384.inl
vendored
Normal file
257
zeroidc/vendor/ring/crypto/fipsmodule/ec/ecp_nistz384.inl
vendored
Normal file
@@ -0,0 +1,257 @@
|
||||
/* Copyright (c) 2014, Intel Corporation.
|
||||
*
|
||||
* Permission to use, copy, modify, and/or distribute this software for any
|
||||
* purpose with or without fee is hereby granted, provided that the above
|
||||
* copyright notice and this permission notice appear in all copies.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
|
||||
* WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
|
||||
* MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY
|
||||
* SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
|
||||
* WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION
|
||||
* OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN
|
||||
* CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. */
|
||||
|
||||
/* Developers and authors:
|
||||
* Shay Gueron (1, 2), and Vlad Krasnov (1)
|
||||
* (1) Intel Corporation, Israel Development Center
|
||||
* (2) University of Haifa
|
||||
* Reference:
|
||||
* Shay Gueron and Vlad Krasnov
|
||||
* "Fast Prime Field Elliptic Curve Cryptography with 256 Bit Primes"
|
||||
* http://eprint.iacr.org/2013/816 */
|
||||
|
||||
#include "ecp_nistz.h"
|
||||
|
||||
#if defined(__GNUC__)
|
||||
#pragma GCC diagnostic push
|
||||
#pragma GCC diagnostic ignored "-Wsign-conversion"
|
||||
#endif
|
||||
|
||||
/* Point double: r = 2*a */
|
||||
void GFp_nistz384_point_double(P384_POINT *r, const P384_POINT *a) {
|
||||
BN_ULONG S[P384_LIMBS];
|
||||
BN_ULONG M[P384_LIMBS];
|
||||
BN_ULONG Zsqr[P384_LIMBS];
|
||||
BN_ULONG tmp0[P384_LIMBS];
|
||||
|
||||
const BN_ULONG *in_x = a->X;
|
||||
const BN_ULONG *in_y = a->Y;
|
||||
const BN_ULONG *in_z = a->Z;
|
||||
|
||||
BN_ULONG *res_x = r->X;
|
||||
BN_ULONG *res_y = r->Y;
|
||||
BN_ULONG *res_z = r->Z;
|
||||
|
||||
elem_mul_by_2(S, in_y);
|
||||
|
||||
elem_sqr_mont(Zsqr, in_z);
|
||||
|
||||
elem_sqr_mont(S, S);
|
||||
|
||||
elem_mul_mont(res_z, in_z, in_y);
|
||||
elem_mul_by_2(res_z, res_z);
|
||||
|
||||
elem_add(M, in_x, Zsqr);
|
||||
elem_sub(Zsqr, in_x, Zsqr);
|
||||
|
||||
elem_sqr_mont(res_y, S);
|
||||
elem_div_by_2(res_y, res_y);
|
||||
|
||||
elem_mul_mont(M, M, Zsqr);
|
||||
elem_mul_by_3(M, M);
|
||||
|
||||
elem_mul_mont(S, S, in_x);
|
||||
elem_mul_by_2(tmp0, S);
|
||||
|
||||
elem_sqr_mont(res_x, M);
|
||||
|
||||
elem_sub(res_x, res_x, tmp0);
|
||||
elem_sub(S, S, res_x);
|
||||
|
||||
elem_mul_mont(S, S, M);
|
||||
elem_sub(res_y, S, res_y);
|
||||
}
|
||||
|
||||
/* Point addition: r = a+b */
|
||||
void GFp_nistz384_point_add(P384_POINT *r, const P384_POINT *a,
|
||||
const P384_POINT *b) {
|
||||
BN_ULONG U2[P384_LIMBS], S2[P384_LIMBS];
|
||||
BN_ULONG U1[P384_LIMBS], S1[P384_LIMBS];
|
||||
BN_ULONG Z1sqr[P384_LIMBS];
|
||||
BN_ULONG Z2sqr[P384_LIMBS];
|
||||
BN_ULONG H[P384_LIMBS], R[P384_LIMBS];
|
||||
BN_ULONG Hsqr[P384_LIMBS];
|
||||
BN_ULONG Rsqr[P384_LIMBS];
|
||||
BN_ULONG Hcub[P384_LIMBS];
|
||||
|
||||
BN_ULONG res_x[P384_LIMBS];
|
||||
BN_ULONG res_y[P384_LIMBS];
|
||||
BN_ULONG res_z[P384_LIMBS];
|
||||
|
||||
const BN_ULONG *in1_x = a->X;
|
||||
const BN_ULONG *in1_y = a->Y;
|
||||
const BN_ULONG *in1_z = a->Z;
|
||||
|
||||
const BN_ULONG *in2_x = b->X;
|
||||
const BN_ULONG *in2_y = b->Y;
|
||||
const BN_ULONG *in2_z = b->Z;
|
||||
|
||||
BN_ULONG in1infty = is_zero(a->Z);
|
||||
BN_ULONG in2infty = is_zero(b->Z);
|
||||
|
||||
elem_sqr_mont(Z2sqr, in2_z); /* Z2^2 */
|
||||
elem_sqr_mont(Z1sqr, in1_z); /* Z1^2 */
|
||||
|
||||
elem_mul_mont(S1, Z2sqr, in2_z); /* S1 = Z2^3 */
|
||||
elem_mul_mont(S2, Z1sqr, in1_z); /* S2 = Z1^3 */
|
||||
|
||||
elem_mul_mont(S1, S1, in1_y); /* S1 = Y1*Z2^3 */
|
||||
elem_mul_mont(S2, S2, in2_y); /* S2 = Y2*Z1^3 */
|
||||
elem_sub(R, S2, S1); /* R = S2 - S1 */
|
||||
|
||||
elem_mul_mont(U1, in1_x, Z2sqr); /* U1 = X1*Z2^2 */
|
||||
elem_mul_mont(U2, in2_x, Z1sqr); /* U2 = X2*Z1^2 */
|
||||
elem_sub(H, U2, U1); /* H = U2 - U1 */
|
||||
|
||||
BN_ULONG is_exceptional = is_equal(U1, U2) & ~in1infty & ~in2infty;
|
||||
if (is_exceptional) {
|
||||
if (is_equal(S1, S2)) {
|
||||
GFp_nistz384_point_double(r, a);
|
||||
} else {
|
||||
limbs_zero(r->X, P384_LIMBS);
|
||||
limbs_zero(r->Y, P384_LIMBS);
|
||||
limbs_zero(r->Z, P384_LIMBS);
|
||||
}
|
||||
return;
|
||||
}
|
||||
|
||||
elem_sqr_mont(Rsqr, R); /* R^2 */
|
||||
elem_mul_mont(res_z, H, in1_z); /* Z3 = H*Z1*Z2 */
|
||||
elem_sqr_mont(Hsqr, H); /* H^2 */
|
||||
elem_mul_mont(res_z, res_z, in2_z); /* Z3 = H*Z1*Z2 */
|
||||
elem_mul_mont(Hcub, Hsqr, H); /* H^3 */
|
||||
|
||||
elem_mul_mont(U2, U1, Hsqr); /* U1*H^2 */
|
||||
elem_mul_by_2(Hsqr, U2); /* 2*U1*H^2 */
|
||||
|
||||
elem_sub(res_x, Rsqr, Hsqr);
|
||||
elem_sub(res_x, res_x, Hcub);
|
||||
|
||||
elem_sub(res_y, U2, res_x);
|
||||
|
||||
elem_mul_mont(S2, S1, Hcub);
|
||||
elem_mul_mont(res_y, R, res_y);
|
||||
elem_sub(res_y, res_y, S2);
|
||||
|
||||
copy_conditional(res_x, in2_x, in1infty);
|
||||
copy_conditional(res_y, in2_y, in1infty);
|
||||
copy_conditional(res_z, in2_z, in1infty);
|
||||
|
||||
copy_conditional(res_x, in1_x, in2infty);
|
||||
copy_conditional(res_y, in1_y, in2infty);
|
||||
copy_conditional(res_z, in1_z, in2infty);
|
||||
|
||||
limbs_copy(r->X, res_x, P384_LIMBS);
|
||||
limbs_copy(r->Y, res_y, P384_LIMBS);
|
||||
limbs_copy(r->Z, res_z, P384_LIMBS);
|
||||
}
|
||||
|
||||
static void add_precomputed_w5(P384_POINT *r, crypto_word wvalue,
|
||||
const P384_POINT table[16]) {
|
||||
crypto_word recoded_is_negative;
|
||||
crypto_word recoded;
|
||||
booth_recode(&recoded_is_negative, &recoded, wvalue, 5);
|
||||
|
||||
alignas(64) P384_POINT h;
|
||||
gfp_p384_point_select_w5(&h, table, recoded);
|
||||
|
||||
alignas(64) BN_ULONG tmp[P384_LIMBS];
|
||||
GFp_p384_elem_neg(tmp, h.Y);
|
||||
copy_conditional(h.Y, tmp, recoded_is_negative);
|
||||
|
||||
GFp_nistz384_point_add(r, r, &h);
|
||||
}
|
||||
|
||||
/* r = p * p_scalar */
|
||||
void GFp_nistz384_point_mul(P384_POINT *r, const BN_ULONG p_scalar[P384_LIMBS],
|
||||
const BN_ULONG p_x[P384_LIMBS],
|
||||
const BN_ULONG p_y[P384_LIMBS]) {
|
||||
static const size_t kWindowSize = 5;
|
||||
static const crypto_word kMask = (1 << (5 /* kWindowSize */ + 1)) - 1;
|
||||
|
||||
uint8_t p_str[(P384_LIMBS * sizeof(Limb)) + 1];
|
||||
gfp_little_endian_bytes_from_scalar(p_str, sizeof(p_str) / sizeof(p_str[0]),
|
||||
p_scalar, P384_LIMBS);
|
||||
|
||||
/* A |P384_POINT| is (3 * 48) = 144 bytes, and the 64-byte alignment should
|
||||
* add no more than 63 bytes of overhead. Thus, |table| should require
|
||||
* ~2367 ((144 * 16) + 63) bytes of stack space. */
|
||||
alignas(64) P384_POINT table[16];
|
||||
|
||||
/* table[0] is implicitly (0,0,0) (the point at infinity), therefore it is
|
||||
* not stored. All other values are actually stored with an offset of -1 in
|
||||
* table. */
|
||||
P384_POINT *row = table;
|
||||
|
||||
limbs_copy(row[1 - 1].X, p_x, P384_LIMBS);
|
||||
limbs_copy(row[1 - 1].Y, p_y, P384_LIMBS);
|
||||
limbs_copy(row[1 - 1].Z, ONE, P384_LIMBS);
|
||||
|
||||
GFp_nistz384_point_double(&row[2 - 1], &row[1 - 1]);
|
||||
GFp_nistz384_point_add(&row[3 - 1], &row[2 - 1], &row[1 - 1]);
|
||||
GFp_nistz384_point_double(&row[4 - 1], &row[2 - 1]);
|
||||
GFp_nistz384_point_double(&row[6 - 1], &row[3 - 1]);
|
||||
GFp_nistz384_point_double(&row[8 - 1], &row[4 - 1]);
|
||||
GFp_nistz384_point_double(&row[12 - 1], &row[6 - 1]);
|
||||
GFp_nistz384_point_add(&row[5 - 1], &row[4 - 1], &row[1 - 1]);
|
||||
GFp_nistz384_point_add(&row[7 - 1], &row[6 - 1], &row[1 - 1]);
|
||||
GFp_nistz384_point_add(&row[9 - 1], &row[8 - 1], &row[1 - 1]);
|
||||
GFp_nistz384_point_add(&row[13 - 1], &row[12 - 1], &row[1 - 1]);
|
||||
GFp_nistz384_point_double(&row[14 - 1], &row[7 - 1]);
|
||||
GFp_nistz384_point_double(&row[10 - 1], &row[5 - 1]);
|
||||
GFp_nistz384_point_add(&row[15 - 1], &row[14 - 1], &row[1 - 1]);
|
||||
GFp_nistz384_point_add(&row[11 - 1], &row[10 - 1], &row[1 - 1]);
|
||||
GFp_nistz384_point_double(&row[16 - 1], &row[8 - 1]);
|
||||
|
||||
static const size_t START_INDEX = 384 - 4;
|
||||
size_t index = START_INDEX;
|
||||
|
||||
BN_ULONG recoded_is_negative;
|
||||
crypto_word recoded;
|
||||
|
||||
crypto_word wvalue = p_str[(index - 1) / 8];
|
||||
wvalue = (wvalue >> ((index - 1) % 8)) & kMask;
|
||||
|
||||
booth_recode(&recoded_is_negative, &recoded, wvalue, 5);
|
||||
dev_assert_secret(!recoded_is_negative);
|
||||
|
||||
gfp_p384_point_select_w5(r, table, recoded);
|
||||
|
||||
while (index >= kWindowSize) {
|
||||
if (index != START_INDEX) {
|
||||
size_t off = (index - 1) / 8;
|
||||
|
||||
wvalue = p_str[off] | p_str[off + 1] << 8;
|
||||
wvalue = (wvalue >> ((index - 1) % 8)) & kMask;
|
||||
add_precomputed_w5(r, wvalue, table);
|
||||
}
|
||||
|
||||
index -= kWindowSize;
|
||||
|
||||
GFp_nistz384_point_double(r, r);
|
||||
GFp_nistz384_point_double(r, r);
|
||||
GFp_nistz384_point_double(r, r);
|
||||
GFp_nistz384_point_double(r, r);
|
||||
GFp_nistz384_point_double(r, r);
|
||||
}
|
||||
|
||||
/* Final window */
|
||||
wvalue = p_str[0];
|
||||
wvalue = (wvalue << 1) & kMask;
|
||||
add_precomputed_w5(r, wvalue, table);
|
||||
}
|
||||
|
||||
#if defined(__GNUC__)
|
||||
#pragma GCC diagnostic pop
|
||||
#endif
|
||||
108
zeroidc/vendor/ring/crypto/fipsmodule/ec/gfp_p256.c
vendored
Normal file
108
zeroidc/vendor/ring/crypto/fipsmodule/ec/gfp_p256.c
vendored
Normal file
@@ -0,0 +1,108 @@
|
||||
/* Copyright 2016 Brian Smith.
|
||||
*
|
||||
* Permission to use, copy, modify, and/or distribute this software for any
|
||||
* purpose with or without fee is hereby granted, provided that the above
|
||||
* copyright notice and this permission notice appear in all copies.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHORS DISCLAIM ALL WARRANTIES
|
||||
* WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
|
||||
* MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY
|
||||
* SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
|
||||
* WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION
|
||||
* OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN
|
||||
* CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. */
|
||||
|
||||
#include "ecp_nistz256.h"
|
||||
#include "../../limbs/limbs.h"
|
||||
|
||||
#include "../../internal.h"
|
||||
#include "../bn/internal.h"
|
||||
#include "../../limbs/limbs.inl"
|
||||
|
||||
typedef Limb Elem[P256_LIMBS];
|
||||
typedef Limb ScalarMont[P256_LIMBS];
|
||||
typedef Limb Scalar[P256_LIMBS];
|
||||
|
||||
void GFp_p256_scalar_sqr_rep_mont(ScalarMont r, const ScalarMont a, Limb rep);
|
||||
|
||||
#if defined(OPENSSL_ARM) || defined(OPENSSL_X86)
|
||||
void GFp_nistz256_sqr_mont(Elem r, const Elem a) {
|
||||
/* XXX: Inefficient. TODO: optimize with dedicated squaring routine. */
|
||||
GFp_nistz256_mul_mont(r, a, a);
|
||||
}
|
||||
#endif
|
||||
|
||||
#if !defined(OPENSSL_X86_64)
|
||||
void GFp_p256_scalar_mul_mont(ScalarMont r, const ScalarMont a,
|
||||
const ScalarMont b) {
|
||||
static const BN_ULONG N[] = {
|
||||
TOBN(0xf3b9cac2, 0xfc632551),
|
||||
TOBN(0xbce6faad, 0xa7179e84),
|
||||
TOBN(0xffffffff, 0xffffffff),
|
||||
TOBN(0xffffffff, 0x00000000),
|
||||
};
|
||||
static const BN_ULONG N_N0[] = {
|
||||
BN_MONT_CTX_N0(0xccd1c8aa, 0xee00bc4f)
|
||||
};
|
||||
/* XXX: Inefficient. TODO: optimize with dedicated multiplication routine. */
|
||||
GFp_bn_mul_mont(r, a, b, N, N_N0, P256_LIMBS);
|
||||
}
|
||||
#endif
|
||||
|
||||
#if defined(OPENSSL_X86_64)
|
||||
void GFp_p256_scalar_sqr_mont(ScalarMont r, const ScalarMont a) {
|
||||
GFp_p256_scalar_sqr_rep_mont(r, a, 1);
|
||||
}
|
||||
#else
|
||||
void GFp_p256_scalar_sqr_mont(ScalarMont r, const ScalarMont a) {
|
||||
GFp_p256_scalar_mul_mont(r, a, a);
|
||||
}
|
||||
|
||||
void GFp_p256_scalar_sqr_rep_mont(ScalarMont r, const ScalarMont a, Limb rep) {
|
||||
dev_assert_secret(rep >= 1);
|
||||
GFp_p256_scalar_sqr_mont(r, a);
|
||||
for (Limb i = 1; i < rep; ++i) {
|
||||
GFp_p256_scalar_sqr_mont(r, r);
|
||||
}
|
||||
}
|
||||
#endif
|
||||
|
||||
|
||||
#if !defined(OPENSSL_X86_64)
|
||||
|
||||
/* TODO(perf): Optimize these. */
|
||||
|
||||
void GFp_nistz256_select_w5(P256_POINT *out, const P256_POINT table[16],
|
||||
crypto_word index) {
|
||||
dev_assert_secret(index >= 0);
|
||||
|
||||
alignas(32) Elem x; limbs_zero(x, P256_LIMBS);
|
||||
alignas(32) Elem y; limbs_zero(y, P256_LIMBS);
|
||||
alignas(32) Elem z; limbs_zero(z, P256_LIMBS);
|
||||
|
||||
// TODO: Rewrite in terms of |limbs_select|.
|
||||
for (size_t i = 0; i < 16; ++i) {
|
||||
crypto_word equal = constant_time_eq_w(index, (crypto_word)i + 1);
|
||||
for (size_t j = 0; j < P256_LIMBS; ++j) {
|
||||
x[j] = constant_time_select_w(equal, table[i].X[j], x[j]);
|
||||
y[j] = constant_time_select_w(equal, table[i].Y[j], y[j]);
|
||||
z[j] = constant_time_select_w(equal, table[i].Z[j], z[j]);
|
||||
}
|
||||
}
|
||||
|
||||
limbs_copy(out->X, x, P256_LIMBS);
|
||||
limbs_copy(out->Y, y, P256_LIMBS);
|
||||
limbs_copy(out->Z, z, P256_LIMBS);
|
||||
}
|
||||
|
||||
#if defined GFp_USE_LARGE_TABLE
|
||||
void GFp_nistz256_select_w7(P256_POINT_AFFINE *out,
|
||||
const PRECOMP256_ROW table, crypto_word index) {
|
||||
alignas(32) Limb xy[P256_LIMBS * 2];
|
||||
limbs_select(xy, table, P256_LIMBS * 2, 64, index - 1);
|
||||
limbs_copy(out->X, &xy[0], P256_LIMBS);
|
||||
limbs_copy(out->Y, &xy[P256_LIMBS], P256_LIMBS);
|
||||
}
|
||||
#endif
|
||||
|
||||
#endif
|
||||
242
zeroidc/vendor/ring/crypto/fipsmodule/ec/gfp_p384.c
vendored
Normal file
242
zeroidc/vendor/ring/crypto/fipsmodule/ec/gfp_p384.c
vendored
Normal file
@@ -0,0 +1,242 @@
|
||||
/* Copyright 2016 Brian Smith.
|
||||
*
|
||||
* Permission to use, copy, modify, and/or distribute this software for any
|
||||
* purpose with or without fee is hereby granted, provided that the above
|
||||
* copyright notice and this permission notice appear in all copies.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHORS DISCLAIM ALL WARRANTIES
|
||||
* WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
|
||||
* MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY
|
||||
* SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
|
||||
* WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION
|
||||
* OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN
|
||||
* CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. */
|
||||
|
||||
#include "../../limbs/limbs.h"
|
||||
|
||||
#include "ecp_nistz384.h"
|
||||
#include "../bn/internal.h"
|
||||
#include "../../internal.h"
|
||||
|
||||
#include "../../limbs/limbs.inl"
|
||||
|
||||
/* XXX: Here we assume that the conversion from |Carry| to |Limb| is
|
||||
* constant-time, but we haven't verified that assumption. TODO: Fix it so
|
||||
* we don't need to make that assumption. */
|
||||
|
||||
|
||||
typedef Limb Elem[P384_LIMBS];
|
||||
typedef Limb ScalarMont[P384_LIMBS];
|
||||
typedef Limb Scalar[P384_LIMBS];
|
||||
|
||||
|
||||
static const BN_ULONG Q[P384_LIMBS] = {
|
||||
TOBN(0x00000000, 0xffffffff),
|
||||
TOBN(0xffffffff, 0x00000000),
|
||||
TOBN(0xffffffff, 0xfffffffe),
|
||||
TOBN(0xffffffff, 0xffffffff),
|
||||
TOBN(0xffffffff, 0xffffffff),
|
||||
TOBN(0xffffffff, 0xffffffff),
|
||||
};
|
||||
|
||||
static const BN_ULONG N[P384_LIMBS] = {
|
||||
TOBN(0xecec196a, 0xccc52973),
|
||||
TOBN(0x581a0db2, 0x48b0a77a),
|
||||
TOBN(0xc7634d81, 0xf4372ddf),
|
||||
TOBN(0xffffffff, 0xffffffff),
|
||||
TOBN(0xffffffff, 0xffffffff),
|
||||
TOBN(0xffffffff, 0xffffffff),
|
||||
};
|
||||
|
||||
|
||||
static const BN_ULONG ONE[P384_LIMBS] = {
|
||||
TOBN(0xffffffff, 1), TOBN(0, 0xffffffff), TOBN(0, 1), TOBN(0, 0), TOBN(0, 0),
|
||||
TOBN(0, 0),
|
||||
};
|
||||
|
||||
|
||||
/* XXX: MSVC for x86 warns when it fails to inline these functions it should
|
||||
* probably inline. */
|
||||
#if defined(_MSC_VER) && !defined(__clang__) && defined(OPENSSL_X86)
|
||||
#define INLINE_IF_POSSIBLE __forceinline
|
||||
#else
|
||||
#define INLINE_IF_POSSIBLE inline
|
||||
#endif
|
||||
|
||||
static inline Limb is_equal(const Elem a, const Elem b) {
|
||||
return LIMBS_equal(a, b, P384_LIMBS);
|
||||
}
|
||||
|
||||
static inline Limb is_zero(const BN_ULONG a[P384_LIMBS]) {
|
||||
return LIMBS_are_zero(a, P384_LIMBS);
|
||||
}
|
||||
|
||||
static inline void copy_conditional(Elem r, const Elem a,
|
||||
const Limb condition) {
|
||||
for (size_t i = 0; i < P384_LIMBS; ++i) {
|
||||
r[i] = constant_time_select_w(condition, a[i], r[i]);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
static inline void elem_add(Elem r, const Elem a, const Elem b) {
|
||||
LIMBS_add_mod(r, a, b, Q, P384_LIMBS);
|
||||
}
|
||||
|
||||
static inline void elem_sub(Elem r, const Elem a, const Elem b) {
|
||||
LIMBS_sub_mod(r, a, b, Q, P384_LIMBS);
|
||||
}
|
||||
|
||||
static void elem_div_by_2(Elem r, const Elem a) {
|
||||
/* Consider the case where `a` is even. Then we can shift `a` right one bit
|
||||
* and the result will still be valid because we didn't lose any bits and so
|
||||
* `(a >> 1) * 2 == a (mod q)`, which is the invariant we must satisfy.
|
||||
*
|
||||
* The remainder of this comment is considering the case where `a` is odd.
|
||||
*
|
||||
* Since `a` is odd, it isn't the case that `(a >> 1) * 2 == a (mod q)`
|
||||
* because the lowest bit is lost during the shift. For example, consider:
|
||||
*
|
||||
* ```python
|
||||
* q = 2**384 - 2**128 - 2**96 + 2**32 - 1
|
||||
* a = 2**383
|
||||
* two_a = a * 2 % q
|
||||
* assert two_a == 0x100000000ffffffffffffffff00000001
|
||||
* ```
|
||||
*
|
||||
* Notice there how `(2 * a) % q` wrapped around to a smaller odd value. When
|
||||
* we divide `two_a` by two (mod q), we need to get the value `2**383`, which
|
||||
* we obviously can't get with just a right shift.
|
||||
*
|
||||
* `q` is odd, and `a` is odd, so `a + q` is even. We could calculate
|
||||
* `(a + q) >> 1` and then reduce it mod `q`. However, then we would have to
|
||||
* keep track of an extra most significant bit. We can avoid that by instead
|
||||
* calculating `(a >> 1) + ((q + 1) >> 1)`. The `1` in `q + 1` is the least
|
||||
* significant bit of `a`. `q + 1` is even, which means it can be shifted
|
||||
* without losing any bits. Since `q` is odd, `q - 1` is even, so the largest
|
||||
* odd field element is `q - 2`. Thus we know that `a <= q - 2`. We know
|
||||
* `(q + 1) >> 1` is `(q + 1) / 2` since (`q + 1`) is even. The value of
|
||||
* `a >> 1` is `(a - 1)/2` since the shift will drop the least significant
|
||||
* bit of `a`, which is 1. Thus:
|
||||
*
|
||||
* sum = ((q + 1) >> 1) + (a >> 1)
|
||||
* sum = (q + 1)/2 + (a >> 1) (substituting (q + 1)/2)
|
||||
* <= (q + 1)/2 + (q - 2 - 1)/2 (substituting a <= q - 2)
|
||||
* <= (q + 1)/2 + (q - 3)/2 (simplifying)
|
||||
* <= (q + 1 + q - 3)/2 (factoring out the common divisor)
|
||||
* <= (2q - 2)/2 (simplifying)
|
||||
* <= q - 1 (simplifying)
|
||||
*
|
||||
* Thus, no reduction of the sum mod `q` is necessary. */
|
||||
|
||||
Limb is_odd = constant_time_is_nonzero_w(a[0] & 1);
|
||||
|
||||
/* r = a >> 1. */
|
||||
Limb carry = a[P384_LIMBS - 1] & 1;
|
||||
r[P384_LIMBS - 1] = a[P384_LIMBS - 1] >> 1;
|
||||
for (size_t i = 1; i < P384_LIMBS; ++i) {
|
||||
Limb new_carry = a[P384_LIMBS - i - 1];
|
||||
r[P384_LIMBS - i - 1] =
|
||||
(a[P384_LIMBS - i - 1] >> 1) | (carry << (LIMB_BITS - 1));
|
||||
carry = new_carry;
|
||||
}
|
||||
|
||||
static const Elem Q_PLUS_1_SHR_1 = {
|
||||
TOBN(0x00000000, 0x80000000), TOBN(0x7fffffff, 0x80000000),
|
||||
TOBN(0xffffffff, 0xffffffff), TOBN(0xffffffff, 0xffffffff),
|
||||
TOBN(0xffffffff, 0xffffffff), TOBN(0x7fffffff, 0xffffffff),
|
||||
};
|
||||
|
||||
Elem adjusted;
|
||||
BN_ULONG carry2 = limbs_add(adjusted, r, Q_PLUS_1_SHR_1, P384_LIMBS);
|
||||
dev_assert_secret(carry2 == 0);
|
||||
(void)carry2;
|
||||
copy_conditional(r, adjusted, is_odd);
|
||||
}
|
||||
|
||||
static inline void elem_mul_mont(Elem r, const Elem a, const Elem b) {
|
||||
static const BN_ULONG Q_N0[] = {
|
||||
BN_MONT_CTX_N0(0x1, 0x1)
|
||||
};
|
||||
/* XXX: Not (clearly) constant-time; inefficient.*/
|
||||
GFp_bn_mul_mont(r, a, b, Q, Q_N0, P384_LIMBS);
|
||||
}
|
||||
|
||||
static inline void elem_mul_by_2(Elem r, const Elem a) {
|
||||
LIMBS_shl_mod(r, a, Q, P384_LIMBS);
|
||||
}
|
||||
|
||||
static INLINE_IF_POSSIBLE void elem_mul_by_3(Elem r, const Elem a) {
|
||||
/* XXX: inefficient. TODO: Replace with an integrated shift + add. */
|
||||
Elem doubled;
|
||||
elem_add(doubled, a, a);
|
||||
elem_add(r, doubled, a);
|
||||
}
|
||||
|
||||
static inline void elem_sqr_mont(Elem r, const Elem a) {
|
||||
/* XXX: Inefficient. TODO: Add a dedicated squaring routine. */
|
||||
elem_mul_mont(r, a, a);
|
||||
}
|
||||
|
||||
void GFp_p384_elem_add(Elem r, const Elem a, const Elem b) {
|
||||
elem_add(r, a, b);
|
||||
}
|
||||
|
||||
void GFp_p384_elem_sub(Elem r, const Elem a, const Elem b) {
|
||||
elem_sub(r, a, b);
|
||||
}
|
||||
|
||||
void GFp_p384_elem_div_by_2(Elem r, const Elem a) {
|
||||
elem_div_by_2(r, a);
|
||||
}
|
||||
|
||||
void GFp_p384_elem_mul_mont(Elem r, const Elem a, const Elem b) {
|
||||
elem_mul_mont(r, a, b);
|
||||
}
|
||||
|
||||
void GFp_p384_elem_neg(Elem r, const Elem a) {
|
||||
Limb is_zero = LIMBS_are_zero(a, P384_LIMBS);
|
||||
Carry borrow = limbs_sub(r, Q, a, P384_LIMBS);
|
||||
dev_assert_secret(borrow == 0);
|
||||
(void)borrow;
|
||||
for (size_t i = 0; i < P384_LIMBS; ++i) {
|
||||
r[i] = constant_time_select_w(is_zero, 0, r[i]);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
void GFp_p384_scalar_mul_mont(ScalarMont r, const ScalarMont a,
|
||||
const ScalarMont b) {
|
||||
static const BN_ULONG N_N0[] = {
|
||||
BN_MONT_CTX_N0(0x6ed46089, 0xe88fdc45)
|
||||
};
|
||||
/* XXX: Inefficient. TODO: Add dedicated multiplication routine. */
|
||||
GFp_bn_mul_mont(r, a, b, N, N_N0, P384_LIMBS);
|
||||
}
|
||||
|
||||
|
||||
/* TODO(perf): Optimize this. */
|
||||
|
||||
static void gfp_p384_point_select_w5(P384_POINT *out,
|
||||
const P384_POINT table[16], size_t index) {
|
||||
Elem x; limbs_zero(x, P384_LIMBS);
|
||||
Elem y; limbs_zero(y, P384_LIMBS);
|
||||
Elem z; limbs_zero(z, P384_LIMBS);
|
||||
|
||||
// TODO: Rewrite in terms of |limbs_select|.
|
||||
for (size_t i = 0; i < 16; ++i) {
|
||||
crypto_word equal = constant_time_eq_w(index, (crypto_word)i + 1);
|
||||
for (size_t j = 0; j < P384_LIMBS; ++j) {
|
||||
x[j] = constant_time_select_w(equal, table[i].X[j], x[j]);
|
||||
y[j] = constant_time_select_w(equal, table[i].Y[j], y[j]);
|
||||
z[j] = constant_time_select_w(equal, table[i].Z[j], z[j]);
|
||||
}
|
||||
}
|
||||
|
||||
limbs_copy(out->X, x, P384_LIMBS);
|
||||
limbs_copy(out->Y, y, P384_LIMBS);
|
||||
limbs_copy(out->Z, z, P384_LIMBS);
|
||||
}
|
||||
|
||||
|
||||
#include "ecp_nistz384.inl"
|
||||
1362
zeroidc/vendor/ring/crypto/fipsmodule/ecdsa/ecdsa_verify_tests.txt
vendored
Normal file
1362
zeroidc/vendor/ring/crypto/fipsmodule/ecdsa/ecdsa_verify_tests.txt
vendored
Normal file
File diff suppressed because it is too large
Load Diff
1122
zeroidc/vendor/ring/crypto/fipsmodule/modes/asm/aesni-gcm-x86_64.pl
vendored
Normal file
1122
zeroidc/vendor/ring/crypto/fipsmodule/modes/asm/aesni-gcm-x86_64.pl
vendored
Normal file
File diff suppressed because it is too large
Load Diff
300
zeroidc/vendor/ring/crypto/fipsmodule/modes/asm/ghash-armv4.pl
vendored
Normal file
300
zeroidc/vendor/ring/crypto/fipsmodule/modes/asm/ghash-armv4.pl
vendored
Normal file
@@ -0,0 +1,300 @@
|
||||
#! /usr/bin/env perl
|
||||
# Copyright 2010-2018 The OpenSSL Project Authors. All Rights Reserved.
|
||||
#
|
||||
# Licensed under the OpenSSL license (the "License"). You may not use
|
||||
# this file except in compliance with the License. You can obtain a copy
|
||||
# in the file LICENSE in the source distribution or at
|
||||
# https://www.openssl.org/source/license.html
|
||||
|
||||
#
|
||||
# ====================================================================
|
||||
# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
|
||||
# project. The module is, however, dual licensed under OpenSSL and
|
||||
# CRYPTOGAMS licenses depending on where you obtain it. For further
|
||||
# details see http://www.openssl.org/~appro/cryptogams/.
|
||||
# ====================================================================
|
||||
#
|
||||
# April 2010
|
||||
#
|
||||
# The module implements "4-bit" GCM GHASH function and underlying
|
||||
# single multiplication operation in GF(2^128). "4-bit" means that it
|
||||
# uses 256 bytes per-key table [+32 bytes shared table]. There is no
|
||||
# experimental performance data available yet. The only approximation
|
||||
# that can be made at this point is based on code size. Inner loop is
|
||||
# 32 instructions long and on single-issue core should execute in <40
|
||||
# cycles. Having verified that gcc 3.4 didn't unroll corresponding
|
||||
# loop, this assembler loop body was found to be ~3x smaller than
|
||||
# compiler-generated one...
|
||||
#
|
||||
# July 2010
|
||||
#
|
||||
# Rescheduling for dual-issue pipeline resulted in 8.5% improvement on
|
||||
# Cortex A8 core and ~25 cycles per processed byte (which was observed
|
||||
# to be ~3 times faster than gcc-generated code:-)
|
||||
#
|
||||
# February 2011
|
||||
#
|
||||
# Profiler-assisted and platform-specific optimization resulted in 7%
|
||||
# improvement on Cortex A8 core and ~23.5 cycles per byte.
|
||||
#
|
||||
# March 2011
|
||||
#
|
||||
# Add NEON implementation featuring polynomial multiplication, i.e. no
|
||||
# lookup tables involved. On Cortex A8 it was measured to process one
|
||||
# byte in 15 cycles or 55% faster than integer-only code.
|
||||
#
|
||||
# April 2014
|
||||
#
|
||||
# Switch to multiplication algorithm suggested in paper referred
|
||||
# below and combine it with reduction algorithm from x86 module.
|
||||
# Performance improvement over previous version varies from 65% on
|
||||
# Snapdragon S4 to 110% on Cortex A9. In absolute terms Cortex A8
|
||||
# processes one byte in 8.45 cycles, A9 - in 10.2, A15 - in 7.63,
|
||||
# Snapdragon S4 - in 9.33.
|
||||
#
|
||||
# Câmara, D.; Gouvêa, C. P. L.; López, J. & Dahab, R.: Fast Software
|
||||
# Polynomial Multiplication on ARM Processors using the NEON Engine.
|
||||
#
|
||||
# http://conradoplg.cryptoland.net/files/2010/12/mocrysen13.pdf
|
||||
|
||||
# ====================================================================
|
||||
# Note about "528B" variant. In ARM case it makes lesser sense to
|
||||
# implement it for following reasons:
|
||||
#
|
||||
# - performance improvement won't be anywhere near 50%, because 128-
|
||||
# bit shift operation is neatly fused with 128-bit xor here, and
|
||||
# "538B" variant would eliminate only 4-5 instructions out of 32
|
||||
# in the inner loop (meaning that estimated improvement is ~15%);
|
||||
# - ARM-based systems are often embedded ones and extra memory
|
||||
# consumption might be unappreciated (for so little improvement);
|
||||
#
|
||||
# Byte order [in]dependence. =========================================
|
||||
#
|
||||
# Caller is expected to maintain specific *dword* order in Htable,
|
||||
# namely with *least* significant dword of 128-bit value at *lower*
|
||||
# address. This differs completely from C code and has everything to
|
||||
# do with ldm instruction and order in which dwords are "consumed" by
|
||||
# algorithm. *Byte* order within these dwords in turn is whatever
|
||||
# *native* byte order on current platform. See gcm128.c for working
|
||||
# example...
|
||||
|
||||
# This file was patched in BoringSSL to remove the variable-time 4-bit
|
||||
# implementation.
|
||||
|
||||
$flavour = shift;
|
||||
if ($flavour=~/\w[\w\-]*\.\w+$/) { $output=$flavour; undef $flavour; }
|
||||
else { while (($output=shift) && ($output!~/\w[\w\-]*\.\w+$/)) {} }
|
||||
|
||||
if ($flavour && $flavour ne "void") {
|
||||
$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
|
||||
( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or
|
||||
( $xlate="${dir}../../../perlasm/arm-xlate.pl" and -f $xlate) or
|
||||
die "can't locate arm-xlate.pl";
|
||||
|
||||
open OUT,"| \"$^X\" $xlate $flavour $output";
|
||||
*STDOUT=*OUT;
|
||||
} else {
|
||||
open OUT,">$output";
|
||||
*STDOUT=*OUT;
|
||||
}
|
||||
|
||||
$Xi="r0"; # argument block
|
||||
$Htbl="r1";
|
||||
$inp="r2";
|
||||
$len="r3";
|
||||
|
||||
$code=<<___;
|
||||
#include <GFp/arm_arch.h>
|
||||
|
||||
@ Silence ARMv8 deprecated IT instruction warnings. This file is used by both
|
||||
@ ARMv7 and ARMv8 processors and does not use ARMv8 instructions. (ARMv8 PMULL
|
||||
@ instructions are in aesv8-armx.pl.)
|
||||
.arch armv7-a
|
||||
|
||||
.text
|
||||
#if defined(__thumb2__) || defined(__clang__)
|
||||
.syntax unified
|
||||
#define ldrplb ldrbpl
|
||||
#define ldrneb ldrbne
|
||||
#endif
|
||||
#if defined(__thumb2__)
|
||||
.thumb
|
||||
#else
|
||||
.code 32
|
||||
#endif
|
||||
___
|
||||
{
|
||||
my ($Xl,$Xm,$Xh,$IN)=map("q$_",(0..3));
|
||||
my ($t0,$t1,$t2,$t3)=map("q$_",(8..12));
|
||||
my ($Hlo,$Hhi,$Hhl,$k48,$k32,$k16)=map("d$_",(26..31));
|
||||
|
||||
sub clmul64x64 {
|
||||
my ($r,$a,$b)=@_;
|
||||
$code.=<<___;
|
||||
vext.8 $t0#lo, $a, $a, #1 @ A1
|
||||
vmull.p8 $t0, $t0#lo, $b @ F = A1*B
|
||||
vext.8 $r#lo, $b, $b, #1 @ B1
|
||||
vmull.p8 $r, $a, $r#lo @ E = A*B1
|
||||
vext.8 $t1#lo, $a, $a, #2 @ A2
|
||||
vmull.p8 $t1, $t1#lo, $b @ H = A2*B
|
||||
vext.8 $t3#lo, $b, $b, #2 @ B2
|
||||
vmull.p8 $t3, $a, $t3#lo @ G = A*B2
|
||||
vext.8 $t2#lo, $a, $a, #3 @ A3
|
||||
veor $t0, $t0, $r @ L = E + F
|
||||
vmull.p8 $t2, $t2#lo, $b @ J = A3*B
|
||||
vext.8 $r#lo, $b, $b, #3 @ B3
|
||||
veor $t1, $t1, $t3 @ M = G + H
|
||||
vmull.p8 $r, $a, $r#lo @ I = A*B3
|
||||
veor $t0#lo, $t0#lo, $t0#hi @ t0 = (L) (P0 + P1) << 8
|
||||
vand $t0#hi, $t0#hi, $k48
|
||||
vext.8 $t3#lo, $b, $b, #4 @ B4
|
||||
veor $t1#lo, $t1#lo, $t1#hi @ t1 = (M) (P2 + P3) << 16
|
||||
vand $t1#hi, $t1#hi, $k32
|
||||
vmull.p8 $t3, $a, $t3#lo @ K = A*B4
|
||||
veor $t2, $t2, $r @ N = I + J
|
||||
veor $t0#lo, $t0#lo, $t0#hi
|
||||
veor $t1#lo, $t1#lo, $t1#hi
|
||||
veor $t2#lo, $t2#lo, $t2#hi @ t2 = (N) (P4 + P5) << 24
|
||||
vand $t2#hi, $t2#hi, $k16
|
||||
vext.8 $t0, $t0, $t0, #15
|
||||
veor $t3#lo, $t3#lo, $t3#hi @ t3 = (K) (P6 + P7) << 32
|
||||
vmov.i64 $t3#hi, #0
|
||||
vext.8 $t1, $t1, $t1, #14
|
||||
veor $t2#lo, $t2#lo, $t2#hi
|
||||
vmull.p8 $r, $a, $b @ D = A*B
|
||||
vext.8 $t3, $t3, $t3, #12
|
||||
vext.8 $t2, $t2, $t2, #13
|
||||
veor $t0, $t0, $t1
|
||||
veor $t2, $t2, $t3
|
||||
veor $r, $r, $t0
|
||||
veor $r, $r, $t2
|
||||
___
|
||||
}
|
||||
|
||||
$code.=<<___;
|
||||
#if __ARM_MAX_ARCH__>=7
|
||||
.arch armv7-a
|
||||
.fpu neon
|
||||
|
||||
.global GFp_gcm_init_neon
|
||||
.type GFp_gcm_init_neon,%function
|
||||
.align 4
|
||||
GFp_gcm_init_neon:
|
||||
vld1.64 $IN#hi,[r1]! @ load H
|
||||
vmov.i8 $t0,#0xe1
|
||||
vld1.64 $IN#lo,[r1]
|
||||
vshl.i64 $t0#hi,#57
|
||||
vshr.u64 $t0#lo,#63 @ t0=0xc2....01
|
||||
vdup.8 $t1,$IN#hi[7]
|
||||
vshr.u64 $Hlo,$IN#lo,#63
|
||||
vshr.s8 $t1,#7 @ broadcast carry bit
|
||||
vshl.i64 $IN,$IN,#1
|
||||
vand $t0,$t0,$t1
|
||||
vorr $IN#hi,$Hlo @ H<<<=1
|
||||
veor $IN,$IN,$t0 @ twisted H
|
||||
vstmia r0,{$IN}
|
||||
|
||||
ret @ bx lr
|
||||
.size GFp_gcm_init_neon,.-GFp_gcm_init_neon
|
||||
|
||||
.global GFp_gcm_gmult_neon
|
||||
.type GFp_gcm_gmult_neon,%function
|
||||
.align 4
|
||||
GFp_gcm_gmult_neon:
|
||||
vld1.64 $IN#hi,[$Xi]! @ load Xi
|
||||
vld1.64 $IN#lo,[$Xi]!
|
||||
vmov.i64 $k48,#0x0000ffffffffffff
|
||||
vldmia $Htbl,{$Hlo-$Hhi} @ load twisted H
|
||||
vmov.i64 $k32,#0x00000000ffffffff
|
||||
#ifdef __ARMEL__
|
||||
vrev64.8 $IN,$IN
|
||||
#endif
|
||||
vmov.i64 $k16,#0x000000000000ffff
|
||||
veor $Hhl,$Hlo,$Hhi @ Karatsuba pre-processing
|
||||
mov $len,#16
|
||||
b .Lgmult_neon
|
||||
.size GFp_gcm_gmult_neon,.-GFp_gcm_gmult_neon
|
||||
|
||||
.global GFp_gcm_ghash_neon
|
||||
.type GFp_gcm_ghash_neon,%function
|
||||
.align 4
|
||||
GFp_gcm_ghash_neon:
|
||||
vld1.64 $Xl#hi,[$Xi]! @ load Xi
|
||||
vld1.64 $Xl#lo,[$Xi]!
|
||||
vmov.i64 $k48,#0x0000ffffffffffff
|
||||
vldmia $Htbl,{$Hlo-$Hhi} @ load twisted H
|
||||
vmov.i64 $k32,#0x00000000ffffffff
|
||||
#ifdef __ARMEL__
|
||||
vrev64.8 $Xl,$Xl
|
||||
#endif
|
||||
vmov.i64 $k16,#0x000000000000ffff
|
||||
veor $Hhl,$Hlo,$Hhi @ Karatsuba pre-processing
|
||||
|
||||
.Loop_neon:
|
||||
vld1.64 $IN#hi,[$inp]! @ load inp
|
||||
vld1.64 $IN#lo,[$inp]!
|
||||
#ifdef __ARMEL__
|
||||
vrev64.8 $IN,$IN
|
||||
#endif
|
||||
veor $IN,$Xl @ inp^=Xi
|
||||
.Lgmult_neon:
|
||||
___
|
||||
&clmul64x64 ($Xl,$Hlo,"$IN#lo"); # H.lo·Xi.lo
|
||||
$code.=<<___;
|
||||
veor $IN#lo,$IN#lo,$IN#hi @ Karatsuba pre-processing
|
||||
___
|
||||
&clmul64x64 ($Xm,$Hhl,"$IN#lo"); # (H.lo+H.hi)·(Xi.lo+Xi.hi)
|
||||
&clmul64x64 ($Xh,$Hhi,"$IN#hi"); # H.hi·Xi.hi
|
||||
$code.=<<___;
|
||||
veor $Xm,$Xm,$Xl @ Karatsuba post-processing
|
||||
veor $Xm,$Xm,$Xh
|
||||
veor $Xl#hi,$Xl#hi,$Xm#lo
|
||||
veor $Xh#lo,$Xh#lo,$Xm#hi @ Xh|Xl - 256-bit result
|
||||
|
||||
@ equivalent of reduction_avx from ghash-x86_64.pl
|
||||
vshl.i64 $t1,$Xl,#57 @ 1st phase
|
||||
vshl.i64 $t2,$Xl,#62
|
||||
veor $t2,$t2,$t1 @
|
||||
vshl.i64 $t1,$Xl,#63
|
||||
veor $t2, $t2, $t1 @
|
||||
veor $Xl#hi,$Xl#hi,$t2#lo @
|
||||
veor $Xh#lo,$Xh#lo,$t2#hi
|
||||
|
||||
vshr.u64 $t2,$Xl,#1 @ 2nd phase
|
||||
veor $Xh,$Xh,$Xl
|
||||
veor $Xl,$Xl,$t2 @
|
||||
vshr.u64 $t2,$t2,#6
|
||||
vshr.u64 $Xl,$Xl,#1 @
|
||||
veor $Xl,$Xl,$Xh @
|
||||
veor $Xl,$Xl,$t2 @
|
||||
|
||||
subs $len,#16
|
||||
bne .Loop_neon
|
||||
|
||||
#ifdef __ARMEL__
|
||||
vrev64.8 $Xl,$Xl
|
||||
#endif
|
||||
sub $Xi,#16
|
||||
vst1.64 $Xl#hi,[$Xi]! @ write out Xi
|
||||
vst1.64 $Xl#lo,[$Xi]
|
||||
|
||||
ret @ bx lr
|
||||
.size GFp_gcm_ghash_neon,.-GFp_gcm_ghash_neon
|
||||
#endif
|
||||
___
|
||||
}
|
||||
$code.=<<___;
|
||||
.asciz "GHASH for ARMv4/NEON, CRYPTOGAMS by <appro\@openssl.org>"
|
||||
.align 2
|
||||
___
|
||||
|
||||
foreach (split("\n",$code)) {
|
||||
s/\`([^\`]*)\`/eval $1/geo;
|
||||
|
||||
s/\bq([0-9]+)#(lo|hi)/sprintf "d%d",2*$1+($2 eq "hi")/geo or
|
||||
s/\bret\b/bx lr/go or
|
||||
s/\bbx\s+lr\b/.word\t0xe12fff1e/go; # make it possible to compile with -march=armv4
|
||||
|
||||
print $_,"\n";
|
||||
}
|
||||
close STDOUT or die "error closing STDOUT"; # enforce flush
|
||||
714
zeroidc/vendor/ring/crypto/fipsmodule/modes/asm/ghash-x86.pl
vendored
Normal file
714
zeroidc/vendor/ring/crypto/fipsmodule/modes/asm/ghash-x86.pl
vendored
Normal file
@@ -0,0 +1,714 @@
|
||||
#! /usr/bin/env perl
|
||||
# Copyright 2010-2016 The OpenSSL Project Authors. All Rights Reserved.
|
||||
#
|
||||
# Licensed under the OpenSSL license (the "License"). You may not use
|
||||
# this file except in compliance with the License. You can obtain a copy
|
||||
# in the file LICENSE in the source distribution or at
|
||||
# https://www.openssl.org/source/license.html
|
||||
|
||||
#
|
||||
# ====================================================================
|
||||
# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
|
||||
# project. The module is, however, dual licensed under OpenSSL and
|
||||
# CRYPTOGAMS licenses depending on where you obtain it. For further
|
||||
# details see http://www.openssl.org/~appro/cryptogams/.
|
||||
# ====================================================================
|
||||
#
|
||||
# March, May, June 2010
|
||||
#
|
||||
# The module implements "4-bit" GCM GHASH function and underlying
|
||||
# single multiplication operation in GF(2^128). "4-bit" means that it
|
||||
# uses 256 bytes per-key table [+64/128 bytes fixed table]. It has two
|
||||
# code paths: vanilla x86 and vanilla SSE. Former will be executed on
|
||||
# 486 and Pentium, latter on all others. SSE GHASH features so called
|
||||
# "528B" variant of "4-bit" method utilizing additional 256+16 bytes
|
||||
# of per-key storage [+512 bytes shared table]. Performance results
|
||||
# are for streamed GHASH subroutine and are expressed in cycles per
|
||||
# processed byte, less is better:
|
||||
#
|
||||
# gcc 2.95.3(*) SSE assembler x86 assembler
|
||||
#
|
||||
# Pentium 105/111(**) - 50
|
||||
# PIII 68 /75 12.2 24
|
||||
# P4 125/125 17.8 84(***)
|
||||
# Opteron 66 /70 10.1 30
|
||||
# Core2 54 /67 8.4 18
|
||||
# Atom 105/105 16.8 53
|
||||
# VIA Nano 69 /71 13.0 27
|
||||
#
|
||||
# (*) gcc 3.4.x was observed to generate few percent slower code,
|
||||
# which is one of reasons why 2.95.3 results were chosen,
|
||||
# another reason is lack of 3.4.x results for older CPUs;
|
||||
# comparison with SSE results is not completely fair, because C
|
||||
# results are for vanilla "256B" implementation, while
|
||||
# assembler results are for "528B";-)
|
||||
# (**) second number is result for code compiled with -fPIC flag,
|
||||
# which is actually more relevant, because assembler code is
|
||||
# position-independent;
|
||||
# (***) see comment in non-MMX routine for further details;
|
||||
#
|
||||
# To summarize, it's >2-5 times faster than gcc-generated code. To
|
||||
# anchor it to something else SHA1 assembler processes one byte in
|
||||
# ~7 cycles on contemporary x86 cores. As for choice of MMX/SSE
|
||||
# in particular, see comment at the end of the file...
|
||||
|
||||
# May 2010
|
||||
#
|
||||
# Add PCLMULQDQ version performing at 2.10 cycles per processed byte.
|
||||
# The question is how close is it to theoretical limit? The pclmulqdq
|
||||
# instruction latency appears to be 14 cycles and there can't be more
|
||||
# than 2 of them executing at any given time. This means that single
|
||||
# Karatsuba multiplication would take 28 cycles *plus* few cycles for
|
||||
# pre- and post-processing. Then multiplication has to be followed by
|
||||
# modulo-reduction. Given that aggregated reduction method [see
|
||||
# "Carry-less Multiplication and Its Usage for Computing the GCM Mode"
|
||||
# white paper by Intel] allows you to perform reduction only once in
|
||||
# a while we can assume that asymptotic performance can be estimated
|
||||
# as (28+Tmod/Naggr)/16, where Tmod is time to perform reduction
|
||||
# and Naggr is the aggregation factor.
|
||||
#
|
||||
# Before we proceed to this implementation let's have closer look at
|
||||
# the best-performing code suggested by Intel in their white paper.
|
||||
# By tracing inter-register dependencies Tmod is estimated as ~19
|
||||
# cycles and Naggr chosen by Intel is 4, resulting in 2.05 cycles per
|
||||
# processed byte. As implied, this is quite optimistic estimate,
|
||||
# because it does not account for Karatsuba pre- and post-processing,
|
||||
# which for a single multiplication is ~5 cycles. Unfortunately Intel
|
||||
# does not provide performance data for GHASH alone. But benchmarking
|
||||
# AES_GCM_encrypt ripped out of Fig. 15 of the white paper with aadt
|
||||
# alone resulted in 2.46 cycles per byte of out 16KB buffer. Note that
|
||||
# the result accounts even for pre-computing of degrees of the hash
|
||||
# key H, but its portion is negligible at 16KB buffer size.
|
||||
#
|
||||
# Moving on to the implementation in question. Tmod is estimated as
|
||||
# ~13 cycles and Naggr is 2, giving asymptotic performance of ...
|
||||
# 2.16. How is it possible that measured performance is better than
|
||||
# optimistic theoretical estimate? There is one thing Intel failed
|
||||
# to recognize. By serializing GHASH with CTR in same subroutine
|
||||
# former's performance is really limited to above (Tmul + Tmod/Naggr)
|
||||
# equation. But if GHASH procedure is detached, the modulo-reduction
|
||||
# can be interleaved with Naggr-1 multiplications at instruction level
|
||||
# and under ideal conditions even disappear from the equation. So that
|
||||
# optimistic theoretical estimate for this implementation is ...
|
||||
# 28/16=1.75, and not 2.16. Well, it's probably way too optimistic,
|
||||
# at least for such small Naggr. I'd argue that (28+Tproc/Naggr),
|
||||
# where Tproc is time required for Karatsuba pre- and post-processing,
|
||||
# is more realistic estimate. In this case it gives ... 1.91 cycles.
|
||||
# Or in other words, depending on how well we can interleave reduction
|
||||
# and one of the two multiplications the performance should be between
|
||||
# 1.91 and 2.16. As already mentioned, this implementation processes
|
||||
# one byte out of 8KB buffer in 2.10 cycles, while x86_64 counterpart
|
||||
# - in 2.02. x86_64 performance is better, because larger register
|
||||
# bank allows to interleave reduction and multiplication better.
|
||||
#
|
||||
# Does it make sense to increase Naggr? To start with it's virtually
|
||||
# impossible in 32-bit mode, because of limited register bank
|
||||
# capacity. Otherwise improvement has to be weighed against slower
|
||||
# setup, as well as code size and complexity increase. As even
|
||||
# optimistic estimate doesn't promise 30% performance improvement,
|
||||
# there are currently no plans to increase Naggr.
|
||||
#
|
||||
# Special thanks to David Woodhouse for providing access to a
|
||||
# Westmere-based system on behalf of Intel Open Source Technology Centre.
|
||||
|
||||
# January 2010
|
||||
#
|
||||
# Tweaked to optimize transitions between integer and FP operations
|
||||
# on same XMM register, PCLMULQDQ subroutine was measured to process
|
||||
# one byte in 2.07 cycles on Sandy Bridge, and in 2.12 - on Westmere.
|
||||
# The minor regression on Westmere is outweighed by ~15% improvement
|
||||
# on Sandy Bridge. Strangely enough attempt to modify 64-bit code in
|
||||
# similar manner resulted in almost 20% degradation on Sandy Bridge,
|
||||
# where original 64-bit code processes one byte in 1.95 cycles.
|
||||
|
||||
#####################################################################
|
||||
# For reference, AMD Bulldozer processes one byte in 1.98 cycles in
|
||||
# 32-bit mode and 1.89 in 64-bit.
|
||||
|
||||
# February 2013
|
||||
#
|
||||
# Overhaul: aggregate Karatsuba post-processing, improve ILP in
|
||||
# reduction_alg9. Resulting performance is 1.96 cycles per byte on
|
||||
# Westmere, 1.95 - on Sandy/Ivy Bridge, 1.76 - on Bulldozer.
|
||||
|
||||
$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
|
||||
push(@INC,"${dir}","${dir}../../../perlasm");
|
||||
require "x86asm.pl";
|
||||
|
||||
$output=pop;
|
||||
open STDOUT,">$output";
|
||||
|
||||
&asm_init($ARGV[0],$x86only = $ARGV[$#ARGV] eq "386");
|
||||
|
||||
$sse2=1;
|
||||
|
||||
|
||||
if ($sse2) {{
|
||||
######################################################################
|
||||
# PCLMULQDQ version.
|
||||
|
||||
$Xip="eax";
|
||||
$Htbl="edx";
|
||||
$const="ecx";
|
||||
$inp="esi";
|
||||
$len="ebx";
|
||||
|
||||
($Xi,$Xhi)=("xmm0","xmm1"); $Hkey="xmm2";
|
||||
($T1,$T2,$T3)=("xmm3","xmm4","xmm5");
|
||||
($Xn,$Xhn)=("xmm6","xmm7");
|
||||
|
||||
&static_label("bswap");
|
||||
|
||||
sub clmul64x64_T2 { # minimal "register" pressure
|
||||
my ($Xhi,$Xi,$Hkey,$HK)=@_;
|
||||
|
||||
&movdqa ($Xhi,$Xi); #
|
||||
&pshufd ($T1,$Xi,0b01001110);
|
||||
&pshufd ($T2,$Hkey,0b01001110) if (!defined($HK));
|
||||
&pxor ($T1,$Xi); #
|
||||
&pxor ($T2,$Hkey) if (!defined($HK));
|
||||
$HK=$T2 if (!defined($HK));
|
||||
|
||||
&pclmulqdq ($Xi,$Hkey,0x00); #######
|
||||
&pclmulqdq ($Xhi,$Hkey,0x11); #######
|
||||
&pclmulqdq ($T1,$HK,0x00); #######
|
||||
&xorps ($T1,$Xi); #
|
||||
&xorps ($T1,$Xhi); #
|
||||
|
||||
&movdqa ($T2,$T1); #
|
||||
&psrldq ($T1,8);
|
||||
&pslldq ($T2,8); #
|
||||
&pxor ($Xhi,$T1);
|
||||
&pxor ($Xi,$T2); #
|
||||
}
|
||||
|
||||
sub clmul64x64_T3 {
|
||||
# Even though this subroutine offers visually better ILP, it
|
||||
# was empirically found to be a tad slower than above version.
|
||||
# At least in GFp_gcm_ghash_clmul context. But it's just as well,
|
||||
# because loop modulo-scheduling is possible only thanks to
|
||||
# minimized "register" pressure...
|
||||
my ($Xhi,$Xi,$Hkey)=@_;
|
||||
|
||||
&movdqa ($T1,$Xi); #
|
||||
&movdqa ($Xhi,$Xi);
|
||||
&pclmulqdq ($Xi,$Hkey,0x00); #######
|
||||
&pclmulqdq ($Xhi,$Hkey,0x11); #######
|
||||
&pshufd ($T2,$T1,0b01001110); #
|
||||
&pshufd ($T3,$Hkey,0b01001110);
|
||||
&pxor ($T2,$T1); #
|
||||
&pxor ($T3,$Hkey);
|
||||
&pclmulqdq ($T2,$T3,0x00); #######
|
||||
&pxor ($T2,$Xi); #
|
||||
&pxor ($T2,$Xhi); #
|
||||
|
||||
&movdqa ($T3,$T2); #
|
||||
&psrldq ($T2,8);
|
||||
&pslldq ($T3,8); #
|
||||
&pxor ($Xhi,$T2);
|
||||
&pxor ($Xi,$T3); #
|
||||
}
|
||||
|
||||
if (1) { # Algorithm 9 with <<1 twist.
|
||||
# Reduction is shorter and uses only two
|
||||
# temporary registers, which makes it better
|
||||
# candidate for interleaving with 64x64
|
||||
# multiplication. Pre-modulo-scheduled loop
|
||||
# was found to be ~20% faster than Algorithm 5
|
||||
# below. Algorithm 9 was therefore chosen for
|
||||
# further optimization...
|
||||
|
||||
sub reduction_alg9 { # 17/11 times faster than Intel version
|
||||
my ($Xhi,$Xi) = @_;
|
||||
|
||||
# 1st phase
|
||||
&movdqa ($T2,$Xi); #
|
||||
&movdqa ($T1,$Xi);
|
||||
&psllq ($Xi,5);
|
||||
&pxor ($T1,$Xi); #
|
||||
&psllq ($Xi,1);
|
||||
&pxor ($Xi,$T1); #
|
||||
&psllq ($Xi,57); #
|
||||
&movdqa ($T1,$Xi); #
|
||||
&pslldq ($Xi,8);
|
||||
&psrldq ($T1,8); #
|
||||
&pxor ($Xi,$T2);
|
||||
&pxor ($Xhi,$T1); #
|
||||
|
||||
# 2nd phase
|
||||
&movdqa ($T2,$Xi);
|
||||
&psrlq ($Xi,1);
|
||||
&pxor ($Xhi,$T2); #
|
||||
&pxor ($T2,$Xi);
|
||||
&psrlq ($Xi,5);
|
||||
&pxor ($Xi,$T2); #
|
||||
&psrlq ($Xi,1); #
|
||||
&pxor ($Xi,$Xhi) #
|
||||
}
|
||||
|
||||
&function_begin_B("GFp_gcm_init_clmul");
|
||||
&mov ($Htbl,&wparam(0));
|
||||
&mov ($Xip,&wparam(1));
|
||||
|
||||
&call (&label("pic"));
|
||||
&set_label("pic");
|
||||
&blindpop ($const);
|
||||
&lea ($const,&DWP(&label("bswap")."-".&label("pic"),$const));
|
||||
|
||||
&movdqu ($Hkey,&QWP(0,$Xip));
|
||||
&pshufd ($Hkey,$Hkey,0b01001110);# dword swap
|
||||
|
||||
# <<1 twist
|
||||
&pshufd ($T2,$Hkey,0b11111111); # broadcast uppermost dword
|
||||
&movdqa ($T1,$Hkey);
|
||||
&psllq ($Hkey,1);
|
||||
&pxor ($T3,$T3); #
|
||||
&psrlq ($T1,63);
|
||||
&pcmpgtd ($T3,$T2); # broadcast carry bit
|
||||
&pslldq ($T1,8);
|
||||
&por ($Hkey,$T1); # H<<=1
|
||||
|
||||
# magic reduction
|
||||
&pand ($T3,&QWP(16,$const)); # 0x1c2_polynomial
|
||||
&pxor ($Hkey,$T3); # if(carry) H^=0x1c2_polynomial
|
||||
|
||||
# calculate H^2
|
||||
&movdqa ($Xi,$Hkey);
|
||||
&clmul64x64_T2 ($Xhi,$Xi,$Hkey);
|
||||
&reduction_alg9 ($Xhi,$Xi);
|
||||
|
||||
&pshufd ($T1,$Hkey,0b01001110);
|
||||
&pshufd ($T2,$Xi,0b01001110);
|
||||
&pxor ($T1,$Hkey); # Karatsuba pre-processing
|
||||
&movdqu (&QWP(0,$Htbl),$Hkey); # save H
|
||||
&pxor ($T2,$Xi); # Karatsuba pre-processing
|
||||
&movdqu (&QWP(16,$Htbl),$Xi); # save H^2
|
||||
&palignr ($T2,$T1,8); # low part is H.lo^H.hi
|
||||
&movdqu (&QWP(32,$Htbl),$T2); # save Karatsuba "salt"
|
||||
|
||||
&ret ();
|
||||
&function_end_B("GFp_gcm_init_clmul");
|
||||
|
||||
&function_begin_B("GFp_gcm_gmult_clmul");
|
||||
&mov ($Xip,&wparam(0));
|
||||
&mov ($Htbl,&wparam(1));
|
||||
|
||||
&call (&label("pic"));
|
||||
&set_label("pic");
|
||||
&blindpop ($const);
|
||||
&lea ($const,&DWP(&label("bswap")."-".&label("pic"),$const));
|
||||
|
||||
&movdqu ($Xi,&QWP(0,$Xip));
|
||||
&movdqa ($T3,&QWP(0,$const));
|
||||
&movups ($Hkey,&QWP(0,$Htbl));
|
||||
&pshufb ($Xi,$T3);
|
||||
&movups ($T2,&QWP(32,$Htbl));
|
||||
|
||||
&clmul64x64_T2 ($Xhi,$Xi,$Hkey,$T2);
|
||||
&reduction_alg9 ($Xhi,$Xi);
|
||||
|
||||
&pshufb ($Xi,$T3);
|
||||
&movdqu (&QWP(0,$Xip),$Xi);
|
||||
|
||||
&ret ();
|
||||
&function_end_B("GFp_gcm_gmult_clmul");
|
||||
|
||||
&function_begin("GFp_gcm_ghash_clmul");
|
||||
&mov ($Xip,&wparam(0));
|
||||
&mov ($Htbl,&wparam(1));
|
||||
&mov ($inp,&wparam(2));
|
||||
&mov ($len,&wparam(3));
|
||||
|
||||
&call (&label("pic"));
|
||||
&set_label("pic");
|
||||
&blindpop ($const);
|
||||
&lea ($const,&DWP(&label("bswap")."-".&label("pic"),$const));
|
||||
|
||||
&movdqu ($Xi,&QWP(0,$Xip));
|
||||
&movdqa ($T3,&QWP(0,$const));
|
||||
&movdqu ($Hkey,&QWP(0,$Htbl));
|
||||
&pshufb ($Xi,$T3);
|
||||
|
||||
&sub ($len,0x10);
|
||||
&jz (&label("odd_tail"));
|
||||
|
||||
#######
|
||||
# Xi+2 =[H*(Ii+1 + Xi+1)] mod P =
|
||||
# [(H*Ii+1) + (H*Xi+1)] mod P =
|
||||
# [(H*Ii+1) + H^2*(Ii+Xi)] mod P
|
||||
#
|
||||
&movdqu ($T1,&QWP(0,$inp)); # Ii
|
||||
&movdqu ($Xn,&QWP(16,$inp)); # Ii+1
|
||||
&pshufb ($T1,$T3);
|
||||
&pshufb ($Xn,$T3);
|
||||
&movdqu ($T3,&QWP(32,$Htbl));
|
||||
&pxor ($Xi,$T1); # Ii+Xi
|
||||
|
||||
&pshufd ($T1,$Xn,0b01001110); # H*Ii+1
|
||||
&movdqa ($Xhn,$Xn);
|
||||
&pxor ($T1,$Xn); #
|
||||
&lea ($inp,&DWP(32,$inp)); # i+=2
|
||||
|
||||
&pclmulqdq ($Xn,$Hkey,0x00); #######
|
||||
&pclmulqdq ($Xhn,$Hkey,0x11); #######
|
||||
&pclmulqdq ($T1,$T3,0x00); #######
|
||||
&movups ($Hkey,&QWP(16,$Htbl)); # load H^2
|
||||
&nop ();
|
||||
|
||||
&sub ($len,0x20);
|
||||
&jbe (&label("even_tail"));
|
||||
&jmp (&label("mod_loop"));
|
||||
|
||||
&set_label("mod_loop",32);
|
||||
&pshufd ($T2,$Xi,0b01001110); # H^2*(Ii+Xi)
|
||||
&movdqa ($Xhi,$Xi);
|
||||
&pxor ($T2,$Xi); #
|
||||
&nop ();
|
||||
|
||||
&pclmulqdq ($Xi,$Hkey,0x00); #######
|
||||
&pclmulqdq ($Xhi,$Hkey,0x11); #######
|
||||
&pclmulqdq ($T2,$T3,0x10); #######
|
||||
&movups ($Hkey,&QWP(0,$Htbl)); # load H
|
||||
|
||||
&xorps ($Xi,$Xn); # (H*Ii+1) + H^2*(Ii+Xi)
|
||||
&movdqa ($T3,&QWP(0,$const));
|
||||
&xorps ($Xhi,$Xhn);
|
||||
&movdqu ($Xhn,&QWP(0,$inp)); # Ii
|
||||
&pxor ($T1,$Xi); # aggregated Karatsuba post-processing
|
||||
&movdqu ($Xn,&QWP(16,$inp)); # Ii+1
|
||||
&pxor ($T1,$Xhi); #
|
||||
|
||||
&pshufb ($Xhn,$T3);
|
||||
&pxor ($T2,$T1); #
|
||||
|
||||
&movdqa ($T1,$T2); #
|
||||
&psrldq ($T2,8);
|
||||
&pslldq ($T1,8); #
|
||||
&pxor ($Xhi,$T2);
|
||||
&pxor ($Xi,$T1); #
|
||||
&pshufb ($Xn,$T3);
|
||||
&pxor ($Xhi,$Xhn); # "Ii+Xi", consume early
|
||||
|
||||
&movdqa ($Xhn,$Xn); #&clmul64x64_TX ($Xhn,$Xn,$Hkey); H*Ii+1
|
||||
&movdqa ($T2,$Xi); #&reduction_alg9($Xhi,$Xi); 1st phase
|
||||
&movdqa ($T1,$Xi);
|
||||
&psllq ($Xi,5);
|
||||
&pxor ($T1,$Xi); #
|
||||
&psllq ($Xi,1);
|
||||
&pxor ($Xi,$T1); #
|
||||
&pclmulqdq ($Xn,$Hkey,0x00); #######
|
||||
&movups ($T3,&QWP(32,$Htbl));
|
||||
&psllq ($Xi,57); #
|
||||
&movdqa ($T1,$Xi); #
|
||||
&pslldq ($Xi,8);
|
||||
&psrldq ($T1,8); #
|
||||
&pxor ($Xi,$T2);
|
||||
&pxor ($Xhi,$T1); #
|
||||
&pshufd ($T1,$Xhn,0b01001110);
|
||||
&movdqa ($T2,$Xi); # 2nd phase
|
||||
&psrlq ($Xi,1);
|
||||
&pxor ($T1,$Xhn);
|
||||
&pxor ($Xhi,$T2); #
|
||||
&pclmulqdq ($Xhn,$Hkey,0x11); #######
|
||||
&movups ($Hkey,&QWP(16,$Htbl)); # load H^2
|
||||
&pxor ($T2,$Xi);
|
||||
&psrlq ($Xi,5);
|
||||
&pxor ($Xi,$T2); #
|
||||
&psrlq ($Xi,1); #
|
||||
&pxor ($Xi,$Xhi) #
|
||||
&pclmulqdq ($T1,$T3,0x00); #######
|
||||
|
||||
&lea ($inp,&DWP(32,$inp));
|
||||
&sub ($len,0x20);
|
||||
&ja (&label("mod_loop"));
|
||||
|
||||
&set_label("even_tail");
|
||||
&pshufd ($T2,$Xi,0b01001110); # H^2*(Ii+Xi)
|
||||
&movdqa ($Xhi,$Xi);
|
||||
&pxor ($T2,$Xi); #
|
||||
|
||||
&pclmulqdq ($Xi,$Hkey,0x00); #######
|
||||
&pclmulqdq ($Xhi,$Hkey,0x11); #######
|
||||
&pclmulqdq ($T2,$T3,0x10); #######
|
||||
&movdqa ($T3,&QWP(0,$const));
|
||||
|
||||
&xorps ($Xi,$Xn); # (H*Ii+1) + H^2*(Ii+Xi)
|
||||
&xorps ($Xhi,$Xhn);
|
||||
&pxor ($T1,$Xi); # aggregated Karatsuba post-processing
|
||||
&pxor ($T1,$Xhi); #
|
||||
|
||||
&pxor ($T2,$T1); #
|
||||
|
||||
&movdqa ($T1,$T2); #
|
||||
&psrldq ($T2,8);
|
||||
&pslldq ($T1,8); #
|
||||
&pxor ($Xhi,$T2);
|
||||
&pxor ($Xi,$T1); #
|
||||
|
||||
&reduction_alg9 ($Xhi,$Xi);
|
||||
|
||||
&test ($len,$len);
|
||||
&jnz (&label("done"));
|
||||
|
||||
&movups ($Hkey,&QWP(0,$Htbl)); # load H
|
||||
&set_label("odd_tail");
|
||||
&movdqu ($T1,&QWP(0,$inp)); # Ii
|
||||
&pshufb ($T1,$T3);
|
||||
&pxor ($Xi,$T1); # Ii+Xi
|
||||
|
||||
&clmul64x64_T2 ($Xhi,$Xi,$Hkey); # H*(Ii+Xi)
|
||||
&reduction_alg9 ($Xhi,$Xi);
|
||||
|
||||
&set_label("done");
|
||||
&pshufb ($Xi,$T3);
|
||||
&movdqu (&QWP(0,$Xip),$Xi);
|
||||
&function_end("GFp_gcm_ghash_clmul");
|
||||
|
||||
} else { # Algorithm 5. Kept for reference purposes.
|
||||
|
||||
sub reduction_alg5 { # 19/16 times faster than Intel version
|
||||
my ($Xhi,$Xi)=@_;
|
||||
|
||||
# <<1
|
||||
&movdqa ($T1,$Xi); #
|
||||
&movdqa ($T2,$Xhi);
|
||||
&pslld ($Xi,1);
|
||||
&pslld ($Xhi,1); #
|
||||
&psrld ($T1,31);
|
||||
&psrld ($T2,31); #
|
||||
&movdqa ($T3,$T1);
|
||||
&pslldq ($T1,4);
|
||||
&psrldq ($T3,12); #
|
||||
&pslldq ($T2,4);
|
||||
&por ($Xhi,$T3); #
|
||||
&por ($Xi,$T1);
|
||||
&por ($Xhi,$T2); #
|
||||
|
||||
# 1st phase
|
||||
&movdqa ($T1,$Xi);
|
||||
&movdqa ($T2,$Xi);
|
||||
&movdqa ($T3,$Xi); #
|
||||
&pslld ($T1,31);
|
||||
&pslld ($T2,30);
|
||||
&pslld ($Xi,25); #
|
||||
&pxor ($T1,$T2);
|
||||
&pxor ($T1,$Xi); #
|
||||
&movdqa ($T2,$T1); #
|
||||
&pslldq ($T1,12);
|
||||
&psrldq ($T2,4); #
|
||||
&pxor ($T3,$T1);
|
||||
|
||||
# 2nd phase
|
||||
&pxor ($Xhi,$T3); #
|
||||
&movdqa ($Xi,$T3);
|
||||
&movdqa ($T1,$T3);
|
||||
&psrld ($Xi,1); #
|
||||
&psrld ($T1,2);
|
||||
&psrld ($T3,7); #
|
||||
&pxor ($Xi,$T1);
|
||||
&pxor ($Xhi,$T2);
|
||||
&pxor ($Xi,$T3); #
|
||||
&pxor ($Xi,$Xhi); #
|
||||
}
|
||||
|
||||
&function_begin_B("GFp_gcm_init_clmul");
|
||||
&mov ($Htbl,&wparam(0));
|
||||
&mov ($Xip,&wparam(1));
|
||||
|
||||
&call (&label("pic"));
|
||||
&set_label("pic");
|
||||
&blindpop ($const);
|
||||
&lea ($const,&DWP(&label("bswap")."-".&label("pic"),$const));
|
||||
|
||||
&movdqu ($Hkey,&QWP(0,$Xip));
|
||||
&pshufd ($Hkey,$Hkey,0b01001110);# dword swap
|
||||
|
||||
# calculate H^2
|
||||
&movdqa ($Xi,$Hkey);
|
||||
&clmul64x64_T3 ($Xhi,$Xi,$Hkey);
|
||||
&reduction_alg5 ($Xhi,$Xi);
|
||||
|
||||
&movdqu (&QWP(0,$Htbl),$Hkey); # save H
|
||||
&movdqu (&QWP(16,$Htbl),$Xi); # save H^2
|
||||
|
||||
&ret ();
|
||||
&function_end_B("GFp_gcm_init_clmul");
|
||||
|
||||
&function_begin_B("GFp_gcm_gmult_clmul");
|
||||
&mov ($Xip,&wparam(0));
|
||||
&mov ($Htbl,&wparam(1));
|
||||
|
||||
&call (&label("pic"));
|
||||
&set_label("pic");
|
||||
&blindpop ($const);
|
||||
&lea ($const,&DWP(&label("bswap")."-".&label("pic"),$const));
|
||||
|
||||
&movdqu ($Xi,&QWP(0,$Xip));
|
||||
&movdqa ($Xn,&QWP(0,$const));
|
||||
&movdqu ($Hkey,&QWP(0,$Htbl));
|
||||
&pshufb ($Xi,$Xn);
|
||||
|
||||
&clmul64x64_T3 ($Xhi,$Xi,$Hkey);
|
||||
&reduction_alg5 ($Xhi,$Xi);
|
||||
|
||||
&pshufb ($Xi,$Xn);
|
||||
&movdqu (&QWP(0,$Xip),$Xi);
|
||||
|
||||
&ret ();
|
||||
&function_end_B("GFp_gcm_gmult_clmul");
|
||||
|
||||
&function_begin("GFp_gcm_ghash_clmul");
|
||||
&mov ($Xip,&wparam(0));
|
||||
&mov ($Htbl,&wparam(1));
|
||||
&mov ($inp,&wparam(2));
|
||||
&mov ($len,&wparam(3));
|
||||
|
||||
&call (&label("pic"));
|
||||
&set_label("pic");
|
||||
&blindpop ($const);
|
||||
&lea ($const,&DWP(&label("bswap")."-".&label("pic"),$const));
|
||||
|
||||
&movdqu ($Xi,&QWP(0,$Xip));
|
||||
&movdqa ($T3,&QWP(0,$const));
|
||||
&movdqu ($Hkey,&QWP(0,$Htbl));
|
||||
&pshufb ($Xi,$T3);
|
||||
|
||||
&sub ($len,0x10);
|
||||
&jz (&label("odd_tail"));
|
||||
|
||||
#######
|
||||
# Xi+2 =[H*(Ii+1 + Xi+1)] mod P =
|
||||
# [(H*Ii+1) + (H*Xi+1)] mod P =
|
||||
# [(H*Ii+1) + H^2*(Ii+Xi)] mod P
|
||||
#
|
||||
&movdqu ($T1,&QWP(0,$inp)); # Ii
|
||||
&movdqu ($Xn,&QWP(16,$inp)); # Ii+1
|
||||
&pshufb ($T1,$T3);
|
||||
&pshufb ($Xn,$T3);
|
||||
&pxor ($Xi,$T1); # Ii+Xi
|
||||
|
||||
&clmul64x64_T3 ($Xhn,$Xn,$Hkey); # H*Ii+1
|
||||
&movdqu ($Hkey,&QWP(16,$Htbl)); # load H^2
|
||||
|
||||
&sub ($len,0x20);
|
||||
&lea ($inp,&DWP(32,$inp)); # i+=2
|
||||
&jbe (&label("even_tail"));
|
||||
|
||||
&set_label("mod_loop");
|
||||
&clmul64x64_T3 ($Xhi,$Xi,$Hkey); # H^2*(Ii+Xi)
|
||||
&movdqu ($Hkey,&QWP(0,$Htbl)); # load H
|
||||
|
||||
&pxor ($Xi,$Xn); # (H*Ii+1) + H^2*(Ii+Xi)
|
||||
&pxor ($Xhi,$Xhn);
|
||||
|
||||
&reduction_alg5 ($Xhi,$Xi);
|
||||
|
||||
#######
|
||||
&movdqa ($T3,&QWP(0,$const));
|
||||
&movdqu ($T1,&QWP(0,$inp)); # Ii
|
||||
&movdqu ($Xn,&QWP(16,$inp)); # Ii+1
|
||||
&pshufb ($T1,$T3);
|
||||
&pshufb ($Xn,$T3);
|
||||
&pxor ($Xi,$T1); # Ii+Xi
|
||||
|
||||
&clmul64x64_T3 ($Xhn,$Xn,$Hkey); # H*Ii+1
|
||||
&movdqu ($Hkey,&QWP(16,$Htbl)); # load H^2
|
||||
|
||||
&sub ($len,0x20);
|
||||
&lea ($inp,&DWP(32,$inp));
|
||||
&ja (&label("mod_loop"));
|
||||
|
||||
&set_label("even_tail");
|
||||
&clmul64x64_T3 ($Xhi,$Xi,$Hkey); # H^2*(Ii+Xi)
|
||||
|
||||
&pxor ($Xi,$Xn); # (H*Ii+1) + H^2*(Ii+Xi)
|
||||
&pxor ($Xhi,$Xhn);
|
||||
|
||||
&reduction_alg5 ($Xhi,$Xi);
|
||||
|
||||
&movdqa ($T3,&QWP(0,$const));
|
||||
&test ($len,$len);
|
||||
&jnz (&label("done"));
|
||||
|
||||
&movdqu ($Hkey,&QWP(0,$Htbl)); # load H
|
||||
&set_label("odd_tail");
|
||||
&movdqu ($T1,&QWP(0,$inp)); # Ii
|
||||
&pshufb ($T1,$T3);
|
||||
&pxor ($Xi,$T1); # Ii+Xi
|
||||
|
||||
&clmul64x64_T3 ($Xhi,$Xi,$Hkey); # H*(Ii+Xi)
|
||||
&reduction_alg5 ($Xhi,$Xi);
|
||||
|
||||
&movdqa ($T3,&QWP(0,$const));
|
||||
&set_label("done");
|
||||
&pshufb ($Xi,$T3);
|
||||
&movdqu (&QWP(0,$Xip),$Xi);
|
||||
&function_end("GFp_gcm_ghash_clmul");
|
||||
|
||||
}
|
||||
|
||||
&set_label("bswap",64);
|
||||
&data_byte(15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0);
|
||||
&data_byte(1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0xc2); # 0x1c2_polynomial
|
||||
&set_label("rem_8bit",64);
|
||||
&data_short(0x0000,0x01C2,0x0384,0x0246,0x0708,0x06CA,0x048C,0x054E);
|
||||
&data_short(0x0E10,0x0FD2,0x0D94,0x0C56,0x0918,0x08DA,0x0A9C,0x0B5E);
|
||||
&data_short(0x1C20,0x1DE2,0x1FA4,0x1E66,0x1B28,0x1AEA,0x18AC,0x196E);
|
||||
&data_short(0x1230,0x13F2,0x11B4,0x1076,0x1538,0x14FA,0x16BC,0x177E);
|
||||
&data_short(0x3840,0x3982,0x3BC4,0x3A06,0x3F48,0x3E8A,0x3CCC,0x3D0E);
|
||||
&data_short(0x3650,0x3792,0x35D4,0x3416,0x3158,0x309A,0x32DC,0x331E);
|
||||
&data_short(0x2460,0x25A2,0x27E4,0x2626,0x2368,0x22AA,0x20EC,0x212E);
|
||||
&data_short(0x2A70,0x2BB2,0x29F4,0x2836,0x2D78,0x2CBA,0x2EFC,0x2F3E);
|
||||
&data_short(0x7080,0x7142,0x7304,0x72C6,0x7788,0x764A,0x740C,0x75CE);
|
||||
&data_short(0x7E90,0x7F52,0x7D14,0x7CD6,0x7998,0x785A,0x7A1C,0x7BDE);
|
||||
&data_short(0x6CA0,0x6D62,0x6F24,0x6EE6,0x6BA8,0x6A6A,0x682C,0x69EE);
|
||||
&data_short(0x62B0,0x6372,0x6134,0x60F6,0x65B8,0x647A,0x663C,0x67FE);
|
||||
&data_short(0x48C0,0x4902,0x4B44,0x4A86,0x4FC8,0x4E0A,0x4C4C,0x4D8E);
|
||||
&data_short(0x46D0,0x4712,0x4554,0x4496,0x41D8,0x401A,0x425C,0x439E);
|
||||
&data_short(0x54E0,0x5522,0x5764,0x56A6,0x53E8,0x522A,0x506C,0x51AE);
|
||||
&data_short(0x5AF0,0x5B32,0x5974,0x58B6,0x5DF8,0x5C3A,0x5E7C,0x5FBE);
|
||||
&data_short(0xE100,0xE0C2,0xE284,0xE346,0xE608,0xE7CA,0xE58C,0xE44E);
|
||||
&data_short(0xEF10,0xEED2,0xEC94,0xED56,0xE818,0xE9DA,0xEB9C,0xEA5E);
|
||||
&data_short(0xFD20,0xFCE2,0xFEA4,0xFF66,0xFA28,0xFBEA,0xF9AC,0xF86E);
|
||||
&data_short(0xF330,0xF2F2,0xF0B4,0xF176,0xF438,0xF5FA,0xF7BC,0xF67E);
|
||||
&data_short(0xD940,0xD882,0xDAC4,0xDB06,0xDE48,0xDF8A,0xDDCC,0xDC0E);
|
||||
&data_short(0xD750,0xD692,0xD4D4,0xD516,0xD058,0xD19A,0xD3DC,0xD21E);
|
||||
&data_short(0xC560,0xC4A2,0xC6E4,0xC726,0xC268,0xC3AA,0xC1EC,0xC02E);
|
||||
&data_short(0xCB70,0xCAB2,0xC8F4,0xC936,0xCC78,0xCDBA,0xCFFC,0xCE3E);
|
||||
&data_short(0x9180,0x9042,0x9204,0x93C6,0x9688,0x974A,0x950C,0x94CE);
|
||||
&data_short(0x9F90,0x9E52,0x9C14,0x9DD6,0x9898,0x995A,0x9B1C,0x9ADE);
|
||||
&data_short(0x8DA0,0x8C62,0x8E24,0x8FE6,0x8AA8,0x8B6A,0x892C,0x88EE);
|
||||
&data_short(0x83B0,0x8272,0x8034,0x81F6,0x84B8,0x857A,0x873C,0x86FE);
|
||||
&data_short(0xA9C0,0xA802,0xAA44,0xAB86,0xAEC8,0xAF0A,0xAD4C,0xAC8E);
|
||||
&data_short(0xA7D0,0xA612,0xA454,0xA596,0xA0D8,0xA11A,0xA35C,0xA29E);
|
||||
&data_short(0xB5E0,0xB422,0xB664,0xB7A6,0xB2E8,0xB32A,0xB16C,0xB0AE);
|
||||
&data_short(0xBBF0,0xBA32,0xB874,0xB9B6,0xBCF8,0xBD3A,0xBF7C,0xBEBE);
|
||||
}} # $sse2
|
||||
|
||||
&asciz("GHASH for x86, CRYPTOGAMS by <appro\@openssl.org>");
|
||||
&asm_finish();
|
||||
|
||||
close STDOUT or die "error closing STDOUT";
|
||||
|
||||
# A question was risen about choice of vanilla MMX. Or rather why wasn't
|
||||
# SSE2 chosen instead? In addition to the fact that MMX runs on legacy
|
||||
# CPUs such as PIII, "4-bit" MMX version was observed to provide better
|
||||
# performance than *corresponding* SSE2 one even on contemporary CPUs.
|
||||
# SSE2 results were provided by Peter-Michael Hager. He maintains SSE2
|
||||
# implementation featuring full range of lookup-table sizes, but with
|
||||
# per-invocation lookup table setup. Latter means that table size is
|
||||
# chosen depending on how much data is to be hashed in every given call,
|
||||
# more data - larger table. Best reported result for Core2 is ~4 cycles
|
||||
# per processed byte out of 64KB block. This number accounts even for
|
||||
# 64KB table setup overhead. As discussed in gcm128.c we choose to be
|
||||
# more conservative in respect to lookup table sizes, but how do the
|
||||
# results compare? Minimalistic "256B" MMX version delivers ~11 cycles
|
||||
# on same platform. As also discussed in gcm128.c, next in line "8-bit
|
||||
# Shoup's" or "4KB" method should deliver twice the performance of
|
||||
# "256B" one, in other words not worse than ~6 cycles per byte. It
|
||||
# should be also be noted that in SSE2 case improvement can be "super-
|
||||
# linear," i.e. more than twice, mostly because >>8 maps to single
|
||||
# instruction on SSE2 register. This is unlike "4-bit" case when >>4
|
||||
# maps to same amount of instructions in both MMX and SSE2 cases.
|
||||
# Bottom line is that switch to SSE2 is considered to be justifiable
|
||||
# only in case we choose to implement "8-bit" method...
|
||||
1328
zeroidc/vendor/ring/crypto/fipsmodule/modes/asm/ghash-x86_64.pl
vendored
Normal file
1328
zeroidc/vendor/ring/crypto/fipsmodule/modes/asm/ghash-x86_64.pl
vendored
Normal file
File diff suppressed because it is too large
Load Diff
432
zeroidc/vendor/ring/crypto/fipsmodule/modes/asm/ghashv8-armx.pl
vendored
Normal file
432
zeroidc/vendor/ring/crypto/fipsmodule/modes/asm/ghashv8-armx.pl
vendored
Normal file
@@ -0,0 +1,432 @@
|
||||
#! /usr/bin/env perl
|
||||
# Copyright 2014-2016 The OpenSSL Project Authors. All Rights Reserved.
|
||||
#
|
||||
# Licensed under the OpenSSL license (the "License"). You may not use
|
||||
# this file except in compliance with the License. You can obtain a copy
|
||||
# in the file LICENSE in the source distribution or at
|
||||
# https://www.openssl.org/source/license.html
|
||||
|
||||
#
|
||||
# ====================================================================
|
||||
# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
|
||||
# project. The module is, however, dual licensed under OpenSSL and
|
||||
# CRYPTOGAMS licenses depending on where you obtain it. For further
|
||||
# details see http://www.openssl.org/~appro/cryptogams/.
|
||||
# ====================================================================
|
||||
#
|
||||
# GHASH for ARMv8 Crypto Extension, 64-bit polynomial multiplication.
|
||||
#
|
||||
# June 2014
|
||||
# Initial version was developed in tight cooperation with Ard Biesheuvel
|
||||
# of Linaro from bits-n-pieces from other assembly modules. Just like
|
||||
# aesv8-armx.pl this module supports both AArch32 and AArch64 execution modes.
|
||||
#
|
||||
# July 2014
|
||||
# Implement 2x aggregated reduction [see ghash-x86.pl for background
|
||||
# information].
|
||||
#
|
||||
# Current performance in cycles per processed byte:
|
||||
#
|
||||
# PMULL[2] 32-bit NEON(*)
|
||||
# Apple A7 0.92 5.62
|
||||
# Cortex-A53 1.01 8.39
|
||||
# Cortex-A57 1.17 7.61
|
||||
# Denver 0.71 6.02
|
||||
# Mongoose 1.10 8.06
|
||||
# Kryo 1.16 8.00
|
||||
#
|
||||
# (*) presented for reference/comparison purposes;
|
||||
|
||||
$flavour = shift;
|
||||
$output = shift;
|
||||
|
||||
$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
|
||||
( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or
|
||||
( $xlate="${dir}../../../perlasm/arm-xlate.pl" and -f $xlate) or
|
||||
die "can't locate arm-xlate.pl";
|
||||
|
||||
open OUT,"| \"$^X\" $xlate $flavour $output";
|
||||
*STDOUT=*OUT;
|
||||
|
||||
$Xi="x0"; # argument block
|
||||
$Htbl="x1";
|
||||
$inp="x2";
|
||||
$len="x3";
|
||||
|
||||
$inc="x12";
|
||||
|
||||
{
|
||||
my ($Xl,$Xm,$Xh,$IN)=map("q$_",(0..3));
|
||||
my ($t0,$t1,$t2,$xC2,$H,$Hhl,$H2)=map("q$_",(8..14));
|
||||
|
||||
$code=<<___;
|
||||
#include <GFp/arm_arch.h>
|
||||
|
||||
.text
|
||||
___
|
||||
$code.=".arch armv8-a+crypto\n" if ($flavour =~ /64/);
|
||||
$code.=<<___ if ($flavour !~ /64/);
|
||||
.fpu neon
|
||||
.code 32
|
||||
#undef __thumb2__
|
||||
___
|
||||
|
||||
################################################################################
|
||||
# void GFp_gcm_init_clmul(u128 Htable[16],const u64 H[2]);
|
||||
#
|
||||
# input: 128-bit H - secret parameter E(K,0^128)
|
||||
# output: precomputed table filled with degrees of twisted H;
|
||||
# H is twisted to handle reverse bitness of GHASH;
|
||||
# only few of 16 slots of Htable[16] are used;
|
||||
# data is opaque to outside world (which allows to
|
||||
# optimize the code independently);
|
||||
#
|
||||
$code.=<<___;
|
||||
.global GFp_gcm_init_clmul
|
||||
.type GFp_gcm_init_clmul,%function
|
||||
.align 4
|
||||
GFp_gcm_init_clmul:
|
||||
AARCH64_VALID_CALL_TARGET
|
||||
vld1.64 {$t1},[x1] @ load input H
|
||||
vmov.i8 $xC2,#0xe1
|
||||
vshl.i64 $xC2,$xC2,#57 @ 0xc2.0
|
||||
vext.8 $IN,$t1,$t1,#8
|
||||
vshr.u64 $t2,$xC2,#63
|
||||
vdup.32 $t1,${t1}[1]
|
||||
vext.8 $t0,$t2,$xC2,#8 @ t0=0xc2....01
|
||||
vshr.u64 $t2,$IN,#63
|
||||
vshr.s32 $t1,$t1,#31 @ broadcast carry bit
|
||||
vand $t2,$t2,$t0
|
||||
vshl.i64 $IN,$IN,#1
|
||||
vext.8 $t2,$t2,$t2,#8
|
||||
vand $t0,$t0,$t1
|
||||
vorr $IN,$IN,$t2 @ H<<<=1
|
||||
veor $H,$IN,$t0 @ twisted H
|
||||
vst1.64 {$H},[x0],#16 @ store Htable[0]
|
||||
|
||||
@ calculate H^2
|
||||
vext.8 $t0,$H,$H,#8 @ Karatsuba pre-processing
|
||||
vpmull.p64 $Xl,$H,$H
|
||||
veor $t0,$t0,$H
|
||||
vpmull2.p64 $Xh,$H,$H
|
||||
vpmull.p64 $Xm,$t0,$t0
|
||||
|
||||
vext.8 $t1,$Xl,$Xh,#8 @ Karatsuba post-processing
|
||||
veor $t2,$Xl,$Xh
|
||||
veor $Xm,$Xm,$t1
|
||||
veor $Xm,$Xm,$t2
|
||||
vpmull.p64 $t2,$Xl,$xC2 @ 1st phase
|
||||
|
||||
vmov $Xh#lo,$Xm#hi @ Xh|Xm - 256-bit result
|
||||
vmov $Xm#hi,$Xl#lo @ Xm is rotated Xl
|
||||
veor $Xl,$Xm,$t2
|
||||
|
||||
vext.8 $t2,$Xl,$Xl,#8 @ 2nd phase
|
||||
vpmull.p64 $Xl,$Xl,$xC2
|
||||
veor $t2,$t2,$Xh
|
||||
veor $H2,$Xl,$t2
|
||||
|
||||
vext.8 $t1,$H2,$H2,#8 @ Karatsuba pre-processing
|
||||
veor $t1,$t1,$H2
|
||||
vext.8 $Hhl,$t0,$t1,#8 @ pack Karatsuba pre-processed
|
||||
vst1.64 {$Hhl-$H2},[x0] @ store Htable[1..2]
|
||||
|
||||
ret
|
||||
.size GFp_gcm_init_clmul,.-GFp_gcm_init_clmul
|
||||
___
|
||||
################################################################################
|
||||
# void GFp_gcm_gmult_clmul(u64 Xi[2],const u128 Htable[16]);
|
||||
#
|
||||
# input: Xi - current hash value;
|
||||
# Htable - table precomputed in GFp_gcm_init_clmul;
|
||||
# output: Xi - next hash value Xi;
|
||||
#
|
||||
$code.=<<___;
|
||||
.global GFp_gcm_gmult_clmul
|
||||
.type GFp_gcm_gmult_clmul,%function
|
||||
.align 4
|
||||
GFp_gcm_gmult_clmul:
|
||||
AARCH64_VALID_CALL_TARGET
|
||||
vld1.64 {$t1},[$Xi] @ load Xi
|
||||
vmov.i8 $xC2,#0xe1
|
||||
vld1.64 {$H-$Hhl},[$Htbl] @ load twisted H, ...
|
||||
vshl.u64 $xC2,$xC2,#57
|
||||
#ifndef __ARMEB__
|
||||
vrev64.8 $t1,$t1
|
||||
#endif
|
||||
vext.8 $IN,$t1,$t1,#8
|
||||
|
||||
vpmull.p64 $Xl,$H,$IN @ H.lo·Xi.lo
|
||||
veor $t1,$t1,$IN @ Karatsuba pre-processing
|
||||
vpmull2.p64 $Xh,$H,$IN @ H.hi·Xi.hi
|
||||
vpmull.p64 $Xm,$Hhl,$t1 @ (H.lo+H.hi)·(Xi.lo+Xi.hi)
|
||||
|
||||
vext.8 $t1,$Xl,$Xh,#8 @ Karatsuba post-processing
|
||||
veor $t2,$Xl,$Xh
|
||||
veor $Xm,$Xm,$t1
|
||||
veor $Xm,$Xm,$t2
|
||||
vpmull.p64 $t2,$Xl,$xC2 @ 1st phase of reduction
|
||||
|
||||
vmov $Xh#lo,$Xm#hi @ Xh|Xm - 256-bit result
|
||||
vmov $Xm#hi,$Xl#lo @ Xm is rotated Xl
|
||||
veor $Xl,$Xm,$t2
|
||||
|
||||
vext.8 $t2,$Xl,$Xl,#8 @ 2nd phase of reduction
|
||||
vpmull.p64 $Xl,$Xl,$xC2
|
||||
veor $t2,$t2,$Xh
|
||||
veor $Xl,$Xl,$t2
|
||||
|
||||
#ifndef __ARMEB__
|
||||
vrev64.8 $Xl,$Xl
|
||||
#endif
|
||||
vext.8 $Xl,$Xl,$Xl,#8
|
||||
vst1.64 {$Xl},[$Xi] @ write out Xi
|
||||
|
||||
ret
|
||||
.size GFp_gcm_gmult_clmul,.-GFp_gcm_gmult_clmul
|
||||
___
|
||||
################################################################################
|
||||
# void GFp_gcm_ghash_clmul(u64 Xi[2], const u128 Htable[16], const u8 *inp,
|
||||
# size_t len);
|
||||
#
|
||||
# input: table precomputed in GFp_gcm_init_clmul;
|
||||
# current hash value Xi;
|
||||
# pointer to input data;
|
||||
# length of input data in bytes, but divisible by block size;
|
||||
# output: next hash value Xi;
|
||||
#
|
||||
$code.=<<___;
|
||||
.global GFp_gcm_ghash_clmul
|
||||
.type GFp_gcm_ghash_clmul,%function
|
||||
.align 4
|
||||
GFp_gcm_ghash_clmul:
|
||||
AARCH64_VALID_CALL_TARGET
|
||||
___
|
||||
$code.=<<___ if ($flavour !~ /64/);
|
||||
vstmdb sp!,{d8-d15} @ 32-bit ABI says so
|
||||
___
|
||||
$code.=<<___;
|
||||
vld1.64 {$Xl},[$Xi] @ load [rotated] Xi
|
||||
@ "[rotated]" means that
|
||||
@ loaded value would have
|
||||
@ to be rotated in order to
|
||||
@ make it appear as in
|
||||
@ algorithm specification
|
||||
subs $len,$len,#32 @ see if $len is 32 or larger
|
||||
mov $inc,#16 @ $inc is used as post-
|
||||
@ increment for input pointer;
|
||||
@ as loop is modulo-scheduled
|
||||
@ $inc is zeroed just in time
|
||||
@ to preclude overstepping
|
||||
@ inp[len], which means that
|
||||
@ last block[s] are actually
|
||||
@ loaded twice, but last
|
||||
@ copy is not processed
|
||||
vld1.64 {$H-$Hhl},[$Htbl],#32 @ load twisted H, ..., H^2
|
||||
vmov.i8 $xC2,#0xe1
|
||||
vld1.64 {$H2},[$Htbl]
|
||||
cclr $inc,eq @ is it time to zero $inc?
|
||||
vext.8 $Xl,$Xl,$Xl,#8 @ rotate Xi
|
||||
vld1.64 {$t0},[$inp],#16 @ load [rotated] I[0]
|
||||
vshl.u64 $xC2,$xC2,#57 @ compose 0xc2.0 constant
|
||||
#ifndef __ARMEB__
|
||||
vrev64.8 $t0,$t0
|
||||
vrev64.8 $Xl,$Xl
|
||||
#endif
|
||||
vext.8 $IN,$t0,$t0,#8 @ rotate I[0]
|
||||
b.lo .Lodd_tail_v8 @ $len was less than 32
|
||||
___
|
||||
{ my ($Xln,$Xmn,$Xhn,$In) = map("q$_",(4..7));
|
||||
#######
|
||||
# Xi+2 =[H*(Ii+1 + Xi+1)] mod P =
|
||||
# [(H*Ii+1) + (H*Xi+1)] mod P =
|
||||
# [(H*Ii+1) + H^2*(Ii+Xi)] mod P
|
||||
#
|
||||
$code.=<<___;
|
||||
vld1.64 {$t1},[$inp],$inc @ load [rotated] I[1]
|
||||
#ifndef __ARMEB__
|
||||
vrev64.8 $t1,$t1
|
||||
#endif
|
||||
vext.8 $In,$t1,$t1,#8
|
||||
veor $IN,$IN,$Xl @ I[i]^=Xi
|
||||
vpmull.p64 $Xln,$H,$In @ H·Ii+1
|
||||
veor $t1,$t1,$In @ Karatsuba pre-processing
|
||||
vpmull2.p64 $Xhn,$H,$In
|
||||
b .Loop_mod2x_v8
|
||||
|
||||
.align 4
|
||||
.Loop_mod2x_v8:
|
||||
vext.8 $t2,$IN,$IN,#8
|
||||
subs $len,$len,#32 @ is there more data?
|
||||
vpmull.p64 $Xl,$H2,$IN @ H^2.lo·Xi.lo
|
||||
cclr $inc,lo @ is it time to zero $inc?
|
||||
|
||||
vpmull.p64 $Xmn,$Hhl,$t1
|
||||
veor $t2,$t2,$IN @ Karatsuba pre-processing
|
||||
vpmull2.p64 $Xh,$H2,$IN @ H^2.hi·Xi.hi
|
||||
veor $Xl,$Xl,$Xln @ accumulate
|
||||
vpmull2.p64 $Xm,$Hhl,$t2 @ (H^2.lo+H^2.hi)·(Xi.lo+Xi.hi)
|
||||
vld1.64 {$t0},[$inp],$inc @ load [rotated] I[i+2]
|
||||
|
||||
veor $Xh,$Xh,$Xhn
|
||||
cclr $inc,eq @ is it time to zero $inc?
|
||||
veor $Xm,$Xm,$Xmn
|
||||
|
||||
vext.8 $t1,$Xl,$Xh,#8 @ Karatsuba post-processing
|
||||
veor $t2,$Xl,$Xh
|
||||
veor $Xm,$Xm,$t1
|
||||
vld1.64 {$t1},[$inp],$inc @ load [rotated] I[i+3]
|
||||
#ifndef __ARMEB__
|
||||
vrev64.8 $t0,$t0
|
||||
#endif
|
||||
veor $Xm,$Xm,$t2
|
||||
vpmull.p64 $t2,$Xl,$xC2 @ 1st phase of reduction
|
||||
|
||||
#ifndef __ARMEB__
|
||||
vrev64.8 $t1,$t1
|
||||
#endif
|
||||
vmov $Xh#lo,$Xm#hi @ Xh|Xm - 256-bit result
|
||||
vmov $Xm#hi,$Xl#lo @ Xm is rotated Xl
|
||||
vext.8 $In,$t1,$t1,#8
|
||||
vext.8 $IN,$t0,$t0,#8
|
||||
veor $Xl,$Xm,$t2
|
||||
vpmull.p64 $Xln,$H,$In @ H·Ii+1
|
||||
veor $IN,$IN,$Xh @ accumulate $IN early
|
||||
|
||||
vext.8 $t2,$Xl,$Xl,#8 @ 2nd phase of reduction
|
||||
vpmull.p64 $Xl,$Xl,$xC2
|
||||
veor $IN,$IN,$t2
|
||||
veor $t1,$t1,$In @ Karatsuba pre-processing
|
||||
veor $IN,$IN,$Xl
|
||||
vpmull2.p64 $Xhn,$H,$In
|
||||
b.hs .Loop_mod2x_v8 @ there was at least 32 more bytes
|
||||
|
||||
veor $Xh,$Xh,$t2
|
||||
vext.8 $IN,$t0,$t0,#8 @ re-construct $IN
|
||||
adds $len,$len,#32 @ re-construct $len
|
||||
veor $Xl,$Xl,$Xh @ re-construct $Xl
|
||||
b.eq .Ldone_v8 @ is $len zero?
|
||||
___
|
||||
}
|
||||
$code.=<<___;
|
||||
.Lodd_tail_v8:
|
||||
vext.8 $t2,$Xl,$Xl,#8
|
||||
veor $IN,$IN,$Xl @ inp^=Xi
|
||||
veor $t1,$t0,$t2 @ $t1 is rotated inp^Xi
|
||||
|
||||
vpmull.p64 $Xl,$H,$IN @ H.lo·Xi.lo
|
||||
veor $t1,$t1,$IN @ Karatsuba pre-processing
|
||||
vpmull2.p64 $Xh,$H,$IN @ H.hi·Xi.hi
|
||||
vpmull.p64 $Xm,$Hhl,$t1 @ (H.lo+H.hi)·(Xi.lo+Xi.hi)
|
||||
|
||||
vext.8 $t1,$Xl,$Xh,#8 @ Karatsuba post-processing
|
||||
veor $t2,$Xl,$Xh
|
||||
veor $Xm,$Xm,$t1
|
||||
veor $Xm,$Xm,$t2
|
||||
vpmull.p64 $t2,$Xl,$xC2 @ 1st phase of reduction
|
||||
|
||||
vmov $Xh#lo,$Xm#hi @ Xh|Xm - 256-bit result
|
||||
vmov $Xm#hi,$Xl#lo @ Xm is rotated Xl
|
||||
veor $Xl,$Xm,$t2
|
||||
|
||||
vext.8 $t2,$Xl,$Xl,#8 @ 2nd phase of reduction
|
||||
vpmull.p64 $Xl,$Xl,$xC2
|
||||
veor $t2,$t2,$Xh
|
||||
veor $Xl,$Xl,$t2
|
||||
|
||||
.Ldone_v8:
|
||||
#ifndef __ARMEB__
|
||||
vrev64.8 $Xl,$Xl
|
||||
#endif
|
||||
vext.8 $Xl,$Xl,$Xl,#8
|
||||
vst1.64 {$Xl},[$Xi] @ write out Xi
|
||||
|
||||
___
|
||||
$code.=<<___ if ($flavour !~ /64/);
|
||||
vldmia sp!,{d8-d15} @ 32-bit ABI says so
|
||||
___
|
||||
$code.=<<___;
|
||||
ret
|
||||
.size GFp_gcm_ghash_clmul,.-GFp_gcm_ghash_clmul
|
||||
___
|
||||
}
|
||||
$code.=<<___;
|
||||
.asciz "GHASH for ARMv8, CRYPTOGAMS by <appro\@openssl.org>"
|
||||
.align 2
|
||||
___
|
||||
|
||||
if ($flavour =~ /64/) { ######## 64-bit code
|
||||
sub unvmov {
|
||||
my $arg=shift;
|
||||
|
||||
$arg =~ m/q([0-9]+)#(lo|hi),\s*q([0-9]+)#(lo|hi)/o &&
|
||||
sprintf "ins v%d.d[%d],v%d.d[%d]",$1,($2 eq "lo")?0:1,$3,($4 eq "lo")?0:1;
|
||||
}
|
||||
foreach(split("\n",$code)) {
|
||||
s/cclr\s+([wx])([^,]+),\s*([a-z]+)/csel $1$2,$1zr,$1$2,$3/o or
|
||||
s/vmov\.i8/movi/o or # fix up legacy mnemonics
|
||||
s/vmov\s+(.*)/unvmov($1)/geo or
|
||||
s/vext\.8/ext/o or
|
||||
s/vshr\.s/sshr\.s/o or
|
||||
s/vshr/ushr/o or
|
||||
s/^(\s+)v/$1/o or # strip off v prefix
|
||||
s/\bbx\s+lr\b/ret/o;
|
||||
|
||||
s/\bq([0-9]+)\b/"v".($1<8?$1:$1+8).".16b"/geo; # old->new registers
|
||||
s/@\s/\/\//o; # old->new style commentary
|
||||
|
||||
# fix up remaining legacy suffixes
|
||||
s/\.[ui]?8(\s)/$1/o;
|
||||
s/\.[uis]?32//o and s/\.16b/\.4s/go;
|
||||
m/\.p64/o and s/\.16b/\.1q/o; # 1st pmull argument
|
||||
m/l\.p64/o and s/\.16b/\.1d/go; # 2nd and 3rd pmull arguments
|
||||
s/\.[uisp]?64//o and s/\.16b/\.2d/go;
|
||||
s/\.[42]([sd])\[([0-3])\]/\.$1\[$2\]/o;
|
||||
|
||||
print $_,"\n";
|
||||
}
|
||||
} else { ######## 32-bit code
|
||||
sub unvdup32 {
|
||||
my $arg=shift;
|
||||
|
||||
$arg =~ m/q([0-9]+),\s*q([0-9]+)\[([0-3])\]/o &&
|
||||
sprintf "vdup.32 q%d,d%d[%d]",$1,2*$2+($3>>1),$3&1;
|
||||
}
|
||||
sub unvpmullp64 {
|
||||
my ($mnemonic,$arg)=@_;
|
||||
|
||||
if ($arg =~ m/q([0-9]+),\s*q([0-9]+),\s*q([0-9]+)/o) {
|
||||
my $word = 0xf2a00e00|(($1&7)<<13)|(($1&8)<<19)
|
||||
|(($2&7)<<17)|(($2&8)<<4)
|
||||
|(($3&7)<<1) |(($3&8)<<2);
|
||||
$word |= 0x00010001 if ($mnemonic =~ "2");
|
||||
# since ARMv7 instructions are always encoded little-endian.
|
||||
# correct solution is to use .inst directive, but older
|
||||
# assemblers don't implement it:-(
|
||||
sprintf ".byte\t0x%02x,0x%02x,0x%02x,0x%02x\t@ %s %s",
|
||||
$word&0xff,($word>>8)&0xff,
|
||||
($word>>16)&0xff,($word>>24)&0xff,
|
||||
$mnemonic,$arg;
|
||||
}
|
||||
}
|
||||
|
||||
foreach(split("\n",$code)) {
|
||||
s/\b[wx]([0-9]+)\b/r$1/go; # new->old registers
|
||||
s/\bv([0-9])\.[12468]+[bsd]\b/q$1/go; # new->old registers
|
||||
s/\/\/\s?/@ /o; # new->old style commentary
|
||||
|
||||
# fix up remaining new-style suffixes
|
||||
s/\],#[0-9]+/]!/o;
|
||||
|
||||
s/cclr\s+([^,]+),\s*([a-z]+)/mov$2 $1,#0/o or
|
||||
s/vdup\.32\s+(.*)/unvdup32($1)/geo or
|
||||
s/v?(pmull2?)\.p64\s+(.*)/unvpmullp64($1,$2)/geo or
|
||||
s/\bq([0-9]+)#(lo|hi)/sprintf "d%d",2*$1+($2 eq "hi")/geo or
|
||||
s/^(\s+)b\./$1b/o or
|
||||
s/^(\s+)ret/$1bx\tlr/o;
|
||||
|
||||
print $_,"\n";
|
||||
}
|
||||
}
|
||||
|
||||
close STDOUT or die "error closing STDOUT"; # enforce flush
|
||||
736
zeroidc/vendor/ring/crypto/fipsmodule/sha/asm/sha256-armv4.pl
vendored
Normal file
736
zeroidc/vendor/ring/crypto/fipsmodule/sha/asm/sha256-armv4.pl
vendored
Normal file
@@ -0,0 +1,736 @@
|
||||
#! /usr/bin/env perl
|
||||
# Copyright 2007-2016 The OpenSSL Project Authors. All Rights Reserved.
|
||||
#
|
||||
# Licensed under the OpenSSL license (the "License"). You may not use
|
||||
# this file except in compliance with the License. You can obtain a copy
|
||||
# in the file LICENSE in the source distribution or at
|
||||
# https://www.openssl.org/source/license.html
|
||||
|
||||
|
||||
# ====================================================================
|
||||
# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
|
||||
# project. The module is, however, dual licensed under OpenSSL and
|
||||
# CRYPTOGAMS licenses depending on where you obtain it. For further
|
||||
# details see http://www.openssl.org/~appro/cryptogams/.
|
||||
#
|
||||
# Permission to use under GPL terms is granted.
|
||||
# ====================================================================
|
||||
|
||||
# SHA256 block procedure for ARMv4. May 2007.
|
||||
|
||||
# Performance is ~2x better than gcc 3.4 generated code and in "abso-
|
||||
# lute" terms is ~2250 cycles per 64-byte block or ~35 cycles per
|
||||
# byte [on single-issue Xscale PXA250 core].
|
||||
|
||||
# July 2010.
|
||||
#
|
||||
# Rescheduling for dual-issue pipeline resulted in 22% improvement on
|
||||
# Cortex A8 core and ~20 cycles per processed byte.
|
||||
|
||||
# February 2011.
|
||||
#
|
||||
# Profiler-assisted and platform-specific optimization resulted in 16%
|
||||
# improvement on Cortex A8 core and ~15.4 cycles per processed byte.
|
||||
|
||||
# September 2013.
|
||||
#
|
||||
# Add NEON implementation. On Cortex A8 it was measured to process one
|
||||
# byte in 12.5 cycles or 23% faster than integer-only code. Snapdragon
|
||||
# S4 does it in 12.5 cycles too, but it's 50% faster than integer-only
|
||||
# code (meaning that latter performs sub-optimally, nothing was done
|
||||
# about it).
|
||||
|
||||
# May 2014.
|
||||
#
|
||||
# Add ARMv8 code path performing at 2.0 cpb on Apple A7.
|
||||
|
||||
$flavour = shift;
|
||||
if ($flavour=~/\w[\w\-]*\.\w+$/) { $output=$flavour; undef $flavour; }
|
||||
else { while (($output=shift) && ($output!~/\w[\w\-]*\.\w+$/)) {} }
|
||||
|
||||
if ($flavour && $flavour ne "void") {
|
||||
$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
|
||||
( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or
|
||||
( $xlate="${dir}../../../perlasm/arm-xlate.pl" and -f $xlate) or
|
||||
die "can't locate arm-xlate.pl";
|
||||
|
||||
open OUT,"| \"$^X\" $xlate $flavour $output";
|
||||
*STDOUT=*OUT;
|
||||
} else {
|
||||
open OUT,">$output";
|
||||
*STDOUT=*OUT;
|
||||
}
|
||||
|
||||
$ctx="r0"; $t0="r0";
|
||||
$inp="r1"; $t4="r1";
|
||||
$len="r2"; $t1="r2";
|
||||
$T1="r3"; $t3="r3";
|
||||
$A="r4";
|
||||
$B="r5";
|
||||
$C="r6";
|
||||
$D="r7";
|
||||
$E="r8";
|
||||
$F="r9";
|
||||
$G="r10";
|
||||
$H="r11";
|
||||
@V=($A,$B,$C,$D,$E,$F,$G,$H);
|
||||
$t2="r12";
|
||||
$Ktbl="r14";
|
||||
|
||||
@Sigma0=( 2,13,22);
|
||||
@Sigma1=( 6,11,25);
|
||||
@sigma0=( 7,18, 3);
|
||||
@sigma1=(17,19,10);
|
||||
|
||||
sub BODY_00_15 {
|
||||
my ($i,$a,$b,$c,$d,$e,$f,$g,$h) = @_;
|
||||
|
||||
$code.=<<___ if ($i<16);
|
||||
#if __ARM_ARCH__>=7
|
||||
@ ldr $t1,[$inp],#4 @ $i
|
||||
# if $i==15
|
||||
str $inp,[sp,#17*4] @ make room for $t4
|
||||
# endif
|
||||
eor $t0,$e,$e,ror#`$Sigma1[1]-$Sigma1[0]`
|
||||
add $a,$a,$t2 @ h+=Maj(a,b,c) from the past
|
||||
eor $t0,$t0,$e,ror#`$Sigma1[2]-$Sigma1[0]` @ Sigma1(e)
|
||||
# ifndef __ARMEB__
|
||||
rev $t1,$t1
|
||||
# endif
|
||||
#else
|
||||
@ ldrb $t1,[$inp,#3] @ $i
|
||||
add $a,$a,$t2 @ h+=Maj(a,b,c) from the past
|
||||
ldrb $t2,[$inp,#2]
|
||||
ldrb $t0,[$inp,#1]
|
||||
orr $t1,$t1,$t2,lsl#8
|
||||
ldrb $t2,[$inp],#4
|
||||
orr $t1,$t1,$t0,lsl#16
|
||||
# if $i==15
|
||||
str $inp,[sp,#17*4] @ make room for $t4
|
||||
# endif
|
||||
eor $t0,$e,$e,ror#`$Sigma1[1]-$Sigma1[0]`
|
||||
orr $t1,$t1,$t2,lsl#24
|
||||
eor $t0,$t0,$e,ror#`$Sigma1[2]-$Sigma1[0]` @ Sigma1(e)
|
||||
#endif
|
||||
___
|
||||
$code.=<<___;
|
||||
ldr $t2,[$Ktbl],#4 @ *K256++
|
||||
add $h,$h,$t1 @ h+=X[i]
|
||||
str $t1,[sp,#`$i%16`*4]
|
||||
eor $t1,$f,$g
|
||||
add $h,$h,$t0,ror#$Sigma1[0] @ h+=Sigma1(e)
|
||||
and $t1,$t1,$e
|
||||
add $h,$h,$t2 @ h+=K256[i]
|
||||
eor $t1,$t1,$g @ Ch(e,f,g)
|
||||
eor $t0,$a,$a,ror#`$Sigma0[1]-$Sigma0[0]`
|
||||
add $h,$h,$t1 @ h+=Ch(e,f,g)
|
||||
#if $i==31
|
||||
and $t2,$t2,#0xff
|
||||
cmp $t2,#0xf2 @ done?
|
||||
#endif
|
||||
#if $i<15
|
||||
# if __ARM_ARCH__>=7
|
||||
ldr $t1,[$inp],#4 @ prefetch
|
||||
# else
|
||||
ldrb $t1,[$inp,#3]
|
||||
# endif
|
||||
eor $t2,$a,$b @ a^b, b^c in next round
|
||||
#else
|
||||
ldr $t1,[sp,#`($i+2)%16`*4] @ from future BODY_16_xx
|
||||
eor $t2,$a,$b @ a^b, b^c in next round
|
||||
ldr $t4,[sp,#`($i+15)%16`*4] @ from future BODY_16_xx
|
||||
#endif
|
||||
eor $t0,$t0,$a,ror#`$Sigma0[2]-$Sigma0[0]` @ Sigma0(a)
|
||||
and $t3,$t3,$t2 @ (b^c)&=(a^b)
|
||||
add $d,$d,$h @ d+=h
|
||||
eor $t3,$t3,$b @ Maj(a,b,c)
|
||||
add $h,$h,$t0,ror#$Sigma0[0] @ h+=Sigma0(a)
|
||||
@ add $h,$h,$t3 @ h+=Maj(a,b,c)
|
||||
___
|
||||
($t2,$t3)=($t3,$t2);
|
||||
}
|
||||
|
||||
sub BODY_16_XX {
|
||||
my ($i,$a,$b,$c,$d,$e,$f,$g,$h) = @_;
|
||||
|
||||
$code.=<<___;
|
||||
@ ldr $t1,[sp,#`($i+1)%16`*4] @ $i
|
||||
@ ldr $t4,[sp,#`($i+14)%16`*4]
|
||||
mov $t0,$t1,ror#$sigma0[0]
|
||||
add $a,$a,$t2 @ h+=Maj(a,b,c) from the past
|
||||
mov $t2,$t4,ror#$sigma1[0]
|
||||
eor $t0,$t0,$t1,ror#$sigma0[1]
|
||||
eor $t2,$t2,$t4,ror#$sigma1[1]
|
||||
eor $t0,$t0,$t1,lsr#$sigma0[2] @ sigma0(X[i+1])
|
||||
ldr $t1,[sp,#`($i+0)%16`*4]
|
||||
eor $t2,$t2,$t4,lsr#$sigma1[2] @ sigma1(X[i+14])
|
||||
ldr $t4,[sp,#`($i+9)%16`*4]
|
||||
|
||||
add $t2,$t2,$t0
|
||||
eor $t0,$e,$e,ror#`$Sigma1[1]-$Sigma1[0]` @ from BODY_00_15
|
||||
add $t1,$t1,$t2
|
||||
eor $t0,$t0,$e,ror#`$Sigma1[2]-$Sigma1[0]` @ Sigma1(e)
|
||||
add $t1,$t1,$t4 @ X[i]
|
||||
___
|
||||
&BODY_00_15(@_);
|
||||
}
|
||||
|
||||
$code=<<___;
|
||||
#ifndef __KERNEL__
|
||||
# include <GFp/arm_arch.h>
|
||||
#else
|
||||
# define __ARM_ARCH__ __LINUX_ARM_ARCH__
|
||||
# define __ARM_MAX_ARCH__ 7
|
||||
#endif
|
||||
|
||||
@ Silence ARMv8 deprecated IT instruction warnings. This file is used by both
|
||||
@ ARMv7 and ARMv8 processors. It does have ARMv8-only code, but those
|
||||
@ instructions are manually-encoded. (See unsha256.)
|
||||
.arch armv7-a
|
||||
|
||||
.text
|
||||
#if defined(__thumb2__)
|
||||
.syntax unified
|
||||
.thumb
|
||||
#else
|
||||
.code 32
|
||||
#endif
|
||||
|
||||
.type K256,%object
|
||||
.align 5
|
||||
K256:
|
||||
.word 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
|
||||
.word 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5
|
||||
.word 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3
|
||||
.word 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174
|
||||
.word 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc
|
||||
.word 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da
|
||||
.word 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7
|
||||
.word 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967
|
||||
.word 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13
|
||||
.word 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85
|
||||
.word 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3
|
||||
.word 0xd192e819,0xd6990624,0xf40e3585,0x106aa070
|
||||
.word 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5
|
||||
.word 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3
|
||||
.word 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
|
||||
.word 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
|
||||
.size K256,.-K256
|
||||
.word 0 @ terminator
|
||||
#if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__)
|
||||
.extern GFp_armcap_P
|
||||
.hidden GFp_armcap_P
|
||||
.LOPENSSL_armcap:
|
||||
.word GFp_armcap_P-.Lsha256_block_data_order
|
||||
#endif
|
||||
.align 5
|
||||
|
||||
.global GFp_sha256_block_data_order
|
||||
.type GFp_sha256_block_data_order,%function
|
||||
GFp_sha256_block_data_order:
|
||||
.Lsha256_block_data_order:
|
||||
#if __ARM_ARCH__<7 && !defined(__thumb2__)
|
||||
sub r3,pc,#8 @ GFp_sha256_block_data_order
|
||||
#else
|
||||
adr r3,.Lsha256_block_data_order
|
||||
#endif
|
||||
#if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__)
|
||||
ldr r12,.LOPENSSL_armcap
|
||||
ldr r12,[r3,r12] @ GFp_armcap_P
|
||||
#ifdef __APPLE__
|
||||
ldr r12,[r12]
|
||||
#endif
|
||||
tst r12,#ARMV8_SHA256
|
||||
bne .LARMv8
|
||||
tst r12,#ARMV7_NEON
|
||||
bne .LNEON
|
||||
#endif
|
||||
add $len,$inp,$len,lsl#6 @ len to point at the end of inp
|
||||
stmdb sp!,{$ctx,$inp,$len,r4-r11,lr}
|
||||
ldmia $ctx,{$A,$B,$C,$D,$E,$F,$G,$H}
|
||||
sub $Ktbl,r3,#256+32 @ K256
|
||||
sub sp,sp,#16*4 @ alloca(X[16])
|
||||
.Loop:
|
||||
# if __ARM_ARCH__>=7
|
||||
ldr $t1,[$inp],#4
|
||||
# else
|
||||
ldrb $t1,[$inp,#3]
|
||||
# endif
|
||||
eor $t3,$B,$C @ magic
|
||||
eor $t2,$t2,$t2
|
||||
___
|
||||
for($i=0;$i<16;$i++) { &BODY_00_15($i,@V); unshift(@V,pop(@V)); }
|
||||
$code.=".Lrounds_16_xx:\n";
|
||||
for (;$i<32;$i++) { &BODY_16_XX($i,@V); unshift(@V,pop(@V)); }
|
||||
$code.=<<___;
|
||||
#if __ARM_ARCH__>=7
|
||||
ite eq @ Thumb2 thing, sanity check in ARM
|
||||
#endif
|
||||
ldreq $t3,[sp,#16*4] @ pull ctx
|
||||
bne .Lrounds_16_xx
|
||||
|
||||
add $A,$A,$t2 @ h+=Maj(a,b,c) from the past
|
||||
ldr $t0,[$t3,#0]
|
||||
ldr $t1,[$t3,#4]
|
||||
ldr $t2,[$t3,#8]
|
||||
add $A,$A,$t0
|
||||
ldr $t0,[$t3,#12]
|
||||
add $B,$B,$t1
|
||||
ldr $t1,[$t3,#16]
|
||||
add $C,$C,$t2
|
||||
ldr $t2,[$t3,#20]
|
||||
add $D,$D,$t0
|
||||
ldr $t0,[$t3,#24]
|
||||
add $E,$E,$t1
|
||||
ldr $t1,[$t3,#28]
|
||||
add $F,$F,$t2
|
||||
ldr $inp,[sp,#17*4] @ pull inp
|
||||
ldr $t2,[sp,#18*4] @ pull inp+len
|
||||
add $G,$G,$t0
|
||||
add $H,$H,$t1
|
||||
stmia $t3,{$A,$B,$C,$D,$E,$F,$G,$H}
|
||||
cmp $inp,$t2
|
||||
sub $Ktbl,$Ktbl,#256 @ rewind Ktbl
|
||||
bne .Loop
|
||||
|
||||
add sp,sp,#`16+3`*4 @ destroy frame
|
||||
#if __ARM_ARCH__>=5
|
||||
ldmia sp!,{r4-r11,pc}
|
||||
#else
|
||||
ldmia sp!,{r4-r11,lr}
|
||||
tst lr,#1
|
||||
moveq pc,lr @ be binary compatible with V4, yet
|
||||
bx lr @ interoperable with Thumb ISA:-)
|
||||
#endif
|
||||
.size GFp_sha256_block_data_order,.-GFp_sha256_block_data_order
|
||||
___
|
||||
######################################################################
|
||||
# NEON stuff
|
||||
#
|
||||
{{{
|
||||
my @X=map("q$_",(0..3));
|
||||
my ($T0,$T1,$T2,$T3,$T4,$T5)=("q8","q9","q10","q11","d24","d25");
|
||||
my $Xfer=$t4;
|
||||
my $j=0;
|
||||
|
||||
sub Dlo() { shift=~m|q([1]?[0-9])|?"d".($1*2):""; }
|
||||
sub Dhi() { shift=~m|q([1]?[0-9])|?"d".($1*2+1):""; }
|
||||
|
||||
sub AUTOLOAD() # thunk [simplified] x86-style perlasm
|
||||
{ my $opcode = $AUTOLOAD; $opcode =~ s/.*:://; $opcode =~ s/_/\./;
|
||||
my $arg = pop;
|
||||
$arg = "#$arg" if ($arg*1 eq $arg);
|
||||
$code .= "\t$opcode\t".join(',',@_,$arg)."\n";
|
||||
}
|
||||
|
||||
sub Xupdate()
|
||||
{ use integer;
|
||||
my $body = shift;
|
||||
my @insns = (&$body,&$body,&$body,&$body);
|
||||
my ($a,$b,$c,$d,$e,$f,$g,$h);
|
||||
|
||||
&vext_8 ($T0,@X[0],@X[1],4); # X[1..4]
|
||||
eval(shift(@insns));
|
||||
eval(shift(@insns));
|
||||
eval(shift(@insns));
|
||||
&vext_8 ($T1,@X[2],@X[3],4); # X[9..12]
|
||||
eval(shift(@insns));
|
||||
eval(shift(@insns));
|
||||
eval(shift(@insns));
|
||||
&vshr_u32 ($T2,$T0,$sigma0[0]);
|
||||
eval(shift(@insns));
|
||||
eval(shift(@insns));
|
||||
&vadd_i32 (@X[0],@X[0],$T1); # X[0..3] += X[9..12]
|
||||
eval(shift(@insns));
|
||||
eval(shift(@insns));
|
||||
&vshr_u32 ($T1,$T0,$sigma0[2]);
|
||||
eval(shift(@insns));
|
||||
eval(shift(@insns));
|
||||
&vsli_32 ($T2,$T0,32-$sigma0[0]);
|
||||
eval(shift(@insns));
|
||||
eval(shift(@insns));
|
||||
&vshr_u32 ($T3,$T0,$sigma0[1]);
|
||||
eval(shift(@insns));
|
||||
eval(shift(@insns));
|
||||
&veor ($T1,$T1,$T2);
|
||||
eval(shift(@insns));
|
||||
eval(shift(@insns));
|
||||
&vsli_32 ($T3,$T0,32-$sigma0[1]);
|
||||
eval(shift(@insns));
|
||||
eval(shift(@insns));
|
||||
&vshr_u32 ($T4,&Dhi(@X[3]),$sigma1[0]);
|
||||
eval(shift(@insns));
|
||||
eval(shift(@insns));
|
||||
&veor ($T1,$T1,$T3); # sigma0(X[1..4])
|
||||
eval(shift(@insns));
|
||||
eval(shift(@insns));
|
||||
&vsli_32 ($T4,&Dhi(@X[3]),32-$sigma1[0]);
|
||||
eval(shift(@insns));
|
||||
eval(shift(@insns));
|
||||
&vshr_u32 ($T5,&Dhi(@X[3]),$sigma1[2]);
|
||||
eval(shift(@insns));
|
||||
eval(shift(@insns));
|
||||
&vadd_i32 (@X[0],@X[0],$T1); # X[0..3] += sigma0(X[1..4])
|
||||
eval(shift(@insns));
|
||||
eval(shift(@insns));
|
||||
&veor ($T5,$T5,$T4);
|
||||
eval(shift(@insns));
|
||||
eval(shift(@insns));
|
||||
&vshr_u32 ($T4,&Dhi(@X[3]),$sigma1[1]);
|
||||
eval(shift(@insns));
|
||||
eval(shift(@insns));
|
||||
&vsli_32 ($T4,&Dhi(@X[3]),32-$sigma1[1]);
|
||||
eval(shift(@insns));
|
||||
eval(shift(@insns));
|
||||
&veor ($T5,$T5,$T4); # sigma1(X[14..15])
|
||||
eval(shift(@insns));
|
||||
eval(shift(@insns));
|
||||
&vadd_i32 (&Dlo(@X[0]),&Dlo(@X[0]),$T5);# X[0..1] += sigma1(X[14..15])
|
||||
eval(shift(@insns));
|
||||
eval(shift(@insns));
|
||||
&vshr_u32 ($T4,&Dlo(@X[0]),$sigma1[0]);
|
||||
eval(shift(@insns));
|
||||
eval(shift(@insns));
|
||||
&vsli_32 ($T4,&Dlo(@X[0]),32-$sigma1[0]);
|
||||
eval(shift(@insns));
|
||||
eval(shift(@insns));
|
||||
&vshr_u32 ($T5,&Dlo(@X[0]),$sigma1[2]);
|
||||
eval(shift(@insns));
|
||||
eval(shift(@insns));
|
||||
&veor ($T5,$T5,$T4);
|
||||
eval(shift(@insns));
|
||||
eval(shift(@insns));
|
||||
&vshr_u32 ($T4,&Dlo(@X[0]),$sigma1[1]);
|
||||
eval(shift(@insns));
|
||||
eval(shift(@insns));
|
||||
&vld1_32 ("{$T0}","[$Ktbl,:128]!");
|
||||
eval(shift(@insns));
|
||||
eval(shift(@insns));
|
||||
&vsli_32 ($T4,&Dlo(@X[0]),32-$sigma1[1]);
|
||||
eval(shift(@insns));
|
||||
eval(shift(@insns));
|
||||
&veor ($T5,$T5,$T4); # sigma1(X[16..17])
|
||||
eval(shift(@insns));
|
||||
eval(shift(@insns));
|
||||
&vadd_i32 (&Dhi(@X[0]),&Dhi(@X[0]),$T5);# X[2..3] += sigma1(X[16..17])
|
||||
eval(shift(@insns));
|
||||
eval(shift(@insns));
|
||||
&vadd_i32 ($T0,$T0,@X[0]);
|
||||
while($#insns>=2) { eval(shift(@insns)); }
|
||||
&vst1_32 ("{$T0}","[$Xfer,:128]!");
|
||||
eval(shift(@insns));
|
||||
eval(shift(@insns));
|
||||
|
||||
push(@X,shift(@X)); # "rotate" X[]
|
||||
}
|
||||
|
||||
sub Xpreload()
|
||||
{ use integer;
|
||||
my $body = shift;
|
||||
my @insns = (&$body,&$body,&$body,&$body);
|
||||
my ($a,$b,$c,$d,$e,$f,$g,$h);
|
||||
|
||||
eval(shift(@insns));
|
||||
eval(shift(@insns));
|
||||
eval(shift(@insns));
|
||||
eval(shift(@insns));
|
||||
&vld1_32 ("{$T0}","[$Ktbl,:128]!");
|
||||
eval(shift(@insns));
|
||||
eval(shift(@insns));
|
||||
eval(shift(@insns));
|
||||
eval(shift(@insns));
|
||||
&vrev32_8 (@X[0],@X[0]);
|
||||
eval(shift(@insns));
|
||||
eval(shift(@insns));
|
||||
eval(shift(@insns));
|
||||
eval(shift(@insns));
|
||||
&vadd_i32 ($T0,$T0,@X[0]);
|
||||
foreach (@insns) { eval; } # remaining instructions
|
||||
&vst1_32 ("{$T0}","[$Xfer,:128]!");
|
||||
|
||||
push(@X,shift(@X)); # "rotate" X[]
|
||||
}
|
||||
|
||||
sub body_00_15 () {
|
||||
(
|
||||
'($a,$b,$c,$d,$e,$f,$g,$h)=@V;'.
|
||||
'&add ($h,$h,$t1)', # h+=X[i]+K[i]
|
||||
'&eor ($t1,$f,$g)',
|
||||
'&eor ($t0,$e,$e,"ror#".($Sigma1[1]-$Sigma1[0]))',
|
||||
'&add ($a,$a,$t2)', # h+=Maj(a,b,c) from the past
|
||||
'&and ($t1,$t1,$e)',
|
||||
'&eor ($t2,$t0,$e,"ror#".($Sigma1[2]-$Sigma1[0]))', # Sigma1(e)
|
||||
'&eor ($t0,$a,$a,"ror#".($Sigma0[1]-$Sigma0[0]))',
|
||||
'&eor ($t1,$t1,$g)', # Ch(e,f,g)
|
||||
'&add ($h,$h,$t2,"ror#$Sigma1[0]")', # h+=Sigma1(e)
|
||||
'&eor ($t2,$a,$b)', # a^b, b^c in next round
|
||||
'&eor ($t0,$t0,$a,"ror#".($Sigma0[2]-$Sigma0[0]))', # Sigma0(a)
|
||||
'&add ($h,$h,$t1)', # h+=Ch(e,f,g)
|
||||
'&ldr ($t1,sprintf "[sp,#%d]",4*(($j+1)&15)) if (($j&15)!=15);'.
|
||||
'&ldr ($t1,"[$Ktbl]") if ($j==15);'.
|
||||
'&ldr ($t1,"[sp,#64]") if ($j==31)',
|
||||
'&and ($t3,$t3,$t2)', # (b^c)&=(a^b)
|
||||
'&add ($d,$d,$h)', # d+=h
|
||||
'&add ($h,$h,$t0,"ror#$Sigma0[0]");'. # h+=Sigma0(a)
|
||||
'&eor ($t3,$t3,$b)', # Maj(a,b,c)
|
||||
'$j++; unshift(@V,pop(@V)); ($t2,$t3)=($t3,$t2);'
|
||||
)
|
||||
}
|
||||
|
||||
$code.=<<___;
|
||||
#if __ARM_MAX_ARCH__>=7
|
||||
.arch armv7-a
|
||||
.fpu neon
|
||||
|
||||
.type sha256_block_data_order_neon,%function
|
||||
.align 5
|
||||
.skip 16
|
||||
sha256_block_data_order_neon:
|
||||
.LNEON:
|
||||
stmdb sp!,{r4-r12,lr}
|
||||
|
||||
sub $H,sp,#16*4+16
|
||||
adr $Ktbl,K256
|
||||
bic $H,$H,#15 @ align for 128-bit stores
|
||||
mov $t2,sp
|
||||
mov sp,$H @ alloca
|
||||
add $len,$inp,$len,lsl#6 @ len to point at the end of inp
|
||||
|
||||
vld1.8 {@X[0]},[$inp]!
|
||||
vld1.8 {@X[1]},[$inp]!
|
||||
vld1.8 {@X[2]},[$inp]!
|
||||
vld1.8 {@X[3]},[$inp]!
|
||||
vld1.32 {$T0},[$Ktbl,:128]!
|
||||
vld1.32 {$T1},[$Ktbl,:128]!
|
||||
vld1.32 {$T2},[$Ktbl,:128]!
|
||||
vld1.32 {$T3},[$Ktbl,:128]!
|
||||
vrev32.8 @X[0],@X[0] @ yes, even on
|
||||
str $ctx,[sp,#64]
|
||||
vrev32.8 @X[1],@X[1] @ big-endian
|
||||
str $inp,[sp,#68]
|
||||
mov $Xfer,sp
|
||||
vrev32.8 @X[2],@X[2]
|
||||
str $len,[sp,#72]
|
||||
vrev32.8 @X[3],@X[3]
|
||||
str $t2,[sp,#76] @ save original sp
|
||||
vadd.i32 $T0,$T0,@X[0]
|
||||
vadd.i32 $T1,$T1,@X[1]
|
||||
vst1.32 {$T0},[$Xfer,:128]!
|
||||
vadd.i32 $T2,$T2,@X[2]
|
||||
vst1.32 {$T1},[$Xfer,:128]!
|
||||
vadd.i32 $T3,$T3,@X[3]
|
||||
vst1.32 {$T2},[$Xfer,:128]!
|
||||
vst1.32 {$T3},[$Xfer,:128]!
|
||||
|
||||
ldmia $ctx,{$A-$H}
|
||||
sub $Xfer,$Xfer,#64
|
||||
ldr $t1,[sp,#0]
|
||||
eor $t2,$t2,$t2
|
||||
eor $t3,$B,$C
|
||||
b .L_00_48
|
||||
|
||||
.align 4
|
||||
.L_00_48:
|
||||
___
|
||||
&Xupdate(\&body_00_15);
|
||||
&Xupdate(\&body_00_15);
|
||||
&Xupdate(\&body_00_15);
|
||||
&Xupdate(\&body_00_15);
|
||||
$code.=<<___;
|
||||
teq $t1,#0 @ check for K256 terminator
|
||||
ldr $t1,[sp,#0]
|
||||
sub $Xfer,$Xfer,#64
|
||||
bne .L_00_48
|
||||
|
||||
ldr $inp,[sp,#68]
|
||||
ldr $t0,[sp,#72]
|
||||
sub $Ktbl,$Ktbl,#256 @ rewind $Ktbl
|
||||
teq $inp,$t0
|
||||
it eq
|
||||
subeq $inp,$inp,#64 @ avoid SEGV
|
||||
vld1.8 {@X[0]},[$inp]! @ load next input block
|
||||
vld1.8 {@X[1]},[$inp]!
|
||||
vld1.8 {@X[2]},[$inp]!
|
||||
vld1.8 {@X[3]},[$inp]!
|
||||
it ne
|
||||
strne $inp,[sp,#68]
|
||||
mov $Xfer,sp
|
||||
___
|
||||
&Xpreload(\&body_00_15);
|
||||
&Xpreload(\&body_00_15);
|
||||
&Xpreload(\&body_00_15);
|
||||
&Xpreload(\&body_00_15);
|
||||
$code.=<<___;
|
||||
ldr $t0,[$t1,#0]
|
||||
add $A,$A,$t2 @ h+=Maj(a,b,c) from the past
|
||||
ldr $t2,[$t1,#4]
|
||||
ldr $t3,[$t1,#8]
|
||||
ldr $t4,[$t1,#12]
|
||||
add $A,$A,$t0 @ accumulate
|
||||
ldr $t0,[$t1,#16]
|
||||
add $B,$B,$t2
|
||||
ldr $t2,[$t1,#20]
|
||||
add $C,$C,$t3
|
||||
ldr $t3,[$t1,#24]
|
||||
add $D,$D,$t4
|
||||
ldr $t4,[$t1,#28]
|
||||
add $E,$E,$t0
|
||||
str $A,[$t1],#4
|
||||
add $F,$F,$t2
|
||||
str $B,[$t1],#4
|
||||
add $G,$G,$t3
|
||||
str $C,[$t1],#4
|
||||
add $H,$H,$t4
|
||||
str $D,[$t1],#4
|
||||
stmia $t1,{$E-$H}
|
||||
|
||||
ittte ne
|
||||
movne $Xfer,sp
|
||||
ldrne $t1,[sp,#0]
|
||||
eorne $t2,$t2,$t2
|
||||
ldreq sp,[sp,#76] @ restore original sp
|
||||
itt ne
|
||||
eorne $t3,$B,$C
|
||||
bne .L_00_48
|
||||
|
||||
ldmia sp!,{r4-r12,pc}
|
||||
.size sha256_block_data_order_neon,.-sha256_block_data_order_neon
|
||||
#endif
|
||||
___
|
||||
}}}
|
||||
######################################################################
|
||||
# ARMv8 stuff
|
||||
#
|
||||
{{{
|
||||
my ($ABCD,$EFGH,$abcd)=map("q$_",(0..2));
|
||||
my @MSG=map("q$_",(8..11));
|
||||
my ($W0,$W1,$ABCD_SAVE,$EFGH_SAVE)=map("q$_",(12..15));
|
||||
my $Ktbl="r3";
|
||||
|
||||
$code.=<<___;
|
||||
#if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__)
|
||||
|
||||
# if defined(__thumb2__)
|
||||
# define INST(a,b,c,d) .byte c,d|0xc,a,b
|
||||
# else
|
||||
# define INST(a,b,c,d) .byte a,b,c,d
|
||||
# endif
|
||||
|
||||
.type sha256_block_data_order_armv8,%function
|
||||
.align 5
|
||||
sha256_block_data_order_armv8:
|
||||
.LARMv8:
|
||||
vld1.32 {$ABCD,$EFGH},[$ctx]
|
||||
sub $Ktbl,$Ktbl,#256+32
|
||||
add $len,$inp,$len,lsl#6 @ len to point at the end of inp
|
||||
b .Loop_v8
|
||||
|
||||
.align 4
|
||||
.Loop_v8:
|
||||
vld1.8 {@MSG[0]-@MSG[1]},[$inp]!
|
||||
vld1.8 {@MSG[2]-@MSG[3]},[$inp]!
|
||||
vld1.32 {$W0},[$Ktbl]!
|
||||
vrev32.8 @MSG[0],@MSG[0]
|
||||
vrev32.8 @MSG[1],@MSG[1]
|
||||
vrev32.8 @MSG[2],@MSG[2]
|
||||
vrev32.8 @MSG[3],@MSG[3]
|
||||
vmov $ABCD_SAVE,$ABCD @ offload
|
||||
vmov $EFGH_SAVE,$EFGH
|
||||
teq $inp,$len
|
||||
___
|
||||
for($i=0;$i<12;$i++) {
|
||||
$code.=<<___;
|
||||
vld1.32 {$W1},[$Ktbl]!
|
||||
vadd.i32 $W0,$W0,@MSG[0]
|
||||
sha256su0 @MSG[0],@MSG[1]
|
||||
vmov $abcd,$ABCD
|
||||
sha256h $ABCD,$EFGH,$W0
|
||||
sha256h2 $EFGH,$abcd,$W0
|
||||
sha256su1 @MSG[0],@MSG[2],@MSG[3]
|
||||
___
|
||||
($W0,$W1)=($W1,$W0); push(@MSG,shift(@MSG));
|
||||
}
|
||||
$code.=<<___;
|
||||
vld1.32 {$W1},[$Ktbl]!
|
||||
vadd.i32 $W0,$W0,@MSG[0]
|
||||
vmov $abcd,$ABCD
|
||||
sha256h $ABCD,$EFGH,$W0
|
||||
sha256h2 $EFGH,$abcd,$W0
|
||||
|
||||
vld1.32 {$W0},[$Ktbl]!
|
||||
vadd.i32 $W1,$W1,@MSG[1]
|
||||
vmov $abcd,$ABCD
|
||||
sha256h $ABCD,$EFGH,$W1
|
||||
sha256h2 $EFGH,$abcd,$W1
|
||||
|
||||
vld1.32 {$W1},[$Ktbl]
|
||||
vadd.i32 $W0,$W0,@MSG[2]
|
||||
sub $Ktbl,$Ktbl,#256-16 @ rewind
|
||||
vmov $abcd,$ABCD
|
||||
sha256h $ABCD,$EFGH,$W0
|
||||
sha256h2 $EFGH,$abcd,$W0
|
||||
|
||||
vadd.i32 $W1,$W1,@MSG[3]
|
||||
vmov $abcd,$ABCD
|
||||
sha256h $ABCD,$EFGH,$W1
|
||||
sha256h2 $EFGH,$abcd,$W1
|
||||
|
||||
vadd.i32 $ABCD,$ABCD,$ABCD_SAVE
|
||||
vadd.i32 $EFGH,$EFGH,$EFGH_SAVE
|
||||
it ne
|
||||
bne .Loop_v8
|
||||
|
||||
vst1.32 {$ABCD,$EFGH},[$ctx]
|
||||
|
||||
ret @ bx lr
|
||||
.size sha256_block_data_order_armv8,.-sha256_block_data_order_armv8
|
||||
#endif
|
||||
___
|
||||
}}}
|
||||
$code.=<<___;
|
||||
.asciz "SHA256 block transform for ARMv4/NEON/ARMv8, CRYPTOGAMS by <appro\@openssl.org>"
|
||||
___
|
||||
|
||||
open SELF,$0;
|
||||
while(<SELF>) {
|
||||
next if (/^#!/);
|
||||
last if (!s/^#/@/ and !/^$/);
|
||||
print;
|
||||
}
|
||||
close SELF;
|
||||
|
||||
{ my %opcode = (
|
||||
"sha256h" => 0xf3000c40, "sha256h2" => 0xf3100c40,
|
||||
"sha256su0" => 0xf3ba03c0, "sha256su1" => 0xf3200c40 );
|
||||
|
||||
sub unsha256 {
|
||||
my ($mnemonic,$arg)=@_;
|
||||
|
||||
if ($arg =~ m/q([0-9]+)(?:,\s*q([0-9]+))?,\s*q([0-9]+)/o) {
|
||||
my $word = $opcode{$mnemonic}|(($1&7)<<13)|(($1&8)<<19)
|
||||
|(($2&7)<<17)|(($2&8)<<4)
|
||||
|(($3&7)<<1) |(($3&8)<<2);
|
||||
# since ARMv7 instructions are always encoded little-endian.
|
||||
# correct solution is to use .inst directive, but older
|
||||
# assemblers don't implement it:-(
|
||||
sprintf "INST(0x%02x,0x%02x,0x%02x,0x%02x)\t@ %s %s",
|
||||
$word&0xff,($word>>8)&0xff,
|
||||
($word>>16)&0xff,($word>>24)&0xff,
|
||||
$mnemonic,$arg;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
foreach (split($/,$code)) {
|
||||
|
||||
s/\`([^\`]*)\`/eval $1/geo;
|
||||
|
||||
s/\b(sha256\w+)\s+(q.*)/unsha256($1,$2)/geo;
|
||||
|
||||
s/\bret\b/bx lr/go or
|
||||
s/\bbx\s+lr\b/.word\t0xe12fff1e/go; # make it possible to compile with -march=armv4
|
||||
|
||||
print $_,"\n";
|
||||
}
|
||||
|
||||
close STDOUT or die "error closing STDOUT"; # enforce flush
|
||||
671
zeroidc/vendor/ring/crypto/fipsmodule/sha/asm/sha512-armv4.pl
vendored
Normal file
671
zeroidc/vendor/ring/crypto/fipsmodule/sha/asm/sha512-armv4.pl
vendored
Normal file
@@ -0,0 +1,671 @@
|
||||
#! /usr/bin/env perl
|
||||
# Copyright 2007-2016 The OpenSSL Project Authors. All Rights Reserved.
|
||||
#
|
||||
# Licensed under the OpenSSL license (the "License"). You may not use
|
||||
# this file except in compliance with the License. You can obtain a copy
|
||||
# in the file LICENSE in the source distribution or at
|
||||
# https://www.openssl.org/source/license.html
|
||||
|
||||
|
||||
# ====================================================================
|
||||
# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
|
||||
# project. The module is, however, dual licensed under OpenSSL and
|
||||
# CRYPTOGAMS licenses depending on where you obtain it. For further
|
||||
# details see http://www.openssl.org/~appro/cryptogams/.
|
||||
#
|
||||
# Permission to use under GPL terms is granted.
|
||||
# ====================================================================
|
||||
|
||||
# SHA512 block procedure for ARMv4. September 2007.
|
||||
|
||||
# This code is ~4.5 (four and a half) times faster than code generated
|
||||
# by gcc 3.4 and it spends ~72 clock cycles per byte [on single-issue
|
||||
# Xscale PXA250 core].
|
||||
#
|
||||
# July 2010.
|
||||
#
|
||||
# Rescheduling for dual-issue pipeline resulted in 6% improvement on
|
||||
# Cortex A8 core and ~40 cycles per processed byte.
|
||||
|
||||
# February 2011.
|
||||
#
|
||||
# Profiler-assisted and platform-specific optimization resulted in 7%
|
||||
# improvement on Coxtex A8 core and ~38 cycles per byte.
|
||||
|
||||
# March 2011.
|
||||
#
|
||||
# Add NEON implementation. On Cortex A8 it was measured to process
|
||||
# one byte in 23.3 cycles or ~60% faster than integer-only code.
|
||||
|
||||
# August 2012.
|
||||
#
|
||||
# Improve NEON performance by 12% on Snapdragon S4. In absolute
|
||||
# terms it's 22.6 cycles per byte, which is disappointing result.
|
||||
# Technical writers asserted that 3-way S4 pipeline can sustain
|
||||
# multiple NEON instructions per cycle, but dual NEON issue could
|
||||
# not be observed, see http://www.openssl.org/~appro/Snapdragon-S4.html
|
||||
# for further details. On side note Cortex-A15 processes one byte in
|
||||
# 16 cycles.
|
||||
|
||||
# Byte order [in]dependence. =========================================
|
||||
#
|
||||
# Originally caller was expected to maintain specific *dword* order in
|
||||
# h[0-7], namely with most significant dword at *lower* address, which
|
||||
# was reflected in below two parameters as 0 and 4. Now caller is
|
||||
# expected to maintain native byte order for whole 64-bit values.
|
||||
$hi="HI";
|
||||
$lo="LO";
|
||||
# ====================================================================
|
||||
|
||||
$flavour = shift;
|
||||
if ($flavour=~/\w[\w\-]*\.\w+$/) { $output=$flavour; undef $flavour; }
|
||||
else { while (($output=shift) && ($output!~/\w[\w\-]*\.\w+$/)) {} }
|
||||
|
||||
if ($flavour && $flavour ne "void") {
|
||||
$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
|
||||
( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or
|
||||
( $xlate="${dir}../../../perlasm/arm-xlate.pl" and -f $xlate) or
|
||||
die "can't locate arm-xlate.pl";
|
||||
|
||||
open OUT,"| \"$^X\" $xlate $flavour $output";
|
||||
*STDOUT=*OUT;
|
||||
} else {
|
||||
open OUT,">$output";
|
||||
*STDOUT=*OUT;
|
||||
}
|
||||
|
||||
$ctx="r0"; # parameter block
|
||||
$inp="r1";
|
||||
$len="r2";
|
||||
|
||||
$Tlo="r3";
|
||||
$Thi="r4";
|
||||
$Alo="r5";
|
||||
$Ahi="r6";
|
||||
$Elo="r7";
|
||||
$Ehi="r8";
|
||||
$t0="r9";
|
||||
$t1="r10";
|
||||
$t2="r11";
|
||||
$t3="r12";
|
||||
############ r13 is stack pointer
|
||||
$Ktbl="r14";
|
||||
############ r15 is program counter
|
||||
|
||||
$Aoff=8*0;
|
||||
$Boff=8*1;
|
||||
$Coff=8*2;
|
||||
$Doff=8*3;
|
||||
$Eoff=8*4;
|
||||
$Foff=8*5;
|
||||
$Goff=8*6;
|
||||
$Hoff=8*7;
|
||||
$Xoff=8*8;
|
||||
|
||||
sub BODY_00_15() {
|
||||
my $magic = shift;
|
||||
$code.=<<___;
|
||||
@ Sigma1(x) (ROTR((x),14) ^ ROTR((x),18) ^ ROTR((x),41))
|
||||
@ LO lo>>14^hi<<18 ^ lo>>18^hi<<14 ^ hi>>9^lo<<23
|
||||
@ HI hi>>14^lo<<18 ^ hi>>18^lo<<14 ^ lo>>9^hi<<23
|
||||
mov $t0,$Elo,lsr#14
|
||||
str $Tlo,[sp,#$Xoff+0]
|
||||
mov $t1,$Ehi,lsr#14
|
||||
str $Thi,[sp,#$Xoff+4]
|
||||
eor $t0,$t0,$Ehi,lsl#18
|
||||
ldr $t2,[sp,#$Hoff+0] @ h.lo
|
||||
eor $t1,$t1,$Elo,lsl#18
|
||||
ldr $t3,[sp,#$Hoff+4] @ h.hi
|
||||
eor $t0,$t0,$Elo,lsr#18
|
||||
eor $t1,$t1,$Ehi,lsr#18
|
||||
eor $t0,$t0,$Ehi,lsl#14
|
||||
eor $t1,$t1,$Elo,lsl#14
|
||||
eor $t0,$t0,$Ehi,lsr#9
|
||||
eor $t1,$t1,$Elo,lsr#9
|
||||
eor $t0,$t0,$Elo,lsl#23
|
||||
eor $t1,$t1,$Ehi,lsl#23 @ Sigma1(e)
|
||||
adds $Tlo,$Tlo,$t0
|
||||
ldr $t0,[sp,#$Foff+0] @ f.lo
|
||||
adc $Thi,$Thi,$t1 @ T += Sigma1(e)
|
||||
ldr $t1,[sp,#$Foff+4] @ f.hi
|
||||
adds $Tlo,$Tlo,$t2
|
||||
ldr $t2,[sp,#$Goff+0] @ g.lo
|
||||
adc $Thi,$Thi,$t3 @ T += h
|
||||
ldr $t3,[sp,#$Goff+4] @ g.hi
|
||||
|
||||
eor $t0,$t0,$t2
|
||||
str $Elo,[sp,#$Eoff+0]
|
||||
eor $t1,$t1,$t3
|
||||
str $Ehi,[sp,#$Eoff+4]
|
||||
and $t0,$t0,$Elo
|
||||
str $Alo,[sp,#$Aoff+0]
|
||||
and $t1,$t1,$Ehi
|
||||
str $Ahi,[sp,#$Aoff+4]
|
||||
eor $t0,$t0,$t2
|
||||
ldr $t2,[$Ktbl,#$lo] @ K[i].lo
|
||||
eor $t1,$t1,$t3 @ Ch(e,f,g)
|
||||
ldr $t3,[$Ktbl,#$hi] @ K[i].hi
|
||||
|
||||
adds $Tlo,$Tlo,$t0
|
||||
ldr $Elo,[sp,#$Doff+0] @ d.lo
|
||||
adc $Thi,$Thi,$t1 @ T += Ch(e,f,g)
|
||||
ldr $Ehi,[sp,#$Doff+4] @ d.hi
|
||||
adds $Tlo,$Tlo,$t2
|
||||
and $t0,$t2,#0xff
|
||||
adc $Thi,$Thi,$t3 @ T += K[i]
|
||||
adds $Elo,$Elo,$Tlo
|
||||
ldr $t2,[sp,#$Boff+0] @ b.lo
|
||||
adc $Ehi,$Ehi,$Thi @ d += T
|
||||
teq $t0,#$magic
|
||||
|
||||
ldr $t3,[sp,#$Coff+0] @ c.lo
|
||||
#if __ARM_ARCH__>=7
|
||||
it eq @ Thumb2 thing, sanity check in ARM
|
||||
#endif
|
||||
orreq $Ktbl,$Ktbl,#1
|
||||
@ Sigma0(x) (ROTR((x),28) ^ ROTR((x),34) ^ ROTR((x),39))
|
||||
@ LO lo>>28^hi<<4 ^ hi>>2^lo<<30 ^ hi>>7^lo<<25
|
||||
@ HI hi>>28^lo<<4 ^ lo>>2^hi<<30 ^ lo>>7^hi<<25
|
||||
mov $t0,$Alo,lsr#28
|
||||
mov $t1,$Ahi,lsr#28
|
||||
eor $t0,$t0,$Ahi,lsl#4
|
||||
eor $t1,$t1,$Alo,lsl#4
|
||||
eor $t0,$t0,$Ahi,lsr#2
|
||||
eor $t1,$t1,$Alo,lsr#2
|
||||
eor $t0,$t0,$Alo,lsl#30
|
||||
eor $t1,$t1,$Ahi,lsl#30
|
||||
eor $t0,$t0,$Ahi,lsr#7
|
||||
eor $t1,$t1,$Alo,lsr#7
|
||||
eor $t0,$t0,$Alo,lsl#25
|
||||
eor $t1,$t1,$Ahi,lsl#25 @ Sigma0(a)
|
||||
adds $Tlo,$Tlo,$t0
|
||||
and $t0,$Alo,$t2
|
||||
adc $Thi,$Thi,$t1 @ T += Sigma0(a)
|
||||
|
||||
ldr $t1,[sp,#$Boff+4] @ b.hi
|
||||
orr $Alo,$Alo,$t2
|
||||
ldr $t2,[sp,#$Coff+4] @ c.hi
|
||||
and $Alo,$Alo,$t3
|
||||
and $t3,$Ahi,$t1
|
||||
orr $Ahi,$Ahi,$t1
|
||||
orr $Alo,$Alo,$t0 @ Maj(a,b,c).lo
|
||||
and $Ahi,$Ahi,$t2
|
||||
adds $Alo,$Alo,$Tlo
|
||||
orr $Ahi,$Ahi,$t3 @ Maj(a,b,c).hi
|
||||
sub sp,sp,#8
|
||||
adc $Ahi,$Ahi,$Thi @ h += T
|
||||
tst $Ktbl,#1
|
||||
add $Ktbl,$Ktbl,#8
|
||||
___
|
||||
}
|
||||
$code=<<___;
|
||||
#ifndef __KERNEL__
|
||||
# include <GFp/arm_arch.h>
|
||||
# define VFP_ABI_PUSH vstmdb sp!,{d8-d15}
|
||||
# define VFP_ABI_POP vldmia sp!,{d8-d15}
|
||||
#else
|
||||
# define __ARM_ARCH__ __LINUX_ARM_ARCH__
|
||||
# define __ARM_MAX_ARCH__ 7
|
||||
# define VFP_ABI_PUSH
|
||||
# define VFP_ABI_POP
|
||||
#endif
|
||||
|
||||
@ Silence ARMv8 deprecated IT instruction warnings. This file is used by both
|
||||
@ ARMv7 and ARMv8 processors and does not use ARMv8 instructions.
|
||||
.arch armv7-a
|
||||
|
||||
#ifdef __ARMEL__
|
||||
# define LO 0
|
||||
# define HI 4
|
||||
# define WORD64(hi0,lo0,hi1,lo1) .word lo0,hi0, lo1,hi1
|
||||
#else
|
||||
# define HI 0
|
||||
# define LO 4
|
||||
# define WORD64(hi0,lo0,hi1,lo1) .word hi0,lo0, hi1,lo1
|
||||
#endif
|
||||
|
||||
.text
|
||||
#if defined(__thumb2__)
|
||||
.syntax unified
|
||||
.thumb
|
||||
# define adrl adr
|
||||
#else
|
||||
.code 32
|
||||
#endif
|
||||
|
||||
.type K512,%object
|
||||
.align 5
|
||||
K512:
|
||||
WORD64(0x428a2f98,0xd728ae22, 0x71374491,0x23ef65cd)
|
||||
WORD64(0xb5c0fbcf,0xec4d3b2f, 0xe9b5dba5,0x8189dbbc)
|
||||
WORD64(0x3956c25b,0xf348b538, 0x59f111f1,0xb605d019)
|
||||
WORD64(0x923f82a4,0xaf194f9b, 0xab1c5ed5,0xda6d8118)
|
||||
WORD64(0xd807aa98,0xa3030242, 0x12835b01,0x45706fbe)
|
||||
WORD64(0x243185be,0x4ee4b28c, 0x550c7dc3,0xd5ffb4e2)
|
||||
WORD64(0x72be5d74,0xf27b896f, 0x80deb1fe,0x3b1696b1)
|
||||
WORD64(0x9bdc06a7,0x25c71235, 0xc19bf174,0xcf692694)
|
||||
WORD64(0xe49b69c1,0x9ef14ad2, 0xefbe4786,0x384f25e3)
|
||||
WORD64(0x0fc19dc6,0x8b8cd5b5, 0x240ca1cc,0x77ac9c65)
|
||||
WORD64(0x2de92c6f,0x592b0275, 0x4a7484aa,0x6ea6e483)
|
||||
WORD64(0x5cb0a9dc,0xbd41fbd4, 0x76f988da,0x831153b5)
|
||||
WORD64(0x983e5152,0xee66dfab, 0xa831c66d,0x2db43210)
|
||||
WORD64(0xb00327c8,0x98fb213f, 0xbf597fc7,0xbeef0ee4)
|
||||
WORD64(0xc6e00bf3,0x3da88fc2, 0xd5a79147,0x930aa725)
|
||||
WORD64(0x06ca6351,0xe003826f, 0x14292967,0x0a0e6e70)
|
||||
WORD64(0x27b70a85,0x46d22ffc, 0x2e1b2138,0x5c26c926)
|
||||
WORD64(0x4d2c6dfc,0x5ac42aed, 0x53380d13,0x9d95b3df)
|
||||
WORD64(0x650a7354,0x8baf63de, 0x766a0abb,0x3c77b2a8)
|
||||
WORD64(0x81c2c92e,0x47edaee6, 0x92722c85,0x1482353b)
|
||||
WORD64(0xa2bfe8a1,0x4cf10364, 0xa81a664b,0xbc423001)
|
||||
WORD64(0xc24b8b70,0xd0f89791, 0xc76c51a3,0x0654be30)
|
||||
WORD64(0xd192e819,0xd6ef5218, 0xd6990624,0x5565a910)
|
||||
WORD64(0xf40e3585,0x5771202a, 0x106aa070,0x32bbd1b8)
|
||||
WORD64(0x19a4c116,0xb8d2d0c8, 0x1e376c08,0x5141ab53)
|
||||
WORD64(0x2748774c,0xdf8eeb99, 0x34b0bcb5,0xe19b48a8)
|
||||
WORD64(0x391c0cb3,0xc5c95a63, 0x4ed8aa4a,0xe3418acb)
|
||||
WORD64(0x5b9cca4f,0x7763e373, 0x682e6ff3,0xd6b2b8a3)
|
||||
WORD64(0x748f82ee,0x5defb2fc, 0x78a5636f,0x43172f60)
|
||||
WORD64(0x84c87814,0xa1f0ab72, 0x8cc70208,0x1a6439ec)
|
||||
WORD64(0x90befffa,0x23631e28, 0xa4506ceb,0xde82bde9)
|
||||
WORD64(0xbef9a3f7,0xb2c67915, 0xc67178f2,0xe372532b)
|
||||
WORD64(0xca273ece,0xea26619c, 0xd186b8c7,0x21c0c207)
|
||||
WORD64(0xeada7dd6,0xcde0eb1e, 0xf57d4f7f,0xee6ed178)
|
||||
WORD64(0x06f067aa,0x72176fba, 0x0a637dc5,0xa2c898a6)
|
||||
WORD64(0x113f9804,0xbef90dae, 0x1b710b35,0x131c471b)
|
||||
WORD64(0x28db77f5,0x23047d84, 0x32caab7b,0x40c72493)
|
||||
WORD64(0x3c9ebe0a,0x15c9bebc, 0x431d67c4,0x9c100d4c)
|
||||
WORD64(0x4cc5d4be,0xcb3e42b6, 0x597f299c,0xfc657e2a)
|
||||
WORD64(0x5fcb6fab,0x3ad6faec, 0x6c44198c,0x4a475817)
|
||||
.size K512,.-K512
|
||||
#if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__)
|
||||
.extern GFp_armcap_P
|
||||
.hidden GFp_armcap_P
|
||||
.LOPENSSL_armcap:
|
||||
.word GFp_armcap_P-.Lsha512_block_data_order
|
||||
.skip 32-4
|
||||
#else
|
||||
.skip 32
|
||||
#endif
|
||||
|
||||
.global GFp_sha512_block_data_order
|
||||
.type GFp_sha512_block_data_order,%function
|
||||
GFp_sha512_block_data_order:
|
||||
.Lsha512_block_data_order:
|
||||
#if __ARM_ARCH__<7 && !defined(__thumb2__)
|
||||
sub r3,pc,#8 @ GFp_sha512_block_data_order
|
||||
#else
|
||||
adr r3,.Lsha512_block_data_order
|
||||
#endif
|
||||
#if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__)
|
||||
ldr r12,.LOPENSSL_armcap
|
||||
ldr r12,[r3,r12] @ GFp_armcap_P
|
||||
#ifdef __APPLE__
|
||||
ldr r12,[r12]
|
||||
#endif
|
||||
tst r12,#ARMV7_NEON
|
||||
bne .LNEON
|
||||
#endif
|
||||
add $len,$inp,$len,lsl#7 @ len to point at the end of inp
|
||||
stmdb sp!,{r4-r12,lr}
|
||||
sub $Ktbl,r3,#672 @ K512
|
||||
sub sp,sp,#9*8
|
||||
|
||||
ldr $Elo,[$ctx,#$Eoff+$lo]
|
||||
ldr $Ehi,[$ctx,#$Eoff+$hi]
|
||||
ldr $t0, [$ctx,#$Goff+$lo]
|
||||
ldr $t1, [$ctx,#$Goff+$hi]
|
||||
ldr $t2, [$ctx,#$Hoff+$lo]
|
||||
ldr $t3, [$ctx,#$Hoff+$hi]
|
||||
.Loop:
|
||||
str $t0, [sp,#$Goff+0]
|
||||
str $t1, [sp,#$Goff+4]
|
||||
str $t2, [sp,#$Hoff+0]
|
||||
str $t3, [sp,#$Hoff+4]
|
||||
ldr $Alo,[$ctx,#$Aoff+$lo]
|
||||
ldr $Ahi,[$ctx,#$Aoff+$hi]
|
||||
ldr $Tlo,[$ctx,#$Boff+$lo]
|
||||
ldr $Thi,[$ctx,#$Boff+$hi]
|
||||
ldr $t0, [$ctx,#$Coff+$lo]
|
||||
ldr $t1, [$ctx,#$Coff+$hi]
|
||||
ldr $t2, [$ctx,#$Doff+$lo]
|
||||
ldr $t3, [$ctx,#$Doff+$hi]
|
||||
str $Tlo,[sp,#$Boff+0]
|
||||
str $Thi,[sp,#$Boff+4]
|
||||
str $t0, [sp,#$Coff+0]
|
||||
str $t1, [sp,#$Coff+4]
|
||||
str $t2, [sp,#$Doff+0]
|
||||
str $t3, [sp,#$Doff+4]
|
||||
ldr $Tlo,[$ctx,#$Foff+$lo]
|
||||
ldr $Thi,[$ctx,#$Foff+$hi]
|
||||
str $Tlo,[sp,#$Foff+0]
|
||||
str $Thi,[sp,#$Foff+4]
|
||||
|
||||
.L00_15:
|
||||
#if __ARM_ARCH__<7
|
||||
ldrb $Tlo,[$inp,#7]
|
||||
ldrb $t0, [$inp,#6]
|
||||
ldrb $t1, [$inp,#5]
|
||||
ldrb $t2, [$inp,#4]
|
||||
ldrb $Thi,[$inp,#3]
|
||||
ldrb $t3, [$inp,#2]
|
||||
orr $Tlo,$Tlo,$t0,lsl#8
|
||||
ldrb $t0, [$inp,#1]
|
||||
orr $Tlo,$Tlo,$t1,lsl#16
|
||||
ldrb $t1, [$inp],#8
|
||||
orr $Tlo,$Tlo,$t2,lsl#24
|
||||
orr $Thi,$Thi,$t3,lsl#8
|
||||
orr $Thi,$Thi,$t0,lsl#16
|
||||
orr $Thi,$Thi,$t1,lsl#24
|
||||
#else
|
||||
ldr $Tlo,[$inp,#4]
|
||||
ldr $Thi,[$inp],#8
|
||||
#ifdef __ARMEL__
|
||||
rev $Tlo,$Tlo
|
||||
rev $Thi,$Thi
|
||||
#endif
|
||||
#endif
|
||||
___
|
||||
&BODY_00_15(0x94);
|
||||
$code.=<<___;
|
||||
tst $Ktbl,#1
|
||||
beq .L00_15
|
||||
ldr $t0,[sp,#`$Xoff+8*(16-1)`+0]
|
||||
ldr $t1,[sp,#`$Xoff+8*(16-1)`+4]
|
||||
bic $Ktbl,$Ktbl,#1
|
||||
.L16_79:
|
||||
@ sigma0(x) (ROTR((x),1) ^ ROTR((x),8) ^ ((x)>>7))
|
||||
@ LO lo>>1^hi<<31 ^ lo>>8^hi<<24 ^ lo>>7^hi<<25
|
||||
@ HI hi>>1^lo<<31 ^ hi>>8^lo<<24 ^ hi>>7
|
||||
mov $Tlo,$t0,lsr#1
|
||||
ldr $t2,[sp,#`$Xoff+8*(16-14)`+0]
|
||||
mov $Thi,$t1,lsr#1
|
||||
ldr $t3,[sp,#`$Xoff+8*(16-14)`+4]
|
||||
eor $Tlo,$Tlo,$t1,lsl#31
|
||||
eor $Thi,$Thi,$t0,lsl#31
|
||||
eor $Tlo,$Tlo,$t0,lsr#8
|
||||
eor $Thi,$Thi,$t1,lsr#8
|
||||
eor $Tlo,$Tlo,$t1,lsl#24
|
||||
eor $Thi,$Thi,$t0,lsl#24
|
||||
eor $Tlo,$Tlo,$t0,lsr#7
|
||||
eor $Thi,$Thi,$t1,lsr#7
|
||||
eor $Tlo,$Tlo,$t1,lsl#25
|
||||
|
||||
@ sigma1(x) (ROTR((x),19) ^ ROTR((x),61) ^ ((x)>>6))
|
||||
@ LO lo>>19^hi<<13 ^ hi>>29^lo<<3 ^ lo>>6^hi<<26
|
||||
@ HI hi>>19^lo<<13 ^ lo>>29^hi<<3 ^ hi>>6
|
||||
mov $t0,$t2,lsr#19
|
||||
mov $t1,$t3,lsr#19
|
||||
eor $t0,$t0,$t3,lsl#13
|
||||
eor $t1,$t1,$t2,lsl#13
|
||||
eor $t0,$t0,$t3,lsr#29
|
||||
eor $t1,$t1,$t2,lsr#29
|
||||
eor $t0,$t0,$t2,lsl#3
|
||||
eor $t1,$t1,$t3,lsl#3
|
||||
eor $t0,$t0,$t2,lsr#6
|
||||
eor $t1,$t1,$t3,lsr#6
|
||||
ldr $t2,[sp,#`$Xoff+8*(16-9)`+0]
|
||||
eor $t0,$t0,$t3,lsl#26
|
||||
|
||||
ldr $t3,[sp,#`$Xoff+8*(16-9)`+4]
|
||||
adds $Tlo,$Tlo,$t0
|
||||
ldr $t0,[sp,#`$Xoff+8*16`+0]
|
||||
adc $Thi,$Thi,$t1
|
||||
|
||||
ldr $t1,[sp,#`$Xoff+8*16`+4]
|
||||
adds $Tlo,$Tlo,$t2
|
||||
adc $Thi,$Thi,$t3
|
||||
adds $Tlo,$Tlo,$t0
|
||||
adc $Thi,$Thi,$t1
|
||||
___
|
||||
&BODY_00_15(0x17);
|
||||
$code.=<<___;
|
||||
#if __ARM_ARCH__>=7
|
||||
ittt eq @ Thumb2 thing, sanity check in ARM
|
||||
#endif
|
||||
ldreq $t0,[sp,#`$Xoff+8*(16-1)`+0]
|
||||
ldreq $t1,[sp,#`$Xoff+8*(16-1)`+4]
|
||||
beq .L16_79
|
||||
bic $Ktbl,$Ktbl,#1
|
||||
|
||||
ldr $Tlo,[sp,#$Boff+0]
|
||||
ldr $Thi,[sp,#$Boff+4]
|
||||
ldr $t0, [$ctx,#$Aoff+$lo]
|
||||
ldr $t1, [$ctx,#$Aoff+$hi]
|
||||
ldr $t2, [$ctx,#$Boff+$lo]
|
||||
ldr $t3, [$ctx,#$Boff+$hi]
|
||||
adds $t0,$Alo,$t0
|
||||
str $t0, [$ctx,#$Aoff+$lo]
|
||||
adc $t1,$Ahi,$t1
|
||||
str $t1, [$ctx,#$Aoff+$hi]
|
||||
adds $t2,$Tlo,$t2
|
||||
str $t2, [$ctx,#$Boff+$lo]
|
||||
adc $t3,$Thi,$t3
|
||||
str $t3, [$ctx,#$Boff+$hi]
|
||||
|
||||
ldr $Alo,[sp,#$Coff+0]
|
||||
ldr $Ahi,[sp,#$Coff+4]
|
||||
ldr $Tlo,[sp,#$Doff+0]
|
||||
ldr $Thi,[sp,#$Doff+4]
|
||||
ldr $t0, [$ctx,#$Coff+$lo]
|
||||
ldr $t1, [$ctx,#$Coff+$hi]
|
||||
ldr $t2, [$ctx,#$Doff+$lo]
|
||||
ldr $t3, [$ctx,#$Doff+$hi]
|
||||
adds $t0,$Alo,$t0
|
||||
str $t0, [$ctx,#$Coff+$lo]
|
||||
adc $t1,$Ahi,$t1
|
||||
str $t1, [$ctx,#$Coff+$hi]
|
||||
adds $t2,$Tlo,$t2
|
||||
str $t2, [$ctx,#$Doff+$lo]
|
||||
adc $t3,$Thi,$t3
|
||||
str $t3, [$ctx,#$Doff+$hi]
|
||||
|
||||
ldr $Tlo,[sp,#$Foff+0]
|
||||
ldr $Thi,[sp,#$Foff+4]
|
||||
ldr $t0, [$ctx,#$Eoff+$lo]
|
||||
ldr $t1, [$ctx,#$Eoff+$hi]
|
||||
ldr $t2, [$ctx,#$Foff+$lo]
|
||||
ldr $t3, [$ctx,#$Foff+$hi]
|
||||
adds $Elo,$Elo,$t0
|
||||
str $Elo,[$ctx,#$Eoff+$lo]
|
||||
adc $Ehi,$Ehi,$t1
|
||||
str $Ehi,[$ctx,#$Eoff+$hi]
|
||||
adds $t2,$Tlo,$t2
|
||||
str $t2, [$ctx,#$Foff+$lo]
|
||||
adc $t3,$Thi,$t3
|
||||
str $t3, [$ctx,#$Foff+$hi]
|
||||
|
||||
ldr $Alo,[sp,#$Goff+0]
|
||||
ldr $Ahi,[sp,#$Goff+4]
|
||||
ldr $Tlo,[sp,#$Hoff+0]
|
||||
ldr $Thi,[sp,#$Hoff+4]
|
||||
ldr $t0, [$ctx,#$Goff+$lo]
|
||||
ldr $t1, [$ctx,#$Goff+$hi]
|
||||
ldr $t2, [$ctx,#$Hoff+$lo]
|
||||
ldr $t3, [$ctx,#$Hoff+$hi]
|
||||
adds $t0,$Alo,$t0
|
||||
str $t0, [$ctx,#$Goff+$lo]
|
||||
adc $t1,$Ahi,$t1
|
||||
str $t1, [$ctx,#$Goff+$hi]
|
||||
adds $t2,$Tlo,$t2
|
||||
str $t2, [$ctx,#$Hoff+$lo]
|
||||
adc $t3,$Thi,$t3
|
||||
str $t3, [$ctx,#$Hoff+$hi]
|
||||
|
||||
add sp,sp,#640
|
||||
sub $Ktbl,$Ktbl,#640
|
||||
|
||||
teq $inp,$len
|
||||
bne .Loop
|
||||
|
||||
add sp,sp,#8*9 @ destroy frame
|
||||
#if __ARM_ARCH__>=5
|
||||
ldmia sp!,{r4-r12,pc}
|
||||
#else
|
||||
ldmia sp!,{r4-r12,lr}
|
||||
tst lr,#1
|
||||
moveq pc,lr @ be binary compatible with V4, yet
|
||||
bx lr @ interoperable with Thumb ISA:-)
|
||||
#endif
|
||||
.size GFp_sha512_block_data_order,.-GFp_sha512_block_data_order
|
||||
___
|
||||
|
||||
{
|
||||
my @Sigma0=(28,34,39);
|
||||
my @Sigma1=(14,18,41);
|
||||
my @sigma0=(1, 8, 7);
|
||||
my @sigma1=(19,61,6);
|
||||
|
||||
my $Ktbl="r3";
|
||||
my $cnt="r12"; # volatile register known as ip, intra-procedure-call scratch
|
||||
|
||||
my @X=map("d$_",(0..15));
|
||||
my @V=($A,$B,$C,$D,$E,$F,$G,$H)=map("d$_",(16..23));
|
||||
|
||||
sub NEON_00_15() {
|
||||
my $i=shift;
|
||||
my ($a,$b,$c,$d,$e,$f,$g,$h)=@_;
|
||||
my ($t0,$t1,$t2,$T1,$K,$Ch,$Maj)=map("d$_",(24..31)); # temps
|
||||
|
||||
$code.=<<___ if ($i<16 || $i&1);
|
||||
vshr.u64 $t0,$e,#@Sigma1[0] @ $i
|
||||
#if $i<16
|
||||
vld1.64 {@X[$i%16]},[$inp]! @ handles unaligned
|
||||
#endif
|
||||
vshr.u64 $t1,$e,#@Sigma1[1]
|
||||
#if $i>0
|
||||
vadd.i64 $a,$Maj @ h+=Maj from the past
|
||||
#endif
|
||||
vshr.u64 $t2,$e,#@Sigma1[2]
|
||||
___
|
||||
$code.=<<___;
|
||||
vld1.64 {$K},[$Ktbl,:64]! @ K[i++]
|
||||
vsli.64 $t0,$e,#`64-@Sigma1[0]`
|
||||
vsli.64 $t1,$e,#`64-@Sigma1[1]`
|
||||
vmov $Ch,$e
|
||||
vsli.64 $t2,$e,#`64-@Sigma1[2]`
|
||||
#if $i<16 && defined(__ARMEL__)
|
||||
vrev64.8 @X[$i],@X[$i]
|
||||
#endif
|
||||
veor $t1,$t0
|
||||
vbsl $Ch,$f,$g @ Ch(e,f,g)
|
||||
vshr.u64 $t0,$a,#@Sigma0[0]
|
||||
veor $t2,$t1 @ Sigma1(e)
|
||||
vadd.i64 $T1,$Ch,$h
|
||||
vshr.u64 $t1,$a,#@Sigma0[1]
|
||||
vsli.64 $t0,$a,#`64-@Sigma0[0]`
|
||||
vadd.i64 $T1,$t2
|
||||
vshr.u64 $t2,$a,#@Sigma0[2]
|
||||
vadd.i64 $K,@X[$i%16]
|
||||
vsli.64 $t1,$a,#`64-@Sigma0[1]`
|
||||
veor $Maj,$a,$b
|
||||
vsli.64 $t2,$a,#`64-@Sigma0[2]`
|
||||
veor $h,$t0,$t1
|
||||
vadd.i64 $T1,$K
|
||||
vbsl $Maj,$c,$b @ Maj(a,b,c)
|
||||
veor $h,$t2 @ Sigma0(a)
|
||||
vadd.i64 $d,$T1
|
||||
vadd.i64 $Maj,$T1
|
||||
@ vadd.i64 $h,$Maj
|
||||
___
|
||||
}
|
||||
|
||||
sub NEON_16_79() {
|
||||
my $i=shift;
|
||||
|
||||
if ($i&1) { &NEON_00_15($i,@_); return; }
|
||||
|
||||
# 2x-vectorized, therefore runs every 2nd round
|
||||
my @X=map("q$_",(0..7)); # view @X as 128-bit vector
|
||||
my ($t0,$t1,$s0,$s1) = map("q$_",(12..15)); # temps
|
||||
my ($d0,$d1,$d2) = map("d$_",(24..26)); # temps from NEON_00_15
|
||||
my $e=@_[4]; # $e from NEON_00_15
|
||||
$i /= 2;
|
||||
$code.=<<___;
|
||||
vshr.u64 $t0,@X[($i+7)%8],#@sigma1[0]
|
||||
vshr.u64 $t1,@X[($i+7)%8],#@sigma1[1]
|
||||
vadd.i64 @_[0],d30 @ h+=Maj from the past
|
||||
vshr.u64 $s1,@X[($i+7)%8],#@sigma1[2]
|
||||
vsli.64 $t0,@X[($i+7)%8],#`64-@sigma1[0]`
|
||||
vext.8 $s0,@X[$i%8],@X[($i+1)%8],#8 @ X[i+1]
|
||||
vsli.64 $t1,@X[($i+7)%8],#`64-@sigma1[1]`
|
||||
veor $s1,$t0
|
||||
vshr.u64 $t0,$s0,#@sigma0[0]
|
||||
veor $s1,$t1 @ sigma1(X[i+14])
|
||||
vshr.u64 $t1,$s0,#@sigma0[1]
|
||||
vadd.i64 @X[$i%8],$s1
|
||||
vshr.u64 $s1,$s0,#@sigma0[2]
|
||||
vsli.64 $t0,$s0,#`64-@sigma0[0]`
|
||||
vsli.64 $t1,$s0,#`64-@sigma0[1]`
|
||||
vext.8 $s0,@X[($i+4)%8],@X[($i+5)%8],#8 @ X[i+9]
|
||||
veor $s1,$t0
|
||||
vshr.u64 $d0,$e,#@Sigma1[0] @ from NEON_00_15
|
||||
vadd.i64 @X[$i%8],$s0
|
||||
vshr.u64 $d1,$e,#@Sigma1[1] @ from NEON_00_15
|
||||
veor $s1,$t1 @ sigma0(X[i+1])
|
||||
vshr.u64 $d2,$e,#@Sigma1[2] @ from NEON_00_15
|
||||
vadd.i64 @X[$i%8],$s1
|
||||
___
|
||||
&NEON_00_15(2*$i,@_);
|
||||
}
|
||||
|
||||
$code.=<<___;
|
||||
#if __ARM_MAX_ARCH__>=7
|
||||
.arch armv7-a
|
||||
.fpu neon
|
||||
|
||||
.type sha512_block_data_order_neon,%function
|
||||
.align 4
|
||||
sha512_block_data_order_neon:
|
||||
.LNEON:
|
||||
dmb @ errata #451034 on early Cortex A8
|
||||
add $len,$inp,$len,lsl#7 @ len to point at the end of inp
|
||||
adr $Ktbl,K512
|
||||
VFP_ABI_PUSH
|
||||
vldmia $ctx,{$A-$H} @ load context
|
||||
.Loop_neon:
|
||||
___
|
||||
for($i=0;$i<16;$i++) { &NEON_00_15($i,@V); unshift(@V,pop(@V)); }
|
||||
$code.=<<___;
|
||||
mov $cnt,#4
|
||||
.L16_79_neon:
|
||||
subs $cnt,#1
|
||||
___
|
||||
for(;$i<32;$i++) { &NEON_16_79($i,@V); unshift(@V,pop(@V)); }
|
||||
$code.=<<___;
|
||||
bne .L16_79_neon
|
||||
|
||||
vadd.i64 $A,d30 @ h+=Maj from the past
|
||||
vldmia $ctx,{d24-d31} @ load context to temp
|
||||
vadd.i64 q8,q12 @ vectorized accumulate
|
||||
vadd.i64 q9,q13
|
||||
vadd.i64 q10,q14
|
||||
vadd.i64 q11,q15
|
||||
vstmia $ctx,{$A-$H} @ save context
|
||||
teq $inp,$len
|
||||
sub $Ktbl,#640 @ rewind K512
|
||||
bne .Loop_neon
|
||||
|
||||
VFP_ABI_POP
|
||||
ret @ bx lr
|
||||
.size sha512_block_data_order_neon,.-sha512_block_data_order_neon
|
||||
#endif
|
||||
___
|
||||
}
|
||||
$code.=<<___;
|
||||
.asciz "SHA512 block transform for ARMv4/NEON, CRYPTOGAMS by <appro\@openssl.org>"
|
||||
___
|
||||
|
||||
$code =~ s/\`([^\`]*)\`/eval $1/gem;
|
||||
$code =~ s/\bbx\s+lr\b/.word\t0xe12fff1e/gm; # make it possible to compile with -march=armv4
|
||||
$code =~ s/\bret\b/bx lr/gm;
|
||||
|
||||
open SELF,$0;
|
||||
while(<SELF>) {
|
||||
next if (/^#!/);
|
||||
last if (!s/^#/@/ and !/^$/);
|
||||
print;
|
||||
}
|
||||
close SELF;
|
||||
|
||||
print $code;
|
||||
close STDOUT or die "error closing STDOUT"; # enforce flush
|
||||
462
zeroidc/vendor/ring/crypto/fipsmodule/sha/asm/sha512-armv8.pl
vendored
Normal file
462
zeroidc/vendor/ring/crypto/fipsmodule/sha/asm/sha512-armv8.pl
vendored
Normal file
@@ -0,0 +1,462 @@
|
||||
#! /usr/bin/env perl
|
||||
# Copyright 2014-2016 The OpenSSL Project Authors. All Rights Reserved.
|
||||
#
|
||||
# Licensed under the OpenSSL license (the "License"). You may not use
|
||||
# this file except in compliance with the License. You can obtain a copy
|
||||
# in the file LICENSE in the source distribution or at
|
||||
# https://www.openssl.org/source/license.html
|
||||
|
||||
# ====================================================================
|
||||
# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
|
||||
# project. The module is, however, dual licensed under OpenSSL and
|
||||
# CRYPTOGAMS licenses depending on where you obtain it. For further
|
||||
# details see http://www.openssl.org/~appro/cryptogams/.
|
||||
#
|
||||
# Permission to use under GPLv2 terms is granted.
|
||||
# ====================================================================
|
||||
#
|
||||
# SHA256/512 for ARMv8.
|
||||
#
|
||||
# Performance in cycles per processed byte and improvement coefficient
|
||||
# over code generated with "default" compiler:
|
||||
#
|
||||
# SHA256-hw SHA256(*) SHA512
|
||||
# Apple A7 1.97 10.5 (+33%) 6.73 (-1%(**))
|
||||
# Cortex-A53 2.38 15.5 (+115%) 10.0 (+150%(***))
|
||||
# Cortex-A57 2.31 11.6 (+86%) 7.51 (+260%(***))
|
||||
# Denver 2.01 10.5 (+26%) 6.70 (+8%)
|
||||
# X-Gene 20.0 (+100%) 12.8 (+300%(***))
|
||||
# Mongoose 2.36 13.0 (+50%) 8.36 (+33%)
|
||||
#
|
||||
# (*) Software SHA256 results are of lesser relevance, presented
|
||||
# mostly for informational purposes.
|
||||
# (**) The result is a trade-off: it's possible to improve it by
|
||||
# 10% (or by 1 cycle per round), but at the cost of 20% loss
|
||||
# on Cortex-A53 (or by 4 cycles per round).
|
||||
# (***) Super-impressive coefficients over gcc-generated code are
|
||||
# indication of some compiler "pathology", most notably code
|
||||
# generated with -mgeneral-regs-only is significanty faster
|
||||
# and the gap is only 40-90%.
|
||||
|
||||
$output=pop;
|
||||
$flavour=pop;
|
||||
|
||||
if ($flavour && $flavour ne "void") {
|
||||
$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
|
||||
( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or
|
||||
( $xlate="${dir}../../../perlasm/arm-xlate.pl" and -f $xlate) or
|
||||
die "can't locate arm-xlate.pl";
|
||||
|
||||
open OUT,"| \"$^X\" $xlate $flavour $output";
|
||||
*STDOUT=*OUT;
|
||||
} else {
|
||||
open OUT,">$output";
|
||||
*STDOUT=*OUT;
|
||||
}
|
||||
|
||||
if ($output =~ /sha512-armv8/) {
|
||||
$BITS=512;
|
||||
$SZ=8;
|
||||
@Sigma0=(28,34,39);
|
||||
@Sigma1=(14,18,41);
|
||||
@sigma0=(1, 8, 7);
|
||||
@sigma1=(19,61, 6);
|
||||
$rounds=80;
|
||||
$reg_t="x";
|
||||
} else {
|
||||
$BITS=256;
|
||||
$SZ=4;
|
||||
@Sigma0=( 2,13,22);
|
||||
@Sigma1=( 6,11,25);
|
||||
@sigma0=( 7,18, 3);
|
||||
@sigma1=(17,19,10);
|
||||
$rounds=64;
|
||||
$reg_t="w";
|
||||
}
|
||||
|
||||
$func="GFp_sha${BITS}_block_data_order";
|
||||
|
||||
($ctx,$inp,$num,$Ktbl)=map("x$_",(0..2,30));
|
||||
|
||||
@X=map("$reg_t$_",(3..15,0..2));
|
||||
@V=($A,$B,$C,$D,$E,$F,$G,$H)=map("$reg_t$_",(20..27));
|
||||
($t0,$t1,$t2,$t3)=map("$reg_t$_",(16,17,19,28));
|
||||
|
||||
sub BODY_00_xx {
|
||||
my ($i,$a,$b,$c,$d,$e,$f,$g,$h)=@_;
|
||||
my $j=($i+1)&15;
|
||||
my ($T0,$T1,$T2)=(@X[($i-8)&15],@X[($i-9)&15],@X[($i-10)&15]);
|
||||
$T0=@X[$i+3] if ($i<11);
|
||||
|
||||
$code.=<<___ if ($i<16);
|
||||
#ifndef __ARMEB__
|
||||
rev @X[$i],@X[$i] // $i
|
||||
#endif
|
||||
___
|
||||
$code.=<<___ if ($i<13 && ($i&1));
|
||||
ldp @X[$i+1],@X[$i+2],[$inp],#2*$SZ
|
||||
___
|
||||
$code.=<<___ if ($i==13);
|
||||
ldp @X[14],@X[15],[$inp]
|
||||
___
|
||||
$code.=<<___ if ($i>=14);
|
||||
ldr @X[($i-11)&15],[sp,#`$SZ*(($i-11)%4)`]
|
||||
___
|
||||
$code.=<<___ if ($i>0 && $i<16);
|
||||
add $a,$a,$t1 // h+=Sigma0(a)
|
||||
___
|
||||
$code.=<<___ if ($i>=11);
|
||||
str @X[($i-8)&15],[sp,#`$SZ*(($i-8)%4)`]
|
||||
___
|
||||
# While ARMv8 specifies merged rotate-n-logical operation such as
|
||||
# 'eor x,y,z,ror#n', it was found to negatively affect performance
|
||||
# on Apple A7. The reason seems to be that it requires even 'y' to
|
||||
# be available earlier. This means that such merged instruction is
|
||||
# not necessarily best choice on critical path... On the other hand
|
||||
# Cortex-A5x handles merged instructions much better than disjoint
|
||||
# rotate and logical... See (**) footnote above.
|
||||
$code.=<<___ if ($i<15);
|
||||
ror $t0,$e,#$Sigma1[0]
|
||||
add $h,$h,$t2 // h+=K[i]
|
||||
eor $T0,$e,$e,ror#`$Sigma1[2]-$Sigma1[1]`
|
||||
and $t1,$f,$e
|
||||
bic $t2,$g,$e
|
||||
add $h,$h,@X[$i&15] // h+=X[i]
|
||||
orr $t1,$t1,$t2 // Ch(e,f,g)
|
||||
eor $t2,$a,$b // a^b, b^c in next round
|
||||
eor $t0,$t0,$T0,ror#$Sigma1[1] // Sigma1(e)
|
||||
ror $T0,$a,#$Sigma0[0]
|
||||
add $h,$h,$t1 // h+=Ch(e,f,g)
|
||||
eor $t1,$a,$a,ror#`$Sigma0[2]-$Sigma0[1]`
|
||||
add $h,$h,$t0 // h+=Sigma1(e)
|
||||
and $t3,$t3,$t2 // (b^c)&=(a^b)
|
||||
add $d,$d,$h // d+=h
|
||||
eor $t3,$t3,$b // Maj(a,b,c)
|
||||
eor $t1,$T0,$t1,ror#$Sigma0[1] // Sigma0(a)
|
||||
add $h,$h,$t3 // h+=Maj(a,b,c)
|
||||
ldr $t3,[$Ktbl],#$SZ // *K++, $t2 in next round
|
||||
//add $h,$h,$t1 // h+=Sigma0(a)
|
||||
___
|
||||
$code.=<<___ if ($i>=15);
|
||||
ror $t0,$e,#$Sigma1[0]
|
||||
add $h,$h,$t2 // h+=K[i]
|
||||
ror $T1,@X[($j+1)&15],#$sigma0[0]
|
||||
and $t1,$f,$e
|
||||
ror $T2,@X[($j+14)&15],#$sigma1[0]
|
||||
bic $t2,$g,$e
|
||||
ror $T0,$a,#$Sigma0[0]
|
||||
add $h,$h,@X[$i&15] // h+=X[i]
|
||||
eor $t0,$t0,$e,ror#$Sigma1[1]
|
||||
eor $T1,$T1,@X[($j+1)&15],ror#$sigma0[1]
|
||||
orr $t1,$t1,$t2 // Ch(e,f,g)
|
||||
eor $t2,$a,$b // a^b, b^c in next round
|
||||
eor $t0,$t0,$e,ror#$Sigma1[2] // Sigma1(e)
|
||||
eor $T0,$T0,$a,ror#$Sigma0[1]
|
||||
add $h,$h,$t1 // h+=Ch(e,f,g)
|
||||
and $t3,$t3,$t2 // (b^c)&=(a^b)
|
||||
eor $T2,$T2,@X[($j+14)&15],ror#$sigma1[1]
|
||||
eor $T1,$T1,@X[($j+1)&15],lsr#$sigma0[2] // sigma0(X[i+1])
|
||||
add $h,$h,$t0 // h+=Sigma1(e)
|
||||
eor $t3,$t3,$b // Maj(a,b,c)
|
||||
eor $t1,$T0,$a,ror#$Sigma0[2] // Sigma0(a)
|
||||
eor $T2,$T2,@X[($j+14)&15],lsr#$sigma1[2] // sigma1(X[i+14])
|
||||
add @X[$j],@X[$j],@X[($j+9)&15]
|
||||
add $d,$d,$h // d+=h
|
||||
add $h,$h,$t3 // h+=Maj(a,b,c)
|
||||
ldr $t3,[$Ktbl],#$SZ // *K++, $t2 in next round
|
||||
add @X[$j],@X[$j],$T1
|
||||
add $h,$h,$t1 // h+=Sigma0(a)
|
||||
add @X[$j],@X[$j],$T2
|
||||
___
|
||||
($t2,$t3)=($t3,$t2);
|
||||
}
|
||||
|
||||
$code.=<<___;
|
||||
#ifndef __KERNEL__
|
||||
# include <GFp/arm_arch.h>
|
||||
#endif
|
||||
|
||||
.text
|
||||
|
||||
.extern GFp_armcap_P
|
||||
.hidden GFp_armcap_P
|
||||
.globl $func
|
||||
.type $func,%function
|
||||
.align 6
|
||||
$func:
|
||||
___
|
||||
$code.=<<___ if ($SZ==4);
|
||||
AARCH64_VALID_CALL_TARGET
|
||||
#ifndef __KERNEL__
|
||||
#if __has_feature(hwaddress_sanitizer) && __clang_major__ >= 10
|
||||
adrp x16,:pg_hi21_nc:GFp_armcap_P
|
||||
#else
|
||||
adrp x16,:pg_hi21:GFp_armcap_P
|
||||
#endif
|
||||
ldr w16,[x16,:lo12:GFp_armcap_P]
|
||||
tst w16,#ARMV8_SHA256
|
||||
b.ne .Lv8_entry
|
||||
#endif
|
||||
___
|
||||
$code.=<<___;
|
||||
AARCH64_SIGN_LINK_REGISTER
|
||||
stp x29,x30,[sp,#-128]!
|
||||
add x29,sp,#0
|
||||
|
||||
stp x19,x20,[sp,#16]
|
||||
stp x21,x22,[sp,#32]
|
||||
stp x23,x24,[sp,#48]
|
||||
stp x25,x26,[sp,#64]
|
||||
stp x27,x28,[sp,#80]
|
||||
sub sp,sp,#4*$SZ
|
||||
|
||||
ldp $A,$B,[$ctx] // load context
|
||||
ldp $C,$D,[$ctx,#2*$SZ]
|
||||
ldp $E,$F,[$ctx,#4*$SZ]
|
||||
add $num,$inp,$num,lsl#`log(16*$SZ)/log(2)` // end of input
|
||||
ldp $G,$H,[$ctx,#6*$SZ]
|
||||
adrp $Ktbl,:pg_hi21:.LK$BITS
|
||||
add $Ktbl,$Ktbl,:lo12:.LK$BITS
|
||||
stp $ctx,$num,[x29,#96]
|
||||
|
||||
.Loop:
|
||||
ldp @X[0],@X[1],[$inp],#2*$SZ
|
||||
ldr $t2,[$Ktbl],#$SZ // *K++
|
||||
eor $t3,$B,$C // magic seed
|
||||
str $inp,[x29,#112]
|
||||
___
|
||||
for ($i=0;$i<16;$i++) { &BODY_00_xx($i,@V); unshift(@V,pop(@V)); }
|
||||
$code.=".Loop_16_xx:\n";
|
||||
for (;$i<32;$i++) { &BODY_00_xx($i,@V); unshift(@V,pop(@V)); }
|
||||
$code.=<<___;
|
||||
cbnz $t2,.Loop_16_xx
|
||||
|
||||
ldp $ctx,$num,[x29,#96]
|
||||
ldr $inp,[x29,#112]
|
||||
sub $Ktbl,$Ktbl,#`$SZ*($rounds+1)` // rewind
|
||||
|
||||
ldp @X[0],@X[1],[$ctx]
|
||||
ldp @X[2],@X[3],[$ctx,#2*$SZ]
|
||||
add $inp,$inp,#14*$SZ // advance input pointer
|
||||
ldp @X[4],@X[5],[$ctx,#4*$SZ]
|
||||
add $A,$A,@X[0]
|
||||
ldp @X[6],@X[7],[$ctx,#6*$SZ]
|
||||
add $B,$B,@X[1]
|
||||
add $C,$C,@X[2]
|
||||
add $D,$D,@X[3]
|
||||
stp $A,$B,[$ctx]
|
||||
add $E,$E,@X[4]
|
||||
add $F,$F,@X[5]
|
||||
stp $C,$D,[$ctx,#2*$SZ]
|
||||
add $G,$G,@X[6]
|
||||
add $H,$H,@X[7]
|
||||
cmp $inp,$num
|
||||
stp $E,$F,[$ctx,#4*$SZ]
|
||||
stp $G,$H,[$ctx,#6*$SZ]
|
||||
b.ne .Loop
|
||||
|
||||
ldp x19,x20,[x29,#16]
|
||||
add sp,sp,#4*$SZ
|
||||
ldp x21,x22,[x29,#32]
|
||||
ldp x23,x24,[x29,#48]
|
||||
ldp x25,x26,[x29,#64]
|
||||
ldp x27,x28,[x29,#80]
|
||||
ldp x29,x30,[sp],#128
|
||||
AARCH64_VALIDATE_LINK_REGISTER
|
||||
ret
|
||||
.size $func,.-$func
|
||||
|
||||
.section .rodata
|
||||
.align 6
|
||||
.type .LK$BITS,%object
|
||||
.LK$BITS:
|
||||
___
|
||||
$code.=<<___ if ($SZ==8);
|
||||
.quad 0x428a2f98d728ae22,0x7137449123ef65cd
|
||||
.quad 0xb5c0fbcfec4d3b2f,0xe9b5dba58189dbbc
|
||||
.quad 0x3956c25bf348b538,0x59f111f1b605d019
|
||||
.quad 0x923f82a4af194f9b,0xab1c5ed5da6d8118
|
||||
.quad 0xd807aa98a3030242,0x12835b0145706fbe
|
||||
.quad 0x243185be4ee4b28c,0x550c7dc3d5ffb4e2
|
||||
.quad 0x72be5d74f27b896f,0x80deb1fe3b1696b1
|
||||
.quad 0x9bdc06a725c71235,0xc19bf174cf692694
|
||||
.quad 0xe49b69c19ef14ad2,0xefbe4786384f25e3
|
||||
.quad 0x0fc19dc68b8cd5b5,0x240ca1cc77ac9c65
|
||||
.quad 0x2de92c6f592b0275,0x4a7484aa6ea6e483
|
||||
.quad 0x5cb0a9dcbd41fbd4,0x76f988da831153b5
|
||||
.quad 0x983e5152ee66dfab,0xa831c66d2db43210
|
||||
.quad 0xb00327c898fb213f,0xbf597fc7beef0ee4
|
||||
.quad 0xc6e00bf33da88fc2,0xd5a79147930aa725
|
||||
.quad 0x06ca6351e003826f,0x142929670a0e6e70
|
||||
.quad 0x27b70a8546d22ffc,0x2e1b21385c26c926
|
||||
.quad 0x4d2c6dfc5ac42aed,0x53380d139d95b3df
|
||||
.quad 0x650a73548baf63de,0x766a0abb3c77b2a8
|
||||
.quad 0x81c2c92e47edaee6,0x92722c851482353b
|
||||
.quad 0xa2bfe8a14cf10364,0xa81a664bbc423001
|
||||
.quad 0xc24b8b70d0f89791,0xc76c51a30654be30
|
||||
.quad 0xd192e819d6ef5218,0xd69906245565a910
|
||||
.quad 0xf40e35855771202a,0x106aa07032bbd1b8
|
||||
.quad 0x19a4c116b8d2d0c8,0x1e376c085141ab53
|
||||
.quad 0x2748774cdf8eeb99,0x34b0bcb5e19b48a8
|
||||
.quad 0x391c0cb3c5c95a63,0x4ed8aa4ae3418acb
|
||||
.quad 0x5b9cca4f7763e373,0x682e6ff3d6b2b8a3
|
||||
.quad 0x748f82ee5defb2fc,0x78a5636f43172f60
|
||||
.quad 0x84c87814a1f0ab72,0x8cc702081a6439ec
|
||||
.quad 0x90befffa23631e28,0xa4506cebde82bde9
|
||||
.quad 0xbef9a3f7b2c67915,0xc67178f2e372532b
|
||||
.quad 0xca273eceea26619c,0xd186b8c721c0c207
|
||||
.quad 0xeada7dd6cde0eb1e,0xf57d4f7fee6ed178
|
||||
.quad 0x06f067aa72176fba,0x0a637dc5a2c898a6
|
||||
.quad 0x113f9804bef90dae,0x1b710b35131c471b
|
||||
.quad 0x28db77f523047d84,0x32caab7b40c72493
|
||||
.quad 0x3c9ebe0a15c9bebc,0x431d67c49c100d4c
|
||||
.quad 0x4cc5d4becb3e42b6,0x597f299cfc657e2a
|
||||
.quad 0x5fcb6fab3ad6faec,0x6c44198c4a475817
|
||||
.quad 0 // terminator
|
||||
___
|
||||
$code.=<<___ if ($SZ==4);
|
||||
.long 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
|
||||
.long 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5
|
||||
.long 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3
|
||||
.long 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174
|
||||
.long 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc
|
||||
.long 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da
|
||||
.long 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7
|
||||
.long 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967
|
||||
.long 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13
|
||||
.long 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85
|
||||
.long 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3
|
||||
.long 0xd192e819,0xd6990624,0xf40e3585,0x106aa070
|
||||
.long 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5
|
||||
.long 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3
|
||||
.long 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
|
||||
.long 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
|
||||
.long 0 //terminator
|
||||
___
|
||||
$code.=<<___;
|
||||
.size .LK$BITS,.-.LK$BITS
|
||||
.asciz "SHA$BITS block transform for ARMv8, CRYPTOGAMS by <appro\@openssl.org>"
|
||||
.align 2
|
||||
___
|
||||
|
||||
if ($SZ==4) {
|
||||
my $Ktbl="x3";
|
||||
|
||||
my ($ABCD,$EFGH,$abcd)=map("v$_.16b",(0..2));
|
||||
my @MSG=map("v$_.16b",(4..7));
|
||||
my ($W0,$W1)=("v16.4s","v17.4s");
|
||||
my ($ABCD_SAVE,$EFGH_SAVE)=("v18.16b","v19.16b");
|
||||
|
||||
$code.=<<___;
|
||||
.text
|
||||
#ifndef __KERNEL__
|
||||
.type sha256_block_armv8,%function
|
||||
.align 6
|
||||
sha256_block_armv8:
|
||||
.Lv8_entry:
|
||||
// Armv8.3-A PAuth: even though x30 is pushed to stack it is not popped later.
|
||||
stp x29,x30,[sp,#-16]!
|
||||
add x29,sp,#0
|
||||
|
||||
ld1.32 {$ABCD,$EFGH},[$ctx]
|
||||
adrp $Ktbl,:pg_hi21:.LK256
|
||||
add $Ktbl,$Ktbl,:lo12:.LK256
|
||||
|
||||
.Loop_hw:
|
||||
ld1 {@MSG[0]-@MSG[3]},[$inp],#64
|
||||
sub $num,$num,#1
|
||||
ld1.32 {$W0},[$Ktbl],#16
|
||||
rev32 @MSG[0],@MSG[0]
|
||||
rev32 @MSG[1],@MSG[1]
|
||||
rev32 @MSG[2],@MSG[2]
|
||||
rev32 @MSG[3],@MSG[3]
|
||||
orr $ABCD_SAVE,$ABCD,$ABCD // offload
|
||||
orr $EFGH_SAVE,$EFGH,$EFGH
|
||||
___
|
||||
for($i=0;$i<12;$i++) {
|
||||
$code.=<<___;
|
||||
ld1.32 {$W1},[$Ktbl],#16
|
||||
add.i32 $W0,$W0,@MSG[0]
|
||||
sha256su0 @MSG[0],@MSG[1]
|
||||
orr $abcd,$ABCD,$ABCD
|
||||
sha256h $ABCD,$EFGH,$W0
|
||||
sha256h2 $EFGH,$abcd,$W0
|
||||
sha256su1 @MSG[0],@MSG[2],@MSG[3]
|
||||
___
|
||||
($W0,$W1)=($W1,$W0); push(@MSG,shift(@MSG));
|
||||
}
|
||||
$code.=<<___;
|
||||
ld1.32 {$W1},[$Ktbl],#16
|
||||
add.i32 $W0,$W0,@MSG[0]
|
||||
orr $abcd,$ABCD,$ABCD
|
||||
sha256h $ABCD,$EFGH,$W0
|
||||
sha256h2 $EFGH,$abcd,$W0
|
||||
|
||||
ld1.32 {$W0},[$Ktbl],#16
|
||||
add.i32 $W1,$W1,@MSG[1]
|
||||
orr $abcd,$ABCD,$ABCD
|
||||
sha256h $ABCD,$EFGH,$W1
|
||||
sha256h2 $EFGH,$abcd,$W1
|
||||
|
||||
ld1.32 {$W1},[$Ktbl]
|
||||
add.i32 $W0,$W0,@MSG[2]
|
||||
sub $Ktbl,$Ktbl,#$rounds*$SZ-16 // rewind
|
||||
orr $abcd,$ABCD,$ABCD
|
||||
sha256h $ABCD,$EFGH,$W0
|
||||
sha256h2 $EFGH,$abcd,$W0
|
||||
|
||||
add.i32 $W1,$W1,@MSG[3]
|
||||
orr $abcd,$ABCD,$ABCD
|
||||
sha256h $ABCD,$EFGH,$W1
|
||||
sha256h2 $EFGH,$abcd,$W1
|
||||
|
||||
add.i32 $ABCD,$ABCD,$ABCD_SAVE
|
||||
add.i32 $EFGH,$EFGH,$EFGH_SAVE
|
||||
|
||||
cbnz $num,.Loop_hw
|
||||
|
||||
st1.32 {$ABCD,$EFGH},[$ctx]
|
||||
|
||||
ldr x29,[sp],#16
|
||||
ret
|
||||
.size sha256_block_armv8,.-sha256_block_armv8
|
||||
#endif
|
||||
___
|
||||
}
|
||||
|
||||
{ my %opcode = (
|
||||
"sha256h" => 0x5e004000, "sha256h2" => 0x5e005000,
|
||||
"sha256su0" => 0x5e282800, "sha256su1" => 0x5e006000 );
|
||||
|
||||
sub unsha256 {
|
||||
my ($mnemonic,$arg)=@_;
|
||||
|
||||
$arg =~ m/[qv]([0-9]+)[^,]*,\s*[qv]([0-9]+)[^,]*(?:,\s*[qv]([0-9]+))?/o
|
||||
&&
|
||||
sprintf ".inst\t0x%08x\t//%s %s",
|
||||
$opcode{$mnemonic}|$1|($2<<5)|($3<<16),
|
||||
$mnemonic,$arg;
|
||||
}
|
||||
}
|
||||
|
||||
open SELF,$0;
|
||||
while(<SELF>) {
|
||||
next if (/^#!/);
|
||||
last if (!s/^#/\/\// and !/^$/);
|
||||
print;
|
||||
}
|
||||
close SELF;
|
||||
|
||||
foreach(split("\n",$code)) {
|
||||
|
||||
s/\`([^\`]*)\`/eval($1)/geo;
|
||||
|
||||
s/\b(sha256\w+)\s+([qv].*)/unsha256($1,$2)/geo;
|
||||
|
||||
s/\.\w?32\b//o and s/\.16b/\.4s/go;
|
||||
m/(ld|st)1[^\[]+\[0\]/o and s/\.4s/\.s/go;
|
||||
|
||||
print $_,"\n";
|
||||
}
|
||||
|
||||
close STDOUT or die "error closing STDOUT";
|
||||
1675
zeroidc/vendor/ring/crypto/fipsmodule/sha/asm/sha512-x86_64.pl
vendored
Normal file
1675
zeroidc/vendor/ring/crypto/fipsmodule/sha/asm/sha512-x86_64.pl
vendored
Normal file
File diff suppressed because it is too large
Load Diff
Reference in New Issue
Block a user