RPM build fix (reverted CI changes which will need to be un-reverted or made conditional) and vendor Rust dependencies to make builds much faster in any CI system.

This commit is contained in:
Adam Ierymenko
2022-06-08 07:32:16 -04:00
parent 373ca30269
commit d5ca4e5f52
12611 changed files with 2898014 additions and 284 deletions

View File

@@ -0,0 +1,961 @@
/* Copyright (c) 2019, Google Inc.
*
* Permission to use, copy, modify, and/or distribute this software for any
* purpose with or without fee is hereby granted, provided that the above
* copyright notice and this permission notice appear in all copies.
*
* THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
* WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
* MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY
* SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
* WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION
* OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN
* CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. */
#include <GFp/aes.h>
#include "../../internal.h"
#if defined(OPENSSL_SSE2)
#include <emmintrin.h>
#endif
// This file contains a constant-time implementation of AES, bitsliced with
// 32-bit, 64-bit, or 128-bit words, operating on two-, four-, and eight-block
// batches, respectively. The 128-bit implementation requires SSE2 intrinsics.
//
// This implementation is based on the algorithms described in the following
// references:
// - https://bearssl.org/constanttime.html#aes
// - https://eprint.iacr.org/2009/129.pdf
// - https://eprint.iacr.org/2009/191.pdf
// Word operations.
//
// An aes_word_t is the word used for this AES implementation. Throughout this
// file, bits and bytes are ordered little-endian, though "left" and "right"
// shifts match the operations themselves, which makes them reversed in a
// little-endian, left-to-right reading.
//
// Eight |aes_word_t|s contain |AES_NOHW_BATCH_SIZE| blocks. The bits in an
// |aes_word_t| are divided into 16 consecutive groups of |AES_NOHW_BATCH_SIZE|
// bits each, each corresponding to a byte in an AES block in column-major
// order (AES's byte order). We refer to these as "logical bytes". Note, in the
// 32-bit and 64-bit implementations, they are smaller than a byte. (The
// contents of a logical byte will be described later.)
//
// MSVC does not support C bit operators on |__m128i|, so the wrapper functions
// |aes_nohw_and|, etc., should be used instead. Note |aes_nohw_shift_left| and
// |aes_nohw_shift_right| measure the shift in logical bytes. That is, the shift
// value ranges from 0 to 15 independent of |aes_word_t| and
// |AES_NOHW_BATCH_SIZE|.
//
// This ordering is different from https://eprint.iacr.org/2009/129.pdf, which
// uses row-major order. Matching the AES order was easier to reason about, and
// we do not have PSHUFB available to arbitrarily permute bytes.
#if defined(OPENSSL_SSE2)
typedef __m128i aes_word_t;
// AES_NOHW_WORD_SIZE is sizeof(aes_word_t). alignas(sizeof(T)) does not work in
// MSVC, so we define a constant.
#define AES_NOHW_WORD_SIZE 16
#define AES_NOHW_BATCH_SIZE 8
#define AES_NOHW_ROW0_MASK \
_mm_set_epi32(0x000000ff, 0x000000ff, 0x000000ff, 0x000000ff)
#define AES_NOHW_ROW1_MASK \
_mm_set_epi32(0x0000ff00, 0x0000ff00, 0x0000ff00, 0x0000ff00)
#define AES_NOHW_ROW2_MASK \
_mm_set_epi32(0x00ff0000, 0x00ff0000, 0x00ff0000, 0x00ff0000)
#define AES_NOHW_ROW3_MASK \
_mm_set_epi32(0xff000000, 0xff000000, 0xff000000, 0xff000000)
#define AES_NOHW_COL01_MASK \
_mm_set_epi32(0x00000000, 0x00000000, 0xffffffff, 0xffffffff)
#define AES_NOHW_COL2_MASK \
_mm_set_epi32(0x00000000, 0xffffffff, 0x00000000, 0x00000000)
#define AES_NOHW_COL3_MASK \
_mm_set_epi32(0xffffffff, 0x00000000, 0x00000000, 0x00000000)
static inline aes_word_t aes_nohw_and(aes_word_t a, aes_word_t b) {
return _mm_and_si128(a, b);
}
static inline aes_word_t aes_nohw_or(aes_word_t a, aes_word_t b) {
return _mm_or_si128(a, b);
}
static inline aes_word_t aes_nohw_xor(aes_word_t a, aes_word_t b) {
return _mm_xor_si128(a, b);
}
static inline aes_word_t aes_nohw_not(aes_word_t a) {
return _mm_xor_si128(
a, _mm_set_epi32(0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff));
}
// These are macros because parameters to |_mm_slli_si128| and |_mm_srli_si128|
// must be constants.
#define aes_nohw_shift_left(/* aes_word_t */ a, /* const */ i) \
_mm_slli_si128((a), (i))
#define aes_nohw_shift_right(/* aes_word_t */ a, /* const */ i) \
_mm_srli_si128((a), (i))
#else // !OPENSSL_SSE2
#if defined(OPENSSL_64_BIT)
typedef uint64_t aes_word_t;
#define AES_NOHW_WORD_SIZE 8
#define AES_NOHW_BATCH_SIZE 4
#define AES_NOHW_ROW0_MASK UINT64_C(0x000f000f000f000f)
#define AES_NOHW_ROW1_MASK UINT64_C(0x00f000f000f000f0)
#define AES_NOHW_ROW2_MASK UINT64_C(0x0f000f000f000f00)
#define AES_NOHW_ROW3_MASK UINT64_C(0xf000f000f000f000)
#define AES_NOHW_COL01_MASK UINT64_C(0x00000000ffffffff)
#define AES_NOHW_COL2_MASK UINT64_C(0x0000ffff00000000)
#define AES_NOHW_COL3_MASK UINT64_C(0xffff000000000000)
#else // !OPENSSL_64_BIT
typedef uint32_t aes_word_t;
#define AES_NOHW_WORD_SIZE 4
#define AES_NOHW_BATCH_SIZE 2
#define AES_NOHW_ROW0_MASK 0x03030303
#define AES_NOHW_ROW1_MASK 0x0c0c0c0c
#define AES_NOHW_ROW2_MASK 0x30303030
#define AES_NOHW_ROW3_MASK 0xc0c0c0c0
#define AES_NOHW_COL01_MASK 0x0000ffff
#define AES_NOHW_COL2_MASK 0x00ff0000
#define AES_NOHW_COL3_MASK 0xff000000
#endif // OPENSSL_64_BIT
static inline aes_word_t aes_nohw_and(aes_word_t a, aes_word_t b) {
return a & b;
}
static inline aes_word_t aes_nohw_or(aes_word_t a, aes_word_t b) {
return a | b;
}
static inline aes_word_t aes_nohw_xor(aes_word_t a, aes_word_t b) {
return a ^ b;
}
static inline aes_word_t aes_nohw_not(aes_word_t a) { return ~a; }
static inline aes_word_t aes_nohw_shift_left(aes_word_t a, aes_word_t i) {
return a << (i * AES_NOHW_BATCH_SIZE);
}
static inline aes_word_t aes_nohw_shift_right(aes_word_t a, aes_word_t i) {
return a >> (i * AES_NOHW_BATCH_SIZE);
}
#endif // OPENSSL_SSE2
OPENSSL_STATIC_ASSERT(AES_NOHW_BATCH_SIZE * 128 == 8 * 8 * sizeof(aes_word_t),
"batch size does not match word size");
OPENSSL_STATIC_ASSERT(AES_NOHW_WORD_SIZE == sizeof(aes_word_t),
"AES_NOHW_WORD_SIZE is incorrect");
// Block representations.
//
// This implementation uses three representations for AES blocks. First, the
// public API represents blocks as uint8_t[16] in the usual way. Second, most
// AES steps are evaluated in bitsliced form, stored in an |AES_NOHW_BATCH|.
// This stores |AES_NOHW_BATCH_SIZE| blocks in bitsliced order. For 64-bit words
// containing bitsliced blocks a, b, c, d, this would be as follows (vertical
// bars divide logical bytes):
//
// batch.w[0] = a0 b0 c0 d0 | a8 b8 c8 d8 | a16 b16 c16 d16 ...
// batch.w[1] = a1 b1 c1 d1 | a9 b9 c9 d9 | a17 b17 c17 d17 ...
// batch.w[2] = a2 b2 c2 d2 | a10 b10 c10 d10 | a18 b18 c18 d18 ...
// batch.w[3] = a3 b3 c3 d3 | a11 b11 c11 d11 | a19 b19 c19 d19 ...
// ...
//
// Finally, an individual block may be stored as an intermediate form in an
// aes_word_t[AES_NOHW_BLOCK_WORDS]. In this form, we permute the bits in each
// block, so that block[0]'s ith logical byte contains least-significant
// |AES_NOHW_BATCH_SIZE| bits of byte i, block[1] contains the next group of
// |AES_NOHW_BATCH_SIZE| bits, and so on. We refer to this transformation as
// "compacting" the block. Note this is no-op with 128-bit words because then
// |AES_NOHW_BLOCK_WORDS| is one and |AES_NOHW_BATCH_SIZE| is eight. For 64-bit
// words, one block would be stored in two words:
//
// block[0] = a0 a1 a2 a3 | a8 a9 a10 a11 | a16 a17 a18 a19 ...
// block[1] = a4 a5 a6 a7 | a12 a13 a14 a15 | a20 a21 a22 a23 ...
//
// Observe that the distances between corresponding bits in bitsliced and
// compact bit orders match. If we line up corresponding words of each block,
// the bitsliced and compact representations may be converted by tranposing bits
// in corresponding logical bytes. Continuing the 64-bit example:
//
// block_a[0] = a0 a1 a2 a3 | a8 a9 a10 a11 | a16 a17 a18 a19 ...
// block_b[0] = b0 b1 b2 b3 | b8 b9 b10 b11 | b16 b17 b18 b19 ...
// block_c[0] = c0 c1 c2 c3 | c8 c9 c10 c11 | c16 c17 c18 c19 ...
// block_d[0] = d0 d1 d2 d3 | d8 d9 d10 d11 | d16 d17 d18 d19 ...
//
// batch.w[0] = a0 b0 c0 d0 | a8 b8 c8 d8 | a16 b16 c16 d16 ...
// batch.w[1] = a1 b1 c1 d1 | a9 b9 c9 d9 | a17 b17 c17 d17 ...
// batch.w[2] = a2 b2 c2 d2 | a10 b10 c10 d10 | a18 b18 c18 d18 ...
// batch.w[3] = a3 b3 c3 d3 | a11 b11 c11 d11 | a19 b19 c19 d19 ...
//
// Note also that bitwise operations and (logical) byte permutations on an
// |aes_word_t| work equally for the bitsliced and compact words.
//
// We use the compact form in the |AES_KEY| representation to save work
// inflating round keys into |AES_NOHW_BATCH|. The compact form also exists
// temporarily while moving blocks in or out of an |AES_NOHW_BATCH|, immediately
// before or after |aes_nohw_transpose|.
#define AES_NOHW_BLOCK_WORDS (16 / sizeof(aes_word_t))
// An AES_NOHW_BATCH stores |AES_NOHW_BATCH_SIZE| blocks. Unless otherwise
// specified, it is in bitsliced form.
typedef struct {
aes_word_t w[8];
} AES_NOHW_BATCH;
// An AES_NOHW_SCHEDULE is an expanded bitsliced AES key schedule. It is
// suitable for encryption or decryption. It is as large as |AES_NOHW_BATCH|
// |AES_KEY|s so it should not be used as a long-term key representation.
typedef struct {
// keys is an array of batches, one for each round key. Each batch stores
// |AES_NOHW_BATCH_SIZE| copies of the round key in bitsliced form.
AES_NOHW_BATCH keys[AES_MAXNR + 1];
} AES_NOHW_SCHEDULE;
// aes_nohw_batch_set sets the |i|th block of |batch| to |in|. |batch| is in
// compact form.
static inline void aes_nohw_batch_set(AES_NOHW_BATCH *batch,
const aes_word_t in[AES_NOHW_BLOCK_WORDS],
size_t i) {
// Note the words are interleaved. The order comes from |aes_nohw_transpose|.
// If |i| is zero and this is the 64-bit implementation, in[0] contains bits
// 0-3 and in[1] contains bits 4-7. We place in[0] at w[0] and in[1] at
// w[4] so that bits 0 and 4 are in the correct position. (In general, bits
// along diagonals of |AES_NOHW_BATCH_SIZE| by |AES_NOHW_BATCH_SIZE| squares
// will be correctly placed.)
dev_assert_secret(i < AES_NOHW_BATCH_SIZE);
#if defined(OPENSSL_SSE2)
batch->w[i] = in[0];
#elif defined(OPENSSL_64_BIT)
batch->w[i] = in[0];
batch->w[i + 4] = in[1];
#else
batch->w[i] = in[0];
batch->w[i + 2] = in[1];
batch->w[i + 4] = in[2];
batch->w[i + 6] = in[3];
#endif
}
// aes_nohw_batch_get writes the |i|th block of |batch| to |out|. |batch| is in
// compact form.
static inline void aes_nohw_batch_get(const AES_NOHW_BATCH *batch,
aes_word_t out[AES_NOHW_BLOCK_WORDS],
size_t i) {
dev_assert_secret(i < AES_NOHW_BATCH_SIZE);
#if defined(OPENSSL_SSE2)
out[0] = batch->w[i];
#elif defined(OPENSSL_64_BIT)
out[0] = batch->w[i];
out[1] = batch->w[i + 4];
#else
out[0] = batch->w[i];
out[1] = batch->w[i + 2];
out[2] = batch->w[i + 4];
out[3] = batch->w[i + 6];
#endif
}
#if !defined(OPENSSL_SSE2)
// aes_nohw_delta_swap returns |a| with bits |a & mask| and
// |a & (mask << shift)| swapped. |mask| and |mask << shift| may not overlap.
static inline aes_word_t aes_nohw_delta_swap(aes_word_t a, aes_word_t mask,
aes_word_t shift) {
// See
// https://reflectionsonsecurity.wordpress.com/2014/05/11/efficient-bit-permutation-using-delta-swaps/
aes_word_t b = (a ^ (a >> shift)) & mask;
return a ^ b ^ (b << shift);
}
// In the 32-bit and 64-bit implementations, a block spans multiple words.
// |aes_nohw_compact_block| must permute bits across different words. First we
// implement |aes_nohw_compact_word| which performs a smaller version of the
// transformation which stays within a single word.
//
// These transformations are generalizations of the output of
// http://programming.sirrida.de/calcperm.php on smaller inputs.
#if defined(OPENSSL_64_BIT)
static inline uint64_t aes_nohw_compact_word(uint64_t a) {
// Numbering the 64/2 = 16 4-bit chunks, least to most significant, we swap
// quartets of those chunks:
// 0 1 2 3 | 4 5 6 7 | 8 9 10 11 | 12 13 14 15 =>
// 0 2 1 3 | 4 6 5 7 | 8 10 9 11 | 12 14 13 15
a = aes_nohw_delta_swap(a, UINT64_C(0x00f000f000f000f0), 4);
// Swap quartets of 8-bit chunks (still numbering by 4-bit chunks):
// 0 2 1 3 | 4 6 5 7 | 8 10 9 11 | 12 14 13 15 =>
// 0 2 4 6 | 1 3 5 7 | 8 10 12 14 | 9 11 13 15
a = aes_nohw_delta_swap(a, UINT64_C(0x0000ff000000ff00), 8);
// Swap quartets of 16-bit chunks (still numbering by 4-bit chunks):
// 0 2 4 6 | 1 3 5 7 | 8 10 12 14 | 9 11 13 15 =>
// 0 2 4 6 | 8 10 12 14 | 1 3 5 7 | 9 11 13 15
a = aes_nohw_delta_swap(a, UINT64_C(0x00000000ffff0000), 16);
return a;
}
static inline uint64_t aes_nohw_uncompact_word(uint64_t a) {
// Reverse the steps of |aes_nohw_uncompact_word|.
a = aes_nohw_delta_swap(a, UINT64_C(0x00000000ffff0000), 16);
a = aes_nohw_delta_swap(a, UINT64_C(0x0000ff000000ff00), 8);
a = aes_nohw_delta_swap(a, UINT64_C(0x00f000f000f000f0), 4);
return a;
}
#else // !OPENSSL_64_BIT
static inline uint32_t aes_nohw_compact_word(uint32_t a) {
// Numbering the 32/2 = 16 pairs of bits, least to most significant, we swap:
// 0 1 2 3 | 4 5 6 7 | 8 9 10 11 | 12 13 14 15 =>
// 0 4 2 6 | 1 5 3 7 | 8 12 10 14 | 9 13 11 15
// Note: 0x00cc = 0b0000_0000_1100_1100
// 0x00cc << 6 = 0b0011_0011_0000_0000
a = aes_nohw_delta_swap(a, 0x00cc00cc, 6);
// Now we swap groups of four bits (still numbering by pairs):
// 0 4 2 6 | 1 5 3 7 | 8 12 10 14 | 9 13 11 15 =>
// 0 4 8 12 | 1 5 9 13 | 2 6 10 14 | 3 7 11 15
// Note: 0x0000_f0f0 << 12 = 0x0f0f_0000
a = aes_nohw_delta_swap(a, 0x0000f0f0, 12);
return a;
}
static inline uint32_t aes_nohw_uncompact_word(uint32_t a) {
// Reverse the steps of |aes_nohw_uncompact_word|.
a = aes_nohw_delta_swap(a, 0x0000f0f0, 12);
a = aes_nohw_delta_swap(a, 0x00cc00cc, 6);
return a;
}
static inline uint32_t aes_nohw_word_from_bytes(uint8_t a0, uint8_t a1,
uint8_t a2, uint8_t a3) {
return (uint32_t)a0 | ((uint32_t)a1 << 8) | ((uint32_t)a2 << 16) |
((uint32_t)a3 << 24);
}
static inline uint8_t lo(uint32_t a) {
return (uint8_t)a;
}
#endif // OPENSSL_64_BIT
#endif // !OPENSSL_SSE2
static inline void aes_nohw_compact_block(aes_word_t out[AES_NOHW_BLOCK_WORDS],
const uint8_t in[16]) {
GFp_memcpy(out, in, 16);
#if defined(OPENSSL_SSE2)
// No conversions needed.
#elif defined(OPENSSL_64_BIT)
uint64_t a0 = aes_nohw_compact_word(out[0]);
uint64_t a1 = aes_nohw_compact_word(out[1]);
out[0] = (a0 & UINT64_C(0x00000000ffffffff)) | (a1 << 32);
out[1] = (a1 & UINT64_C(0xffffffff00000000)) | (a0 >> 32);
#else
uint32_t a0 = aes_nohw_compact_word(out[0]);
uint32_t a1 = aes_nohw_compact_word(out[1]);
uint32_t a2 = aes_nohw_compact_word(out[2]);
uint32_t a3 = aes_nohw_compact_word(out[3]);
// Note clang, when building for ARM Thumb2, will sometimes miscompile
// expressions such as (a0 & 0x0000ff00) << 8, particularly when building
// without optimizations. This bug was introduced in
// https://reviews.llvm.org/rL340261 and fixed in
// https://reviews.llvm.org/rL351310. The following is written to avoid this.
out[0] = aes_nohw_word_from_bytes(lo(a0), lo(a1), lo(a2), lo(a3));
out[1] = aes_nohw_word_from_bytes(lo(a0 >> 8), lo(a1 >> 8), lo(a2 >> 8), lo(a3 >> 8));
out[2] = aes_nohw_word_from_bytes(lo(a0 >> 16), lo(a1 >> 16), lo(a2 >> 16), lo(a3 >> 16));
out[3] = aes_nohw_word_from_bytes(lo(a0 >> 24), lo(a1 >> 24), lo(a2 >> 24), lo(a3 >> 24));
#endif
}
static inline void aes_nohw_uncompact_block(
uint8_t out[16], const aes_word_t in[AES_NOHW_BLOCK_WORDS]) {
#if defined(OPENSSL_SSE2)
GFp_memcpy(out, in, 16); // No conversions needed.
#elif defined(OPENSSL_64_BIT)
uint64_t a0 = in[0];
uint64_t a1 = in[1];
uint64_t b0 =
aes_nohw_uncompact_word((a0 & UINT64_C(0x00000000ffffffff)) | (a1 << 32));
uint64_t b1 =
aes_nohw_uncompact_word((a1 & UINT64_C(0xffffffff00000000)) | (a0 >> 32));
GFp_memcpy(out, &b0, 8);
GFp_memcpy(out + 8, &b1, 8);
#else
uint32_t a0 = in[0];
uint32_t a1 = in[1];
uint32_t a2 = in[2];
uint32_t a3 = in[3];
// Note clang, when building for ARM Thumb2, will sometimes miscompile
// expressions such as (a0 & 0x0000ff00) << 8, particularly when building
// without optimizations. This bug was introduced in
// https://reviews.llvm.org/rL340261 and fixed in
// https://reviews.llvm.org/rL351310. The following is written to avoid this.
uint32_t b0 = aes_nohw_word_from_bytes(lo(a0), lo(a1), lo(a2), lo(a3));
uint32_t b1 = aes_nohw_word_from_bytes(lo(a0 >> 8), lo(a1 >> 8), lo(a2 >> 8), lo(a3 >> 8));
uint32_t b2 =
aes_nohw_word_from_bytes(lo(a0 >> 16), lo(a1 >> 16), lo(a2 >> 16), lo(a3 >> 16));
uint32_t b3 =
aes_nohw_word_from_bytes(lo(a0 >> 24), lo(a1 >> 24), lo(a2 >> 24), lo(a3 >> 24));
b0 = aes_nohw_uncompact_word(b0);
b1 = aes_nohw_uncompact_word(b1);
b2 = aes_nohw_uncompact_word(b2);
b3 = aes_nohw_uncompact_word(b3);
GFp_memcpy(out, &b0, 4);
GFp_memcpy(out + 4, &b1, 4);
GFp_memcpy(out + 8, &b2, 4);
GFp_memcpy(out + 12, &b3, 4);
#endif
}
// aes_nohw_swap_bits is a variation on a delta swap. It swaps the bits in
// |*a & (mask << shift)| with the bits in |*b & mask|. |mask| and
// |mask << shift| must not overlap. |mask| is specified as a |uint32_t|, but it
// is repeated to the full width of |aes_word_t|.
#if defined(OPENSSL_SSE2)
// This must be a macro because |_mm_srli_epi32| and |_mm_slli_epi32| require
// constant shift values.
#define aes_nohw_swap_bits(/*__m128i* */ a, /*__m128i* */ b, \
/* uint32_t */ mask, /* const */ shift) \
do { \
__m128i swap = \
_mm_and_si128(_mm_xor_si128(_mm_srli_epi32(*(a), (shift)), *(b)), \
_mm_set_epi32((mask), (mask), (mask), (mask))); \
*(a) = _mm_xor_si128(*(a), _mm_slli_epi32(swap, (shift))); \
*(b) = _mm_xor_si128(*(b), swap); \
\
} while (0)
#else
static inline void aes_nohw_swap_bits(aes_word_t *a, aes_word_t *b,
uint32_t mask, aes_word_t shift) {
#if defined(OPENSSL_64_BIT)
aes_word_t mask_w = (((uint64_t)mask) << 32) | mask;
#else
aes_word_t mask_w = mask;
#endif
// This is a variation on a delta swap.
aes_word_t swap = ((*a >> shift) ^ *b) & mask_w;
*a ^= swap << shift;
*b ^= swap;
}
#endif // OPENSSL_SSE2
// aes_nohw_transpose converts |batch| to and from bitsliced form. It divides
// the 8 × word_size bits into AES_NOHW_BATCH_SIZE × AES_NOHW_BATCH_SIZE squares
// and transposes each square.
static void aes_nohw_transpose(AES_NOHW_BATCH *batch) {
// Swap bits with index 0 and 1 mod 2 (0x55 = 0b01010101).
aes_nohw_swap_bits(&batch->w[0], &batch->w[1], 0x55555555, 1);
aes_nohw_swap_bits(&batch->w[2], &batch->w[3], 0x55555555, 1);
aes_nohw_swap_bits(&batch->w[4], &batch->w[5], 0x55555555, 1);
aes_nohw_swap_bits(&batch->w[6], &batch->w[7], 0x55555555, 1);
#if AES_NOHW_BATCH_SIZE >= 4
// Swap bits with index 0-1 and 2-3 mod 4 (0x33 = 0b00110011).
aes_nohw_swap_bits(&batch->w[0], &batch->w[2], 0x33333333, 2);
aes_nohw_swap_bits(&batch->w[1], &batch->w[3], 0x33333333, 2);
aes_nohw_swap_bits(&batch->w[4], &batch->w[6], 0x33333333, 2);
aes_nohw_swap_bits(&batch->w[5], &batch->w[7], 0x33333333, 2);
#endif
#if AES_NOHW_BATCH_SIZE >= 8
// Swap bits with index 0-3 and 4-7 mod 8 (0x0f = 0b00001111).
aes_nohw_swap_bits(&batch->w[0], &batch->w[4], 0x0f0f0f0f, 4);
aes_nohw_swap_bits(&batch->w[1], &batch->w[5], 0x0f0f0f0f, 4);
aes_nohw_swap_bits(&batch->w[2], &batch->w[6], 0x0f0f0f0f, 4);
aes_nohw_swap_bits(&batch->w[3], &batch->w[7], 0x0f0f0f0f, 4);
#endif
}
// aes_nohw_to_batch initializes |out| with the |num_blocks| blocks from |in|.
// |num_blocks| must be at most |AES_NOHW_BATCH|.
static void aes_nohw_to_batch(AES_NOHW_BATCH *out, const uint8_t *in,
size_t num_blocks) {
// Don't leave unused blocks uninitialized.
GFp_memset(out, 0, sizeof(AES_NOHW_BATCH));
debug_assert_nonsecret(num_blocks <= AES_NOHW_BATCH_SIZE);
for (size_t i = 0; i < num_blocks; i++) {
aes_word_t block[AES_NOHW_BLOCK_WORDS];
aes_nohw_compact_block(block, in + 16 * i);
aes_nohw_batch_set(out, block, i);
}
aes_nohw_transpose(out);
}
// aes_nohw_to_batch writes the first |num_blocks| blocks in |batch| to |out|.
// |num_blocks| must be at most |AES_NOHW_BATCH|.
static void aes_nohw_from_batch(uint8_t *out, size_t num_blocks,
const AES_NOHW_BATCH *batch) {
AES_NOHW_BATCH copy = *batch;
aes_nohw_transpose(&copy);
debug_assert_nonsecret(num_blocks <= AES_NOHW_BATCH_SIZE);
for (size_t i = 0; i < num_blocks; i++) {
aes_word_t block[AES_NOHW_BLOCK_WORDS];
aes_nohw_batch_get(&copy, block, i);
aes_nohw_uncompact_block(out + 16 * i, block);
}
}
// AES round steps.
static void aes_nohw_add_round_key(AES_NOHW_BATCH *batch,
const AES_NOHW_BATCH *key) {
for (size_t i = 0; i < 8; i++) {
batch->w[i] = aes_nohw_xor(batch->w[i], key->w[i]);
}
}
static void aes_nohw_sub_bytes(AES_NOHW_BATCH *batch) {
// See https://eprint.iacr.org/2009/191.pdf, Appendix C.
aes_word_t x0 = batch->w[7];
aes_word_t x1 = batch->w[6];
aes_word_t x2 = batch->w[5];
aes_word_t x3 = batch->w[4];
aes_word_t x4 = batch->w[3];
aes_word_t x5 = batch->w[2];
aes_word_t x6 = batch->w[1];
aes_word_t x7 = batch->w[0];
// Figure 2, the top linear transformation.
aes_word_t y14 = aes_nohw_xor(x3, x5);
aes_word_t y13 = aes_nohw_xor(x0, x6);
aes_word_t y9 = aes_nohw_xor(x0, x3);
aes_word_t y8 = aes_nohw_xor(x0, x5);
aes_word_t t0 = aes_nohw_xor(x1, x2);
aes_word_t y1 = aes_nohw_xor(t0, x7);
aes_word_t y4 = aes_nohw_xor(y1, x3);
aes_word_t y12 = aes_nohw_xor(y13, y14);
aes_word_t y2 = aes_nohw_xor(y1, x0);
aes_word_t y5 = aes_nohw_xor(y1, x6);
aes_word_t y3 = aes_nohw_xor(y5, y8);
aes_word_t t1 = aes_nohw_xor(x4, y12);
aes_word_t y15 = aes_nohw_xor(t1, x5);
aes_word_t y20 = aes_nohw_xor(t1, x1);
aes_word_t y6 = aes_nohw_xor(y15, x7);
aes_word_t y10 = aes_nohw_xor(y15, t0);
aes_word_t y11 = aes_nohw_xor(y20, y9);
aes_word_t y7 = aes_nohw_xor(x7, y11);
aes_word_t y17 = aes_nohw_xor(y10, y11);
aes_word_t y19 = aes_nohw_xor(y10, y8);
aes_word_t y16 = aes_nohw_xor(t0, y11);
aes_word_t y21 = aes_nohw_xor(y13, y16);
aes_word_t y18 = aes_nohw_xor(x0, y16);
// Figure 3, the middle non-linear section.
aes_word_t t2 = aes_nohw_and(y12, y15);
aes_word_t t3 = aes_nohw_and(y3, y6);
aes_word_t t4 = aes_nohw_xor(t3, t2);
aes_word_t t5 = aes_nohw_and(y4, x7);
aes_word_t t6 = aes_nohw_xor(t5, t2);
aes_word_t t7 = aes_nohw_and(y13, y16);
aes_word_t t8 = aes_nohw_and(y5, y1);
aes_word_t t9 = aes_nohw_xor(t8, t7);
aes_word_t t10 = aes_nohw_and(y2, y7);
aes_word_t t11 = aes_nohw_xor(t10, t7);
aes_word_t t12 = aes_nohw_and(y9, y11);
aes_word_t t13 = aes_nohw_and(y14, y17);
aes_word_t t14 = aes_nohw_xor(t13, t12);
aes_word_t t15 = aes_nohw_and(y8, y10);
aes_word_t t16 = aes_nohw_xor(t15, t12);
aes_word_t t17 = aes_nohw_xor(t4, t14);
aes_word_t t18 = aes_nohw_xor(t6, t16);
aes_word_t t19 = aes_nohw_xor(t9, t14);
aes_word_t t20 = aes_nohw_xor(t11, t16);
aes_word_t t21 = aes_nohw_xor(t17, y20);
aes_word_t t22 = aes_nohw_xor(t18, y19);
aes_word_t t23 = aes_nohw_xor(t19, y21);
aes_word_t t24 = aes_nohw_xor(t20, y18);
aes_word_t t25 = aes_nohw_xor(t21, t22);
aes_word_t t26 = aes_nohw_and(t21, t23);
aes_word_t t27 = aes_nohw_xor(t24, t26);
aes_word_t t28 = aes_nohw_and(t25, t27);
aes_word_t t29 = aes_nohw_xor(t28, t22);
aes_word_t t30 = aes_nohw_xor(t23, t24);
aes_word_t t31 = aes_nohw_xor(t22, t26);
aes_word_t t32 = aes_nohw_and(t31, t30);
aes_word_t t33 = aes_nohw_xor(t32, t24);
aes_word_t t34 = aes_nohw_xor(t23, t33);
aes_word_t t35 = aes_nohw_xor(t27, t33);
aes_word_t t36 = aes_nohw_and(t24, t35);
aes_word_t t37 = aes_nohw_xor(t36, t34);
aes_word_t t38 = aes_nohw_xor(t27, t36);
aes_word_t t39 = aes_nohw_and(t29, t38);
aes_word_t t40 = aes_nohw_xor(t25, t39);
aes_word_t t41 = aes_nohw_xor(t40, t37);
aes_word_t t42 = aes_nohw_xor(t29, t33);
aes_word_t t43 = aes_nohw_xor(t29, t40);
aes_word_t t44 = aes_nohw_xor(t33, t37);
aes_word_t t45 = aes_nohw_xor(t42, t41);
aes_word_t z0 = aes_nohw_and(t44, y15);
aes_word_t z1 = aes_nohw_and(t37, y6);
aes_word_t z2 = aes_nohw_and(t33, x7);
aes_word_t z3 = aes_nohw_and(t43, y16);
aes_word_t z4 = aes_nohw_and(t40, y1);
aes_word_t z5 = aes_nohw_and(t29, y7);
aes_word_t z6 = aes_nohw_and(t42, y11);
aes_word_t z7 = aes_nohw_and(t45, y17);
aes_word_t z8 = aes_nohw_and(t41, y10);
aes_word_t z9 = aes_nohw_and(t44, y12);
aes_word_t z10 = aes_nohw_and(t37, y3);
aes_word_t z11 = aes_nohw_and(t33, y4);
aes_word_t z12 = aes_nohw_and(t43, y13);
aes_word_t z13 = aes_nohw_and(t40, y5);
aes_word_t z14 = aes_nohw_and(t29, y2);
aes_word_t z15 = aes_nohw_and(t42, y9);
aes_word_t z16 = aes_nohw_and(t45, y14);
aes_word_t z17 = aes_nohw_and(t41, y8);
// Figure 4, bottom linear transformation.
aes_word_t t46 = aes_nohw_xor(z15, z16);
aes_word_t t47 = aes_nohw_xor(z10, z11);
aes_word_t t48 = aes_nohw_xor(z5, z13);
aes_word_t t49 = aes_nohw_xor(z9, z10);
aes_word_t t50 = aes_nohw_xor(z2, z12);
aes_word_t t51 = aes_nohw_xor(z2, z5);
aes_word_t t52 = aes_nohw_xor(z7, z8);
aes_word_t t53 = aes_nohw_xor(z0, z3);
aes_word_t t54 = aes_nohw_xor(z6, z7);
aes_word_t t55 = aes_nohw_xor(z16, z17);
aes_word_t t56 = aes_nohw_xor(z12, t48);
aes_word_t t57 = aes_nohw_xor(t50, t53);
aes_word_t t58 = aes_nohw_xor(z4, t46);
aes_word_t t59 = aes_nohw_xor(z3, t54);
aes_word_t t60 = aes_nohw_xor(t46, t57);
aes_word_t t61 = aes_nohw_xor(z14, t57);
aes_word_t t62 = aes_nohw_xor(t52, t58);
aes_word_t t63 = aes_nohw_xor(t49, t58);
aes_word_t t64 = aes_nohw_xor(z4, t59);
aes_word_t t65 = aes_nohw_xor(t61, t62);
aes_word_t t66 = aes_nohw_xor(z1, t63);
aes_word_t s0 = aes_nohw_xor(t59, t63);
aes_word_t s6 = aes_nohw_xor(t56, aes_nohw_not(t62));
aes_word_t s7 = aes_nohw_xor(t48, aes_nohw_not(t60));
aes_word_t t67 = aes_nohw_xor(t64, t65);
aes_word_t s3 = aes_nohw_xor(t53, t66);
aes_word_t s4 = aes_nohw_xor(t51, t66);
aes_word_t s5 = aes_nohw_xor(t47, t65);
aes_word_t s1 = aes_nohw_xor(t64, aes_nohw_not(s3));
aes_word_t s2 = aes_nohw_xor(t55, aes_nohw_not(t67));
batch->w[0] = s7;
batch->w[1] = s6;
batch->w[2] = s5;
batch->w[3] = s4;
batch->w[4] = s3;
batch->w[5] = s2;
batch->w[6] = s1;
batch->w[7] = s0;
}
// aes_nohw_rotate_cols_right returns |v| with the columns in each row rotated
// to the right by |n|. This is a macro because |aes_nohw_shift_*| require
// constant shift counts in the SSE2 implementation.
#define aes_nohw_rotate_cols_right(/* aes_word_t */ v, /* const */ n) \
(aes_nohw_or(aes_nohw_shift_right((v), (n)*4), \
aes_nohw_shift_left((v), 16 - (n)*4)))
static void aes_nohw_shift_rows(AES_NOHW_BATCH *batch) {
for (size_t i = 0; i < 8; i++) {
aes_word_t row0 = aes_nohw_and(batch->w[i], AES_NOHW_ROW0_MASK);
aes_word_t row1 = aes_nohw_and(batch->w[i], AES_NOHW_ROW1_MASK);
aes_word_t row2 = aes_nohw_and(batch->w[i], AES_NOHW_ROW2_MASK);
aes_word_t row3 = aes_nohw_and(batch->w[i], AES_NOHW_ROW3_MASK);
row1 = aes_nohw_rotate_cols_right(row1, 1);
row2 = aes_nohw_rotate_cols_right(row2, 2);
row3 = aes_nohw_rotate_cols_right(row3, 3);
batch->w[i] = aes_nohw_or(aes_nohw_or(row0, row1), aes_nohw_or(row2, row3));
}
}
// aes_nohw_rotate_rows_down returns |v| with the rows in each column rotated
// down by one.
static inline aes_word_t aes_nohw_rotate_rows_down(aes_word_t v) {
#if defined(OPENSSL_SSE2)
return _mm_or_si128(_mm_srli_epi32(v, 8), _mm_slli_epi32(v, 24));
#elif defined(OPENSSL_64_BIT)
return ((v >> 4) & UINT64_C(0x0fff0fff0fff0fff)) |
((v << 12) & UINT64_C(0xf000f000f000f000));
#else
return ((v >> 2) & 0x3f3f3f3f) | ((v << 6) & 0xc0c0c0c0);
#endif
}
// aes_nohw_rotate_rows_twice returns |v| with the rows in each column rotated
// by two.
static inline aes_word_t aes_nohw_rotate_rows_twice(aes_word_t v) {
#if defined(OPENSSL_SSE2)
return _mm_or_si128(_mm_srli_epi32(v, 16), _mm_slli_epi32(v, 16));
#elif defined(OPENSSL_64_BIT)
return ((v >> 8) & UINT64_C(0x00ff00ff00ff00ff)) |
((v << 8) & UINT64_C(0xff00ff00ff00ff00));
#else
return ((v >> 4) & 0x0f0f0f0f) | ((v << 4) & 0xf0f0f0f0);
#endif
}
static void aes_nohw_mix_columns(AES_NOHW_BATCH *batch) {
// See https://eprint.iacr.org/2009/129.pdf, section 4.4 and appendix A.
aes_word_t a0 = batch->w[0];
aes_word_t a1 = batch->w[1];
aes_word_t a2 = batch->w[2];
aes_word_t a3 = batch->w[3];
aes_word_t a4 = batch->w[4];
aes_word_t a5 = batch->w[5];
aes_word_t a6 = batch->w[6];
aes_word_t a7 = batch->w[7];
aes_word_t r0 = aes_nohw_rotate_rows_down(a0);
aes_word_t a0_r0 = aes_nohw_xor(a0, r0);
aes_word_t r1 = aes_nohw_rotate_rows_down(a1);
aes_word_t a1_r1 = aes_nohw_xor(a1, r1);
aes_word_t r2 = aes_nohw_rotate_rows_down(a2);
aes_word_t a2_r2 = aes_nohw_xor(a2, r2);
aes_word_t r3 = aes_nohw_rotate_rows_down(a3);
aes_word_t a3_r3 = aes_nohw_xor(a3, r3);
aes_word_t r4 = aes_nohw_rotate_rows_down(a4);
aes_word_t a4_r4 = aes_nohw_xor(a4, r4);
aes_word_t r5 = aes_nohw_rotate_rows_down(a5);
aes_word_t a5_r5 = aes_nohw_xor(a5, r5);
aes_word_t r6 = aes_nohw_rotate_rows_down(a6);
aes_word_t a6_r6 = aes_nohw_xor(a6, r6);
aes_word_t r7 = aes_nohw_rotate_rows_down(a7);
aes_word_t a7_r7 = aes_nohw_xor(a7, r7);
batch->w[0] =
aes_nohw_xor(aes_nohw_xor(a7_r7, r0), aes_nohw_rotate_rows_twice(a0_r0));
batch->w[1] =
aes_nohw_xor(aes_nohw_xor(a0_r0, a7_r7),
aes_nohw_xor(r1, aes_nohw_rotate_rows_twice(a1_r1)));
batch->w[2] =
aes_nohw_xor(aes_nohw_xor(a1_r1, r2), aes_nohw_rotate_rows_twice(a2_r2));
batch->w[3] =
aes_nohw_xor(aes_nohw_xor(a2_r2, a7_r7),
aes_nohw_xor(r3, aes_nohw_rotate_rows_twice(a3_r3)));
batch->w[4] =
aes_nohw_xor(aes_nohw_xor(a3_r3, a7_r7),
aes_nohw_xor(r4, aes_nohw_rotate_rows_twice(a4_r4)));
batch->w[5] =
aes_nohw_xor(aes_nohw_xor(a4_r4, r5), aes_nohw_rotate_rows_twice(a5_r5));
batch->w[6] =
aes_nohw_xor(aes_nohw_xor(a5_r5, r6), aes_nohw_rotate_rows_twice(a6_r6));
batch->w[7] =
aes_nohw_xor(aes_nohw_xor(a6_r6, r7), aes_nohw_rotate_rows_twice(a7_r7));
}
static void aes_nohw_encrypt_batch(const AES_NOHW_SCHEDULE *key,
size_t num_rounds, AES_NOHW_BATCH *batch) {
aes_nohw_add_round_key(batch, &key->keys[0]);
for (size_t i = 1; i < num_rounds; i++) {
aes_nohw_sub_bytes(batch);
aes_nohw_shift_rows(batch);
aes_nohw_mix_columns(batch);
aes_nohw_add_round_key(batch, &key->keys[i]);
}
aes_nohw_sub_bytes(batch);
aes_nohw_shift_rows(batch);
aes_nohw_add_round_key(batch, &key->keys[num_rounds]);
}
// Key schedule.
static void aes_nohw_expand_round_keys(AES_NOHW_SCHEDULE *out,
const AES_KEY *key) {
for (unsigned i = 0; i <= key->rounds; i++) {
// Copy the round key into each block in the batch.
for (size_t j = 0; j < AES_NOHW_BATCH_SIZE; j++) {
aes_word_t tmp[AES_NOHW_BLOCK_WORDS];
GFp_memcpy(tmp, key->rd_key + 4 * i, 16);
aes_nohw_batch_set(&out->keys[i], tmp, j);
}
aes_nohw_transpose(&out->keys[i]);
}
}
static const uint8_t aes_nohw_rcon[10] = {0x01, 0x02, 0x04, 0x08, 0x10,
0x20, 0x40, 0x80, 0x1b, 0x36};
// aes_nohw_rcon_slice returns the |i|th group of |AES_NOHW_BATCH_SIZE| bits in
// |rcon|, stored in a |aes_word_t|.
static inline aes_word_t aes_nohw_rcon_slice(uint8_t rcon, size_t i) {
rcon = (rcon >> (i * AES_NOHW_BATCH_SIZE)) & ((1 << AES_NOHW_BATCH_SIZE) - 1);
#if defined(OPENSSL_SSE2)
return _mm_set_epi32(0, 0, 0, rcon);
#else
return ((aes_word_t)rcon);
#endif
}
static void aes_nohw_sub_block(aes_word_t out[AES_NOHW_BLOCK_WORDS],
const aes_word_t in[AES_NOHW_BLOCK_WORDS]) {
AES_NOHW_BATCH batch;
GFp_memset(&batch, 0, sizeof(batch));
aes_nohw_batch_set(&batch, in, 0);
aes_nohw_transpose(&batch);
aes_nohw_sub_bytes(&batch);
aes_nohw_transpose(&batch);
aes_nohw_batch_get(&batch, out, 0);
}
static void aes_nohw_setup_key_128(AES_KEY *key, const uint8_t in[16]) {
key->rounds = 10;
aes_word_t block[AES_NOHW_BLOCK_WORDS];
aes_nohw_compact_block(block, in);
GFp_memcpy(key->rd_key, block, 16);
for (size_t i = 1; i <= 10; i++) {
aes_word_t sub[AES_NOHW_BLOCK_WORDS];
aes_nohw_sub_block(sub, block);
uint8_t rcon = aes_nohw_rcon[i - 1];
for (size_t j = 0; j < AES_NOHW_BLOCK_WORDS; j++) {
// Incorporate |rcon| and the transformed word into the first word.
block[j] = aes_nohw_xor(block[j], aes_nohw_rcon_slice(rcon, j));
block[j] = aes_nohw_xor(
block[j],
aes_nohw_shift_right(aes_nohw_rotate_rows_down(sub[j]), 12));
// Propagate to the remaining words. Note this is reordered from the usual
// formulation to avoid needing masks.
aes_word_t v = block[j];
block[j] = aes_nohw_xor(block[j], aes_nohw_shift_left(v, 4));
block[j] = aes_nohw_xor(block[j], aes_nohw_shift_left(v, 8));
block[j] = aes_nohw_xor(block[j], aes_nohw_shift_left(v, 12));
}
GFp_memcpy(key->rd_key + 4 * i, block, 16);
}
}
static void aes_nohw_setup_key_256(AES_KEY *key, const uint8_t in[32]) {
key->rounds = 14;
// Each key schedule iteration produces two round keys.
aes_word_t block1[AES_NOHW_BLOCK_WORDS], block2[AES_NOHW_BLOCK_WORDS];
aes_nohw_compact_block(block1, in);
GFp_memcpy(key->rd_key, block1, 16);
aes_nohw_compact_block(block2, in + 16);
GFp_memcpy(key->rd_key + 4, block2, 16);
for (size_t i = 2; i <= 14; i += 2) {
aes_word_t sub[AES_NOHW_BLOCK_WORDS];
aes_nohw_sub_block(sub, block2);
uint8_t rcon = aes_nohw_rcon[i / 2 - 1];
for (size_t j = 0; j < AES_NOHW_BLOCK_WORDS; j++) {
// Incorporate |rcon| and the transformed word into the first word.
block1[j] = aes_nohw_xor(block1[j], aes_nohw_rcon_slice(rcon, j));
block1[j] = aes_nohw_xor(
block1[j],
aes_nohw_shift_right(aes_nohw_rotate_rows_down(sub[j]), 12));
// Propagate to the remaining words.
aes_word_t v = block1[j];
block1[j] = aes_nohw_xor(block1[j], aes_nohw_shift_left(v, 4));
block1[j] = aes_nohw_xor(block1[j], aes_nohw_shift_left(v, 8));
block1[j] = aes_nohw_xor(block1[j], aes_nohw_shift_left(v, 12));
}
GFp_memcpy(key->rd_key + 4 * i, block1, 16);
if (i == 14) {
break;
}
aes_nohw_sub_block(sub, block1);
for (size_t j = 0; j < AES_NOHW_BLOCK_WORDS; j++) {
// Incorporate the transformed word into the first word.
block2[j] = aes_nohw_xor(block2[j], aes_nohw_shift_right(sub[j], 12));
// Propagate to the remaining words.
aes_word_t v = block2[j];
block2[j] = aes_nohw_xor(block2[j], aes_nohw_shift_left(v, 4));
block2[j] = aes_nohw_xor(block2[j], aes_nohw_shift_left(v, 8));
block2[j] = aes_nohw_xor(block2[j], aes_nohw_shift_left(v, 12));
}
GFp_memcpy(key->rd_key + 4 * (i + 1), block2, 16);
}
}
// External API.
int GFp_aes_nohw_set_encrypt_key(const uint8_t *key, unsigned bits,
AES_KEY *aeskey) {
switch (bits) {
case 128:
aes_nohw_setup_key_128(aeskey, key);
return 0;
case 256:
aes_nohw_setup_key_256(aeskey, key);
return 0;
}
return 1;
}
void GFp_aes_nohw_encrypt(const uint8_t *in, uint8_t *out, const AES_KEY *key) {
AES_NOHW_SCHEDULE sched;
aes_nohw_expand_round_keys(&sched, key);
AES_NOHW_BATCH batch;
aes_nohw_to_batch(&batch, in, /*num_blocks=*/1);
aes_nohw_encrypt_batch(&sched, key->rounds, &batch);
aes_nohw_from_batch(out, /*num_blocks=*/1, &batch);
}
static inline void aes_nohw_xor_block(uint8_t out[16], const uint8_t a[16],
const uint8_t b[16]) {
for (size_t i = 0; i < 16; i += sizeof(aes_word_t)) {
aes_word_t x, y;
GFp_memcpy(&x, a + i, sizeof(aes_word_t));
GFp_memcpy(&y, b + i, sizeof(aes_word_t));
x = aes_nohw_xor(x, y);
GFp_memcpy(out + i, &x, sizeof(aes_word_t));
}
}
void GFp_aes_nohw_ctr32_encrypt_blocks(const uint8_t *in, uint8_t *out,
size_t blocks, const AES_KEY *key,
const uint8_t ivec[16]) {
if (blocks == 0) {
return;
}
AES_NOHW_SCHEDULE sched;
aes_nohw_expand_round_keys(&sched, key);
// Make |AES_NOHW_BATCH_SIZE| copies of |ivec|.
alignas(AES_NOHW_WORD_SIZE) union {
uint32_t u32[AES_NOHW_BATCH_SIZE * 4];
uint8_t u8[AES_NOHW_BATCH_SIZE * 16];
} ivs, enc_ivs;
for (size_t i = 0; i < AES_NOHW_BATCH_SIZE; i++) {
GFp_memcpy(ivs.u8 + 16 * i, ivec, 16);
}
uint32_t ctr = CRYPTO_bswap4(ivs.u32[3]);
for (;;) {
// Update counters.
for (uint32_t i = 0; i < AES_NOHW_BATCH_SIZE; i++) {
ivs.u32[4 * i + 3] = CRYPTO_bswap4(ctr + i);
}
size_t todo = blocks >= AES_NOHW_BATCH_SIZE ? AES_NOHW_BATCH_SIZE : blocks;
AES_NOHW_BATCH batch;
aes_nohw_to_batch(&batch, ivs.u8, todo);
aes_nohw_encrypt_batch(&sched, key->rounds, &batch);
aes_nohw_from_batch(enc_ivs.u8, todo, &batch);
for (size_t i = 0; i < todo; i++) {
aes_nohw_xor_block(out + 16 * i, in + 16 * i, enc_ivs.u8 + 16 * i);
}
blocks -= todo;
if (blocks == 0) {
break;
}
in += 16 * AES_NOHW_BATCH_SIZE;
out += 16 * AES_NOHW_BATCH_SIZE;
ctr += AES_NOHW_BATCH_SIZE;
}
}

View File

@@ -0,0 +1,971 @@
#! /usr/bin/env perl
# Copyright 2009-2016 The OpenSSL Project Authors. All Rights Reserved.
#
# Licensed under the OpenSSL license (the "License"). You may not use
# this file except in compliance with the License. You can obtain a copy
# in the file LICENSE in the source distribution or at
# https://www.openssl.org/source/license.html
# ====================================================================
# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
# project. The module is, however, dual licensed under OpenSSL and
# CRYPTOGAMS licenses depending on where you obtain it. For further
# details see http://www.openssl.org/~appro/cryptogams/.
# ====================================================================
#
# This module implements support for Intel AES-NI extension. In
# OpenSSL context it's used with Intel engine, but can also be used as
# drop-in replacement for crypto/aes/asm/aes-586.pl [see below for
# details].
#
# Performance.
#
# To start with see corresponding paragraph in aesni-x86_64.pl...
# Instead of filling table similar to one found there I've chosen to
# summarize *comparison* results for raw ECB, CTR and CBC benchmarks.
# The simplified table below represents 32-bit performance relative
# to 64-bit one in every given point. Ratios vary for different
# encryption modes, therefore interval values.
#
# 16-byte 64-byte 256-byte 1-KB 8-KB
# 53-67% 67-84% 91-94% 95-98% 97-99.5%
#
# Lower ratios for smaller block sizes are perfectly understandable,
# because function call overhead is higher in 32-bit mode. Largest
# 8-KB block performance is virtually same: 32-bit code is less than
# 1% slower for ECB, CBC and CCM, and ~3% slower otherwise.
# January 2011
#
# See aesni-x86_64.pl for details. Unlike x86_64 version this module
# interleaves at most 6 aes[enc|dec] instructions, because there are
# not enough registers for 8x interleave [which should be optimal for
# Sandy Bridge]. Actually, performance results for 6x interleave
# factor presented in aesni-x86_64.pl (except for CTR) are for this
# module.
# April 2011
#
# Add aesni_xts_[en|de]crypt. Westmere spends 1.50 cycles processing
# one byte out of 8KB with 128-bit key, Sandy Bridge - 1.09.
# November 2015
#
# Add aesni_ocb_[en|de]crypt. [Removed in BoringSSL]
######################################################################
# Current large-block performance in cycles per byte processed with
# 128-bit key (less is better).
#
# CBC en-/decrypt CTR XTS ECB OCB
# Westmere 3.77/1.37 1.37 1.52 1.27
# * Bridge 5.07/0.98 0.99 1.09 0.91 1.10
# Haswell 4.44/0.80 0.97 1.03 0.72 0.76
# Skylake 2.68/0.65 0.65 0.66 0.64 0.66
# Silvermont 5.77/3.56 3.67 4.03 3.46 4.03
# Goldmont 3.84/1.39 1.39 1.63 1.31 1.70
# Bulldozer 5.80/0.98 1.05 1.24 0.93 1.23
$PREFIX="GFp_aes_hw"; # if $PREFIX is set to "AES", the script
# generates drop-in replacement for
# crypto/aes/asm/aes-586.pl:-)
$AESNI_PREFIX="GFp_aes_hw";
$inline=1; # inline _aesni_[en|de]crypt
$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
push(@INC,"${dir}","${dir}../../../perlasm");
require "x86asm.pl";
$output = pop;
open OUT,">$output";
*STDOUT=*OUT;
&asm_init($ARGV[0]);
&external_label("GFp_ia32cap_P");
&static_label("key_const");
if ($PREFIX eq $AESNI_PREFIX) { $movekey=\&movups; }
else { $movekey=\&movups; }
$len="eax";
$rounds="ecx";
$key="edx";
$inp="esi";
$out="edi";
$rounds_="ebx"; # backup copy for $rounds
$key_="ebp"; # backup copy for $key
$rndkey0="xmm0";
$rndkey1="xmm1";
$inout0="xmm2";
$inout1="xmm3";
$inout2="xmm4";
$inout3="xmm5"; $in1="xmm5";
$inout4="xmm6"; $in0="xmm6";
$inout5="xmm7"; $ivec="xmm7";
# AESNI extension
sub aeskeygenassist
{ my($dst,$src,$imm)=@_;
if ("$dst:$src" =~ /xmm([0-7]):xmm([0-7])/)
{ &data_byte(0x66,0x0f,0x3a,0xdf,0xc0|($1<<3)|$2,$imm); }
}
sub aescommon
{ my($opcodelet,$dst,$src)=@_;
if ("$dst:$src" =~ /xmm([0-7]):xmm([0-7])/)
{ &data_byte(0x66,0x0f,0x38,$opcodelet,0xc0|($1<<3)|$2);}
}
sub aesimc { aescommon(0xdb,@_); }
sub aesenc { aescommon(0xdc,@_); }
sub aesenclast { aescommon(0xdd,@_); }
# Inline version of internal aesni_[en|de]crypt1
{ my $sn;
sub aesni_inline_generate1
{ my ($p,$inout,$ivec)=@_; $inout=$inout0 if (!defined($inout));
$sn++;
&$movekey ($rndkey0,&QWP(0,$key));
&$movekey ($rndkey1,&QWP(16,$key));
&xorps ($ivec,$rndkey0) if (defined($ivec));
&lea ($key,&DWP(32,$key));
&xorps ($inout,$ivec) if (defined($ivec));
&xorps ($inout,$rndkey0) if (!defined($ivec));
&set_label("${p}1_loop_$sn");
eval"&aes${p} ($inout,$rndkey1)";
&dec ($rounds);
&$movekey ($rndkey1,&QWP(0,$key));
&lea ($key,&DWP(16,$key));
&jnz (&label("${p}1_loop_$sn"));
eval"&aes${p}last ($inout,$rndkey1)";
}}
sub aesni_generate1 # fully unrolled loop
{ my ($p,$inout)=@_; $inout=$inout0 if (!defined($inout));
&function_begin_B("_aesni_${p}rypt1");
&movups ($rndkey0,&QWP(0,$key));
&$movekey ($rndkey1,&QWP(0x10,$key));
&xorps ($inout,$rndkey0);
&$movekey ($rndkey0,&QWP(0x20,$key));
&lea ($key,&DWP(0x30,$key));
&cmp ($rounds,11);
&jb (&label("${p}128"));
&lea ($key,&DWP(0x40,$key));
# 192-bit key support was removed.
eval"&aes${p} ($inout,$rndkey1)";
&$movekey ($rndkey1,&QWP(-0x40,$key));
eval"&aes${p} ($inout,$rndkey0)";
&$movekey ($rndkey0,&QWP(-0x30,$key));
# 192-bit key support was removed.
eval"&aes${p} ($inout,$rndkey1)";
&$movekey ($rndkey1,&QWP(-0x20,$key));
eval"&aes${p} ($inout,$rndkey0)";
&$movekey ($rndkey0,&QWP(-0x10,$key));
&set_label("${p}128");
eval"&aes${p} ($inout,$rndkey1)";
&$movekey ($rndkey1,&QWP(0,$key));
eval"&aes${p} ($inout,$rndkey0)";
&$movekey ($rndkey0,&QWP(0x10,$key));
eval"&aes${p} ($inout,$rndkey1)";
&$movekey ($rndkey1,&QWP(0x20,$key));
eval"&aes${p} ($inout,$rndkey0)";
&$movekey ($rndkey0,&QWP(0x30,$key));
eval"&aes${p} ($inout,$rndkey1)";
&$movekey ($rndkey1,&QWP(0x40,$key));
eval"&aes${p} ($inout,$rndkey0)";
&$movekey ($rndkey0,&QWP(0x50,$key));
eval"&aes${p} ($inout,$rndkey1)";
&$movekey ($rndkey1,&QWP(0x60,$key));
eval"&aes${p} ($inout,$rndkey0)";
&$movekey ($rndkey0,&QWP(0x70,$key));
eval"&aes${p} ($inout,$rndkey1)";
eval"&aes${p}last ($inout,$rndkey0)";
&ret();
&function_end_B("_aesni_${p}rypt1");
}
# void $PREFIX_encrypt (const void *inp,void *out,const AES_KEY *key);
&aesni_generate1("enc") if (!$inline);
&function_begin_B("${PREFIX}_encrypt");
&mov ("eax",&wparam(0));
&mov ($key,&wparam(2));
&movups ($inout0,&QWP(0,"eax"));
&mov ($rounds,&DWP(240,$key));
&mov ("eax",&wparam(1));
if ($inline)
{ &aesni_inline_generate1("enc"); }
else
{ &call ("_aesni_encrypt1"); }
&pxor ($rndkey0,$rndkey0); # clear register bank
&pxor ($rndkey1,$rndkey1);
&movups (&QWP(0,"eax"),$inout0);
&pxor ($inout0,$inout0);
&ret ();
&function_end_B("${PREFIX}_encrypt");
# _aesni_[en|de]cryptN are private interfaces, N denotes interleave
# factor. Why 3x subroutine were originally used in loops? Even though
# aes[enc|dec] latency was originally 6, it could be scheduled only
# every *2nd* cycle. Thus 3x interleave was the one providing optimal
# utilization, i.e. when subroutine's throughput is virtually same as
# of non-interleaved subroutine [for number of input blocks up to 3].
# This is why it originally made no sense to implement 2x subroutine.
# But times change and it became appropriate to spend extra 192 bytes
# on 2x subroutine on Atom Silvermont account. For processors that
# can schedule aes[enc|dec] every cycle optimal interleave factor
# equals to corresponding instructions latency. 8x is optimal for
# * Bridge, but it's unfeasible to accommodate such implementation
# in XMM registers addressable in 32-bit mode and therefore maximum
# of 6x is used instead...
sub aesni_generate2
{ my $p=shift;
&function_begin_B("_aesni_${p}rypt2");
&$movekey ($rndkey0,&QWP(0,$key));
&shl ($rounds,4);
&$movekey ($rndkey1,&QWP(16,$key));
&xorps ($inout0,$rndkey0);
&pxor ($inout1,$rndkey0);
&$movekey ($rndkey0,&QWP(32,$key));
&lea ($key,&DWP(32,$key,$rounds));
&neg ($rounds);
&add ($rounds,16);
&set_label("${p}2_loop");
eval"&aes${p} ($inout0,$rndkey1)";
eval"&aes${p} ($inout1,$rndkey1)";
&$movekey ($rndkey1,&QWP(0,$key,$rounds));
&add ($rounds,32);
eval"&aes${p} ($inout0,$rndkey0)";
eval"&aes${p} ($inout1,$rndkey0)";
&$movekey ($rndkey0,&QWP(-16,$key,$rounds));
&jnz (&label("${p}2_loop"));
eval"&aes${p} ($inout0,$rndkey1)";
eval"&aes${p} ($inout1,$rndkey1)";
eval"&aes${p}last ($inout0,$rndkey0)";
eval"&aes${p}last ($inout1,$rndkey0)";
&ret();
&function_end_B("_aesni_${p}rypt2");
}
sub aesni_generate3
{ my $p=shift;
&function_begin_B("_aesni_${p}rypt3");
&$movekey ($rndkey0,&QWP(0,$key));
&shl ($rounds,4);
&$movekey ($rndkey1,&QWP(16,$key));
&xorps ($inout0,$rndkey0);
&pxor ($inout1,$rndkey0);
&pxor ($inout2,$rndkey0);
&$movekey ($rndkey0,&QWP(32,$key));
&lea ($key,&DWP(32,$key,$rounds));
&neg ($rounds);
&add ($rounds,16);
&set_label("${p}3_loop");
eval"&aes${p} ($inout0,$rndkey1)";
eval"&aes${p} ($inout1,$rndkey1)";
eval"&aes${p} ($inout2,$rndkey1)";
&$movekey ($rndkey1,&QWP(0,$key,$rounds));
&add ($rounds,32);
eval"&aes${p} ($inout0,$rndkey0)";
eval"&aes${p} ($inout1,$rndkey0)";
eval"&aes${p} ($inout2,$rndkey0)";
&$movekey ($rndkey0,&QWP(-16,$key,$rounds));
&jnz (&label("${p}3_loop"));
eval"&aes${p} ($inout0,$rndkey1)";
eval"&aes${p} ($inout1,$rndkey1)";
eval"&aes${p} ($inout2,$rndkey1)";
eval"&aes${p}last ($inout0,$rndkey0)";
eval"&aes${p}last ($inout1,$rndkey0)";
eval"&aes${p}last ($inout2,$rndkey0)";
&ret();
&function_end_B("_aesni_${p}rypt3");
}
# 4x interleave is implemented to improve small block performance,
# most notably [and naturally] 4 block by ~30%. One can argue that one
# should have implemented 5x as well, but improvement would be <20%,
# so it's not worth it...
sub aesni_generate4
{ my $p=shift;
&function_begin_B("_aesni_${p}rypt4");
&$movekey ($rndkey0,&QWP(0,$key));
&$movekey ($rndkey1,&QWP(16,$key));
&shl ($rounds,4);
&xorps ($inout0,$rndkey0);
&pxor ($inout1,$rndkey0);
&pxor ($inout2,$rndkey0);
&pxor ($inout3,$rndkey0);
&$movekey ($rndkey0,&QWP(32,$key));
&lea ($key,&DWP(32,$key,$rounds));
&neg ($rounds);
&data_byte (0x0f,0x1f,0x40,0x00);
&add ($rounds,16);
&set_label("${p}4_loop");
eval"&aes${p} ($inout0,$rndkey1)";
eval"&aes${p} ($inout1,$rndkey1)";
eval"&aes${p} ($inout2,$rndkey1)";
eval"&aes${p} ($inout3,$rndkey1)";
&$movekey ($rndkey1,&QWP(0,$key,$rounds));
&add ($rounds,32);
eval"&aes${p} ($inout0,$rndkey0)";
eval"&aes${p} ($inout1,$rndkey0)";
eval"&aes${p} ($inout2,$rndkey0)";
eval"&aes${p} ($inout3,$rndkey0)";
&$movekey ($rndkey0,&QWP(-16,$key,$rounds));
&jnz (&label("${p}4_loop"));
eval"&aes${p} ($inout0,$rndkey1)";
eval"&aes${p} ($inout1,$rndkey1)";
eval"&aes${p} ($inout2,$rndkey1)";
eval"&aes${p} ($inout3,$rndkey1)";
eval"&aes${p}last ($inout0,$rndkey0)";
eval"&aes${p}last ($inout1,$rndkey0)";
eval"&aes${p}last ($inout2,$rndkey0)";
eval"&aes${p}last ($inout3,$rndkey0)";
&ret();
&function_end_B("_aesni_${p}rypt4");
}
sub aesni_generate6
{ my $p=shift;
&function_begin_B("_aesni_${p}rypt6");
&static_label("_aesni_${p}rypt6_enter");
&$movekey ($rndkey0,&QWP(0,$key));
&shl ($rounds,4);
&$movekey ($rndkey1,&QWP(16,$key));
&xorps ($inout0,$rndkey0);
&pxor ($inout1,$rndkey0); # pxor does better here
&pxor ($inout2,$rndkey0);
eval"&aes${p} ($inout0,$rndkey1)";
&pxor ($inout3,$rndkey0);
&pxor ($inout4,$rndkey0);
eval"&aes${p} ($inout1,$rndkey1)";
&lea ($key,&DWP(32,$key,$rounds));
&neg ($rounds);
eval"&aes${p} ($inout2,$rndkey1)";
&pxor ($inout5,$rndkey0);
&$movekey ($rndkey0,&QWP(0,$key,$rounds));
&add ($rounds,16);
&jmp (&label("_aesni_${p}rypt6_inner"));
&set_label("${p}6_loop",16);
eval"&aes${p} ($inout0,$rndkey1)";
eval"&aes${p} ($inout1,$rndkey1)";
eval"&aes${p} ($inout2,$rndkey1)";
&set_label("_aesni_${p}rypt6_inner");
eval"&aes${p} ($inout3,$rndkey1)";
eval"&aes${p} ($inout4,$rndkey1)";
eval"&aes${p} ($inout5,$rndkey1)";
&set_label("_aesni_${p}rypt6_enter");
&$movekey ($rndkey1,&QWP(0,$key,$rounds));
&add ($rounds,32);
eval"&aes${p} ($inout0,$rndkey0)";
eval"&aes${p} ($inout1,$rndkey0)";
eval"&aes${p} ($inout2,$rndkey0)";
eval"&aes${p} ($inout3,$rndkey0)";
eval"&aes${p} ($inout4,$rndkey0)";
eval"&aes${p} ($inout5,$rndkey0)";
&$movekey ($rndkey0,&QWP(-16,$key,$rounds));
&jnz (&label("${p}6_loop"));
eval"&aes${p} ($inout0,$rndkey1)";
eval"&aes${p} ($inout1,$rndkey1)";
eval"&aes${p} ($inout2,$rndkey1)";
eval"&aes${p} ($inout3,$rndkey1)";
eval"&aes${p} ($inout4,$rndkey1)";
eval"&aes${p} ($inout5,$rndkey1)";
eval"&aes${p}last ($inout0,$rndkey0)";
eval"&aes${p}last ($inout1,$rndkey0)";
eval"&aes${p}last ($inout2,$rndkey0)";
eval"&aes${p}last ($inout3,$rndkey0)";
eval"&aes${p}last ($inout4,$rndkey0)";
eval"&aes${p}last ($inout5,$rndkey0)";
&ret();
&function_end_B("_aesni_${p}rypt6");
}
&aesni_generate2("enc") if ($PREFIX eq $AESNI_PREFIX);
&aesni_generate3("enc") if ($PREFIX eq $AESNI_PREFIX);
&aesni_generate4("enc") if ($PREFIX eq $AESNI_PREFIX);
&aesni_generate6("enc") if ($PREFIX eq $AESNI_PREFIX);
if ($PREFIX eq $AESNI_PREFIX) {
######################################################################
# void aes_hw_ctr32_encrypt_blocks (const void *in, void *out,
# size_t blocks, const AES_KEY *key,
# const char *ivec);
#
# Handles only complete blocks, operates on 32-bit counter and
# does not update *ivec! (see crypto/modes/ctr128.c for details)
#
# stack layout:
# 0 pshufb mask
# 16 vector addend: 0,6,6,6
# 32 counter-less ivec
# 48 1st triplet of counter vector
# 64 2nd triplet of counter vector
# 80 saved %esp
&function_begin("${PREFIX}_ctr32_encrypt_blocks");
&mov ($inp,&wparam(0));
&mov ($out,&wparam(1));
&mov ($len,&wparam(2));
&mov ($key,&wparam(3));
&mov ($rounds_,&wparam(4));
&mov ($key_,"esp");
&sub ("esp",88);
&and ("esp",-16); # align stack
&mov (&DWP(80,"esp"),$key_);
&cmp ($len,1);
&je (&label("ctr32_one_shortcut"));
&movdqu ($inout5,&QWP(0,$rounds_)); # load ivec
# compose byte-swap control mask for pshufb on stack
&mov (&DWP(0,"esp"),0x0c0d0e0f);
&mov (&DWP(4,"esp"),0x08090a0b);
&mov (&DWP(8,"esp"),0x04050607);
&mov (&DWP(12,"esp"),0x00010203);
# compose counter increment vector on stack
&mov ($rounds,6);
&xor ($key_,$key_);
&mov (&DWP(16,"esp"),$rounds);
&mov (&DWP(20,"esp"),$rounds);
&mov (&DWP(24,"esp"),$rounds);
&mov (&DWP(28,"esp"),$key_);
&pextrd ($rounds_,$inout5,3); # pull 32-bit counter
&pinsrd ($inout5,$key_,3); # wipe 32-bit counter
&mov ($rounds,&DWP(240,$key)); # key->rounds
# compose 2 vectors of 3x32-bit counters
&bswap ($rounds_);
&pxor ($rndkey0,$rndkey0);
&pxor ($rndkey1,$rndkey1);
&movdqa ($inout0,&QWP(0,"esp")); # load byte-swap mask
&pinsrd ($rndkey0,$rounds_,0);
&lea ($key_,&DWP(3,$rounds_));
&pinsrd ($rndkey1,$key_,0);
&inc ($rounds_);
&pinsrd ($rndkey0,$rounds_,1);
&inc ($key_);
&pinsrd ($rndkey1,$key_,1);
&inc ($rounds_);
&pinsrd ($rndkey0,$rounds_,2);
&inc ($key_);
&pinsrd ($rndkey1,$key_,2);
&movdqa (&QWP(48,"esp"),$rndkey0); # save 1st triplet
&pshufb ($rndkey0,$inout0); # byte swap
&movdqu ($inout4,&QWP(0,$key)); # key[0]
&movdqa (&QWP(64,"esp"),$rndkey1); # save 2nd triplet
&pshufb ($rndkey1,$inout0); # byte swap
&pshufd ($inout0,$rndkey0,3<<6); # place counter to upper dword
&pshufd ($inout1,$rndkey0,2<<6);
&cmp ($len,6);
&jb (&label("ctr32_tail"));
&pxor ($inout5,$inout4); # counter-less ivec^key[0]
&shl ($rounds,4);
&mov ($rounds_,16);
&movdqa (&QWP(32,"esp"),$inout5); # save counter-less ivec^key[0]
&mov ($key_,$key); # backup $key
&sub ($rounds_,$rounds); # backup twisted $rounds
&lea ($key,&DWP(32,$key,$rounds));
&sub ($len,6);
&jmp (&label("ctr32_loop6"));
&set_label("ctr32_loop6",16);
# inlining _aesni_encrypt6's prologue gives ~6% improvement...
&pshufd ($inout2,$rndkey0,1<<6);
&movdqa ($rndkey0,&QWP(32,"esp")); # pull counter-less ivec
&pshufd ($inout3,$rndkey1,3<<6);
&pxor ($inout0,$rndkey0); # merge counter-less ivec
&pshufd ($inout4,$rndkey1,2<<6);
&pxor ($inout1,$rndkey0);
&pshufd ($inout5,$rndkey1,1<<6);
&$movekey ($rndkey1,&QWP(16,$key_));
&pxor ($inout2,$rndkey0);
&pxor ($inout3,$rndkey0);
&aesenc ($inout0,$rndkey1);
&pxor ($inout4,$rndkey0);
&pxor ($inout5,$rndkey0);
&aesenc ($inout1,$rndkey1);
&$movekey ($rndkey0,&QWP(32,$key_));
&mov ($rounds,$rounds_);
&aesenc ($inout2,$rndkey1);
&aesenc ($inout3,$rndkey1);
&aesenc ($inout4,$rndkey1);
&aesenc ($inout5,$rndkey1);
&call (&label("_aesni_encrypt6_enter"));
&movups ($rndkey1,&QWP(0,$inp));
&movups ($rndkey0,&QWP(0x10,$inp));
&xorps ($inout0,$rndkey1);
&movups ($rndkey1,&QWP(0x20,$inp));
&xorps ($inout1,$rndkey0);
&movups (&QWP(0,$out),$inout0);
&movdqa ($rndkey0,&QWP(16,"esp")); # load increment
&xorps ($inout2,$rndkey1);
&movdqa ($rndkey1,&QWP(64,"esp")); # load 2nd triplet
&movups (&QWP(0x10,$out),$inout1);
&movups (&QWP(0x20,$out),$inout2);
&paddd ($rndkey1,$rndkey0); # 2nd triplet increment
&paddd ($rndkey0,&QWP(48,"esp")); # 1st triplet increment
&movdqa ($inout0,&QWP(0,"esp")); # load byte swap mask
&movups ($inout1,&QWP(0x30,$inp));
&movups ($inout2,&QWP(0x40,$inp));
&xorps ($inout3,$inout1);
&movups ($inout1,&QWP(0x50,$inp));
&lea ($inp,&DWP(0x60,$inp));
&movdqa (&QWP(48,"esp"),$rndkey0); # save 1st triplet
&pshufb ($rndkey0,$inout0); # byte swap
&xorps ($inout4,$inout2);
&movups (&QWP(0x30,$out),$inout3);
&xorps ($inout5,$inout1);
&movdqa (&QWP(64,"esp"),$rndkey1); # save 2nd triplet
&pshufb ($rndkey1,$inout0); # byte swap
&movups (&QWP(0x40,$out),$inout4);
&pshufd ($inout0,$rndkey0,3<<6);
&movups (&QWP(0x50,$out),$inout5);
&lea ($out,&DWP(0x60,$out));
&pshufd ($inout1,$rndkey0,2<<6);
&sub ($len,6);
&jnc (&label("ctr32_loop6"));
&add ($len,6);
&jz (&label("ctr32_ret"));
&movdqu ($inout5,&QWP(0,$key_));
&mov ($key,$key_);
&pxor ($inout5,&QWP(32,"esp")); # restore count-less ivec
&mov ($rounds,&DWP(240,$key_)); # restore $rounds
&set_label("ctr32_tail");
&por ($inout0,$inout5);
&cmp ($len,2);
&jb (&label("ctr32_one"));
&pshufd ($inout2,$rndkey0,1<<6);
&por ($inout1,$inout5);
&je (&label("ctr32_two"));
&pshufd ($inout3,$rndkey1,3<<6);
&por ($inout2,$inout5);
&cmp ($len,4);
&jb (&label("ctr32_three"));
&pshufd ($inout4,$rndkey1,2<<6);
&por ($inout3,$inout5);
&je (&label("ctr32_four"));
&por ($inout4,$inout5);
&call ("_aesni_encrypt6");
&movups ($rndkey1,&QWP(0,$inp));
&movups ($rndkey0,&QWP(0x10,$inp));
&xorps ($inout0,$rndkey1);
&movups ($rndkey1,&QWP(0x20,$inp));
&xorps ($inout1,$rndkey0);
&movups ($rndkey0,&QWP(0x30,$inp));
&xorps ($inout2,$rndkey1);
&movups ($rndkey1,&QWP(0x40,$inp));
&xorps ($inout3,$rndkey0);
&movups (&QWP(0,$out),$inout0);
&xorps ($inout4,$rndkey1);
&movups (&QWP(0x10,$out),$inout1);
&movups (&QWP(0x20,$out),$inout2);
&movups (&QWP(0x30,$out),$inout3);
&movups (&QWP(0x40,$out),$inout4);
&jmp (&label("ctr32_ret"));
&set_label("ctr32_one_shortcut",16);
&movups ($inout0,&QWP(0,$rounds_)); # load ivec
&mov ($rounds,&DWP(240,$key));
&set_label("ctr32_one");
if ($inline)
{ &aesni_inline_generate1("enc"); }
else
{ &call ("_aesni_encrypt1"); }
&movups ($in0,&QWP(0,$inp));
&xorps ($in0,$inout0);
&movups (&QWP(0,$out),$in0);
&jmp (&label("ctr32_ret"));
&set_label("ctr32_two",16);
&call ("_aesni_encrypt2");
&movups ($inout3,&QWP(0,$inp));
&movups ($inout4,&QWP(0x10,$inp));
&xorps ($inout0,$inout3);
&xorps ($inout1,$inout4);
&movups (&QWP(0,$out),$inout0);
&movups (&QWP(0x10,$out),$inout1);
&jmp (&label("ctr32_ret"));
&set_label("ctr32_three",16);
&call ("_aesni_encrypt3");
&movups ($inout3,&QWP(0,$inp));
&movups ($inout4,&QWP(0x10,$inp));
&xorps ($inout0,$inout3);
&movups ($inout5,&QWP(0x20,$inp));
&xorps ($inout1,$inout4);
&movups (&QWP(0,$out),$inout0);
&xorps ($inout2,$inout5);
&movups (&QWP(0x10,$out),$inout1);
&movups (&QWP(0x20,$out),$inout2);
&jmp (&label("ctr32_ret"));
&set_label("ctr32_four",16);
&call ("_aesni_encrypt4");
&movups ($inout4,&QWP(0,$inp));
&movups ($inout5,&QWP(0x10,$inp));
&movups ($rndkey1,&QWP(0x20,$inp));
&xorps ($inout0,$inout4);
&movups ($rndkey0,&QWP(0x30,$inp));
&xorps ($inout1,$inout5);
&movups (&QWP(0,$out),$inout0);
&xorps ($inout2,$rndkey1);
&movups (&QWP(0x10,$out),$inout1);
&xorps ($inout3,$rndkey0);
&movups (&QWP(0x20,$out),$inout2);
&movups (&QWP(0x30,$out),$inout3);
&set_label("ctr32_ret");
&pxor ("xmm0","xmm0"); # clear register bank
&pxor ("xmm1","xmm1");
&pxor ("xmm2","xmm2");
&pxor ("xmm3","xmm3");
&pxor ("xmm4","xmm4");
&movdqa (&QWP(32,"esp"),"xmm0"); # clear stack
&pxor ("xmm5","xmm5");
&movdqa (&QWP(48,"esp"),"xmm0");
&pxor ("xmm6","xmm6");
&movdqa (&QWP(64,"esp"),"xmm0");
&pxor ("xmm7","xmm7");
&mov ("esp",&DWP(80,"esp"));
&function_end("${PREFIX}_ctr32_encrypt_blocks");
}
######################################################################
# Mechanical port from aesni-x86_64.pl.
#
# _aesni_set_encrypt_key is private interface,
# input:
# "eax" const unsigned char *userKey
# $rounds int bits
# $key AES_KEY *key
# output:
# "eax" return code
# $round rounds
&function_begin_B("_aesni_set_encrypt_key");
&push ("ebp");
&push ("ebx");
&test ("eax","eax");
&jz (&label("bad_pointer"));
&test ($key,$key);
&jz (&label("bad_pointer"));
&call (&label("pic"));
&set_label("pic");
&blindpop("ebx");
&lea ("ebx",&DWP(&label("key_const")."-".&label("pic"),"ebx"));
&picmeup("ebp","GFp_ia32cap_P","ebx",&label("key_const"));
&movups ("xmm0",&QWP(0,"eax")); # pull first 128 bits of *userKey
&xorps ("xmm4","xmm4"); # low dword of xmm4 is assumed 0
&mov ("ebp",&DWP(4,"ebp"));
&lea ($key,&DWP(16,$key));
&and ("ebp",1<<28|1<<11); # AVX and XOP bits
&cmp ($rounds,256);
&je (&label("14rounds"));
# 192-bit key support was removed.
&cmp ($rounds,128);
&jne (&label("bad_keybits"));
&set_label("10rounds",16);
&cmp ("ebp",1<<28);
&je (&label("10rounds_alt"));
&mov ($rounds,9);
&$movekey (&QWP(-16,$key),"xmm0"); # round 0
&aeskeygenassist("xmm1","xmm0",0x01); # round 1
&call (&label("key_128_cold"));
&aeskeygenassist("xmm1","xmm0",0x2); # round 2
&call (&label("key_128"));
&aeskeygenassist("xmm1","xmm0",0x04); # round 3
&call (&label("key_128"));
&aeskeygenassist("xmm1","xmm0",0x08); # round 4
&call (&label("key_128"));
&aeskeygenassist("xmm1","xmm0",0x10); # round 5
&call (&label("key_128"));
&aeskeygenassist("xmm1","xmm0",0x20); # round 6
&call (&label("key_128"));
&aeskeygenassist("xmm1","xmm0",0x40); # round 7
&call (&label("key_128"));
&aeskeygenassist("xmm1","xmm0",0x80); # round 8
&call (&label("key_128"));
&aeskeygenassist("xmm1","xmm0",0x1b); # round 9
&call (&label("key_128"));
&aeskeygenassist("xmm1","xmm0",0x36); # round 10
&call (&label("key_128"));
&$movekey (&QWP(0,$key),"xmm0");
&mov (&DWP(80,$key),$rounds);
&jmp (&label("good_key"));
&set_label("key_128",16);
&$movekey (&QWP(0,$key),"xmm0");
&lea ($key,&DWP(16,$key));
&set_label("key_128_cold");
&shufps ("xmm4","xmm0",0b00010000);
&xorps ("xmm0","xmm4");
&shufps ("xmm4","xmm0",0b10001100);
&xorps ("xmm0","xmm4");
&shufps ("xmm1","xmm1",0b11111111); # critical path
&xorps ("xmm0","xmm1");
&ret();
&set_label("10rounds_alt",16);
&movdqa ("xmm5",&QWP(0x00,"ebx"));
&mov ($rounds,8);
&movdqa ("xmm4",&QWP(0x20,"ebx"));
&movdqa ("xmm2","xmm0");
&movdqu (&QWP(-16,$key),"xmm0");
&set_label("loop_key128");
&pshufb ("xmm0","xmm5");
&aesenclast ("xmm0","xmm4");
&pslld ("xmm4",1);
&lea ($key,&DWP(16,$key));
&movdqa ("xmm3","xmm2");
&pslldq ("xmm2",4);
&pxor ("xmm3","xmm2");
&pslldq ("xmm2",4);
&pxor ("xmm3","xmm2");
&pslldq ("xmm2",4);
&pxor ("xmm2","xmm3");
&pxor ("xmm0","xmm2");
&movdqu (&QWP(-16,$key),"xmm0");
&movdqa ("xmm2","xmm0");
&dec ($rounds);
&jnz (&label("loop_key128"));
&movdqa ("xmm4",&QWP(0x30,"ebx"));
&pshufb ("xmm0","xmm5");
&aesenclast ("xmm0","xmm4");
&pslld ("xmm4",1);
&movdqa ("xmm3","xmm2");
&pslldq ("xmm2",4);
&pxor ("xmm3","xmm2");
&pslldq ("xmm2",4);
&pxor ("xmm3","xmm2");
&pslldq ("xmm2",4);
&pxor ("xmm2","xmm3");
&pxor ("xmm0","xmm2");
&movdqu (&QWP(0,$key),"xmm0");
&movdqa ("xmm2","xmm0");
&pshufb ("xmm0","xmm5");
&aesenclast ("xmm0","xmm4");
&movdqa ("xmm3","xmm2");
&pslldq ("xmm2",4);
&pxor ("xmm3","xmm2");
&pslldq ("xmm2",4);
&pxor ("xmm3","xmm2");
&pslldq ("xmm2",4);
&pxor ("xmm2","xmm3");
&pxor ("xmm0","xmm2");
&movdqu (&QWP(16,$key),"xmm0");
&mov ($rounds,9);
&mov (&DWP(96,$key),$rounds);
&jmp (&label("good_key"));
# 192-bit key support was removed.
&set_label("14rounds",16);
&movups ("xmm2",&QWP(16,"eax")); # remaining half of *userKey
&lea ($key,&DWP(16,$key));
&cmp ("ebp",1<<28);
&je (&label("14rounds_alt"));
&mov ($rounds,13);
&$movekey (&QWP(-32,$key),"xmm0"); # round 0
&$movekey (&QWP(-16,$key),"xmm2"); # round 1
&aeskeygenassist("xmm1","xmm2",0x01); # round 2
&call (&label("key_256a_cold"));
&aeskeygenassist("xmm1","xmm0",0x01); # round 3
&call (&label("key_256b"));
&aeskeygenassist("xmm1","xmm2",0x02); # round 4
&call (&label("key_256a"));
&aeskeygenassist("xmm1","xmm0",0x02); # round 5
&call (&label("key_256b"));
&aeskeygenassist("xmm1","xmm2",0x04); # round 6
&call (&label("key_256a"));
&aeskeygenassist("xmm1","xmm0",0x04); # round 7
&call (&label("key_256b"));
&aeskeygenassist("xmm1","xmm2",0x08); # round 8
&call (&label("key_256a"));
&aeskeygenassist("xmm1","xmm0",0x08); # round 9
&call (&label("key_256b"));
&aeskeygenassist("xmm1","xmm2",0x10); # round 10
&call (&label("key_256a"));
&aeskeygenassist("xmm1","xmm0",0x10); # round 11
&call (&label("key_256b"));
&aeskeygenassist("xmm1","xmm2",0x20); # round 12
&call (&label("key_256a"));
&aeskeygenassist("xmm1","xmm0",0x20); # round 13
&call (&label("key_256b"));
&aeskeygenassist("xmm1","xmm2",0x40); # round 14
&call (&label("key_256a"));
&$movekey (&QWP(0,$key),"xmm0");
&mov (&DWP(16,$key),$rounds);
&xor ("eax","eax");
&jmp (&label("good_key"));
&set_label("key_256a",16);
&$movekey (&QWP(0,$key),"xmm2");
&lea ($key,&DWP(16,$key));
&set_label("key_256a_cold");
&shufps ("xmm4","xmm0",0b00010000);
&xorps ("xmm0","xmm4");
&shufps ("xmm4","xmm0",0b10001100);
&xorps ("xmm0","xmm4");
&shufps ("xmm1","xmm1",0b11111111); # critical path
&xorps ("xmm0","xmm1");
&ret();
&set_label("key_256b",16);
&$movekey (&QWP(0,$key),"xmm0");
&lea ($key,&DWP(16,$key));
&shufps ("xmm4","xmm2",0b00010000);
&xorps ("xmm2","xmm4");
&shufps ("xmm4","xmm2",0b10001100);
&xorps ("xmm2","xmm4");
&shufps ("xmm1","xmm1",0b10101010); # critical path
&xorps ("xmm2","xmm1");
&ret();
&set_label("14rounds_alt",16);
&movdqa ("xmm5",&QWP(0x00,"ebx"));
&movdqa ("xmm4",&QWP(0x20,"ebx"));
&mov ($rounds,7);
&movdqu (&QWP(-32,$key),"xmm0");
&movdqa ("xmm1","xmm2");
&movdqu (&QWP(-16,$key),"xmm2");
&set_label("loop_key256");
&pshufb ("xmm2","xmm5");
&aesenclast ("xmm2","xmm4");
&movdqa ("xmm3","xmm0");
&pslldq ("xmm0",4);
&pxor ("xmm3","xmm0");
&pslldq ("xmm0",4);
&pxor ("xmm3","xmm0");
&pslldq ("xmm0",4);
&pxor ("xmm0","xmm3");
&pslld ("xmm4",1);
&pxor ("xmm0","xmm2");
&movdqu (&QWP(0,$key),"xmm0");
&dec ($rounds);
&jz (&label("done_key256"));
&pshufd ("xmm2","xmm0",0xff);
&pxor ("xmm3","xmm3");
&aesenclast ("xmm2","xmm3");
&movdqa ("xmm3","xmm1");
&pslldq ("xmm1",4);
&pxor ("xmm3","xmm1");
&pslldq ("xmm1",4);
&pxor ("xmm3","xmm1");
&pslldq ("xmm1",4);
&pxor ("xmm1","xmm3");
&pxor ("xmm2","xmm1");
&movdqu (&QWP(16,$key),"xmm2");
&lea ($key,&DWP(32,$key));
&movdqa ("xmm1","xmm2");
&jmp (&label("loop_key256"));
&set_label("done_key256");
&mov ($rounds,13);
&mov (&DWP(16,$key),$rounds);
&set_label("good_key");
&pxor ("xmm0","xmm0");
&pxor ("xmm1","xmm1");
&pxor ("xmm2","xmm2");
&pxor ("xmm3","xmm3");
&pxor ("xmm4","xmm4");
&pxor ("xmm5","xmm5");
&xor ("eax","eax");
&pop ("ebx");
&pop ("ebp");
&ret ();
&set_label("bad_pointer",4);
&mov ("eax",-1);
&pop ("ebx");
&pop ("ebp");
&ret ();
&set_label("bad_keybits",4);
&pxor ("xmm0","xmm0");
&mov ("eax",-2);
&pop ("ebx");
&pop ("ebp");
&ret ();
&function_end_B("_aesni_set_encrypt_key");
# int $PREFIX_set_encrypt_key (const unsigned char *userKey, int bits,
# AES_KEY *key)
&function_begin_B("${PREFIX}_set_encrypt_key");
&mov ("eax",&wparam(0));
&mov ($rounds,&wparam(1));
&mov ($key,&wparam(2));
&call ("_aesni_set_encrypt_key");
&ret ();
&function_end_B("${PREFIX}_set_encrypt_key");
&set_label("key_const",64);
&data_word(0x0c0f0e0d,0x0c0f0e0d,0x0c0f0e0d,0x0c0f0e0d);
&data_word(0x04070605,0x04070605,0x04070605,0x04070605);
&data_word(1,1,1,1);
&data_word(0x1b,0x1b,0x1b,0x1b);
&asciz("AES for Intel AES-NI, CRYPTOGAMS by <appro\@openssl.org>");
&asm_finish();
close STDOUT or die "error closing STDOUT";

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,630 @@
#! /usr/bin/env perl
# Copyright 2014-2016 The OpenSSL Project Authors. All Rights Reserved.
#
# Licensed under the OpenSSL license (the "License"). You may not use
# this file except in compliance with the License. You can obtain a copy
# in the file LICENSE in the source distribution or at
# https://www.openssl.org/source/license.html
#
# ====================================================================
# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
# project. The module is, however, dual licensed under OpenSSL and
# CRYPTOGAMS licenses depending on where you obtain it. For further
# details see http://www.openssl.org/~appro/cryptogams/.
# ====================================================================
#
# This module implements support for ARMv8 AES instructions. The
# module is endian-agnostic in sense that it supports both big- and
# little-endian cases. As does it support both 32- and 64-bit modes
# of operation. Latter is achieved by limiting amount of utilized
# registers to 16, which implies additional NEON load and integer
# instructions. This has no effect on mighty Apple A7, where results
# are literally equal to the theoretical estimates based on AES
# instruction latencies and issue rates. On Cortex-A53, an in-order
# execution core, this costs up to 10-15%, which is partially
# compensated by implementing dedicated code path for 128-bit
# CBC encrypt case. On Cortex-A57 parallelizable mode performance
# seems to be limited by sheer amount of NEON instructions...
#
# Performance in cycles per byte processed with 128-bit key:
#
# CBC enc CBC dec CTR
# Apple A7 2.39 1.20 1.20
# Cortex-A53 1.32 1.29 1.46
# Cortex-A57(*) 1.95 0.85 0.93
# Denver 1.96 0.86 0.80
# Mongoose 1.33 1.20 1.20
#
# (*) original 3.64/1.34/1.32 results were for r0p0 revision
# and are still same even for updated module;
$flavour = shift;
$output = shift;
$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or
( $xlate="${dir}../../../perlasm/arm-xlate.pl" and -f $xlate) or
die "can't locate arm-xlate.pl";
open OUT,"| \"$^X\" $xlate $flavour $output";
*STDOUT=*OUT;
$prefix="aes_hw";
$code=<<___;
#include <GFp/arm_arch.h>
#if __ARM_MAX_ARCH__>=7
.text
___
$code.=".arch armv8-a+crypto\n" if ($flavour =~ /64/);
$code.=<<___ if ($flavour !~ /64/);
.arch armv7-a // don't confuse not-so-latest binutils with argv8 :-)
.fpu neon
.code 32
#undef __thumb2__
___
# Assembler mnemonics are an eclectic mix of 32- and 64-bit syntax,
# NEON is mostly 32-bit mnemonics, integer - mostly 64. Goal is to
# maintain both 32- and 64-bit codes within single module and
# transliterate common code to either flavour with regex vodoo.
#
{{{
my ($inp,$bits,$out,$ptr,$rounds)=("x0","w1","x2","x3","w12");
my ($zero,$rcon,$mask,$in0,$in1,$tmp,$key)=
$flavour=~/64/? map("q$_",(0..6)) : map("q$_",(0..3,8..10));
# On AArch64, put the data .rodata and use adrp + add for compatibility with
# execute-only memory. On AArch32, put it in .text and use adr.
$code.= ".section .rodata\n" if ($flavour =~ /64/);
$code.=<<___;
.align 5
.Lrcon:
.long 0x01,0x01,0x01,0x01
.long 0x0c0f0e0d,0x0c0f0e0d,0x0c0f0e0d,0x0c0f0e0d // rotate-n-splat
.long 0x1b,0x1b,0x1b,0x1b
.text
.globl GFp_${prefix}_set_encrypt_key
.type GFp_${prefix}_set_encrypt_key,%function
.align 5
GFp_${prefix}_set_encrypt_key:
.Lenc_key:
___
$code.=<<___ if ($flavour =~ /64/);
// Armv8.3-A PAuth: even though x30 is pushed to stack it is not popped later.
AARCH64_VALID_CALL_TARGET
stp x29,x30,[sp,#-16]!
add x29,sp,#0
___
$code.=<<___;
mov $ptr,#-1
cmp $inp,#0
b.eq .Lenc_key_abort
cmp $out,#0
b.eq .Lenc_key_abort
mov $ptr,#-2
cmp $bits,#128
b.lt .Lenc_key_abort
cmp $bits,#256
b.gt .Lenc_key_abort
tst $bits,#0x3f
b.ne .Lenc_key_abort
___
$code.=<<___ if ($flavour =~ /64/);
adrp $ptr,:pg_hi21:.Lrcon
add $ptr,$ptr,:lo12:.Lrcon
___
$code.=<<___ if ($flavour !~ /64/);
adr $ptr,.Lrcon
___
$code.=<<___;
cmp $bits,#192
veor $zero,$zero,$zero
vld1.8 {$in0},[$inp],#16
mov $bits,#8 // reuse $bits
vld1.32 {$rcon,$mask},[$ptr],#32
b.lt .Loop128
// 192-bit key support was removed.
b .L256
.align 4
.Loop128:
vtbl.8 $key,{$in0},$mask
vext.8 $tmp,$zero,$in0,#12
vst1.32 {$in0},[$out],#16
aese $key,$zero
subs $bits,$bits,#1
veor $in0,$in0,$tmp
vext.8 $tmp,$zero,$tmp,#12
veor $in0,$in0,$tmp
vext.8 $tmp,$zero,$tmp,#12
veor $key,$key,$rcon
veor $in0,$in0,$tmp
vshl.u8 $rcon,$rcon,#1
veor $in0,$in0,$key
b.ne .Loop128
vld1.32 {$rcon},[$ptr]
vtbl.8 $key,{$in0},$mask
vext.8 $tmp,$zero,$in0,#12
vst1.32 {$in0},[$out],#16
aese $key,$zero
veor $in0,$in0,$tmp
vext.8 $tmp,$zero,$tmp,#12
veor $in0,$in0,$tmp
vext.8 $tmp,$zero,$tmp,#12
veor $key,$key,$rcon
veor $in0,$in0,$tmp
vshl.u8 $rcon,$rcon,#1
veor $in0,$in0,$key
vtbl.8 $key,{$in0},$mask
vext.8 $tmp,$zero,$in0,#12
vst1.32 {$in0},[$out],#16
aese $key,$zero
veor $in0,$in0,$tmp
vext.8 $tmp,$zero,$tmp,#12
veor $in0,$in0,$tmp
vext.8 $tmp,$zero,$tmp,#12
veor $key,$key,$rcon
veor $in0,$in0,$tmp
veor $in0,$in0,$key
vst1.32 {$in0},[$out]
add $out,$out,#0x50
mov $rounds,#10
b .Ldone
// 192-bit key support was removed.
.align 4
.L256:
vld1.8 {$in1},[$inp]
mov $bits,#7
mov $rounds,#14
vst1.32 {$in0},[$out],#16
.Loop256:
vtbl.8 $key,{$in1},$mask
vext.8 $tmp,$zero,$in0,#12
vst1.32 {$in1},[$out],#16
aese $key,$zero
subs $bits,$bits,#1
veor $in0,$in0,$tmp
vext.8 $tmp,$zero,$tmp,#12
veor $in0,$in0,$tmp
vext.8 $tmp,$zero,$tmp,#12
veor $key,$key,$rcon
veor $in0,$in0,$tmp
vshl.u8 $rcon,$rcon,#1
veor $in0,$in0,$key
vst1.32 {$in0},[$out],#16
b.eq .Ldone
vdup.32 $key,${in0}[3] // just splat
vext.8 $tmp,$zero,$in1,#12
aese $key,$zero
veor $in1,$in1,$tmp
vext.8 $tmp,$zero,$tmp,#12
veor $in1,$in1,$tmp
vext.8 $tmp,$zero,$tmp,#12
veor $in1,$in1,$tmp
veor $in1,$in1,$key
b .Loop256
.Ldone:
str $rounds,[$out]
mov $ptr,#0
.Lenc_key_abort:
mov x0,$ptr // return value
`"ldr x29,[sp],#16" if ($flavour =~ /64/)`
ret
.size GFp_${prefix}_set_encrypt_key,.-GFp_${prefix}_set_encrypt_key
___
}}}
{{{
sub gen_block () {
my $dir = shift;
my ($e,$mc) = $dir eq "en" ? ("e","mc") : ("d","imc");
my ($inp,$out,$key)=map("x$_",(0..2));
my $rounds="w3";
my ($rndkey0,$rndkey1,$inout)=map("q$_",(0..3));
$code.=<<___;
.globl GFp_${prefix}_${dir}crypt
.type GFp_${prefix}_${dir}crypt,%function
.align 5
GFp_${prefix}_${dir}crypt:
AARCH64_VALID_CALL_TARGET
ldr $rounds,[$key,#240]
vld1.32 {$rndkey0},[$key],#16
vld1.8 {$inout},[$inp]
sub $rounds,$rounds,#2
vld1.32 {$rndkey1},[$key],#16
.Loop_${dir}c:
aes$e $inout,$rndkey0
aes$mc $inout,$inout
vld1.32 {$rndkey0},[$key],#16
subs $rounds,$rounds,#2
aes$e $inout,$rndkey1
aes$mc $inout,$inout
vld1.32 {$rndkey1},[$key],#16
b.gt .Loop_${dir}c
aes$e $inout,$rndkey0
aes$mc $inout,$inout
vld1.32 {$rndkey0},[$key]
aes$e $inout,$rndkey1
veor $inout,$inout,$rndkey0
vst1.8 {$inout},[$out]
ret
.size GFp_${prefix}_${dir}crypt,.-GFp_${prefix}_${dir}crypt
___
}
&gen_block("en");
&gen_block("de");
}}}
{{{
my ($inp,$out,$len,$key,$ivp)=map("x$_",(0..4));
my ($rounds,$cnt,$key_)=("w5","w6","x7");
my ($ctr,$tctr0,$tctr1,$tctr2)=map("w$_",(8..10,12));
my $step="x12"; # aliases with $tctr2
my ($dat0,$dat1,$in0,$in1,$tmp0,$tmp1,$ivec,$rndlast)=map("q$_",(0..7));
my ($dat2,$in2,$tmp2)=map("q$_",(10,11,9));
my ($dat,$tmp)=($dat0,$tmp0);
### q8-q15 preloaded key schedule
$code.=<<___;
.globl GFp_${prefix}_ctr32_encrypt_blocks
.type GFp_${prefix}_ctr32_encrypt_blocks,%function
.align 5
GFp_${prefix}_ctr32_encrypt_blocks:
___
$code.=<<___ if ($flavour =~ /64/);
// Armv8.3-A PAuth: even though x30 is pushed to stack it is not popped later.
AARCH64_VALID_CALL_TARGET
stp x29,x30,[sp,#-16]!
add x29,sp,#0
___
$code.=<<___ if ($flavour !~ /64/);
mov ip,sp
stmdb sp!,{r4-r10,lr}
vstmdb sp!,{d8-d15} @ ABI specification says so
ldr r4, [ip] @ load remaining arg
___
$code.=<<___;
ldr $rounds,[$key,#240]
ldr $ctr, [$ivp, #12]
vld1.32 {$dat0},[$ivp]
vld1.32 {q8-q9},[$key] // load key schedule...
sub $rounds,$rounds,#4
mov $step,#16
cmp $len,#2
add $key_,$key,x5,lsl#4 // pointer to last 5 round keys
sub $rounds,$rounds,#2
vld1.32 {q12-q13},[$key_],#32
vld1.32 {q14-q15},[$key_],#32
vld1.32 {$rndlast},[$key_]
add $key_,$key,#32
mov $cnt,$rounds
cclr $step,lo
// ARM Cortex-A57 and Cortex-A72 cores running in 32-bit mode are
// affected by silicon errata #1742098 [0] and #1655431 [1],
// respectively, where the second instruction of an aese/aesmc
// instruction pair may execute twice if an interrupt is taken right
// after the first instruction consumes an input register of which a
// single 32-bit lane has been updated the last time it was modified.
//
// This function uses a counter in one 32-bit lane. The vmov.32 lines
// could write to $dat1 and $dat2 directly, but that trips this bugs.
// We write to $ivec and copy to the final register as a workaround.
//
// [0] ARM-EPM-049219 v23 Cortex-A57 MPCore Software Developers Errata Notice
// [1] ARM-EPM-012079 v11.0 Cortex-A72 MPCore Software Developers Errata Notice
#ifndef __ARMEB__
rev $ctr, $ctr
#endif
add $tctr1, $ctr, #1
vorr $ivec,$dat0,$dat0
rev $tctr1, $tctr1
vmov.32 ${ivec}[3],$tctr1
add $ctr, $ctr, #2
vorr $dat1,$ivec,$ivec
b.ls .Lctr32_tail
rev $tctr2, $ctr
vmov.32 ${ivec}[3],$tctr2
sub $len,$len,#3 // bias
vorr $dat2,$ivec,$ivec
b .Loop3x_ctr32
.align 4
.Loop3x_ctr32:
aese $dat0,q8
aesmc $dat0,$dat0
aese $dat1,q8
aesmc $dat1,$dat1
aese $dat2,q8
aesmc $dat2,$dat2
vld1.32 {q8},[$key_],#16
subs $cnt,$cnt,#2
aese $dat0,q9
aesmc $dat0,$dat0
aese $dat1,q9
aesmc $dat1,$dat1
aese $dat2,q9
aesmc $dat2,$dat2
vld1.32 {q9},[$key_],#16
b.gt .Loop3x_ctr32
aese $dat0,q8
aesmc $tmp0,$dat0
aese $dat1,q8
aesmc $tmp1,$dat1
vld1.8 {$in0},[$inp],#16
add $tctr0,$ctr,#1
aese $dat2,q8
aesmc $dat2,$dat2
vld1.8 {$in1},[$inp],#16
rev $tctr0,$tctr0
aese $tmp0,q9
aesmc $tmp0,$tmp0
aese $tmp1,q9
aesmc $tmp1,$tmp1
vld1.8 {$in2},[$inp],#16
mov $key_,$key
aese $dat2,q9
aesmc $tmp2,$dat2
aese $tmp0,q12
aesmc $tmp0,$tmp0
aese $tmp1,q12
aesmc $tmp1,$tmp1
veor $in0,$in0,$rndlast
add $tctr1,$ctr,#2
aese $tmp2,q12
aesmc $tmp2,$tmp2
veor $in1,$in1,$rndlast
add $ctr,$ctr,#3
aese $tmp0,q13
aesmc $tmp0,$tmp0
aese $tmp1,q13
aesmc $tmp1,$tmp1
// Note the logic to update $dat0, $dat1, and $dat1 is written to work
// around a bug in ARM Cortex-A57 and Cortex-A72 cores running in
// 32-bit mode. See the comment above.
veor $in2,$in2,$rndlast
vmov.32 ${ivec}[3], $tctr0
aese $tmp2,q13
aesmc $tmp2,$tmp2
vorr $dat0,$ivec,$ivec
rev $tctr1,$tctr1
aese $tmp0,q14
aesmc $tmp0,$tmp0
vmov.32 ${ivec}[3], $tctr1
rev $tctr2,$ctr
aese $tmp1,q14
aesmc $tmp1,$tmp1
vorr $dat1,$ivec,$ivec
vmov.32 ${ivec}[3], $tctr2
aese $tmp2,q14
aesmc $tmp2,$tmp2
vorr $dat2,$ivec,$ivec
subs $len,$len,#3
aese $tmp0,q15
aese $tmp1,q15
aese $tmp2,q15
veor $in0,$in0,$tmp0
vld1.32 {q8},[$key_],#16 // re-pre-load rndkey[0]
vst1.8 {$in0},[$out],#16
veor $in1,$in1,$tmp1
mov $cnt,$rounds
vst1.8 {$in1},[$out],#16
veor $in2,$in2,$tmp2
vld1.32 {q9},[$key_],#16 // re-pre-load rndkey[1]
vst1.8 {$in2},[$out],#16
b.hs .Loop3x_ctr32
adds $len,$len,#3
b.eq .Lctr32_done
cmp $len,#1
mov $step,#16
cclr $step,eq
.Lctr32_tail:
aese $dat0,q8
aesmc $dat0,$dat0
aese $dat1,q8
aesmc $dat1,$dat1
vld1.32 {q8},[$key_],#16
subs $cnt,$cnt,#2
aese $dat0,q9
aesmc $dat0,$dat0
aese $dat1,q9
aesmc $dat1,$dat1
vld1.32 {q9},[$key_],#16
b.gt .Lctr32_tail
aese $dat0,q8
aesmc $dat0,$dat0
aese $dat1,q8
aesmc $dat1,$dat1
aese $dat0,q9
aesmc $dat0,$dat0
aese $dat1,q9
aesmc $dat1,$dat1
vld1.8 {$in0},[$inp],$step
aese $dat0,q12
aesmc $dat0,$dat0
aese $dat1,q12
aesmc $dat1,$dat1
vld1.8 {$in1},[$inp]
aese $dat0,q13
aesmc $dat0,$dat0
aese $dat1,q13
aesmc $dat1,$dat1
veor $in0,$in0,$rndlast
aese $dat0,q14
aesmc $dat0,$dat0
aese $dat1,q14
aesmc $dat1,$dat1
veor $in1,$in1,$rndlast
aese $dat0,q15
aese $dat1,q15
cmp $len,#1
veor $in0,$in0,$dat0
veor $in1,$in1,$dat1
vst1.8 {$in0},[$out],#16
b.eq .Lctr32_done
vst1.8 {$in1},[$out]
.Lctr32_done:
___
$code.=<<___ if ($flavour !~ /64/);
vldmia sp!,{d8-d15}
ldmia sp!,{r4-r10,pc}
___
$code.=<<___ if ($flavour =~ /64/);
ldr x29,[sp],#16
ret
___
$code.=<<___;
.size GFp_${prefix}_ctr32_encrypt_blocks,.-GFp_${prefix}_ctr32_encrypt_blocks
___
}}}
$code.=<<___;
#endif
___
########################################
if ($flavour =~ /64/) { ######## 64-bit code
my %opcode = (
"aesd" => 0x4e285800, "aese" => 0x4e284800,
"aesimc"=> 0x4e287800, "aesmc" => 0x4e286800 );
local *unaes = sub {
my ($mnemonic,$arg)=@_;
$arg =~ m/[qv]([0-9]+)[^,]*,\s*[qv]([0-9]+)/o &&
sprintf ".inst\t0x%08x\t//%s %s",
$opcode{$mnemonic}|$1|($2<<5),
$mnemonic,$arg;
};
foreach(split("\n",$code)) {
s/\`([^\`]*)\`/eval($1)/geo;
s/\bq([0-9]+)\b/"v".($1<8?$1:$1+8).".16b"/geo; # old->new registers
s/@\s/\/\//o; # old->new style commentary
#s/[v]?(aes\w+)\s+([qv].*)/unaes($1,$2)/geo or
s/cclr\s+([wx])([^,]+),\s*([a-z]+)/csel $1$2,$1zr,$1$2,$3/o or
s/mov\.([a-z]+)\s+([wx][0-9]+),\s*([wx][0-9]+)/csel $2,$3,$2,$1/o or
s/vmov\.i8/movi/o or # fix up legacy mnemonics
s/vext\.8/ext/o or
s/vrev32\.8/rev32/o or
s/vtst\.8/cmtst/o or
s/vshr/ushr/o or
s/^(\s+)v/$1/o or # strip off v prefix
s/\bbx\s+lr\b/ret/o;
# fix up remaining legacy suffixes
s/\.[ui]?8//o;
m/\],#8/o and s/\.16b/\.8b/go;
s/\.[ui]?32//o and s/\.16b/\.4s/go;
s/\.[ui]?64//o and s/\.16b/\.2d/go;
s/\.[42]([sd])\[([0-3])\]/\.$1\[$2\]/o;
print $_,"\n";
}
} else { ######## 32-bit code
my %opcode = (
"aesd" => 0xf3b00340, "aese" => 0xf3b00300,
"aesimc"=> 0xf3b003c0, "aesmc" => 0xf3b00380 );
local *unaes = sub {
my ($mnemonic,$arg)=@_;
if ($arg =~ m/[qv]([0-9]+)[^,]*,\s*[qv]([0-9]+)/o) {
my $word = $opcode{$mnemonic}|(($1&7)<<13)|(($1&8)<<19)
|(($2&7)<<1) |(($2&8)<<2);
# since ARMv7 instructions are always encoded little-endian.
# correct solution is to use .inst directive, but older
# assemblers don't implement it:-(
sprintf ".byte\t0x%02x,0x%02x,0x%02x,0x%02x\t@ %s %s",
$word&0xff,($word>>8)&0xff,
($word>>16)&0xff,($word>>24)&0xff,
$mnemonic,$arg;
}
};
sub unvtbl {
my $arg=shift;
$arg =~ m/q([0-9]+),\s*\{q([0-9]+)\},\s*q([0-9]+)/o &&
sprintf "vtbl.8 d%d,{q%d},d%d\n\t".
"vtbl.8 d%d,{q%d},d%d", 2*$1,$2,2*$3, 2*$1+1,$2,2*$3+1;
}
sub unvdup32 {
my $arg=shift;
$arg =~ m/q([0-9]+),\s*q([0-9]+)\[([0-3])\]/o &&
sprintf "vdup.32 q%d,d%d[%d]",$1,2*$2+($3>>1),$3&1;
}
sub unvmov32 {
my $arg=shift;
$arg =~ m/q([0-9]+)\[([0-3])\],(.*)/o &&
sprintf "vmov.32 d%d[%d],%s",2*$1+($2>>1),$2&1,$3;
}
foreach(split("\n",$code)) {
s/\`([^\`]*)\`/eval($1)/geo;
s/\b[wx]([0-9]+)\b/r$1/go; # new->old registers
s/\bv([0-9])\.[12468]+[bsd]\b/q$1/go; # new->old registers
s/\/\/\s?/@ /o; # new->old style commentary
# fix up remaining new-style suffixes
s/\{q([0-9]+)\},\s*\[(.+)\],#8/sprintf "{d%d},[$2]!",2*$1/eo or
s/\],#[0-9]+/]!/o;
s/[v]?(aes\w+)\s+([qv].*)/unaes($1,$2)/geo or
s/cclr\s+([^,]+),\s*([a-z]+)/mov$2 $1,#0/o or
s/vtbl\.8\s+(.*)/unvtbl($1)/geo or
s/vdup\.32\s+(.*)/unvdup32($1)/geo or
s/vmov\.32\s+(.*)/unvmov32($1)/geo or
s/^(\s+)b\./$1b/o or
s/^(\s+)mov\./$1mov/o or
s/^(\s+)ret/$1bx\tlr/o;
print $_,"\n";
}
}
close STDOUT or die "error closing STDOUT";

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,603 @@
#! /usr/bin/env perl
# Copyright 2011-2016 The OpenSSL Project Authors. All Rights Reserved.
#
# Licensed under the OpenSSL license (the "License"). You may not use
# this file except in compliance with the License. You can obtain a copy
# in the file LICENSE in the source distribution or at
# https://www.openssl.org/source/license.html
######################################################################
## Constant-time SSSE3 AES core implementation.
## version 0.1
##
## By Mike Hamburg (Stanford University), 2009
## Public domain.
##
## For details see http://shiftleft.org/papers/vector_aes/ and
## http://crypto.stanford.edu/vpaes/.
######################################################################
# September 2011.
#
# Port vpaes-x86_64.pl as 32-bit "almost" drop-in replacement for
# aes-586.pl. "Almost" refers to the fact that AES_cbc_encrypt
# doesn't handle partial vectors (doesn't have to if called from
# EVP only). "Drop-in" implies that this module doesn't share key
# schedule structure with the original nor does it make assumption
# about its alignment...
#
# Performance summary. aes-586.pl column lists large-block CBC
# encrypt/decrypt/with-hyper-threading-off(*) results in cycles per
# byte processed with 128-bit key, and vpaes-x86.pl column - [also
# large-block CBC] encrypt/decrypt.
#
# aes-586.pl vpaes-x86.pl
#
# Core 2(**) 28.1/41.4/18.3 21.9/25.2(***)
# Nehalem 27.9/40.4/18.1 10.2/11.9
# Atom 70.7/92.1/60.1 61.1/75.4(***)
# Silvermont 45.4/62.9/24.1 49.2/61.1(***)
#
# (*) "Hyper-threading" in the context refers rather to cache shared
# among multiple cores, than to specifically Intel HTT. As vast
# majority of contemporary cores share cache, slower code path
# is common place. In other words "with-hyper-threading-off"
# results are presented mostly for reference purposes.
#
# (**) "Core 2" refers to initial 65nm design, a.k.a. Conroe.
#
# (***) Less impressive improvement on Core 2 and Atom is due to slow
# pshufb, yet it's respectable +28%/64% improvement on Core 2
# and +15% on Atom (as implied, over "hyper-threading-safe"
# code path).
#
# <appro@openssl.org>
$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
push(@INC,"${dir}","${dir}../../../perlasm");
require "x86asm.pl";
$output = pop;
open OUT,">$output";
*STDOUT=*OUT;
&asm_init($ARGV[0],$x86only = $ARGV[$#ARGV] eq "386");
$PREFIX="vpaes";
my ($round, $base, $magic, $key, $const, $inp, $out)=
("eax", "ebx", "ecx", "edx","ebp", "esi","edi");
&static_label("_vpaes_consts");
&static_label("_vpaes_schedule_low_round");
&set_label("_vpaes_consts",64);
$k_inv=-0x30; # inv, inva
&data_word(0x0D080180,0x0E05060F,0x0A0B0C02,0x04070309);
&data_word(0x0F0B0780,0x01040A06,0x02050809,0x030D0E0C);
$k_s0F=-0x10; # s0F
&data_word(0x0F0F0F0F,0x0F0F0F0F,0x0F0F0F0F,0x0F0F0F0F);
$k_ipt=0x00; # input transform (lo, hi)
&data_word(0x5A2A7000,0xC2B2E898,0x52227808,0xCABAE090);
&data_word(0x317C4D00,0x4C01307D,0xB0FDCC81,0xCD80B1FC);
$k_sb1=0x20; # sb1u, sb1t
&data_word(0xCB503E00,0xB19BE18F,0x142AF544,0xA5DF7A6E);
&data_word(0xFAE22300,0x3618D415,0x0D2ED9EF,0x3BF7CCC1);
$k_sb2=0x40; # sb2u, sb2t
&data_word(0x0B712400,0xE27A93C6,0xBC982FCD,0x5EB7E955);
&data_word(0x0AE12900,0x69EB8840,0xAB82234A,0xC2A163C8);
$k_sbo=0x60; # sbou, sbot
&data_word(0x6FBDC700,0xD0D26D17,0xC502A878,0x15AABF7A);
&data_word(0x5FBB6A00,0xCFE474A5,0x412B35FA,0x8E1E90D1);
$k_mc_forward=0x80; # mc_forward
&data_word(0x00030201,0x04070605,0x080B0A09,0x0C0F0E0D);
&data_word(0x04070605,0x080B0A09,0x0C0F0E0D,0x00030201);
&data_word(0x080B0A09,0x0C0F0E0D,0x00030201,0x04070605);
&data_word(0x0C0F0E0D,0x00030201,0x04070605,0x080B0A09);
$k_mc_backward=0xc0; # mc_backward
&data_word(0x02010003,0x06050407,0x0A09080B,0x0E0D0C0F);
&data_word(0x0E0D0C0F,0x02010003,0x06050407,0x0A09080B);
&data_word(0x0A09080B,0x0E0D0C0F,0x02010003,0x06050407);
&data_word(0x06050407,0x0A09080B,0x0E0D0C0F,0x02010003);
$k_sr=0x100; # sr
&data_word(0x03020100,0x07060504,0x0B0A0908,0x0F0E0D0C);
&data_word(0x0F0A0500,0x030E0904,0x07020D08,0x0B06010C);
&data_word(0x0B020900,0x0F060D04,0x030A0108,0x070E050C);
&data_word(0x070A0D00,0x0B0E0104,0x0F020508,0x0306090C);
$k_rcon=0x140; # rcon
&data_word(0xAF9DEEB6,0x1F8391B9,0x4D7C7D81,0x702A9808);
$k_s63=0x150; # s63: all equal to 0x63 transformed
&data_word(0x5B5B5B5B,0x5B5B5B5B,0x5B5B5B5B,0x5B5B5B5B);
$k_opt=0x160; # output transform
&data_word(0xD6B66000,0xFF9F4929,0xDEBE6808,0xF7974121);
&data_word(0x50BCEC00,0x01EDBD51,0xB05C0CE0,0xE10D5DB1);
$k_deskew=0x180; # deskew tables: inverts the sbox's "skew"
&data_word(0x47A4E300,0x07E4A340,0x5DBEF91A,0x1DFEB95A);
&data_word(0x83EA6900,0x5F36B5DC,0xF49D1E77,0x2841C2AB);
&asciz ("Vector Permutation AES for x86/SSSE3, Mike Hamburg (Stanford University)");
&align (64);
&function_begin_B("_vpaes_preheat");
&add ($const,&DWP(0,"esp"));
&movdqa ("xmm7",&QWP($k_inv,$const));
&movdqa ("xmm6",&QWP($k_s0F,$const));
&ret ();
&function_end_B("_vpaes_preheat");
##
## _aes_encrypt_core
##
## AES-encrypt %xmm0.
##
## Inputs:
## %xmm0 = input
## %xmm6-%xmm7 as in _vpaes_preheat
## (%edx) = scheduled keys
##
## Output in %xmm0
## Clobbers %xmm1-%xmm5, %eax, %ebx, %ecx, %edx
##
##
&function_begin_B("_vpaes_encrypt_core");
&mov ($magic,16);
&mov ($round,&DWP(240,$key));
&movdqa ("xmm1","xmm6")
&movdqa ("xmm2",&QWP($k_ipt,$const));
&pandn ("xmm1","xmm0");
&pand ("xmm0","xmm6");
&movdqu ("xmm5",&QWP(0,$key));
&pshufb ("xmm2","xmm0");
&movdqa ("xmm0",&QWP($k_ipt+16,$const));
&pxor ("xmm2","xmm5");
&psrld ("xmm1",4);
&add ($key,16);
&pshufb ("xmm0","xmm1");
&lea ($base,&DWP($k_mc_backward,$const));
&pxor ("xmm0","xmm2");
&jmp (&label("enc_entry"));
&set_label("enc_loop",16);
# middle of middle round
&movdqa ("xmm4",&QWP($k_sb1,$const)); # 4 : sb1u
&movdqa ("xmm0",&QWP($k_sb1+16,$const));# 0 : sb1t
&pshufb ("xmm4","xmm2"); # 4 = sb1u
&pshufb ("xmm0","xmm3"); # 0 = sb1t
&pxor ("xmm4","xmm5"); # 4 = sb1u + k
&movdqa ("xmm5",&QWP($k_sb2,$const)); # 4 : sb2u
&pxor ("xmm0","xmm4"); # 0 = A
&movdqa ("xmm1",&QWP(-0x40,$base,$magic));# .Lk_mc_forward[]
&pshufb ("xmm5","xmm2"); # 4 = sb2u
&movdqa ("xmm2",&QWP($k_sb2+16,$const));# 2 : sb2t
&movdqa ("xmm4",&QWP(0,$base,$magic)); # .Lk_mc_backward[]
&pshufb ("xmm2","xmm3"); # 2 = sb2t
&movdqa ("xmm3","xmm0"); # 3 = A
&pxor ("xmm2","xmm5"); # 2 = 2A
&pshufb ("xmm0","xmm1"); # 0 = B
&add ($key,16); # next key
&pxor ("xmm0","xmm2"); # 0 = 2A+B
&pshufb ("xmm3","xmm4"); # 3 = D
&add ($magic,16); # next mc
&pxor ("xmm3","xmm0"); # 3 = 2A+B+D
&pshufb ("xmm0","xmm1"); # 0 = 2B+C
&and ($magic,0x30); # ... mod 4
&sub ($round,1); # nr--
&pxor ("xmm0","xmm3"); # 0 = 2A+3B+C+D
&set_label("enc_entry");
# top of round
&movdqa ("xmm1","xmm6"); # 1 : i
&movdqa ("xmm5",&QWP($k_inv+16,$const));# 2 : a/k
&pandn ("xmm1","xmm0"); # 1 = i<<4
&psrld ("xmm1",4); # 1 = i
&pand ("xmm0","xmm6"); # 0 = k
&pshufb ("xmm5","xmm0"); # 2 = a/k
&movdqa ("xmm3","xmm7"); # 3 : 1/i
&pxor ("xmm0","xmm1"); # 0 = j
&pshufb ("xmm3","xmm1"); # 3 = 1/i
&movdqa ("xmm4","xmm7"); # 4 : 1/j
&pxor ("xmm3","xmm5"); # 3 = iak = 1/i + a/k
&pshufb ("xmm4","xmm0"); # 4 = 1/j
&movdqa ("xmm2","xmm7"); # 2 : 1/iak
&pxor ("xmm4","xmm5"); # 4 = jak = 1/j + a/k
&pshufb ("xmm2","xmm3"); # 2 = 1/iak
&movdqa ("xmm3","xmm7"); # 3 : 1/jak
&pxor ("xmm2","xmm0"); # 2 = io
&pshufb ("xmm3","xmm4"); # 3 = 1/jak
&movdqu ("xmm5",&QWP(0,$key));
&pxor ("xmm3","xmm1"); # 3 = jo
&jnz (&label("enc_loop"));
# middle of last round
&movdqa ("xmm4",&QWP($k_sbo,$const)); # 3 : sbou .Lk_sbo
&movdqa ("xmm0",&QWP($k_sbo+16,$const));# 3 : sbot .Lk_sbo+16
&pshufb ("xmm4","xmm2"); # 4 = sbou
&pxor ("xmm4","xmm5"); # 4 = sb1u + k
&pshufb ("xmm0","xmm3"); # 0 = sb1t
&movdqa ("xmm1",&QWP(0x40,$base,$magic));# .Lk_sr[]
&pxor ("xmm0","xmm4"); # 0 = A
&pshufb ("xmm0","xmm1");
&ret ();
&function_end_B("_vpaes_encrypt_core");
########################################################
## ##
## AES key schedule ##
## ##
########################################################
&function_begin_B("_vpaes_schedule_core");
&add ($const,&DWP(0,"esp"));
&movdqu ("xmm0",&QWP(0,$inp)); # load key (unaligned)
&movdqa ("xmm2",&QWP($k_rcon,$const)); # load rcon
# input transform
&movdqa ("xmm3","xmm0");
&lea ($base,&DWP($k_ipt,$const));
&movdqa (&QWP(4,"esp"),"xmm2"); # xmm8
&call ("_vpaes_schedule_transform");
&movdqa ("xmm7","xmm0");
&test ($out,$out);
&jnz (&label("schedule_am_decrypting"));
# encrypting, output zeroth round key after transform
&movdqu (&QWP(0,$key),"xmm0");
&jmp (&label("schedule_go"));
&set_label("schedule_am_decrypting");
# decrypting, output zeroth round key after shiftrows
&movdqa ("xmm1",&QWP($k_sr,$const,$magic));
&pshufb ("xmm3","xmm1");
&movdqu (&QWP(0,$key),"xmm3");
&xor ($magic,0x30);
&set_label("schedule_go");
&cmp ($round,192);
&ja (&label("schedule_256"));
# 192-bit key support was removed.
# 128: fall though
##
## .schedule_128
##
## 128-bit specific part of key schedule.
##
## This schedule is really simple, because all its parts
## are accomplished by the subroutines.
##
&set_label("schedule_128");
&mov ($round,10);
&set_label("loop_schedule_128");
&call ("_vpaes_schedule_round");
&dec ($round);
&jz (&label("schedule_mangle_last"));
&call ("_vpaes_schedule_mangle"); # write output
&jmp (&label("loop_schedule_128"));
##
## .aes_schedule_256
##
## 256-bit specific part of key schedule.
##
## The structure here is very similar to the 128-bit
## schedule, but with an additional "low side" in
## %xmm6. The low side's rounds are the same as the
## high side's, except no rcon and no rotation.
##
&set_label("schedule_256",16);
&movdqu ("xmm0",&QWP(16,$inp)); # load key part 2 (unaligned)
&call ("_vpaes_schedule_transform"); # input transform
&mov ($round,7);
&set_label("loop_schedule_256");
&call ("_vpaes_schedule_mangle"); # output low result
&movdqa ("xmm6","xmm0"); # save cur_lo in xmm6
# high round
&call ("_vpaes_schedule_round");
&dec ($round);
&jz (&label("schedule_mangle_last"));
&call ("_vpaes_schedule_mangle");
# low round. swap xmm7 and xmm6
&pshufd ("xmm0","xmm0",0xFF);
&movdqa (&QWP(20,"esp"),"xmm7");
&movdqa ("xmm7","xmm6");
&call ("_vpaes_schedule_low_round");
&movdqa ("xmm7",&QWP(20,"esp"));
&jmp (&label("loop_schedule_256"));
##
## .aes_schedule_mangle_last
##
## Mangler for last round of key schedule
## Mangles %xmm0
## when encrypting, outputs out(%xmm0) ^ 63
## when decrypting, outputs unskew(%xmm0)
##
## Always called right before return... jumps to cleanup and exits
##
&set_label("schedule_mangle_last",16);
# schedule last round key from xmm0
&lea ($base,&DWP($k_deskew,$const));
&test ($out,$out);
&jnz (&label("schedule_mangle_last_dec"));
# encrypting
&movdqa ("xmm1",&QWP($k_sr,$const,$magic));
&pshufb ("xmm0","xmm1"); # output permute
&lea ($base,&DWP($k_opt,$const)); # prepare to output transform
&add ($key,32);
&set_label("schedule_mangle_last_dec");
&add ($key,-16);
&pxor ("xmm0",&QWP($k_s63,$const));
&call ("_vpaes_schedule_transform"); # output transform
&movdqu (&QWP(0,$key),"xmm0"); # save last key
# cleanup
&pxor ("xmm0","xmm0");
&pxor ("xmm1","xmm1");
&pxor ("xmm2","xmm2");
&pxor ("xmm3","xmm3");
&pxor ("xmm4","xmm4");
&pxor ("xmm5","xmm5");
&pxor ("xmm6","xmm6");
&pxor ("xmm7","xmm7");
&ret ();
&function_end_B("_vpaes_schedule_core");
##
## .aes_schedule_round
##
## Runs one main round of the key schedule on %xmm0, %xmm7
##
## Specifically, runs subbytes on the high dword of %xmm0
## then rotates it by one byte and xors into the low dword of
## %xmm7.
##
## Adds rcon from low byte of %xmm8, then rotates %xmm8 for
## next rcon.
##
## Smears the dwords of %xmm7 by xoring the low into the
## second low, result into third, result into highest.
##
## Returns results in %xmm7 = %xmm0.
## Clobbers %xmm1-%xmm5.
##
&function_begin_B("_vpaes_schedule_round");
# extract rcon from xmm8
&movdqa ("xmm2",&QWP(8,"esp")); # xmm8
&pxor ("xmm1","xmm1");
&palignr("xmm1","xmm2",15);
&palignr("xmm2","xmm2",15);
&pxor ("xmm7","xmm1");
# rotate
&pshufd ("xmm0","xmm0",0xFF);
&palignr("xmm0","xmm0",1);
# fall through...
&movdqa (&QWP(8,"esp"),"xmm2"); # xmm8
# low round: same as high round, but no rotation and no rcon.
&set_label("_vpaes_schedule_low_round");
# smear xmm7
&movdqa ("xmm1","xmm7");
&pslldq ("xmm7",4);
&pxor ("xmm7","xmm1");
&movdqa ("xmm1","xmm7");
&pslldq ("xmm7",8);
&pxor ("xmm7","xmm1");
&pxor ("xmm7",&QWP($k_s63,$const));
# subbyte
&movdqa ("xmm4",&QWP($k_s0F,$const));
&movdqa ("xmm5",&QWP($k_inv,$const)); # 4 : 1/j
&movdqa ("xmm1","xmm4");
&pandn ("xmm1","xmm0");
&psrld ("xmm1",4); # 1 = i
&pand ("xmm0","xmm4"); # 0 = k
&movdqa ("xmm2",&QWP($k_inv+16,$const));# 2 : a/k
&pshufb ("xmm2","xmm0"); # 2 = a/k
&pxor ("xmm0","xmm1"); # 0 = j
&movdqa ("xmm3","xmm5"); # 3 : 1/i
&pshufb ("xmm3","xmm1"); # 3 = 1/i
&pxor ("xmm3","xmm2"); # 3 = iak = 1/i + a/k
&movdqa ("xmm4","xmm5"); # 4 : 1/j
&pshufb ("xmm4","xmm0"); # 4 = 1/j
&pxor ("xmm4","xmm2"); # 4 = jak = 1/j + a/k
&movdqa ("xmm2","xmm5"); # 2 : 1/iak
&pshufb ("xmm2","xmm3"); # 2 = 1/iak
&pxor ("xmm2","xmm0"); # 2 = io
&movdqa ("xmm3","xmm5"); # 3 : 1/jak
&pshufb ("xmm3","xmm4"); # 3 = 1/jak
&pxor ("xmm3","xmm1"); # 3 = jo
&movdqa ("xmm4",&QWP($k_sb1,$const)); # 4 : sbou
&pshufb ("xmm4","xmm2"); # 4 = sbou
&movdqa ("xmm0",&QWP($k_sb1+16,$const));# 0 : sbot
&pshufb ("xmm0","xmm3"); # 0 = sb1t
&pxor ("xmm0","xmm4"); # 0 = sbox output
# add in smeared stuff
&pxor ("xmm0","xmm7");
&movdqa ("xmm7","xmm0");
&ret ();
&function_end_B("_vpaes_schedule_round");
##
## .aes_schedule_transform
##
## Linear-transform %xmm0 according to tables at (%ebx)
##
## Output in %xmm0
## Clobbers %xmm1, %xmm2
##
&function_begin_B("_vpaes_schedule_transform");
&movdqa ("xmm2",&QWP($k_s0F,$const));
&movdqa ("xmm1","xmm2");
&pandn ("xmm1","xmm0");
&psrld ("xmm1",4);
&pand ("xmm0","xmm2");
&movdqa ("xmm2",&QWP(0,$base));
&pshufb ("xmm2","xmm0");
&movdqa ("xmm0",&QWP(16,$base));
&pshufb ("xmm0","xmm1");
&pxor ("xmm0","xmm2");
&ret ();
&function_end_B("_vpaes_schedule_transform");
##
## .aes_schedule_mangle
##
## Mangle xmm0 from (basis-transformed) standard version
## to our version.
##
## On encrypt,
## xor with 0x63
## multiply by circulant 0,1,1,1
## apply shiftrows transform
##
## On decrypt,
## xor with 0x63
## multiply by "inverse mixcolumns" circulant E,B,D,9
## deskew
## apply shiftrows transform
##
##
## Writes out to (%edx), and increments or decrements it
## Keeps track of round number mod 4 in %ecx
## Preserves xmm0
## Clobbers xmm1-xmm5
##
&function_begin_B("_vpaes_schedule_mangle");
&movdqa ("xmm4","xmm0"); # save xmm0 for later
&movdqa ("xmm5",&QWP($k_mc_forward,$const));
&test ($out,$out);
&jnz (&label("schedule_mangle_dec"));
# encrypting
&add ($key,16);
&pxor ("xmm4",&QWP($k_s63,$const));
&pshufb ("xmm4","xmm5");
&movdqa ("xmm3","xmm4");
&pshufb ("xmm4","xmm5");
&pxor ("xmm3","xmm4");
&pshufb ("xmm4","xmm5");
&pxor ("xmm3","xmm4");
&jmp (&label("schedule_mangle_both"));
&set_label("schedule_mangle_dec",16);
# inverse mix columns
&movdqa ("xmm2",&QWP($k_s0F,$const));
&lea ($inp,&DWP($k_dksd,$const));
&movdqa ("xmm1","xmm2");
&pandn ("xmm1","xmm4");
&psrld ("xmm1",4); # 1 = hi
&pand ("xmm4","xmm2"); # 4 = lo
&movdqa ("xmm2",&QWP(0,$inp));
&pshufb ("xmm2","xmm4");
&movdqa ("xmm3",&QWP(0x10,$inp));
&pshufb ("xmm3","xmm1");
&pxor ("xmm3","xmm2");
&pshufb ("xmm3","xmm5");
&movdqa ("xmm2",&QWP(0x20,$inp));
&pshufb ("xmm2","xmm4");
&pxor ("xmm2","xmm3");
&movdqa ("xmm3",&QWP(0x30,$inp));
&pshufb ("xmm3","xmm1");
&pxor ("xmm3","xmm2");
&pshufb ("xmm3","xmm5");
&movdqa ("xmm2",&QWP(0x40,$inp));
&pshufb ("xmm2","xmm4");
&pxor ("xmm2","xmm3");
&movdqa ("xmm3",&QWP(0x50,$inp));
&pshufb ("xmm3","xmm1");
&pxor ("xmm3","xmm2");
&pshufb ("xmm3","xmm5");
&movdqa ("xmm2",&QWP(0x60,$inp));
&pshufb ("xmm2","xmm4");
&pxor ("xmm2","xmm3");
&movdqa ("xmm3",&QWP(0x70,$inp));
&pshufb ("xmm3","xmm1");
&pxor ("xmm3","xmm2");
&add ($key,-16);
&set_label("schedule_mangle_both");
&movdqa ("xmm1",&QWP($k_sr,$const,$magic));
&pshufb ("xmm3","xmm1");
&add ($magic,-16);
&and ($magic,0x30);
&movdqu (&QWP(0,$key),"xmm3");
&ret ();
&function_end_B("_vpaes_schedule_mangle");
#
# Interface to OpenSSL
#
&function_begin("GFp_${PREFIX}_set_encrypt_key");
&mov ($inp,&wparam(0)); # inp
&lea ($base,&DWP(-56,"esp"));
&mov ($round,&wparam(1)); # bits
&and ($base,-16);
&mov ($key,&wparam(2)); # key
&xchg ($base,"esp"); # alloca
&mov (&DWP(48,"esp"),$base);
&mov ($base,$round);
&shr ($base,5);
&add ($base,5);
&mov (&DWP(240,$key),$base); # AES_KEY->rounds = nbits/32+5;
&mov ($magic,0x30);
&mov ($out,0);
&lea ($const,&DWP(&label("_vpaes_consts")."+0x30-".&label("pic_point")));
&call ("_vpaes_schedule_core");
&set_label("pic_point");
&mov ("esp",&DWP(48,"esp"));
&xor ("eax","eax");
&function_end("GFp_${PREFIX}_set_encrypt_key");
&function_begin("GFp_${PREFIX}_encrypt");
&lea ($const,&DWP(&label("_vpaes_consts")."+0x30-".&label("pic_point")));
&call ("_vpaes_preheat");
&set_label("pic_point");
&mov ($inp,&wparam(0)); # inp
&lea ($base,&DWP(-56,"esp"));
&mov ($out,&wparam(1)); # out
&and ($base,-16);
&mov ($key,&wparam(2)); # key
&xchg ($base,"esp"); # alloca
&mov (&DWP(48,"esp"),$base);
&movdqu ("xmm0",&QWP(0,$inp));
&call ("_vpaes_encrypt_core");
&movdqu (&QWP(0,$out),"xmm0");
&mov ("esp",&DWP(48,"esp"));
&function_end("GFp_${PREFIX}_encrypt");
&asm_finish();
close STDOUT or die "error closing STDOUT";

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,761 @@
#! /usr/bin/env perl
# Copyright 2007-2016 The OpenSSL Project Authors. All Rights Reserved.
#
# Licensed under the OpenSSL license (the "License"). You may not use
# this file except in compliance with the License. You can obtain a copy
# in the file LICENSE in the source distribution or at
# https://www.openssl.org/source/license.html
# ====================================================================
# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
# project. The module is, however, dual licensed under OpenSSL and
# CRYPTOGAMS licenses depending on where you obtain it. For further
# details see http://www.openssl.org/~appro/cryptogams/.
# ====================================================================
# January 2007.
# Montgomery multiplication for ARMv4.
#
# Performance improvement naturally varies among CPU implementations
# and compilers. The code was observed to provide +65-35% improvement
# [depending on key length, less for longer keys] on ARM920T, and
# +115-80% on Intel IXP425. This is compared to pre-bn_mul_mont code
# base and compiler generated code with in-lined umull and even umlal
# instructions. The latter means that this code didn't really have an
# "advantage" of utilizing some "secret" instruction.
#
# The code is interoperable with Thumb ISA and is rather compact, less
# than 1/2KB. Windows CE port would be trivial, as it's exclusively
# about decorations, ABI and instruction syntax are identical.
# November 2013
#
# Add NEON code path, which handles lengths divisible by 8. RSA/DSA
# performance improvement on Cortex-A8 is ~45-100% depending on key
# length, more for longer keys. On Cortex-A15 the span is ~10-105%.
# On Snapdragon S4 improvement was measured to vary from ~70% to
# incredible ~380%, yes, 4.8x faster, for RSA4096 sign. But this is
# rather because original integer-only code seems to perform
# suboptimally on S4. Situation on Cortex-A9 is unfortunately
# different. It's being looked into, but the trouble is that
# performance for vectors longer than 256 bits is actually couple
# of percent worse than for integer-only code. The code is chosen
# for execution on all NEON-capable processors, because gain on
# others outweighs the marginal loss on Cortex-A9.
# September 2015
#
# Align Cortex-A9 performance with November 2013 improvements, i.e.
# NEON code is now ~20-105% faster than integer-only one on this
# processor. But this optimization further improved performance even
# on other processors: NEON code path is ~45-180% faster than original
# integer-only on Cortex-A8, ~10-210% on Cortex-A15, ~70-450% on
# Snapdragon S4.
$flavour = shift;
if ($flavour=~/\w[\w\-]*\.\w+$/) { $output=$flavour; undef $flavour; }
else { while (($output=shift) && ($output!~/\w[\w\-]*\.\w+$/)) {} }
if ($flavour && $flavour ne "void") {
$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or
( $xlate="${dir}../../../perlasm/arm-xlate.pl" and -f $xlate) or
die "can't locate arm-xlate.pl";
open OUT,"| \"$^X\" $xlate $flavour $output";
*STDOUT=*OUT;
} else {
open OUT,">$output";
*STDOUT=*OUT;
}
$num="r0"; # starts as num argument, but holds &tp[num-1]
$ap="r1";
$bp="r2"; $bi="r2"; $rp="r2";
$np="r3";
$tp="r4";
$aj="r5";
$nj="r6";
$tj="r7";
$n0="r8";
########### # r9 is reserved by ELF as platform specific, e.g. TLS pointer
$alo="r10"; # sl, gcc uses it to keep @GOT
$ahi="r11"; # fp
$nlo="r12"; # ip
########### # r13 is stack pointer
$nhi="r14"; # lr
########### # r15 is program counter
#### argument block layout relative to &tp[num-1], a.k.a. $num
$_rp="$num,#12*4";
# ap permanently resides in r1
$_bp="$num,#13*4";
# np permanently resides in r3
$_n0="$num,#14*4";
$_num="$num,#15*4"; $_bpend=$_num;
$code=<<___;
#include <GFp/arm_arch.h>
@ Silence ARMv8 deprecated IT instruction warnings. This file is used by both
@ ARMv7 and ARMv8 processors and does not use ARMv8 instructions.
.arch armv7-a
.text
#if defined(__thumb2__)
.syntax unified
.thumb
#else
.code 32
#endif
#if __ARM_MAX_ARCH__>=7
.extern GFp_armcap_P
.hidden GFp_armcap_P
.align 5
.LOPENSSL_armcap:
.word GFp_armcap_P-.Lbn_mul_mont
#endif
.global GFp_bn_mul_mont
.type GFp_bn_mul_mont,%function
.align 5
GFp_bn_mul_mont:
.Lbn_mul_mont:
ldr ip,[sp,#4] @ load num
stmdb sp!,{r0,r2} @ sp points at argument block
#if __ARM_MAX_ARCH__>=7
tst ip,#7
bne .Lialu
adr r0,.Lbn_mul_mont
ldr r2,.LOPENSSL_armcap
ldr r0,[r0,r2]
#ifdef __APPLE__
ldr r0,[r0]
#endif
tst r0,#ARMV7_NEON @ NEON available?
ldmia sp, {r0,r2}
beq .Lialu
add sp,sp,#8
b bn_mul8x_mont_neon
.align 4
.Lialu:
#endif
cmp ip,#2
mov $num,ip @ load num
#ifdef __thumb2__
ittt lt
#endif
movlt r0,#0
addlt sp,sp,#2*4
blt .Labrt
stmdb sp!,{r4-r12,lr} @ save 10 registers
mov $num,$num,lsl#2 @ rescale $num for byte count
sub sp,sp,$num @ alloca(4*num)
sub sp,sp,#4 @ +extra dword
sub $num,$num,#4 @ "num=num-1"
add $tp,$bp,$num @ &bp[num-1]
add $num,sp,$num @ $num to point at &tp[num-1]
ldr $n0,[$_n0] @ &n0
ldr $bi,[$bp] @ bp[0]
ldr $aj,[$ap],#4 @ ap[0],ap++
ldr $nj,[$np],#4 @ np[0],np++
ldr $n0,[$n0] @ *n0
str $tp,[$_bpend] @ save &bp[num]
umull $alo,$ahi,$aj,$bi @ ap[0]*bp[0]
str $n0,[$_n0] @ save n0 value
mul $n0,$alo,$n0 @ "tp[0]"*n0
mov $nlo,#0
umlal $alo,$nlo,$nj,$n0 @ np[0]*n0+"t[0]"
mov $tp,sp
.L1st:
ldr $aj,[$ap],#4 @ ap[j],ap++
mov $alo,$ahi
ldr $nj,[$np],#4 @ np[j],np++
mov $ahi,#0
umlal $alo,$ahi,$aj,$bi @ ap[j]*bp[0]
mov $nhi,#0
umlal $nlo,$nhi,$nj,$n0 @ np[j]*n0
adds $nlo,$nlo,$alo
str $nlo,[$tp],#4 @ tp[j-1]=,tp++
adc $nlo,$nhi,#0
cmp $tp,$num
bne .L1st
adds $nlo,$nlo,$ahi
ldr $tp,[$_bp] @ restore bp
mov $nhi,#0
ldr $n0,[$_n0] @ restore n0
adc $nhi,$nhi,#0
str $nlo,[$num] @ tp[num-1]=
mov $tj,sp
str $nhi,[$num,#4] @ tp[num]=
.Louter:
sub $tj,$num,$tj @ "original" $num-1 value
sub $ap,$ap,$tj @ "rewind" ap to &ap[1]
ldr $bi,[$tp,#4]! @ *(++bp)
sub $np,$np,$tj @ "rewind" np to &np[1]
ldr $aj,[$ap,#-4] @ ap[0]
ldr $alo,[sp] @ tp[0]
ldr $nj,[$np,#-4] @ np[0]
ldr $tj,[sp,#4] @ tp[1]
mov $ahi,#0
umlal $alo,$ahi,$aj,$bi @ ap[0]*bp[i]+tp[0]
str $tp,[$_bp] @ save bp
mul $n0,$alo,$n0
mov $nlo,#0
umlal $alo,$nlo,$nj,$n0 @ np[0]*n0+"tp[0]"
mov $tp,sp
.Linner:
ldr $aj,[$ap],#4 @ ap[j],ap++
adds $alo,$ahi,$tj @ +=tp[j]
ldr $nj,[$np],#4 @ np[j],np++
mov $ahi,#0
umlal $alo,$ahi,$aj,$bi @ ap[j]*bp[i]
mov $nhi,#0
umlal $nlo,$nhi,$nj,$n0 @ np[j]*n0
adc $ahi,$ahi,#0
ldr $tj,[$tp,#8] @ tp[j+1]
adds $nlo,$nlo,$alo
str $nlo,[$tp],#4 @ tp[j-1]=,tp++
adc $nlo,$nhi,#0
cmp $tp,$num
bne .Linner
adds $nlo,$nlo,$ahi
mov $nhi,#0
ldr $tp,[$_bp] @ restore bp
adc $nhi,$nhi,#0
ldr $n0,[$_n0] @ restore n0
adds $nlo,$nlo,$tj
ldr $tj,[$_bpend] @ restore &bp[num]
adc $nhi,$nhi,#0
str $nlo,[$num] @ tp[num-1]=
str $nhi,[$num,#4] @ tp[num]=
cmp $tp,$tj
#ifdef __thumb2__
itt ne
#endif
movne $tj,sp
bne .Louter
ldr $rp,[$_rp] @ pull rp
mov $aj,sp
add $num,$num,#4 @ $num to point at &tp[num]
sub $aj,$num,$aj @ "original" num value
mov $tp,sp @ "rewind" $tp
mov $ap,$tp @ "borrow" $ap
sub $np,$np,$aj @ "rewind" $np to &np[0]
subs $tj,$tj,$tj @ "clear" carry flag
.Lsub: ldr $tj,[$tp],#4
ldr $nj,[$np],#4
sbcs $tj,$tj,$nj @ tp[j]-np[j]
str $tj,[$rp],#4 @ rp[j]=
teq $tp,$num @ preserve carry
bne .Lsub
sbcs $nhi,$nhi,#0 @ upmost carry
mov $tp,sp @ "rewind" $tp
sub $rp,$rp,$aj @ "rewind" $rp
.Lcopy: ldr $tj,[$tp] @ conditional copy
ldr $aj,[$rp]
str sp,[$tp],#4 @ zap tp
#ifdef __thumb2__
it cc
#endif
movcc $aj,$tj
str $aj,[$rp],#4
teq $tp,$num @ preserve carry
bne .Lcopy
mov sp,$num
add sp,sp,#4 @ skip over tp[num+1]
ldmia sp!,{r4-r12,lr} @ restore registers
add sp,sp,#2*4 @ skip over {r0,r2}
mov r0,#1
.Labrt:
#if __ARM_ARCH__>=5
ret @ bx lr
#else
tst lr,#1
moveq pc,lr @ be binary compatible with V4, yet
bx lr @ interoperable with Thumb ISA:-)
#endif
.size GFp_bn_mul_mont,.-GFp_bn_mul_mont
___
{
my ($A0,$A1,$A2,$A3)=map("d$_",(0..3));
my ($N0,$N1,$N2,$N3)=map("d$_",(4..7));
my ($Z,$Temp)=("q4","q5");
my @ACC=map("q$_",(6..13));
my ($Bi,$Ni,$M0)=map("d$_",(28..31));
my $zero="$Z#lo";
my $temp="$Temp#lo";
my ($rptr,$aptr,$bptr,$nptr,$n0,$num)=map("r$_",(0..5));
my ($tinptr,$toutptr,$inner,$outer,$bnptr)=map("r$_",(6..11));
$code.=<<___;
#if __ARM_MAX_ARCH__>=7
.arch armv7-a
.fpu neon
.type bn_mul8x_mont_neon,%function
.align 5
bn_mul8x_mont_neon:
mov ip,sp
stmdb sp!,{r4-r11}
vstmdb sp!,{d8-d15} @ ABI specification says so
ldmia ip,{r4-r5} @ load rest of parameter block
mov ip,sp
cmp $num,#8
bhi .LNEON_8n
@ special case for $num==8, everything is in register bank...
vld1.32 {${Bi}[0]}, [$bptr,:32]!
veor $zero,$zero,$zero
sub $toutptr,sp,$num,lsl#4
vld1.32 {$A0-$A3}, [$aptr]! @ can't specify :32 :-(
and $toutptr,$toutptr,#-64
vld1.32 {${M0}[0]}, [$n0,:32]
mov sp,$toutptr @ alloca
vzip.16 $Bi,$zero
vmull.u32 @ACC[0],$Bi,${A0}[0]
vmull.u32 @ACC[1],$Bi,${A0}[1]
vmull.u32 @ACC[2],$Bi,${A1}[0]
vshl.i64 $Ni,@ACC[0]#hi,#16
vmull.u32 @ACC[3],$Bi,${A1}[1]
vadd.u64 $Ni,$Ni,@ACC[0]#lo
veor $zero,$zero,$zero
vmul.u32 $Ni,$Ni,$M0
vmull.u32 @ACC[4],$Bi,${A2}[0]
vld1.32 {$N0-$N3}, [$nptr]!
vmull.u32 @ACC[5],$Bi,${A2}[1]
vmull.u32 @ACC[6],$Bi,${A3}[0]
vzip.16 $Ni,$zero
vmull.u32 @ACC[7],$Bi,${A3}[1]
vmlal.u32 @ACC[0],$Ni,${N0}[0]
sub $outer,$num,#1
vmlal.u32 @ACC[1],$Ni,${N0}[1]
vmlal.u32 @ACC[2],$Ni,${N1}[0]
vmlal.u32 @ACC[3],$Ni,${N1}[1]
vmlal.u32 @ACC[4],$Ni,${N2}[0]
vmov $Temp,@ACC[0]
vmlal.u32 @ACC[5],$Ni,${N2}[1]
vmov @ACC[0],@ACC[1]
vmlal.u32 @ACC[6],$Ni,${N3}[0]
vmov @ACC[1],@ACC[2]
vmlal.u32 @ACC[7],$Ni,${N3}[1]
vmov @ACC[2],@ACC[3]
vmov @ACC[3],@ACC[4]
vshr.u64 $temp,$temp,#16
vmov @ACC[4],@ACC[5]
vmov @ACC[5],@ACC[6]
vadd.u64 $temp,$temp,$Temp#hi
vmov @ACC[6],@ACC[7]
veor @ACC[7],@ACC[7]
vshr.u64 $temp,$temp,#16
b .LNEON_outer8
.align 4
.LNEON_outer8:
vld1.32 {${Bi}[0]}, [$bptr,:32]!
veor $zero,$zero,$zero
vzip.16 $Bi,$zero
vadd.u64 @ACC[0]#lo,@ACC[0]#lo,$temp
vmlal.u32 @ACC[0],$Bi,${A0}[0]
vmlal.u32 @ACC[1],$Bi,${A0}[1]
vmlal.u32 @ACC[2],$Bi,${A1}[0]
vshl.i64 $Ni,@ACC[0]#hi,#16
vmlal.u32 @ACC[3],$Bi,${A1}[1]
vadd.u64 $Ni,$Ni,@ACC[0]#lo
veor $zero,$zero,$zero
subs $outer,$outer,#1
vmul.u32 $Ni,$Ni,$M0
vmlal.u32 @ACC[4],$Bi,${A2}[0]
vmlal.u32 @ACC[5],$Bi,${A2}[1]
vmlal.u32 @ACC[6],$Bi,${A3}[0]
vzip.16 $Ni,$zero
vmlal.u32 @ACC[7],$Bi,${A3}[1]
vmlal.u32 @ACC[0],$Ni,${N0}[0]
vmlal.u32 @ACC[1],$Ni,${N0}[1]
vmlal.u32 @ACC[2],$Ni,${N1}[0]
vmlal.u32 @ACC[3],$Ni,${N1}[1]
vmlal.u32 @ACC[4],$Ni,${N2}[0]
vmov $Temp,@ACC[0]
vmlal.u32 @ACC[5],$Ni,${N2}[1]
vmov @ACC[0],@ACC[1]
vmlal.u32 @ACC[6],$Ni,${N3}[0]
vmov @ACC[1],@ACC[2]
vmlal.u32 @ACC[7],$Ni,${N3}[1]
vmov @ACC[2],@ACC[3]
vmov @ACC[3],@ACC[4]
vshr.u64 $temp,$temp,#16
vmov @ACC[4],@ACC[5]
vmov @ACC[5],@ACC[6]
vadd.u64 $temp,$temp,$Temp#hi
vmov @ACC[6],@ACC[7]
veor @ACC[7],@ACC[7]
vshr.u64 $temp,$temp,#16
bne .LNEON_outer8
vadd.u64 @ACC[0]#lo,@ACC[0]#lo,$temp
mov $toutptr,sp
vshr.u64 $temp,@ACC[0]#lo,#16
mov $inner,$num
vadd.u64 @ACC[0]#hi,@ACC[0]#hi,$temp
add $tinptr,sp,#96
vshr.u64 $temp,@ACC[0]#hi,#16
vzip.16 @ACC[0]#lo,@ACC[0]#hi
b .LNEON_tail_entry
.align 4
.LNEON_8n:
veor @ACC[0],@ACC[0],@ACC[0]
sub $toutptr,sp,#128
veor @ACC[1],@ACC[1],@ACC[1]
sub $toutptr,$toutptr,$num,lsl#4
veor @ACC[2],@ACC[2],@ACC[2]
and $toutptr,$toutptr,#-64
veor @ACC[3],@ACC[3],@ACC[3]
mov sp,$toutptr @ alloca
veor @ACC[4],@ACC[4],@ACC[4]
add $toutptr,$toutptr,#256
veor @ACC[5],@ACC[5],@ACC[5]
sub $inner,$num,#8
veor @ACC[6],@ACC[6],@ACC[6]
veor @ACC[7],@ACC[7],@ACC[7]
.LNEON_8n_init:
vst1.64 {@ACC[0]-@ACC[1]},[$toutptr,:256]!
subs $inner,$inner,#8
vst1.64 {@ACC[2]-@ACC[3]},[$toutptr,:256]!
vst1.64 {@ACC[4]-@ACC[5]},[$toutptr,:256]!
vst1.64 {@ACC[6]-@ACC[7]},[$toutptr,:256]!
bne .LNEON_8n_init
add $tinptr,sp,#256
vld1.32 {$A0-$A3},[$aptr]!
add $bnptr,sp,#8
vld1.32 {${M0}[0]},[$n0,:32]
mov $outer,$num
b .LNEON_8n_outer
.align 4
.LNEON_8n_outer:
vld1.32 {${Bi}[0]},[$bptr,:32]! @ *b++
veor $zero,$zero,$zero
vzip.16 $Bi,$zero
add $toutptr,sp,#128
vld1.32 {$N0-$N3},[$nptr]!
vmlal.u32 @ACC[0],$Bi,${A0}[0]
vmlal.u32 @ACC[1],$Bi,${A0}[1]
veor $zero,$zero,$zero
vmlal.u32 @ACC[2],$Bi,${A1}[0]
vshl.i64 $Ni,@ACC[0]#hi,#16
vmlal.u32 @ACC[3],$Bi,${A1}[1]
vadd.u64 $Ni,$Ni,@ACC[0]#lo
vmlal.u32 @ACC[4],$Bi,${A2}[0]
vmul.u32 $Ni,$Ni,$M0
vmlal.u32 @ACC[5],$Bi,${A2}[1]
vst1.32 {$Bi},[sp,:64] @ put aside smashed b[8*i+0]
vmlal.u32 @ACC[6],$Bi,${A3}[0]
vzip.16 $Ni,$zero
vmlal.u32 @ACC[7],$Bi,${A3}[1]
___
for ($i=0; $i<7;) {
$code.=<<___;
vld1.32 {${Bi}[0]},[$bptr,:32]! @ *b++
vmlal.u32 @ACC[0],$Ni,${N0}[0]
veor $temp,$temp,$temp
vmlal.u32 @ACC[1],$Ni,${N0}[1]
vzip.16 $Bi,$temp
vmlal.u32 @ACC[2],$Ni,${N1}[0]
vshr.u64 @ACC[0]#lo,@ACC[0]#lo,#16
vmlal.u32 @ACC[3],$Ni,${N1}[1]
vmlal.u32 @ACC[4],$Ni,${N2}[0]
vadd.u64 @ACC[0]#lo,@ACC[0]#lo,@ACC[0]#hi
vmlal.u32 @ACC[5],$Ni,${N2}[1]
vshr.u64 @ACC[0]#lo,@ACC[0]#lo,#16
vmlal.u32 @ACC[6],$Ni,${N3}[0]
vmlal.u32 @ACC[7],$Ni,${N3}[1]
vadd.u64 @ACC[1]#lo,@ACC[1]#lo,@ACC[0]#lo
vst1.32 {$Ni},[$bnptr,:64]! @ put aside smashed m[8*i+$i]
___
push(@ACC,shift(@ACC)); $i++;
$code.=<<___;
vmlal.u32 @ACC[0],$Bi,${A0}[0]
vld1.64 {@ACC[7]},[$tinptr,:128]!
vmlal.u32 @ACC[1],$Bi,${A0}[1]
veor $zero,$zero,$zero
vmlal.u32 @ACC[2],$Bi,${A1}[0]
vshl.i64 $Ni,@ACC[0]#hi,#16
vmlal.u32 @ACC[3],$Bi,${A1}[1]
vadd.u64 $Ni,$Ni,@ACC[0]#lo
vmlal.u32 @ACC[4],$Bi,${A2}[0]
vmul.u32 $Ni,$Ni,$M0
vmlal.u32 @ACC[5],$Bi,${A2}[1]
vst1.32 {$Bi},[$bnptr,:64]! @ put aside smashed b[8*i+$i]
vmlal.u32 @ACC[6],$Bi,${A3}[0]
vzip.16 $Ni,$zero
vmlal.u32 @ACC[7],$Bi,${A3}[1]
___
}
$code.=<<___;
vld1.32 {$Bi},[sp,:64] @ pull smashed b[8*i+0]
vmlal.u32 @ACC[0],$Ni,${N0}[0]
vld1.32 {$A0-$A3},[$aptr]!
vmlal.u32 @ACC[1],$Ni,${N0}[1]
vmlal.u32 @ACC[2],$Ni,${N1}[0]
vshr.u64 @ACC[0]#lo,@ACC[0]#lo,#16
vmlal.u32 @ACC[3],$Ni,${N1}[1]
vmlal.u32 @ACC[4],$Ni,${N2}[0]
vadd.u64 @ACC[0]#lo,@ACC[0]#lo,@ACC[0]#hi
vmlal.u32 @ACC[5],$Ni,${N2}[1]
vshr.u64 @ACC[0]#lo,@ACC[0]#lo,#16
vmlal.u32 @ACC[6],$Ni,${N3}[0]
vmlal.u32 @ACC[7],$Ni,${N3}[1]
vadd.u64 @ACC[1]#lo,@ACC[1]#lo,@ACC[0]#lo
vst1.32 {$Ni},[$bnptr,:64] @ put aside smashed m[8*i+$i]
add $bnptr,sp,#8 @ rewind
___
push(@ACC,shift(@ACC));
$code.=<<___;
sub $inner,$num,#8
b .LNEON_8n_inner
.align 4
.LNEON_8n_inner:
subs $inner,$inner,#8
vmlal.u32 @ACC[0],$Bi,${A0}[0]
vld1.64 {@ACC[7]},[$tinptr,:128]
vmlal.u32 @ACC[1],$Bi,${A0}[1]
vld1.32 {$Ni},[$bnptr,:64]! @ pull smashed m[8*i+0]
vmlal.u32 @ACC[2],$Bi,${A1}[0]
vld1.32 {$N0-$N3},[$nptr]!
vmlal.u32 @ACC[3],$Bi,${A1}[1]
it ne
addne $tinptr,$tinptr,#16 @ don't advance in last iteration
vmlal.u32 @ACC[4],$Bi,${A2}[0]
vmlal.u32 @ACC[5],$Bi,${A2}[1]
vmlal.u32 @ACC[6],$Bi,${A3}[0]
vmlal.u32 @ACC[7],$Bi,${A3}[1]
___
for ($i=1; $i<8; $i++) {
$code.=<<___;
vld1.32 {$Bi},[$bnptr,:64]! @ pull smashed b[8*i+$i]
vmlal.u32 @ACC[0],$Ni,${N0}[0]
vmlal.u32 @ACC[1],$Ni,${N0}[1]
vmlal.u32 @ACC[2],$Ni,${N1}[0]
vmlal.u32 @ACC[3],$Ni,${N1}[1]
vmlal.u32 @ACC[4],$Ni,${N2}[0]
vmlal.u32 @ACC[5],$Ni,${N2}[1]
vmlal.u32 @ACC[6],$Ni,${N3}[0]
vmlal.u32 @ACC[7],$Ni,${N3}[1]
vst1.64 {@ACC[0]},[$toutptr,:128]!
___
push(@ACC,shift(@ACC));
$code.=<<___;
vmlal.u32 @ACC[0],$Bi,${A0}[0]
vld1.64 {@ACC[7]},[$tinptr,:128]
vmlal.u32 @ACC[1],$Bi,${A0}[1]
vld1.32 {$Ni},[$bnptr,:64]! @ pull smashed m[8*i+$i]
vmlal.u32 @ACC[2],$Bi,${A1}[0]
it ne
addne $tinptr,$tinptr,#16 @ don't advance in last iteration
vmlal.u32 @ACC[3],$Bi,${A1}[1]
vmlal.u32 @ACC[4],$Bi,${A2}[0]
vmlal.u32 @ACC[5],$Bi,${A2}[1]
vmlal.u32 @ACC[6],$Bi,${A3}[0]
vmlal.u32 @ACC[7],$Bi,${A3}[1]
___
}
$code.=<<___;
it eq
subeq $aptr,$aptr,$num,lsl#2 @ rewind
vmlal.u32 @ACC[0],$Ni,${N0}[0]
vld1.32 {$Bi},[sp,:64] @ pull smashed b[8*i+0]
vmlal.u32 @ACC[1],$Ni,${N0}[1]
vld1.32 {$A0-$A3},[$aptr]!
vmlal.u32 @ACC[2],$Ni,${N1}[0]
add $bnptr,sp,#8 @ rewind
vmlal.u32 @ACC[3],$Ni,${N1}[1]
vmlal.u32 @ACC[4],$Ni,${N2}[0]
vmlal.u32 @ACC[5],$Ni,${N2}[1]
vmlal.u32 @ACC[6],$Ni,${N3}[0]
vst1.64 {@ACC[0]},[$toutptr,:128]!
vmlal.u32 @ACC[7],$Ni,${N3}[1]
bne .LNEON_8n_inner
___
push(@ACC,shift(@ACC));
$code.=<<___;
add $tinptr,sp,#128
vst1.64 {@ACC[0]-@ACC[1]},[$toutptr,:256]!
veor q2,q2,q2 @ $N0-$N1
vst1.64 {@ACC[2]-@ACC[3]},[$toutptr,:256]!
veor q3,q3,q3 @ $N2-$N3
vst1.64 {@ACC[4]-@ACC[5]},[$toutptr,:256]!
vst1.64 {@ACC[6]},[$toutptr,:128]
subs $outer,$outer,#8
vld1.64 {@ACC[0]-@ACC[1]},[$tinptr,:256]!
vld1.64 {@ACC[2]-@ACC[3]},[$tinptr,:256]!
vld1.64 {@ACC[4]-@ACC[5]},[$tinptr,:256]!
vld1.64 {@ACC[6]-@ACC[7]},[$tinptr,:256]!
itt ne
subne $nptr,$nptr,$num,lsl#2 @ rewind
bne .LNEON_8n_outer
add $toutptr,sp,#128
vst1.64 {q2-q3}, [sp,:256]! @ start wiping stack frame
vshr.u64 $temp,@ACC[0]#lo,#16
vst1.64 {q2-q3},[sp,:256]!
vadd.u64 @ACC[0]#hi,@ACC[0]#hi,$temp
vst1.64 {q2-q3}, [sp,:256]!
vshr.u64 $temp,@ACC[0]#hi,#16
vst1.64 {q2-q3}, [sp,:256]!
vzip.16 @ACC[0]#lo,@ACC[0]#hi
mov $inner,$num
b .LNEON_tail_entry
.align 4
.LNEON_tail:
vadd.u64 @ACC[0]#lo,@ACC[0]#lo,$temp
vshr.u64 $temp,@ACC[0]#lo,#16
vld1.64 {@ACC[2]-@ACC[3]}, [$tinptr, :256]!
vadd.u64 @ACC[0]#hi,@ACC[0]#hi,$temp
vld1.64 {@ACC[4]-@ACC[5]}, [$tinptr, :256]!
vshr.u64 $temp,@ACC[0]#hi,#16
vld1.64 {@ACC[6]-@ACC[7]}, [$tinptr, :256]!
vzip.16 @ACC[0]#lo,@ACC[0]#hi
.LNEON_tail_entry:
___
for ($i=1; $i<8; $i++) {
$code.=<<___;
vadd.u64 @ACC[1]#lo,@ACC[1]#lo,$temp
vst1.32 {@ACC[0]#lo[0]}, [$toutptr, :32]!
vshr.u64 $temp,@ACC[1]#lo,#16
vadd.u64 @ACC[1]#hi,@ACC[1]#hi,$temp
vshr.u64 $temp,@ACC[1]#hi,#16
vzip.16 @ACC[1]#lo,@ACC[1]#hi
___
push(@ACC,shift(@ACC));
}
push(@ACC,shift(@ACC));
$code.=<<___;
vld1.64 {@ACC[0]-@ACC[1]}, [$tinptr, :256]!
subs $inner,$inner,#8
vst1.32 {@ACC[7]#lo[0]}, [$toutptr, :32]!
bne .LNEON_tail
vst1.32 {${temp}[0]}, [$toutptr, :32] @ top-most bit
sub $nptr,$nptr,$num,lsl#2 @ rewind $nptr
subs $aptr,sp,#0 @ clear carry flag
add $bptr,sp,$num,lsl#2
.LNEON_sub:
ldmia $aptr!, {r4-r7}
ldmia $nptr!, {r8-r11}
sbcs r8, r4,r8
sbcs r9, r5,r9
sbcs r10,r6,r10
sbcs r11,r7,r11
teq $aptr,$bptr @ preserves carry
stmia $rptr!, {r8-r11}
bne .LNEON_sub
ldr r10, [$aptr] @ load top-most bit
mov r11,sp
veor q0,q0,q0
sub r11,$bptr,r11 @ this is num*4
veor q1,q1,q1
mov $aptr,sp
sub $rptr,$rptr,r11 @ rewind $rptr
mov $nptr,$bptr @ second 3/4th of frame
sbcs r10,r10,#0 @ result is carry flag
.LNEON_copy_n_zap:
ldmia $aptr!, {r4-r7}
ldmia $rptr, {r8-r11}
it cc
movcc r8, r4
vst1.64 {q0-q1}, [$nptr,:256]! @ wipe
itt cc
movcc r9, r5
movcc r10,r6
vst1.64 {q0-q1}, [$nptr,:256]! @ wipe
it cc
movcc r11,r7
ldmia $aptr, {r4-r7}
stmia $rptr!, {r8-r11}
sub $aptr,$aptr,#16
ldmia $rptr, {r8-r11}
it cc
movcc r8, r4
vst1.64 {q0-q1}, [$aptr,:256]! @ wipe
itt cc
movcc r9, r5
movcc r10,r6
vst1.64 {q0-q1}, [$nptr,:256]! @ wipe
it cc
movcc r11,r7
teq $aptr,$bptr @ preserves carry
stmia $rptr!, {r8-r11}
bne .LNEON_copy_n_zap
mov sp,ip
vldmia sp!,{d8-d15}
ldmia sp!,{r4-r11}
ret @ bx lr
.size bn_mul8x_mont_neon,.-bn_mul8x_mont_neon
#endif
___
}
$code.=<<___;
.asciz "Montgomery multiplication for ARMv4/NEON, CRYPTOGAMS by <appro\@openssl.org>"
___
foreach (split("\n",$code)) {
s/\`([^\`]*)\`/eval $1/ge;
s/\bq([0-9]+)#(lo|hi)/sprintf "d%d",2*$1+($2 eq "hi")/ge or
s/\bret\b/bx lr/g or
s/\bbx\s+lr\b/.word\t0xe12fff1e/g; # make it possible to compile with -march=armv4
print $_,"\n";
}
close STDOUT or die "error closing STDOUT";

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,336 @@
#! /usr/bin/env perl
# Copyright 2005-2016 The OpenSSL Project Authors. All Rights Reserved.
#
# Licensed under the OpenSSL license (the "License"). You may not use
# this file except in compliance with the License. You can obtain a copy
# in the file LICENSE in the source distribution or at
# https://www.openssl.org/source/license.html
# ====================================================================
# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
# project. The module is, however, dual licensed under OpenSSL and
# CRYPTOGAMS licenses depending on where you obtain it. For further
# details see http://www.openssl.org/~appro/cryptogams/.
# ====================================================================
# October 2005
#
# This is a "teaser" code, as it can be improved in several ways...
# First of all non-SSE2 path should be implemented (yes, for now it
# performs Montgomery multiplication/convolution only on SSE2-capable
# CPUs such as P4, others fall down to original code). Then inner loop
# can be unrolled and modulo-scheduled to improve ILP and possibly
# moved to 128-bit XMM register bank (though it would require input
# rearrangement and/or increase bus bandwidth utilization). Dedicated
# squaring procedure should give further performance improvement...
# Yet, for being draft, the code improves rsa512 *sign* benchmark by
# 110%(!), rsa1024 one - by 70% and rsa4096 - by 20%:-)
# December 2006
#
# Modulo-scheduling SSE2 loops results in further 15-20% improvement.
# Integer-only code [being equipped with dedicated squaring procedure]
# gives ~40% on rsa512 sign benchmark...
$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
push(@INC,"${dir}","${dir}../../../perlasm");
require "x86asm.pl";
$output = pop;
open STDOUT,">$output";
&asm_init($ARGV[0]);
$sse2=0;
for (@ARGV) { $sse2=1 if (/-DOPENSSL_IA32_SSE2/); }
&external_label("GFp_ia32cap_P") if ($sse2);
&function_begin("GFp_bn_mul_mont");
$i="edx";
$j="ecx";
$ap="esi"; $tp="esi"; # overlapping variables!!!
$rp="edi"; $bp="edi"; # overlapping variables!!!
$np="ebp";
$num="ebx";
$_num=&DWP(4*0,"esp"); # stack top layout
$_rp=&DWP(4*1,"esp");
$_ap=&DWP(4*2,"esp");
$_bp=&DWP(4*3,"esp");
$_np=&DWP(4*4,"esp");
$_n0=&DWP(4*5,"esp"); $_n0q=&QWP(4*5,"esp");
$_sp=&DWP(4*6,"esp");
$_bpend=&DWP(4*7,"esp");
$frame=32; # size of above frame rounded up to 16n
&xor ("eax","eax");
&mov ("edi",&wparam(5)); # int num
&lea ("esi",&wparam(0)); # put aside pointer to argument block
&lea ("edx",&wparam(1)); # load ap
&add ("edi",2); # extra two words on top of tp
&neg ("edi");
&lea ("ebp",&DWP(-$frame,"esp","edi",4)); # future alloca($frame+4*(num+2))
&neg ("edi");
# minimize cache contention by arranging 2K window between stack
# pointer and ap argument [np is also position sensitive vector,
# but it's assumed to be near ap, as it's allocated at ~same
# time].
&mov ("eax","ebp");
&sub ("eax","edx");
&and ("eax",2047);
&sub ("ebp","eax"); # this aligns sp and ap modulo 2048
&xor ("edx","ebp");
&and ("edx",2048);
&xor ("edx",2048);
&sub ("ebp","edx"); # this splits them apart modulo 4096
&and ("ebp",-64); # align to cache line
# An OS-agnostic version of __chkstk.
#
# Some OSes (Windows) insist on stack being "wired" to
# physical memory in strictly sequential manner, i.e. if stack
# allocation spans two pages, then reference to farmost one can
# be punishable by SEGV. But page walking can do good even on
# other OSes, because it guarantees that villain thread hits
# the guard page before it can make damage to innocent one...
&mov ("eax","esp");
&sub ("eax","ebp");
&and ("eax",-4096);
&mov ("edx","esp"); # saved stack pointer!
&lea ("esp",&DWP(0,"ebp","eax"));
&mov ("eax",&DWP(0,"esp"));
&cmp ("esp","ebp");
&ja (&label("page_walk"));
&jmp (&label("page_walk_done"));
&set_label("page_walk",16);
&lea ("esp",&DWP(-4096,"esp"));
&mov ("eax",&DWP(0,"esp"));
&cmp ("esp","ebp");
&ja (&label("page_walk"));
&set_label("page_walk_done");
################################# load argument block...
&mov ("eax",&DWP(0*4,"esi"));# BN_ULONG *rp
&mov ("ebx",&DWP(1*4,"esi"));# const BN_ULONG *ap
&mov ("ecx",&DWP(2*4,"esi"));# const BN_ULONG *bp
&mov ("ebp",&DWP(3*4,"esi"));# const BN_ULONG *np
&mov ("esi",&DWP(4*4,"esi"));# const BN_ULONG *n0
#&mov ("edi",&DWP(5*4,"esi"));# int num
&mov ("esi",&DWP(0,"esi")); # pull n0[0]
&mov ($_rp,"eax"); # ... save a copy of argument block
&mov ($_ap,"ebx");
&mov ($_bp,"ecx");
&mov ($_np,"ebp");
&mov ($_n0,"esi");
&lea ($num,&DWP(-3,"edi")); # num=num-1 to assist modulo-scheduling
#&mov ($_num,$num); # redundant as $num is not reused
&mov ($_sp,"edx"); # saved stack pointer!
if($sse2) {
$acc0="mm0"; # mmx register bank layout
$acc1="mm1";
$car0="mm2";
$car1="mm3";
$mul0="mm4";
$mul1="mm5";
$temp="mm6";
$mask="mm7";
&picmeup("eax","GFp_ia32cap_P");
&bt (&DWP(0,"eax"),26);
# The non-SSE2 code was removed.
&mov ("eax",-1);
&movd ($mask,"eax"); # mask 32 lower bits
&mov ($ap,$_ap); # load input pointers
&mov ($bp,$_bp);
&mov ($np,$_np);
&xor ($i,$i); # i=0
&xor ($j,$j); # j=0
&movd ($mul0,&DWP(0,$bp)); # bp[0]
&movd ($mul1,&DWP(0,$ap)); # ap[0]
&movd ($car1,&DWP(0,$np)); # np[0]
&pmuludq($mul1,$mul0); # ap[0]*bp[0]
&movq ($car0,$mul1);
&movq ($acc0,$mul1); # I wish movd worked for
&pand ($acc0,$mask); # inter-register transfers
&pmuludq($mul1,$_n0q); # *=n0
&pmuludq($car1,$mul1); # "t[0]"*np[0]*n0
&paddq ($car1,$acc0);
&movd ($acc1,&DWP(4,$np)); # np[1]
&movd ($acc0,&DWP(4,$ap)); # ap[1]
&psrlq ($car0,32);
&psrlq ($car1,32);
&inc ($j); # j++
&set_label("1st",16);
&pmuludq($acc0,$mul0); # ap[j]*bp[0]
&pmuludq($acc1,$mul1); # np[j]*m1
&paddq ($car0,$acc0); # +=c0
&paddq ($car1,$acc1); # +=c1
&movq ($acc0,$car0);
&pand ($acc0,$mask);
&movd ($acc1,&DWP(4,$np,$j,4)); # np[j+1]
&paddq ($car1,$acc0); # +=ap[j]*bp[0];
&movd ($acc0,&DWP(4,$ap,$j,4)); # ap[j+1]
&psrlq ($car0,32);
&movd (&DWP($frame-4,"esp",$j,4),$car1); # tp[j-1]=
&psrlq ($car1,32);
&lea ($j,&DWP(1,$j));
&cmp ($j,$num);
&jl (&label("1st"));
&pmuludq($acc0,$mul0); # ap[num-1]*bp[0]
&pmuludq($acc1,$mul1); # np[num-1]*m1
&paddq ($car0,$acc0); # +=c0
&paddq ($car1,$acc1); # +=c1
&movq ($acc0,$car0);
&pand ($acc0,$mask);
&paddq ($car1,$acc0); # +=ap[num-1]*bp[0];
&movd (&DWP($frame-4,"esp",$j,4),$car1); # tp[num-2]=
&psrlq ($car0,32);
&psrlq ($car1,32);
&paddq ($car1,$car0);
&movq (&QWP($frame,"esp",$num,4),$car1); # tp[num].tp[num-1]
&inc ($i); # i++
&set_label("outer");
&xor ($j,$j); # j=0
&movd ($mul0,&DWP(0,$bp,$i,4)); # bp[i]
&movd ($mul1,&DWP(0,$ap)); # ap[0]
&movd ($temp,&DWP($frame,"esp")); # tp[0]
&movd ($car1,&DWP(0,$np)); # np[0]
&pmuludq($mul1,$mul0); # ap[0]*bp[i]
&paddq ($mul1,$temp); # +=tp[0]
&movq ($acc0,$mul1);
&movq ($car0,$mul1);
&pand ($acc0,$mask);
&pmuludq($mul1,$_n0q); # *=n0
&pmuludq($car1,$mul1);
&paddq ($car1,$acc0);
&movd ($temp,&DWP($frame+4,"esp")); # tp[1]
&movd ($acc1,&DWP(4,$np)); # np[1]
&movd ($acc0,&DWP(4,$ap)); # ap[1]
&psrlq ($car0,32);
&psrlq ($car1,32);
&paddq ($car0,$temp); # +=tp[1]
&inc ($j); # j++
&dec ($num);
&set_label("inner");
&pmuludq($acc0,$mul0); # ap[j]*bp[i]
&pmuludq($acc1,$mul1); # np[j]*m1
&paddq ($car0,$acc0); # +=c0
&paddq ($car1,$acc1); # +=c1
&movq ($acc0,$car0);
&movd ($temp,&DWP($frame+4,"esp",$j,4));# tp[j+1]
&pand ($acc0,$mask);
&movd ($acc1,&DWP(4,$np,$j,4)); # np[j+1]
&paddq ($car1,$acc0); # +=ap[j]*bp[i]+tp[j]
&movd ($acc0,&DWP(4,$ap,$j,4)); # ap[j+1]
&psrlq ($car0,32);
&movd (&DWP($frame-4,"esp",$j,4),$car1);# tp[j-1]=
&psrlq ($car1,32);
&paddq ($car0,$temp); # +=tp[j+1]
&dec ($num);
&lea ($j,&DWP(1,$j)); # j++
&jnz (&label("inner"));
&mov ($num,$j);
&pmuludq($acc0,$mul0); # ap[num-1]*bp[i]
&pmuludq($acc1,$mul1); # np[num-1]*m1
&paddq ($car0,$acc0); # +=c0
&paddq ($car1,$acc1); # +=c1
&movq ($acc0,$car0);
&pand ($acc0,$mask);
&paddq ($car1,$acc0); # +=ap[num-1]*bp[i]+tp[num-1]
&movd (&DWP($frame-4,"esp",$j,4),$car1); # tp[num-2]=
&psrlq ($car0,32);
&psrlq ($car1,32);
&movd ($temp,&DWP($frame+4,"esp",$num,4)); # += tp[num]
&paddq ($car1,$car0);
&paddq ($car1,$temp);
&movq (&QWP($frame,"esp",$num,4),$car1); # tp[num].tp[num-1]
&lea ($i,&DWP(1,$i)); # i++
&cmp ($i,$num);
&jle (&label("outer"));
&emms (); # done with mmx bank
} # The non-SSE2 code was removed.
&set_label("common_tail",16);
&mov ($np,$_np); # load modulus pointer
&mov ($rp,$_rp); # load result pointer
&lea ($tp,&DWP($frame,"esp")); # [$ap and $bp are zapped]
&mov ("eax",&DWP(0,$tp)); # tp[0]
&mov ($j,$num); # j=num-1
&xor ($i,$i); # i=0 and clear CF!
&set_label("sub",16);
&sbb ("eax",&DWP(0,$np,$i,4));
&mov (&DWP(0,$rp,$i,4),"eax"); # rp[i]=tp[i]-np[i]
&dec ($j); # doesn't affect CF!
&mov ("eax",&DWP(4,$tp,$i,4)); # tp[i+1]
&lea ($i,&DWP(1,$i)); # i++
&jge (&label("sub"));
&sbb ("eax",0); # handle upmost overflow bit
&mov ("edx",-1);
&xor ("edx","eax");
&jmp (&label("copy"));
&set_label("copy",16); # conditional copy
&mov ($tp,&DWP($frame,"esp",$num,4));
&mov ($np,&DWP(0,$rp,$num,4));
&mov (&DWP($frame,"esp",$num,4),$j); # zap temporary vector
&and ($tp,"eax");
&and ($np,"edx");
&or ($np,$tp);
&mov (&DWP(0,$rp,$num,4),$np);
&dec ($num);
&jge (&label("copy"));
&mov ("esp",$_sp); # pull saved stack pointer
&mov ("eax",1);
&function_end("GFp_bn_mul_mont");
&asciz("Montgomery Multiplication for x86, CRYPTOGAMS by <appro\@openssl.org>");
&asm_finish();
close STDOUT or die "error closing STDOUT";

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,197 @@
/* Copyright (C) 1995-1997 Eric Young (eay@cryptsoft.com)
* All rights reserved.
*
* This package is an SSL implementation written
* by Eric Young (eay@cryptsoft.com).
* The implementation was written so as to conform with Netscapes SSL.
*
* This library is free for commercial and non-commercial use as long as
* the following conditions are aheared to. The following conditions
* apply to all code found in this distribution, be it the RC4, RSA,
* lhash, DES, etc., code; not just the SSL code. The SSL documentation
* included with this distribution is covered by the same copyright terms
* except that the holder is Tim Hudson (tjh@cryptsoft.com).
*
* Copyright remains Eric Young's, and as such any Copyright notices in
* the code are not to be removed.
* If this package is used in a product, Eric Young should be given attribution
* as the author of the parts of the library used.
* This can be in the form of a textual message at program startup or
* in documentation (online or textual) provided with the package.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* 3. All advertising materials mentioning features or use of this software
* must display the following acknowledgement:
* "This product includes cryptographic software written by
* Eric Young (eay@cryptsoft.com)"
* The word 'cryptographic' can be left out if the rouines from the library
* being used are not cryptographic related :-).
* 4. If you include any Windows specific code (or a derivative thereof) from
* the apps directory (application code) you must include an acknowledgement:
* "This product includes software written by Tim Hudson (tjh@cryptsoft.com)"
*
* THIS SOFTWARE IS PROVIDED BY ERIC YOUNG ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
* The licence and distribution terms for any publically available version or
* derivative of this code cannot be changed. i.e. this code cannot simply be
* copied and put under another distribution licence
* [including the GNU Public Licence.]
*/
/* ====================================================================
* Copyright (c) 1998-2006 The OpenSSL Project. All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
*
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
*
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in
* the documentation and/or other materials provided with the
* distribution.
*
* 3. All advertising materials mentioning features or use of this
* software must display the following acknowledgment:
* "This product includes software developed by the OpenSSL Project
* for use in the OpenSSL Toolkit. (http://www.openssl.org/)"
*
* 4. The names "OpenSSL Toolkit" and "OpenSSL Project" must not be used to
* endorse or promote products derived from this software without
* prior written permission. For written permission, please contact
* openssl-core@openssl.org.
*
* 5. Products derived from this software may not be called "OpenSSL"
* nor may "OpenSSL" appear in their names without prior written
* permission of the OpenSSL Project.
*
* 6. Redistributions of any form whatsoever must retain the following
* acknowledgment:
* "This product includes software developed by the OpenSSL Project
* for use in the OpenSSL Toolkit (http://www.openssl.org/)"
*
* THIS SOFTWARE IS PROVIDED BY THE OpenSSL PROJECT ``AS IS'' AND ANY
* EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
* PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE OpenSSL PROJECT OR
* ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
* SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
* NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
* STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED
* OF THE POSSIBILITY OF SUCH DAMAGE.
* ====================================================================
*
* This product includes cryptographic software written by Eric Young
* (eay@cryptsoft.com). This product includes software written by Tim
* Hudson (tjh@cryptsoft.com).
*
*/
/* ====================================================================
* Copyright 2002 Sun Microsystems, Inc. ALL RIGHTS RESERVED.
*
* Portions of the attached software ("Contribution") are developed by
* SUN MICROSYSTEMS, INC., and are contributed to the OpenSSL project.
*
* The Contribution is licensed pursuant to the Eric Young open source
* license provided above.
*
* The binary polynomial arithmetic software is originally written by
* Sheueling Chang Shantz and Douglas Stebila of Sun Microsystems
* Laboratories. */
#ifndef OPENSSL_HEADER_BN_INTERNAL_H
#define OPENSSL_HEADER_BN_INTERNAL_H
#include <GFp/base.h>
#if defined(OPENSSL_X86_64) && defined(_MSC_VER) && !defined(__clang__)
#pragma warning(push, 3)
#include <intrin.h>
#pragma warning(pop)
#pragma intrinsic(_umul128)
#endif
#include "../../internal.h"
typedef crypto_word BN_ULONG;
#if defined(OPENSSL_64_BIT)
#if defined(BORINGSSL_HAS_UINT128)
// MSVC doesn't support two-word integers on 64-bit.
#define BN_ULLONG uint128_t
#endif
#define BN_BITS2 64
#define BN_MONT_CTX_N0_LIMBS 1
#define BN_MONT_CTX_N0(hi, lo) TOBN(hi, lo), 0
#define TOBN(hi, lo) ((BN_ULONG)(hi) << 32 | (lo))
#elif defined(OPENSSL_32_BIT)
#define BN_ULLONG uint64_t
#define BN_BITS2 32
// On some 32-bit platforms, Montgomery multiplication is done using 64-bit
// arithmetic with SIMD instructions. On such platforms, |BN_MONT_CTX::n0|
// needs to be two words long. Only certain 32-bit platforms actually make use
// of n0[1] and shorter R value would suffice for the others. However,
// currently only the assembly files know which is which.
#define BN_MONT_CTX_N0_LIMBS 2
#define BN_MONT_CTX_N0(hi, lo) TOBN(hi, lo)
#define TOBN(hi, lo) (lo), (hi)
#else
#error "Must define either OPENSSL_32_BIT or OPENSSL_64_BIT"
#endif
// |num| must be at least 4, at least on x86.
//
// In other forks, |bn_mul_mont| returns an |int| indicating whether it
// actually did the multiplication. All our implementations always do the
// multiplication, and forcing callers to deal with the possibility of it
// failing just leads to further problems.
//
// In other forks, |bn_mod_mul|'s `num` argument has type |int| but it is
// implicitly treated as a |size_t|; when |int| is smaller than |size_t|
// then the |movq 48(%rsp),%r9| done by x86_64-xlate.pl implicitly does the
// conversion.
OPENSSL_STATIC_ASSERT(sizeof(int) == sizeof(size_t) ||
(sizeof(int) == 4 && sizeof(size_t) == 8),
"int and size_t ABI mismatch");
void GFp_bn_mul_mont(BN_ULONG *rp, const BN_ULONG *ap, const BN_ULONG *bp,
const BN_ULONG *np, const BN_ULONG *n0, size_t num);
static inline void bn_umult_lohi(BN_ULONG *low_out, BN_ULONG *high_out,
BN_ULONG a, BN_ULONG b) {
#if defined(OPENSSL_X86_64) && defined(_MSC_VER) && !defined(__clang__)
*low_out = _umul128(a, b, high_out);
#else
BN_ULLONG result = (BN_ULLONG)a * b;
*low_out = (BN_ULONG)result;
*high_out = (BN_ULONG)(result >> BN_BITS2);
#endif
}
#endif // OPENSSL_HEADER_BN_INTERNAL_H

View File

@@ -0,0 +1,158 @@
/* Copyright (C) 1995-1998 Eric Young (eay@cryptsoft.com)
* All rights reserved.
*
* This package is an SSL implementation written
* by Eric Young (eay@cryptsoft.com).
* The implementation was written so as to conform with Netscapes SSL.
*
* This library is free for commercial and non-commercial use as long as
* the following conditions are aheared to. The following conditions
* apply to all code found in this distribution, be it the RC4, RSA,
* lhash, DES, etc., code; not just the SSL code. The SSL documentation
* included with this distribution is covered by the same copyright terms
* except that the holder is Tim Hudson (tjh@cryptsoft.com).
*
* Copyright remains Eric Young's, and as such any Copyright notices in
* the code are not to be removed.
* If this package is used in a product, Eric Young should be given attribution
* as the author of the parts of the library used.
* This can be in the form of a textual message at program startup or
* in documentation (online or textual) provided with the package.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* 3. All advertising materials mentioning features or use of this software
* must display the following acknowledgement:
* "This product includes cryptographic software written by
* Eric Young (eay@cryptsoft.com)"
* The word 'cryptographic' can be left out if the rouines from the library
* being used are not cryptographic related :-).
* 4. If you include any Windows specific code (or a derivative thereof) from
* the apps directory (application code) you must include an acknowledgement:
* "This product includes software written by Tim Hudson (tjh@cryptsoft.com)"
*
* THIS SOFTWARE IS PROVIDED BY ERIC YOUNG ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
* The licence and distribution terms for any publically available version or
* derivative of this code cannot be changed. i.e. this code cannot simply be
* copied and put under another distribution licence
* [including the GNU Public Licence.]
*/
/* ====================================================================
* Copyright (c) 1998-2006 The OpenSSL Project. All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
*
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
*
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in
* the documentation and/or other materials provided with the
* distribution.
*
* 3. All advertising materials mentioning features or use of this
* software must display the following acknowledgment:
* "This product includes software developed by the OpenSSL Project
* for use in the OpenSSL Toolkit. (http://www.openssl.org/)"
*
* 4. The names "OpenSSL Toolkit" and "OpenSSL Project" must not be used to
* endorse or promote products derived from this software without
* prior written permission. For written permission, please contact
* openssl-core@openssl.org.
*
* 5. Products derived from this software may not be called "OpenSSL"
* nor may "OpenSSL" appear in their names without prior written
* permission of the OpenSSL Project.
*
* 6. Redistributions of any form whatsoever must retain the following
* acknowledgment:
* "This product includes software developed by the OpenSSL Project
* for use in the OpenSSL Toolkit (http://www.openssl.org/)"
*
* THIS SOFTWARE IS PROVIDED BY THE OpenSSL PROJECT ``AS IS'' AND ANY
* EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
* PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE OpenSSL PROJECT OR
* ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
* SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
* NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
* STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED
* OF THE POSSIBILITY OF SUCH DAMAGE.
* ====================================================================
*
* This product includes cryptographic software written by Eric Young
* (eay@cryptsoft.com). This product includes software written by Tim
* Hudson (tjh@cryptsoft.com). */
#include "internal.h"
#include "../../internal.h"
#include "../../limbs/limbs.h"
#include "../../limbs/limbs.inl"
OPENSSL_STATIC_ASSERT(BN_MONT_CTX_N0_LIMBS == 1 || BN_MONT_CTX_N0_LIMBS == 2,
"BN_MONT_CTX_N0_LIMBS value is invalid");
OPENSSL_STATIC_ASSERT(
sizeof(BN_ULONG) * BN_MONT_CTX_N0_LIMBS == sizeof(uint64_t),
"uint64_t is insufficient precision for n0");
int GFp_bn_from_montgomery_in_place(BN_ULONG r[], size_t num_r, BN_ULONG a[],
size_t num_a, const BN_ULONG n[],
size_t num_n,
const BN_ULONG n0_[BN_MONT_CTX_N0_LIMBS]) {
if (num_n == 0 || num_r != num_n || num_a != 2 * num_n) {
return 0;
}
// Add multiples of |n| to |r| until R = 2^(nl * BN_BITS2) divides it. On
// input, we had |r| < |n| * R, so now |r| < 2 * |n| * R. Note that |r|
// includes |carry| which is stored separately.
BN_ULONG n0 = n0_[0];
BN_ULONG carry = 0;
for (size_t i = 0; i < num_n; i++) {
BN_ULONG v = GFp_limbs_mul_add_limb(a + i, n, a[i] * n0, num_n);
v += carry + a[i + num_n];
carry |= (v != a[i + num_n]);
carry &= (v <= a[i + num_n]);
a[i + num_n] = v;
}
// Shift |num_n| words to divide by R. We have |a| < 2 * |n|. Note that |a|
// includes |carry| which is stored separately.
a += num_n;
// |a| thus requires at most one additional subtraction |n| to be reduced.
// Subtract |n| and select the answer in constant time.
BN_ULONG v = limbs_sub(r, a, n, num_n) - carry;
// |v| is one if |a| - |n| underflowed or zero if it did not. Note |v| cannot
// be -1. That would imply the subtraction did not fit in |num_n| words, and
// we know at most one subtraction is needed.
v = 0u - v;
for (size_t i = 0; i < num_n; i++) {
r[i] = constant_time_select_w(v, a[i], r[i]);
a[i] = 0;
}
return 1;
}

View File

@@ -0,0 +1,105 @@
/* Copyright 2016 Brian Smith.
*
* Permission to use, copy, modify, and/or distribute this software for any
* purpose with or without fee is hereby granted, provided that the above
* copyright notice and this permission notice appear in all copies.
*
* THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
* WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
* MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY
* SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
* WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION
* OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN
* CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. */
#include "internal.h"
#include "../../internal.h"
OPENSSL_STATIC_ASSERT(BN_MONT_CTX_N0_LIMBS == 1 || BN_MONT_CTX_N0_LIMBS == 2,
"BN_MONT_CTX_N0_LIMBS value is invalid");
OPENSSL_STATIC_ASSERT(sizeof(BN_ULONG) * BN_MONT_CTX_N0_LIMBS == sizeof(uint64_t),
"uint64_t is insufficient precision for n0");
// LG_LITTLE_R is log_2(r).
#define LG_LITTLE_R (BN_MONT_CTX_N0_LIMBS * BN_BITS2)
// bn_neg_inv_r_mod_n_u64 calculates the -1/n mod r; i.e. it calculates |v|
// such that u*r - v*n == 1. |r| is the constant defined in |bn_mont_n0|. |n|
// must be odd.
//
// This is derived from |xbinGCD| in Henry S. Warren, Jr.'s "Montgomery
// Multiplication" (http://www.hackersdelight.org/MontgomeryMultiplication.pdf).
// It is very similar to the MODULAR-INVERSE function in Stephen R. Dussé's and
// Burton S. Kaliski Jr.'s "A Cryptographic Library for the Motorola DSP56000"
// (http://link.springer.com/chapter/10.1007%2F3-540-46877-3_21).
//
// This is inspired by Joppe W. Bos's "Constant Time Modular Inversion"
// (http://www.joppebos.com/files/CTInversion.pdf) so that the inversion is
// constant-time with respect to |n|. We assume uint64_t additions,
// subtractions, shifts, and bitwise operations are all constant time, which
// may be a large leap of faith on 32-bit targets. We avoid division and
// multiplication, which tend to be the most problematic in terms of timing
// leaks.
//
// Most GCD implementations return values such that |u*r + v*n == 1|, so the
// caller would have to negate the resultant |v| for the purpose of Montgomery
// multiplication. This implementation does the negation implicitly by doing
// the computations as a difference instead of a sum.
uint64_t GFp_bn_neg_inv_mod_r_u64(uint64_t n) {
dev_assert_secret(n % 2 == 1);
// alpha == 2**(lg r - 1) == r / 2.
static const uint64_t alpha = UINT64_C(1) << (LG_LITTLE_R - 1);
const uint64_t beta = n;
uint64_t u = 1;
uint64_t v = 0;
// The invariant maintained from here on is:
// 2**(lg r - i) == u*2*alpha - v*beta.
for (size_t i = 0; i < LG_LITTLE_R; ++i) {
#if BN_BITS2 == 64 && defined(BN_ULLONG)
dev_assert_secret((BN_ULLONG)(1) << (LG_LITTLE_R - i) ==
((BN_ULLONG)u * 2 * alpha) - ((BN_ULLONG)v * beta));
#endif
// Delete a common factor of 2 in u and v if |u| is even. Otherwise, set
// |u = (u + beta) / 2| and |v = (v / 2) + alpha|.
uint64_t u_is_odd = UINT64_C(0) - (u & 1); // Either 0xff..ff or 0.
// The addition can overflow, so use Dietz's method for it.
//
// Dietz calculates (x+y)/2 by (x xor y)>>1 + x&y. This is valid for all
// (unsigned) x and y, even when x+y overflows. Evidence for 32-bit values
// (embedded in 64 bits to so that overflow can be ignored):
//
// (declare-fun x () (_ BitVec 64))
// (declare-fun y () (_ BitVec 64))
// (assert (let (
// (one (_ bv1 64))
// (thirtyTwo (_ bv32 64)))
// (and
// (bvult x (bvshl one thirtyTwo))
// (bvult y (bvshl one thirtyTwo))
// (not (=
// (bvadd (bvlshr (bvxor x y) one) (bvand x y))
// (bvlshr (bvadd x y) one)))
// )))
// (check-sat)
uint64_t beta_if_u_is_odd = beta & u_is_odd; // Either |beta| or 0.
u = ((u ^ beta_if_u_is_odd) >> 1) + (u & beta_if_u_is_odd);
uint64_t alpha_if_u_is_odd = alpha & u_is_odd; /* Either |alpha| or 0. */
v = (v >> 1) + alpha_if_u_is_odd;
}
// The invariant now shows that u*r - v*n == 1 since r == 2 * alpha.
#if BN_BITS2 == 64 && defined(BN_ULLONG)
dev_assert_secret(1 == ((BN_ULLONG)u * 2 * alpha) - ((BN_ULLONG)v * beta));
#endif
return v;
}

View File

@@ -0,0 +1,901 @@
#! /usr/bin/env perl
# Copyright 2015-2018 The OpenSSL Project Authors. All Rights Reserved.
#
# Licensed under the OpenSSL license (the "License"). You may not use
# this file except in compliance with the License. You can obtain a copy
# in the file LICENSE in the source distribution or at
# https://www.openssl.org/source/license.html
# ====================================================================
# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
# project. The module is, however, dual licensed under OpenSSL and
# CRYPTOGAMS licenses depending on where you obtain it. For further
# details see http://www.openssl.org/~appro/cryptogams/.
# ====================================================================
#
# ECP_NISTZ256 module for ARMv4.
#
# October 2014.
#
# Original ECP_NISTZ256 submission targeting x86_64 is detailed in
# http://eprint.iacr.org/2013/816. In the process of adaptation
# original .c module was made 32-bit savvy in order to make this
# implementation possible.
#
# with/without -DECP_NISTZ256_ASM
# Cortex-A8 +53-170%
# Cortex-A9 +76-205%
# Cortex-A15 +100-316%
# Snapdragon S4 +66-187%
#
# Ranges denote minimum and maximum improvement coefficients depending
# on benchmark. Lower coefficients are for ECDSA sign, server-side
# operation. Keep in mind that +200% means 3x improvement.
$flavour = shift;
if ($flavour=~/\w[\w\-]*\.\w+$/) { $output=$flavour; undef $flavour; }
else { while (($output=shift) && ($output!~/\w[\w\-]*\.\w+$/)) {} }
if ($flavour && $flavour ne "void") {
$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or
( $xlate="${dir}../../../perlasm/arm-xlate.pl" and -f $xlate) or
die "can't locate arm-xlate.pl";
open STDOUT,"| \"$^X\" $xlate $flavour $output";
} else {
open STDOUT,">$output";
}
$code.=<<___;
#include <GFp/arm_arch.h>
.text
#if defined(__thumb2__)
.syntax unified
.thumb
#else
.code 32
#endif
.asciz "ECP_NISTZ256 for ARMv4, CRYPTOGAMS by <appro\@openssl.org>"
.align 6
___
########################################################################
# common register layout, note that $t2 is link register, so that if
# internal subroutine uses $t2, then it has to offload lr...
($r_ptr,$a_ptr,$b_ptr,$ff,$a0,$a1,$a2,$a3,$a4,$a5,$a6,$a7,$t1,$t2)=
map("r$_",(0..12,14));
($t0,$t3)=($ff,$a_ptr);
$code.=<<___;
.type __ecp_nistz256_mul_by_2,%function
.align 4
__ecp_nistz256_mul_by_2:
ldr $a0,[$a_ptr,#0]
ldr $a1,[$a_ptr,#4]
ldr $a2,[$a_ptr,#8]
adds $a0,$a0,$a0 @ a[0:7]+=a[0:7], i.e. add with itself
ldr $a3,[$a_ptr,#12]
adcs $a1,$a1,$a1
ldr $a4,[$a_ptr,#16]
adcs $a2,$a2,$a2
ldr $a5,[$a_ptr,#20]
adcs $a3,$a3,$a3
ldr $a6,[$a_ptr,#24]
adcs $a4,$a4,$a4
ldr $a7,[$a_ptr,#28]
adcs $a5,$a5,$a5
adcs $a6,$a6,$a6
mov $ff,#0
adcs $a7,$a7,$a7
adc $ff,$ff,#0
b .Lreduce_by_sub
.size __ecp_nistz256_mul_by_2,.-__ecp_nistz256_mul_by_2
@ void GFp_nistz256_add(BN_ULONG r0[8],const BN_ULONG r1[8],
@ const BN_ULONG r2[8]);
.globl GFp_nistz256_add
.type GFp_nistz256_add,%function
.align 4
GFp_nistz256_add:
stmdb sp!,{r4-r12,lr}
bl __ecp_nistz256_add
#if __ARM_ARCH__>=5 || !defined(__thumb__)
ldmia sp!,{r4-r12,pc}
#else
ldmia sp!,{r4-r12,lr}
bx lr @ interoperable with Thumb ISA:-)
#endif
.size GFp_nistz256_add,.-GFp_nistz256_add
.type __ecp_nistz256_add,%function
.align 4
__ecp_nistz256_add:
str lr,[sp,#-4]! @ push lr
ldr $a0,[$a_ptr,#0]
ldr $a1,[$a_ptr,#4]
ldr $a2,[$a_ptr,#8]
ldr $a3,[$a_ptr,#12]
ldr $a4,[$a_ptr,#16]
ldr $t0,[$b_ptr,#0]
ldr $a5,[$a_ptr,#20]
ldr $t1,[$b_ptr,#4]
ldr $a6,[$a_ptr,#24]
ldr $t2,[$b_ptr,#8]
ldr $a7,[$a_ptr,#28]
ldr $t3,[$b_ptr,#12]
adds $a0,$a0,$t0
ldr $t0,[$b_ptr,#16]
adcs $a1,$a1,$t1
ldr $t1,[$b_ptr,#20]
adcs $a2,$a2,$t2
ldr $t2,[$b_ptr,#24]
adcs $a3,$a3,$t3
ldr $t3,[$b_ptr,#28]
adcs $a4,$a4,$t0
adcs $a5,$a5,$t1
adcs $a6,$a6,$t2
mov $ff,#0
adcs $a7,$a7,$t3
adc $ff,$ff,#0
ldr lr,[sp],#4 @ pop lr
.Lreduce_by_sub:
@ if a+b >= modulus, subtract modulus.
@
@ But since comparison implies subtraction, we subtract
@ modulus and then add it back if subtraction borrowed.
subs $a0,$a0,#-1
sbcs $a1,$a1,#-1
sbcs $a2,$a2,#-1
sbcs $a3,$a3,#0
sbcs $a4,$a4,#0
sbcs $a5,$a5,#0
sbcs $a6,$a6,#1
sbcs $a7,$a7,#-1
sbc $ff,$ff,#0
@ Note that because mod has special form, i.e. consists of
@ 0xffffffff, 1 and 0s, we can conditionally synthesize it by
@ using value of borrow as a whole or extracting single bit.
@ Follow $ff register...
adds $a0,$a0,$ff @ add synthesized modulus
adcs $a1,$a1,$ff
str $a0,[$r_ptr,#0]
adcs $a2,$a2,$ff
str $a1,[$r_ptr,#4]
adcs $a3,$a3,#0
str $a2,[$r_ptr,#8]
adcs $a4,$a4,#0
str $a3,[$r_ptr,#12]
adcs $a5,$a5,#0
str $a4,[$r_ptr,#16]
adcs $a6,$a6,$ff,lsr#31
str $a5,[$r_ptr,#20]
adcs $a7,$a7,$ff
str $a6,[$r_ptr,#24]
str $a7,[$r_ptr,#28]
mov pc,lr
.size __ecp_nistz256_add,.-__ecp_nistz256_add
.type __ecp_nistz256_mul_by_3,%function
.align 4
__ecp_nistz256_mul_by_3:
str lr,[sp,#-4]! @ push lr
@ As multiplication by 3 is performed as 2*n+n, below are inline
@ copies of __ecp_nistz256_mul_by_2 and __ecp_nistz256_add, see
@ corresponding subroutines for details.
ldr $a0,[$a_ptr,#0]
ldr $a1,[$a_ptr,#4]
ldr $a2,[$a_ptr,#8]
adds $a0,$a0,$a0 @ a[0:7]+=a[0:7]
ldr $a3,[$a_ptr,#12]
adcs $a1,$a1,$a1
ldr $a4,[$a_ptr,#16]
adcs $a2,$a2,$a2
ldr $a5,[$a_ptr,#20]
adcs $a3,$a3,$a3
ldr $a6,[$a_ptr,#24]
adcs $a4,$a4,$a4
ldr $a7,[$a_ptr,#28]
adcs $a5,$a5,$a5
adcs $a6,$a6,$a6
mov $ff,#0
adcs $a7,$a7,$a7
adc $ff,$ff,#0
subs $a0,$a0,#-1 @ .Lreduce_by_sub but without stores
sbcs $a1,$a1,#-1
sbcs $a2,$a2,#-1
sbcs $a3,$a3,#0
sbcs $a4,$a4,#0
sbcs $a5,$a5,#0
sbcs $a6,$a6,#1
sbcs $a7,$a7,#-1
sbc $ff,$ff,#0
adds $a0,$a0,$ff @ add synthesized modulus
adcs $a1,$a1,$ff
adcs $a2,$a2,$ff
adcs $a3,$a3,#0
adcs $a4,$a4,#0
ldr $b_ptr,[$a_ptr,#0]
adcs $a5,$a5,#0
ldr $t1,[$a_ptr,#4]
adcs $a6,$a6,$ff,lsr#31
ldr $t2,[$a_ptr,#8]
adc $a7,$a7,$ff
ldr $t0,[$a_ptr,#12]
adds $a0,$a0,$b_ptr @ 2*a[0:7]+=a[0:7]
ldr $b_ptr,[$a_ptr,#16]
adcs $a1,$a1,$t1
ldr $t1,[$a_ptr,#20]
adcs $a2,$a2,$t2
ldr $t2,[$a_ptr,#24]
adcs $a3,$a3,$t0
ldr $t3,[$a_ptr,#28]
adcs $a4,$a4,$b_ptr
adcs $a5,$a5,$t1
adcs $a6,$a6,$t2
mov $ff,#0
adcs $a7,$a7,$t3
adc $ff,$ff,#0
ldr lr,[sp],#4 @ pop lr
b .Lreduce_by_sub
.size __ecp_nistz256_mul_by_3,.-__ecp_nistz256_mul_by_3
.type __ecp_nistz256_div_by_2,%function
.align 4
__ecp_nistz256_div_by_2:
@ ret = (a is odd ? a+mod : a) >> 1
ldr $a0,[$a_ptr,#0]
ldr $a1,[$a_ptr,#4]
ldr $a2,[$a_ptr,#8]
mov $ff,$a0,lsl#31 @ place least significant bit to most
@ significant position, now arithmetic
@ right shift by 31 will produce -1 or
@ 0, while logical right shift 1 or 0,
@ this is how modulus is conditionally
@ synthesized in this case...
ldr $a3,[$a_ptr,#12]
adds $a0,$a0,$ff,asr#31
ldr $a4,[$a_ptr,#16]
adcs $a1,$a1,$ff,asr#31
ldr $a5,[$a_ptr,#20]
adcs $a2,$a2,$ff,asr#31
ldr $a6,[$a_ptr,#24]
adcs $a3,$a3,#0
ldr $a7,[$a_ptr,#28]
adcs $a4,$a4,#0
mov $a0,$a0,lsr#1 @ a[0:7]>>=1, we can start early
@ because it doesn't affect flags
adcs $a5,$a5,#0
orr $a0,$a0,$a1,lsl#31
adcs $a6,$a6,$ff,lsr#31
mov $b_ptr,#0
adcs $a7,$a7,$ff,asr#31
mov $a1,$a1,lsr#1
adc $b_ptr,$b_ptr,#0 @ top-most carry bit from addition
orr $a1,$a1,$a2,lsl#31
mov $a2,$a2,lsr#1
str $a0,[$r_ptr,#0]
orr $a2,$a2,$a3,lsl#31
mov $a3,$a3,lsr#1
str $a1,[$r_ptr,#4]
orr $a3,$a3,$a4,lsl#31
mov $a4,$a4,lsr#1
str $a2,[$r_ptr,#8]
orr $a4,$a4,$a5,lsl#31
mov $a5,$a5,lsr#1
str $a3,[$r_ptr,#12]
orr $a5,$a5,$a6,lsl#31
mov $a6,$a6,lsr#1
str $a4,[$r_ptr,#16]
orr $a6,$a6,$a7,lsl#31
mov $a7,$a7,lsr#1
str $a5,[$r_ptr,#20]
orr $a7,$a7,$b_ptr,lsl#31 @ don't forget the top-most carry bit
str $a6,[$r_ptr,#24]
str $a7,[$r_ptr,#28]
mov pc,lr
.size __ecp_nistz256_div_by_2,.-__ecp_nistz256_div_by_2
.type __ecp_nistz256_sub,%function
.align 4
__ecp_nistz256_sub:
str lr,[sp,#-4]! @ push lr
ldr $a0,[$a_ptr,#0]
ldr $a1,[$a_ptr,#4]
ldr $a2,[$a_ptr,#8]
ldr $a3,[$a_ptr,#12]
ldr $a4,[$a_ptr,#16]
ldr $t0,[$b_ptr,#0]
ldr $a5,[$a_ptr,#20]
ldr $t1,[$b_ptr,#4]
ldr $a6,[$a_ptr,#24]
ldr $t2,[$b_ptr,#8]
ldr $a7,[$a_ptr,#28]
ldr $t3,[$b_ptr,#12]
subs $a0,$a0,$t0
ldr $t0,[$b_ptr,#16]
sbcs $a1,$a1,$t1
ldr $t1,[$b_ptr,#20]
sbcs $a2,$a2,$t2
ldr $t2,[$b_ptr,#24]
sbcs $a3,$a3,$t3
ldr $t3,[$b_ptr,#28]
sbcs $a4,$a4,$t0
sbcs $a5,$a5,$t1
sbcs $a6,$a6,$t2
sbcs $a7,$a7,$t3
sbc $ff,$ff,$ff @ broadcast borrow bit
ldr lr,[sp],#4 @ pop lr
.Lreduce_by_add:
@ if a-b borrows, add modulus.
@
@ Note that because mod has special form, i.e. consists of
@ 0xffffffff, 1 and 0s, we can conditionally synthesize it by
@ broadcasting borrow bit to a register, $ff, and using it as
@ a whole or extracting single bit.
adds $a0,$a0,$ff @ add synthesized modulus
adcs $a1,$a1,$ff
str $a0,[$r_ptr,#0]
adcs $a2,$a2,$ff
str $a1,[$r_ptr,#4]
adcs $a3,$a3,#0
str $a2,[$r_ptr,#8]
adcs $a4,$a4,#0
str $a3,[$r_ptr,#12]
adcs $a5,$a5,#0
str $a4,[$r_ptr,#16]
adcs $a6,$a6,$ff,lsr#31
str $a5,[$r_ptr,#20]
adcs $a7,$a7,$ff
str $a6,[$r_ptr,#24]
str $a7,[$r_ptr,#28]
mov pc,lr
.size __ecp_nistz256_sub,.-__ecp_nistz256_sub
@ void GFp_nistz256_neg(BN_ULONG r0[8],const BN_ULONG r1[8]);
.globl GFp_nistz256_neg
.type GFp_nistz256_neg,%function
.align 4
GFp_nistz256_neg:
stmdb sp!,{r4-r12,lr}
bl __ecp_nistz256_neg
#if __ARM_ARCH__>=5 || !defined(__thumb__)
ldmia sp!,{r4-r12,pc}
#else
ldmia sp!,{r4-r12,lr}
bx lr @ interoperable with Thumb ISA:-)
#endif
.size GFp_nistz256_neg,.-GFp_nistz256_neg
.type __ecp_nistz256_neg,%function
.align 4
__ecp_nistz256_neg:
ldr $a0,[$a_ptr,#0]
eor $ff,$ff,$ff
ldr $a1,[$a_ptr,#4]
ldr $a2,[$a_ptr,#8]
subs $a0,$ff,$a0
ldr $a3,[$a_ptr,#12]
sbcs $a1,$ff,$a1
ldr $a4,[$a_ptr,#16]
sbcs $a2,$ff,$a2
ldr $a5,[$a_ptr,#20]
sbcs $a3,$ff,$a3
ldr $a6,[$a_ptr,#24]
sbcs $a4,$ff,$a4
ldr $a7,[$a_ptr,#28]
sbcs $a5,$ff,$a5
sbcs $a6,$ff,$a6
sbcs $a7,$ff,$a7
sbc $ff,$ff,$ff
b .Lreduce_by_add
.size __ecp_nistz256_neg,.-__ecp_nistz256_neg
___
{
my @acc=map("r$_",(3..11));
my ($t0,$t1,$bj,$t2,$t3)=map("r$_",(0,1,2,12,14));
$code.=<<___;
@ void GFp_nistz256_mul_mont(BN_ULONG r0[8],const BN_ULONG r1[8],
@ const BN_ULONG r2[8]);
.globl GFp_nistz256_mul_mont
.type GFp_nistz256_mul_mont,%function
.align 4
GFp_nistz256_mul_mont:
stmdb sp!,{r4-r12,lr}
bl __ecp_nistz256_mul_mont
#if __ARM_ARCH__>=5 || !defined(__thumb__)
ldmia sp!,{r4-r12,pc}
#else
ldmia sp!,{r4-r12,lr}
bx lr @ interoperable with Thumb ISA:-)
#endif
.size GFp_nistz256_mul_mont,.-GFp_nistz256_mul_mont
.type __ecp_nistz256_mul_mont,%function
.align 4
__ecp_nistz256_mul_mont:
stmdb sp!,{r0-r2,lr} @ make a copy of arguments too
ldr $bj,[$b_ptr,#0] @ b[0]
ldmia $a_ptr,{@acc[1]-@acc[8]}
umull @acc[0],$t3,@acc[1],$bj @ r[0]=a[0]*b[0]
stmdb sp!,{$acc[1]-@acc[8]} @ copy a[0-7] to stack, so
@ that it can be addressed
@ without spending register
@ on address
umull @acc[1],$t0,@acc[2],$bj @ r[1]=a[1]*b[0]
umull @acc[2],$t1,@acc[3],$bj
adds @acc[1],@acc[1],$t3 @ accumulate high part of mult
umull @acc[3],$t2,@acc[4],$bj
adcs @acc[2],@acc[2],$t0
umull @acc[4],$t3,@acc[5],$bj
adcs @acc[3],@acc[3],$t1
umull @acc[5],$t0,@acc[6],$bj
adcs @acc[4],@acc[4],$t2
umull @acc[6],$t1,@acc[7],$bj
adcs @acc[5],@acc[5],$t3
umull @acc[7],$t2,@acc[8],$bj
adcs @acc[6],@acc[6],$t0
adcs @acc[7],@acc[7],$t1
eor $t3,$t3,$t3 @ first overflow bit is zero
adc @acc[8],$t2,#0
___
for(my $i=1;$i<8;$i++) {
my $t4=@acc[0];
# Reduction iteration is normally performed by accumulating
# result of multiplication of modulus by "magic" digit [and
# omitting least significant word, which is guaranteed to
# be 0], but thanks to special form of modulus and "magic"
# digit being equal to least significant word, it can be
# performed with additions and subtractions alone. Indeed:
#
# ffff.0001.0000.0000.0000.ffff.ffff.ffff
# * abcd
# + xxxx.xxxx.xxxx.xxxx.xxxx.xxxx.xxxx.xxxx.abcd
#
# Now observing that ff..ff*x = (2^n-1)*x = 2^n*x-x, we
# rewrite above as:
#
# xxxx.xxxx.xxxx.xxxx.xxxx.xxxx.xxxx.xxxx.abcd
# + abcd.0000.abcd.0000.0000.abcd.0000.0000.0000
# - abcd.0000.0000.0000.0000.0000.0000.abcd
#
# or marking redundant operations:
#
# xxxx.xxxx.xxxx.xxxx.xxxx.xxxx.xxxx.xxxx.----
# + abcd.0000.abcd.0000.0000.abcd.----.----.----
# - abcd.----.----.----.----.----.----.----
$code.=<<___;
@ multiplication-less reduction $i
adds @acc[3],@acc[3],@acc[0] @ r[3]+=r[0]
ldr $bj,[sp,#40] @ restore b_ptr
adcs @acc[4],@acc[4],#0 @ r[4]+=0
adcs @acc[5],@acc[5],#0 @ r[5]+=0
adcs @acc[6],@acc[6],@acc[0] @ r[6]+=r[0]
ldr $t1,[sp,#0] @ load a[0]
adcs @acc[7],@acc[7],#0 @ r[7]+=0
ldr $bj,[$bj,#4*$i] @ load b[i]
adcs @acc[8],@acc[8],@acc[0] @ r[8]+=r[0]
eor $t0,$t0,$t0
adc $t3,$t3,#0 @ overflow bit
subs @acc[7],@acc[7],@acc[0] @ r[7]-=r[0]
ldr $t2,[sp,#4] @ a[1]
sbcs @acc[8],@acc[8],#0 @ r[8]-=0
umlal @acc[1],$t0,$t1,$bj @ "r[0]"+=a[0]*b[i]
eor $t1,$t1,$t1
sbc @acc[0],$t3,#0 @ overflow bit, keep in mind
@ that netto result is
@ addition of a value which
@ makes underflow impossible
ldr $t3,[sp,#8] @ a[2]
umlal @acc[2],$t1,$t2,$bj @ "r[1]"+=a[1]*b[i]
str @acc[0],[sp,#36] @ temporarily offload overflow
eor $t2,$t2,$t2
ldr $t4,[sp,#12] @ a[3], $t4 is alias @acc[0]
umlal @acc[3],$t2,$t3,$bj @ "r[2]"+=a[2]*b[i]
eor $t3,$t3,$t3
adds @acc[2],@acc[2],$t0 @ accumulate high part of mult
ldr $t0,[sp,#16] @ a[4]
umlal @acc[4],$t3,$t4,$bj @ "r[3]"+=a[3]*b[i]
eor $t4,$t4,$t4
adcs @acc[3],@acc[3],$t1
ldr $t1,[sp,#20] @ a[5]
umlal @acc[5],$t4,$t0,$bj @ "r[4]"+=a[4]*b[i]
eor $t0,$t0,$t0
adcs @acc[4],@acc[4],$t2
ldr $t2,[sp,#24] @ a[6]
umlal @acc[6],$t0,$t1,$bj @ "r[5]"+=a[5]*b[i]
eor $t1,$t1,$t1
adcs @acc[5],@acc[5],$t3
ldr $t3,[sp,#28] @ a[7]
umlal @acc[7],$t1,$t2,$bj @ "r[6]"+=a[6]*b[i]
eor $t2,$t2,$t2
adcs @acc[6],@acc[6],$t4
ldr @acc[0],[sp,#36] @ restore overflow bit
umlal @acc[8],$t2,$t3,$bj @ "r[7]"+=a[7]*b[i]
eor $t3,$t3,$t3
adcs @acc[7],@acc[7],$t0
adcs @acc[8],@acc[8],$t1
adcs @acc[0],$acc[0],$t2
adc $t3,$t3,#0 @ new overflow bit
___
push(@acc,shift(@acc)); # rotate registers, so that
# "r[i]" becomes r[i]
}
$code.=<<___;
@ last multiplication-less reduction
adds @acc[3],@acc[3],@acc[0]
ldr $r_ptr,[sp,#32] @ restore r_ptr
adcs @acc[4],@acc[4],#0
adcs @acc[5],@acc[5],#0
adcs @acc[6],@acc[6],@acc[0]
adcs @acc[7],@acc[7],#0
adcs @acc[8],@acc[8],@acc[0]
adc $t3,$t3,#0
subs @acc[7],@acc[7],@acc[0]
sbcs @acc[8],@acc[8],#0
sbc @acc[0],$t3,#0 @ overflow bit
@ Final step is "if result > mod, subtract mod", but we do it
@ "other way around", namely subtract modulus from result
@ and if it borrowed, add modulus back.
adds @acc[1],@acc[1],#1 @ subs @acc[1],@acc[1],#-1
adcs @acc[2],@acc[2],#0 @ sbcs @acc[2],@acc[2],#-1
adcs @acc[3],@acc[3],#0 @ sbcs @acc[3],@acc[3],#-1
sbcs @acc[4],@acc[4],#0
sbcs @acc[5],@acc[5],#0
sbcs @acc[6],@acc[6],#0
sbcs @acc[7],@acc[7],#1
adcs @acc[8],@acc[8],#0 @ sbcs @acc[8],@acc[8],#-1
ldr lr,[sp,#44] @ restore lr
sbc @acc[0],@acc[0],#0 @ broadcast borrow bit
add sp,sp,#48
@ Note that because mod has special form, i.e. consists of
@ 0xffffffff, 1 and 0s, we can conditionally synthesize it by
@ broadcasting borrow bit to a register, @acc[0], and using it as
@ a whole or extracting single bit.
adds @acc[1],@acc[1],@acc[0] @ add modulus or zero
adcs @acc[2],@acc[2],@acc[0]
str @acc[1],[$r_ptr,#0]
adcs @acc[3],@acc[3],@acc[0]
str @acc[2],[$r_ptr,#4]
adcs @acc[4],@acc[4],#0
str @acc[3],[$r_ptr,#8]
adcs @acc[5],@acc[5],#0
str @acc[4],[$r_ptr,#12]
adcs @acc[6],@acc[6],#0
str @acc[5],[$r_ptr,#16]
adcs @acc[7],@acc[7],@acc[0],lsr#31
str @acc[6],[$r_ptr,#20]
adc @acc[8],@acc[8],@acc[0]
str @acc[7],[$r_ptr,#24]
str @acc[8],[$r_ptr,#28]
mov pc,lr
.size __ecp_nistz256_mul_mont,.-__ecp_nistz256_mul_mont
___
}
{{{
########################################################################
# Below $aN assignment matches order in which 256-bit result appears in
# register bank at return from __ecp_nistz256_mul_mont, so that we can
# skip over reloading it from memory. This means that below functions
# use custom calling sequence accepting 256-bit input in registers,
# output pointer in r0, $r_ptr, and optional pointer in r2, $b_ptr.
#
# See their "normal" counterparts for insights on calculations.
my ($a0,$a1,$a2,$a3,$a4,$a5,$a6,$a7,
$t0,$t1,$t2,$t3)=map("r$_",(11,3..10,12,14,1));
my $ff=$b_ptr;
$code.=<<___;
.type __ecp_nistz256_sub_from,%function
.align 5
__ecp_nistz256_sub_from:
str lr,[sp,#-4]! @ push lr
ldr $t0,[$b_ptr,#0]
ldr $t1,[$b_ptr,#4]
ldr $t2,[$b_ptr,#8]
ldr $t3,[$b_ptr,#12]
subs $a0,$a0,$t0
ldr $t0,[$b_ptr,#16]
sbcs $a1,$a1,$t1
ldr $t1,[$b_ptr,#20]
sbcs $a2,$a2,$t2
ldr $t2,[$b_ptr,#24]
sbcs $a3,$a3,$t3
ldr $t3,[$b_ptr,#28]
sbcs $a4,$a4,$t0
sbcs $a5,$a5,$t1
sbcs $a6,$a6,$t2
sbcs $a7,$a7,$t3
sbc $ff,$ff,$ff @ broadcast borrow bit
ldr lr,[sp],#4 @ pop lr
adds $a0,$a0,$ff @ add synthesized modulus
adcs $a1,$a1,$ff
str $a0,[$r_ptr,#0]
adcs $a2,$a2,$ff
str $a1,[$r_ptr,#4]
adcs $a3,$a3,#0
str $a2,[$r_ptr,#8]
adcs $a4,$a4,#0
str $a3,[$r_ptr,#12]
adcs $a5,$a5,#0
str $a4,[$r_ptr,#16]
adcs $a6,$a6,$ff,lsr#31
str $a5,[$r_ptr,#20]
adcs $a7,$a7,$ff
str $a6,[$r_ptr,#24]
str $a7,[$r_ptr,#28]
mov pc,lr
.size __ecp_nistz256_sub_from,.-__ecp_nistz256_sub_from
.type __ecp_nistz256_sub_morf,%function
.align 5
__ecp_nistz256_sub_morf:
str lr,[sp,#-4]! @ push lr
ldr $t0,[$b_ptr,#0]
ldr $t1,[$b_ptr,#4]
ldr $t2,[$b_ptr,#8]
ldr $t3,[$b_ptr,#12]
subs $a0,$t0,$a0
ldr $t0,[$b_ptr,#16]
sbcs $a1,$t1,$a1
ldr $t1,[$b_ptr,#20]
sbcs $a2,$t2,$a2
ldr $t2,[$b_ptr,#24]
sbcs $a3,$t3,$a3
ldr $t3,[$b_ptr,#28]
sbcs $a4,$t0,$a4
sbcs $a5,$t1,$a5
sbcs $a6,$t2,$a6
sbcs $a7,$t3,$a7
sbc $ff,$ff,$ff @ broadcast borrow bit
ldr lr,[sp],#4 @ pop lr
adds $a0,$a0,$ff @ add synthesized modulus
adcs $a1,$a1,$ff
str $a0,[$r_ptr,#0]
adcs $a2,$a2,$ff
str $a1,[$r_ptr,#4]
adcs $a3,$a3,#0
str $a2,[$r_ptr,#8]
adcs $a4,$a4,#0
str $a3,[$r_ptr,#12]
adcs $a5,$a5,#0
str $a4,[$r_ptr,#16]
adcs $a6,$a6,$ff,lsr#31
str $a5,[$r_ptr,#20]
adcs $a7,$a7,$ff
str $a6,[$r_ptr,#24]
str $a7,[$r_ptr,#28]
mov pc,lr
.size __ecp_nistz256_sub_morf,.-__ecp_nistz256_sub_morf
.type __ecp_nistz256_add_self,%function
.align 4
__ecp_nistz256_add_self:
adds $a0,$a0,$a0 @ a[0:7]+=a[0:7]
adcs $a1,$a1,$a1
adcs $a2,$a2,$a2
adcs $a3,$a3,$a3
adcs $a4,$a4,$a4
adcs $a5,$a5,$a5
adcs $a6,$a6,$a6
mov $ff,#0
adcs $a7,$a7,$a7
adc $ff,$ff,#0
@ if a+b >= modulus, subtract modulus.
@
@ But since comparison implies subtraction, we subtract
@ modulus and then add it back if subtraction borrowed.
subs $a0,$a0,#-1
sbcs $a1,$a1,#-1
sbcs $a2,$a2,#-1
sbcs $a3,$a3,#0
sbcs $a4,$a4,#0
sbcs $a5,$a5,#0
sbcs $a6,$a6,#1
sbcs $a7,$a7,#-1
sbc $ff,$ff,#0
@ Note that because mod has special form, i.e. consists of
@ 0xffffffff, 1 and 0s, we can conditionally synthesize it by
@ using value of borrow as a whole or extracting single bit.
@ Follow $ff register...
adds $a0,$a0,$ff @ add synthesized modulus
adcs $a1,$a1,$ff
str $a0,[$r_ptr,#0]
adcs $a2,$a2,$ff
str $a1,[$r_ptr,#4]
adcs $a3,$a3,#0
str $a2,[$r_ptr,#8]
adcs $a4,$a4,#0
str $a3,[$r_ptr,#12]
adcs $a5,$a5,#0
str $a4,[$r_ptr,#16]
adcs $a6,$a6,$ff,lsr#31
str $a5,[$r_ptr,#20]
adcs $a7,$a7,$ff
str $a6,[$r_ptr,#24]
str $a7,[$r_ptr,#28]
mov pc,lr
.size __ecp_nistz256_add_self,.-__ecp_nistz256_add_self
___
########################################################################
# following subroutines are "literal" implementation of those found in
# ecp_nistz256.c
#
########################################################################
# void ecp_nistz256_point_double(P256_POINT *out,const P256_POINT *inp);
#
{
my ($S,$M,$Zsqr,$in_x,$tmp0)=map(32*$_,(0..4));
# above map() describes stack layout with 5 temporary
# 256-bit vectors on top. Then note that we push
# starting from r0, which means that we have copy of
# input arguments just below these temporary vectors.
$code.=<<___;
.globl GFp_nistz256_point_double
.type GFp_nistz256_point_double,%function
.align 5
GFp_nistz256_point_double:
stmdb sp!,{r0-r12,lr} @ push from r0, unusual, but intentional
sub sp,sp,#32*5
.Lpoint_double_shortcut:
add r3,sp,#$in_x
ldmia $a_ptr!,{r4-r11} @ copy in_x
stmia r3,{r4-r11}
add $r_ptr,sp,#$S
bl __ecp_nistz256_mul_by_2 @ p256_mul_by_2(S, in_y);
add $b_ptr,$a_ptr,#32
add $a_ptr,$a_ptr,#32
add $r_ptr,sp,#$Zsqr
bl __ecp_nistz256_mul_mont @ p256_sqr_mont(Zsqr, in_z);
add $a_ptr,sp,#$S
add $b_ptr,sp,#$S
add $r_ptr,sp,#$S
bl __ecp_nistz256_mul_mont @ p256_sqr_mont(S, S);
ldr $b_ptr,[sp,#32*5+4]
add $a_ptr,$b_ptr,#32
add $b_ptr,$b_ptr,#64
add $r_ptr,sp,#$tmp0
bl __ecp_nistz256_mul_mont @ p256_mul_mont(tmp0, in_z, in_y);
ldr $r_ptr,[sp,#32*5]
add $r_ptr,$r_ptr,#64
bl __ecp_nistz256_add_self @ p256_mul_by_2(res_z, tmp0);
add $a_ptr,sp,#$in_x
add $b_ptr,sp,#$Zsqr
add $r_ptr,sp,#$M
bl __ecp_nistz256_add @ p256_add(M, in_x, Zsqr);
add $a_ptr,sp,#$in_x
add $b_ptr,sp,#$Zsqr
add $r_ptr,sp,#$Zsqr
bl __ecp_nistz256_sub @ p256_sub(Zsqr, in_x, Zsqr);
add $a_ptr,sp,#$S
add $b_ptr,sp,#$S
add $r_ptr,sp,#$tmp0
bl __ecp_nistz256_mul_mont @ p256_sqr_mont(tmp0, S);
add $a_ptr,sp,#$Zsqr
add $b_ptr,sp,#$M
add $r_ptr,sp,#$M
bl __ecp_nistz256_mul_mont @ p256_mul_mont(M, M, Zsqr);
ldr $r_ptr,[sp,#32*5]
add $a_ptr,sp,#$tmp0
add $r_ptr,$r_ptr,#32
bl __ecp_nistz256_div_by_2 @ p256_div_by_2(res_y, tmp0);
add $a_ptr,sp,#$M
add $r_ptr,sp,#$M
bl __ecp_nistz256_mul_by_3 @ p256_mul_by_3(M, M);
add $a_ptr,sp,#$in_x
add $b_ptr,sp,#$S
add $r_ptr,sp,#$S
bl __ecp_nistz256_mul_mont @ p256_mul_mont(S, S, in_x);
add $r_ptr,sp,#$tmp0
bl __ecp_nistz256_add_self @ p256_mul_by_2(tmp0, S);
ldr $r_ptr,[sp,#32*5]
add $a_ptr,sp,#$M
add $b_ptr,sp,#$M
bl __ecp_nistz256_mul_mont @ p256_sqr_mont(res_x, M);
add $b_ptr,sp,#$tmp0
bl __ecp_nistz256_sub_from @ p256_sub(res_x, res_x, tmp0);
add $b_ptr,sp,#$S
add $r_ptr,sp,#$S
bl __ecp_nistz256_sub_morf @ p256_sub(S, S, res_x);
add $a_ptr,sp,#$M
add $b_ptr,sp,#$S
bl __ecp_nistz256_mul_mont @ p256_mul_mont(S, S, M);
ldr $r_ptr,[sp,#32*5]
add $b_ptr,$r_ptr,#32
add $r_ptr,$r_ptr,#32
bl __ecp_nistz256_sub_from @ p256_sub(res_y, S, res_y);
add sp,sp,#32*5+16 @ +16 means "skip even over saved r0-r3"
#if __ARM_ARCH__>=5 || !defined(__thumb__)
ldmia sp!,{r4-r12,pc}
#else
ldmia sp!,{r4-r12,lr}
bx lr @ interoperable with Thumb ISA:-)
#endif
.size GFp_nistz256_point_double,.-GFp_nistz256_point_double
___
}
}}}
foreach (split("\n",$code)) {
s/\`([^\`]*)\`/eval $1/geo;
s/\bq([0-9]+)#(lo|hi)/sprintf "d%d",2*$1+($2 eq "hi")/geo;
print $_,"\n";
}
close STDOUT or die "error closing STDOUT";

View File

@@ -0,0 +1,908 @@
#! /usr/bin/env perl
# Copyright 2015-2016 The OpenSSL Project Authors. All Rights Reserved.
#
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions
# are met:
#
# 1. Redistributions of source code must retain the above copyright
# notice, this list of conditions and the following disclaimer.
#
# 2. Redistributions in binary form must reproduce the above copyright
# notice, this list of conditions and the following disclaimer in
# the documentation and/or other materials provided with the
# distribution.
#
# 3. All advertising materials mentioning features or use of this
# software must display the following acknowledgment:
# "This product includes software developed by the OpenSSL Project
# for use in the OpenSSL Toolkit. (http://www.openssl.org/)"
#
# 4. The names "OpenSSL Toolkit" and "OpenSSL Project" must not be used to
# endorse or promote products derived from this software without
# prior written permission. For written permission, please contact
# openssl-core@openssl.org.
#
# 5. Products derived from this software may not be called "OpenSSL"
# nor may "OpenSSL" appear in their names without prior written
# permission of the OpenSSL Project.
#
# 6. Redistributions of any form whatsoever must retain the following
# acknowledgment:
# "This product includes software developed by the OpenSSL Project
# for use in the OpenSSL Toolkit (http://www.openssl.org/)"
#
# THIS SOFTWARE IS PROVIDED BY THE OpenSSL PROJECT ``AS IS'' AND ANY
# EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE OpenSSL PROJECT OR
# ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
# NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
# LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
# HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
# STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED
# OF THE POSSIBILITY OF SUCH DAMAGE.
# ====================================================================
#
# This product includes cryptographic software written by Eric Young
# (eay@cryptsoft.com). This product includes software written by Tim
# Hudson (tjh@cryptsoft.com).
# ====================================================================
# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
# project. The module is, however, dual licensed under OpenSSL and
# CRYPTOGAMS licenses depending on where you obtain it. For further
# details see http://www.openssl.org/~appro/cryptogams/.
# ====================================================================
#
# ECP_NISTZ256 module for ARMv8.
#
# February 2015.
#
# Original ECP_NISTZ256 submission targeting x86_64 is detailed in
# http://eprint.iacr.org/2013/816.
#
# with/without -DECP_NISTZ256_ASM
# Apple A7 +120-360%
# Cortex-A53 +120-400%
# Cortex-A57 +120-350%
# X-Gene +200-330%
# Denver +140-400%
#
# Ranges denote minimum and maximum improvement coefficients depending
# on benchmark. Lower coefficients are for ECDSA sign, server-side
# operation. Keep in mind that +400% means 5x improvement.
$flavour = shift;
while (($output=shift) && ($output!~/\w[\w\-]*\.\w+$/)) {}
$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or
( $xlate="${dir}../../../perlasm/arm-xlate.pl" and -f $xlate) or
die "can't locate arm-xlate.pl";
open OUT,"| \"$^X\" $xlate $flavour $output";
*STDOUT=*OUT;
{
my ($rp,$ap,$bp,$bi,$a0,$a1,$a2,$a3,$t0,$t1,$t2,$t3,$poly1,$poly3,
$acc0,$acc1,$acc2,$acc3,$acc4,$acc5) =
map("x$_",(0..17,19,20));
my ($acc6,$acc7)=($ap,$bp); # used in __ecp_nistz256_sqr_mont
$code.=<<___;
#include <GFp/arm_arch.h>
.text
.align 5
.Lpoly:
.quad 0xffffffffffffffff,0x00000000ffffffff,0x0000000000000000,0xffffffff00000001
.Lone_mont:
.quad 0x0000000000000001,0xffffffff00000000,0xffffffffffffffff,0x00000000fffffffe
.Lone:
.quad 1,0,0,0
.asciz "ECP_NISTZ256 for ARMv8, CRYPTOGAMS by <appro\@openssl.org>"
// void GFp_nistz256_mul_mont(BN_ULONG x0[4],const BN_ULONG x1[4],
// const BN_ULONG x2[4]);
.globl GFp_nistz256_mul_mont
.type GFp_nistz256_mul_mont,%function
.align 4
GFp_nistz256_mul_mont:
stp x29,x30,[sp,#-32]!
add x29,sp,#0
stp x19,x20,[sp,#16]
ldr $bi,[$bp] // bp[0]
ldp $a0,$a1,[$ap]
ldp $a2,$a3,[$ap,#16]
ldr $poly1,.Lpoly+8
ldr $poly3,.Lpoly+24
bl __ecp_nistz256_mul_mont
ldp x19,x20,[sp,#16]
ldp x29,x30,[sp],#32
ret
.size GFp_nistz256_mul_mont,.-GFp_nistz256_mul_mont
// void GFp_nistz256_sqr_mont(BN_ULONG x0[4],const BN_ULONG x1[4]);
.globl GFp_nistz256_sqr_mont
.type GFp_nistz256_sqr_mont,%function
.align 4
GFp_nistz256_sqr_mont:
stp x29,x30,[sp,#-32]!
add x29,sp,#0
stp x19,x20,[sp,#16]
ldp $a0,$a1,[$ap]
ldp $a2,$a3,[$ap,#16]
ldr $poly1,.Lpoly+8
ldr $poly3,.Lpoly+24
bl __ecp_nistz256_sqr_mont
ldp x19,x20,[sp,#16]
ldp x29,x30,[sp],#32
ret
.size GFp_nistz256_sqr_mont,.-GFp_nistz256_sqr_mont
// void GFp_nistz256_add(BN_ULONG x0[4],const BN_ULONG x1[4],
// const BN_ULONG x2[4]);
.globl GFp_nistz256_add
.type GFp_nistz256_add,%function
.align 4
GFp_nistz256_add:
stp x29,x30,[sp,#-16]!
add x29,sp,#0
ldp $acc0,$acc1,[$ap]
ldp $t0,$t1,[$bp]
ldp $acc2,$acc3,[$ap,#16]
ldp $t2,$t3,[$bp,#16]
ldr $poly1,.Lpoly+8
ldr $poly3,.Lpoly+24
bl __ecp_nistz256_add
ldp x29,x30,[sp],#16
ret
.size GFp_nistz256_add,.-GFp_nistz256_add
// void GFp_nistz256_neg(BN_ULONG x0[4],const BN_ULONG x1[4]);
.globl GFp_nistz256_neg
.type GFp_nistz256_neg,%function
.align 4
GFp_nistz256_neg:
stp x29,x30,[sp,#-16]!
add x29,sp,#0
mov $bp,$ap
mov $acc0,xzr // a = 0
mov $acc1,xzr
mov $acc2,xzr
mov $acc3,xzr
ldr $poly1,.Lpoly+8
ldr $poly3,.Lpoly+24
bl __ecp_nistz256_sub_from
ldp x29,x30,[sp],#16
ret
.size GFp_nistz256_neg,.-GFp_nistz256_neg
// note that __ecp_nistz256_mul_mont expects a[0-3] input pre-loaded
// to $a0-$a3 and b[0] - to $bi
.type __ecp_nistz256_mul_mont,%function
.align 4
__ecp_nistz256_mul_mont:
mul $acc0,$a0,$bi // a[0]*b[0]
umulh $t0,$a0,$bi
mul $acc1,$a1,$bi // a[1]*b[0]
umulh $t1,$a1,$bi
mul $acc2,$a2,$bi // a[2]*b[0]
umulh $t2,$a2,$bi
mul $acc3,$a3,$bi // a[3]*b[0]
umulh $t3,$a3,$bi
ldr $bi,[$bp,#8] // b[1]
adds $acc1,$acc1,$t0 // accumulate high parts of multiplication
lsl $t0,$acc0,#32
adcs $acc2,$acc2,$t1
lsr $t1,$acc0,#32
adcs $acc3,$acc3,$t2
adc $acc4,xzr,$t3
mov $acc5,xzr
___
for($i=1;$i<4;$i++) {
# Reduction iteration is normally performed by accumulating
# result of multiplication of modulus by "magic" digit [and
# omitting least significant word, which is guaranteed to
# be 0], but thanks to special form of modulus and "magic"
# digit being equal to least significant word, it can be
# performed with additions and subtractions alone. Indeed:
#
# ffff0001.00000000.0000ffff.ffffffff
# * abcdefgh
# + xxxxxxxx.xxxxxxxx.xxxxxxxx.xxxxxxxx.abcdefgh
#
# Now observing that ff..ff*x = (2^n-1)*x = 2^n*x-x, we
# rewrite above as:
#
# xxxxxxxx.xxxxxxxx.xxxxxxxx.xxxxxxxx.abcdefgh
# + abcdefgh.abcdefgh.0000abcd.efgh0000.00000000
# - 0000abcd.efgh0000.00000000.00000000.abcdefgh
#
# or marking redundant operations:
#
# xxxxxxxx.xxxxxxxx.xxxxxxxx.xxxxxxxx.--------
# + abcdefgh.abcdefgh.0000abcd.efgh0000.--------
# - 0000abcd.efgh0000.--------.--------.--------
$code.=<<___;
subs $t2,$acc0,$t0 // "*0xffff0001"
sbc $t3,$acc0,$t1
adds $acc0,$acc1,$t0 // +=acc[0]<<96 and omit acc[0]
mul $t0,$a0,$bi // lo(a[0]*b[i])
adcs $acc1,$acc2,$t1
mul $t1,$a1,$bi // lo(a[1]*b[i])
adcs $acc2,$acc3,$t2 // +=acc[0]*0xffff0001
mul $t2,$a2,$bi // lo(a[2]*b[i])
adcs $acc3,$acc4,$t3
mul $t3,$a3,$bi // lo(a[3]*b[i])
adc $acc4,$acc5,xzr
adds $acc0,$acc0,$t0 // accumulate low parts of multiplication
umulh $t0,$a0,$bi // hi(a[0]*b[i])
adcs $acc1,$acc1,$t1
umulh $t1,$a1,$bi // hi(a[1]*b[i])
adcs $acc2,$acc2,$t2
umulh $t2,$a2,$bi // hi(a[2]*b[i])
adcs $acc3,$acc3,$t3
umulh $t3,$a3,$bi // hi(a[3]*b[i])
adc $acc4,$acc4,xzr
___
$code.=<<___ if ($i<3);
ldr $bi,[$bp,#8*($i+1)] // b[$i+1]
___
$code.=<<___;
adds $acc1,$acc1,$t0 // accumulate high parts of multiplication
lsl $t0,$acc0,#32
adcs $acc2,$acc2,$t1
lsr $t1,$acc0,#32
adcs $acc3,$acc3,$t2
adcs $acc4,$acc4,$t3
adc $acc5,xzr,xzr
___
}
$code.=<<___;
// last reduction
subs $t2,$acc0,$t0 // "*0xffff0001"
sbc $t3,$acc0,$t1
adds $acc0,$acc1,$t0 // +=acc[0]<<96 and omit acc[0]
adcs $acc1,$acc2,$t1
adcs $acc2,$acc3,$t2 // +=acc[0]*0xffff0001
adcs $acc3,$acc4,$t3
adc $acc4,$acc5,xzr
adds $t0,$acc0,#1 // subs $t0,$acc0,#-1 // tmp = ret-modulus
sbcs $t1,$acc1,$poly1
sbcs $t2,$acc2,xzr
sbcs $t3,$acc3,$poly3
sbcs xzr,$acc4,xzr // did it borrow?
csel $acc0,$acc0,$t0,lo // ret = borrow ? ret : ret-modulus
csel $acc1,$acc1,$t1,lo
csel $acc2,$acc2,$t2,lo
stp $acc0,$acc1,[$rp]
csel $acc3,$acc3,$t3,lo
stp $acc2,$acc3,[$rp,#16]
ret
.size __ecp_nistz256_mul_mont,.-__ecp_nistz256_mul_mont
// note that __ecp_nistz256_sqr_mont expects a[0-3] input pre-loaded
// to $a0-$a3
.type __ecp_nistz256_sqr_mont,%function
.align 4
__ecp_nistz256_sqr_mont:
// | | | | | |a1*a0| |
// | | | | |a2*a0| | |
// | |a3*a2|a3*a0| | | |
// | | | |a2*a1| | | |
// | | |a3*a1| | | | |
// *| | | | | | | | 2|
// +|a3*a3|a2*a2|a1*a1|a0*a0|
// |--+--+--+--+--+--+--+--|
// |A7|A6|A5|A4|A3|A2|A1|A0|, where Ax is $accx, i.e. follow $accx
//
// "can't overflow" below mark carrying into high part of
// multiplication result, which can't overflow, because it
// can never be all ones.
mul $acc1,$a1,$a0 // a[1]*a[0]
umulh $t1,$a1,$a0
mul $acc2,$a2,$a0 // a[2]*a[0]
umulh $t2,$a2,$a0
mul $acc3,$a3,$a0 // a[3]*a[0]
umulh $acc4,$a3,$a0
adds $acc2,$acc2,$t1 // accumulate high parts of multiplication
mul $t0,$a2,$a1 // a[2]*a[1]
umulh $t1,$a2,$a1
adcs $acc3,$acc3,$t2
mul $t2,$a3,$a1 // a[3]*a[1]
umulh $t3,$a3,$a1
adc $acc4,$acc4,xzr // can't overflow
mul $acc5,$a3,$a2 // a[3]*a[2]
umulh $acc6,$a3,$a2
adds $t1,$t1,$t2 // accumulate high parts of multiplication
mul $acc0,$a0,$a0 // a[0]*a[0]
adc $t2,$t3,xzr // can't overflow
adds $acc3,$acc3,$t0 // accumulate low parts of multiplication
umulh $a0,$a0,$a0
adcs $acc4,$acc4,$t1
mul $t1,$a1,$a1 // a[1]*a[1]
adcs $acc5,$acc5,$t2
umulh $a1,$a1,$a1
adc $acc6,$acc6,xzr // can't overflow
adds $acc1,$acc1,$acc1 // acc[1-6]*=2
mul $t2,$a2,$a2 // a[2]*a[2]
adcs $acc2,$acc2,$acc2
umulh $a2,$a2,$a2
adcs $acc3,$acc3,$acc3
mul $t3,$a3,$a3 // a[3]*a[3]
adcs $acc4,$acc4,$acc4
umulh $a3,$a3,$a3
adcs $acc5,$acc5,$acc5
adcs $acc6,$acc6,$acc6
adc $acc7,xzr,xzr
adds $acc1,$acc1,$a0 // +a[i]*a[i]
adcs $acc2,$acc2,$t1
adcs $acc3,$acc3,$a1
adcs $acc4,$acc4,$t2
adcs $acc5,$acc5,$a2
lsl $t0,$acc0,#32
adcs $acc6,$acc6,$t3
lsr $t1,$acc0,#32
adc $acc7,$acc7,$a3
___
for($i=0;$i<3;$i++) { # reductions, see commentary in
# multiplication for details
$code.=<<___;
subs $t2,$acc0,$t0 // "*0xffff0001"
sbc $t3,$acc0,$t1
adds $acc0,$acc1,$t0 // +=acc[0]<<96 and omit acc[0]
adcs $acc1,$acc2,$t1
lsl $t0,$acc0,#32
adcs $acc2,$acc3,$t2 // +=acc[0]*0xffff0001
lsr $t1,$acc0,#32
adc $acc3,$t3,xzr // can't overflow
___
}
$code.=<<___;
subs $t2,$acc0,$t0 // "*0xffff0001"
sbc $t3,$acc0,$t1
adds $acc0,$acc1,$t0 // +=acc[0]<<96 and omit acc[0]
adcs $acc1,$acc2,$t1
adcs $acc2,$acc3,$t2 // +=acc[0]*0xffff0001
adc $acc3,$t3,xzr // can't overflow
adds $acc0,$acc0,$acc4 // accumulate upper half
adcs $acc1,$acc1,$acc5
adcs $acc2,$acc2,$acc6
adcs $acc3,$acc3,$acc7
adc $acc4,xzr,xzr
adds $t0,$acc0,#1 // subs $t0,$acc0,#-1 // tmp = ret-modulus
sbcs $t1,$acc1,$poly1
sbcs $t2,$acc2,xzr
sbcs $t3,$acc3,$poly3
sbcs xzr,$acc4,xzr // did it borrow?
csel $acc0,$acc0,$t0,lo // ret = borrow ? ret : ret-modulus
csel $acc1,$acc1,$t1,lo
csel $acc2,$acc2,$t2,lo
stp $acc0,$acc1,[$rp]
csel $acc3,$acc3,$t3,lo
stp $acc2,$acc3,[$rp,#16]
ret
.size __ecp_nistz256_sqr_mont,.-__ecp_nistz256_sqr_mont
// Note that __ecp_nistz256_add expects both input vectors pre-loaded to
// $a0-$a3 and $t0-$t3. This is done because it's used in multiple
// contexts, e.g. in multiplication by 2 and 3...
.type __ecp_nistz256_add,%function
.align 4
__ecp_nistz256_add:
adds $acc0,$acc0,$t0 // ret = a+b
adcs $acc1,$acc1,$t1
adcs $acc2,$acc2,$t2
adcs $acc3,$acc3,$t3
adc $ap,xzr,xzr // zap $ap
adds $t0,$acc0,#1 // subs $t0,$a0,#-1 // tmp = ret-modulus
sbcs $t1,$acc1,$poly1
sbcs $t2,$acc2,xzr
sbcs $t3,$acc3,$poly3
sbcs xzr,$ap,xzr // did subtraction borrow?
csel $acc0,$acc0,$t0,lo // ret = borrow ? ret : ret-modulus
csel $acc1,$acc1,$t1,lo
csel $acc2,$acc2,$t2,lo
stp $acc0,$acc1,[$rp]
csel $acc3,$acc3,$t3,lo
stp $acc2,$acc3,[$rp,#16]
ret
.size __ecp_nistz256_add,.-__ecp_nistz256_add
.type __ecp_nistz256_sub_from,%function
.align 4
__ecp_nistz256_sub_from:
ldp $t0,$t1,[$bp]
ldp $t2,$t3,[$bp,#16]
subs $acc0,$acc0,$t0 // ret = a-b
sbcs $acc1,$acc1,$t1
sbcs $acc2,$acc2,$t2
sbcs $acc3,$acc3,$t3
sbc $ap,xzr,xzr // zap $ap
subs $t0,$acc0,#1 // adds $t0,$a0,#-1 // tmp = ret+modulus
adcs $t1,$acc1,$poly1
adcs $t2,$acc2,xzr
adc $t3,$acc3,$poly3
cmp $ap,xzr // did subtraction borrow?
csel $acc0,$acc0,$t0,eq // ret = borrow ? ret+modulus : ret
csel $acc1,$acc1,$t1,eq
csel $acc2,$acc2,$t2,eq
stp $acc0,$acc1,[$rp]
csel $acc3,$acc3,$t3,eq
stp $acc2,$acc3,[$rp,#16]
ret
.size __ecp_nistz256_sub_from,.-__ecp_nistz256_sub_from
.type __ecp_nistz256_sub_morf,%function
.align 4
__ecp_nistz256_sub_morf:
ldp $t0,$t1,[$bp]
ldp $t2,$t3,[$bp,#16]
subs $acc0,$t0,$acc0 // ret = b-a
sbcs $acc1,$t1,$acc1
sbcs $acc2,$t2,$acc2
sbcs $acc3,$t3,$acc3
sbc $ap,xzr,xzr // zap $ap
subs $t0,$acc0,#1 // adds $t0,$a0,#-1 // tmp = ret+modulus
adcs $t1,$acc1,$poly1
adcs $t2,$acc2,xzr
adc $t3,$acc3,$poly3
cmp $ap,xzr // did subtraction borrow?
csel $acc0,$acc0,$t0,eq // ret = borrow ? ret+modulus : ret
csel $acc1,$acc1,$t1,eq
csel $acc2,$acc2,$t2,eq
stp $acc0,$acc1,[$rp]
csel $acc3,$acc3,$t3,eq
stp $acc2,$acc3,[$rp,#16]
ret
.size __ecp_nistz256_sub_morf,.-__ecp_nistz256_sub_morf
.type __ecp_nistz256_div_by_2,%function
.align 4
__ecp_nistz256_div_by_2:
subs $t0,$acc0,#1 // adds $t0,$a0,#-1 // tmp = a+modulus
adcs $t1,$acc1,$poly1
adcs $t2,$acc2,xzr
adcs $t3,$acc3,$poly3
adc $ap,xzr,xzr // zap $ap
tst $acc0,#1 // is a even?
csel $acc0,$acc0,$t0,eq // ret = even ? a : a+modulus
csel $acc1,$acc1,$t1,eq
csel $acc2,$acc2,$t2,eq
csel $acc3,$acc3,$t3,eq
csel $ap,xzr,$ap,eq
lsr $acc0,$acc0,#1 // ret >>= 1
orr $acc0,$acc0,$acc1,lsl#63
lsr $acc1,$acc1,#1
orr $acc1,$acc1,$acc2,lsl#63
lsr $acc2,$acc2,#1
orr $acc2,$acc2,$acc3,lsl#63
lsr $acc3,$acc3,#1
stp $acc0,$acc1,[$rp]
orr $acc3,$acc3,$ap,lsl#63
stp $acc2,$acc3,[$rp,#16]
ret
.size __ecp_nistz256_div_by_2,.-__ecp_nistz256_div_by_2
___
########################################################################
# following subroutines are "literal" implementation of those found in
# ecp_nistz256.c
#
########################################################################
# void GFp_nistz256_point_double(P256_POINT *out,const P256_POINT *inp);
#
{
my ($S,$M,$Zsqr,$tmp0)=map(32*$_,(0..3));
# above map() describes stack layout with 4 temporary
# 256-bit vectors on top.
my ($rp_real,$ap_real) = map("x$_",(21,22));
$code.=<<___;
.globl GFp_nistz256_point_double
.type GFp_nistz256_point_double,%function
.align 5
GFp_nistz256_point_double:
stp x29,x30,[sp,#-80]!
add x29,sp,#0
stp x19,x20,[sp,#16]
stp x21,x22,[sp,#32]
sub sp,sp,#32*4
.Ldouble_shortcut:
ldp $acc0,$acc1,[$ap,#32]
mov $rp_real,$rp
ldp $acc2,$acc3,[$ap,#48]
mov $ap_real,$ap
ldr $poly1,.Lpoly+8
mov $t0,$acc0
ldr $poly3,.Lpoly+24
mov $t1,$acc1
ldp $a0,$a1,[$ap_real,#64] // forward load for p256_sqr_mont
mov $t2,$acc2
mov $t3,$acc3
ldp $a2,$a3,[$ap_real,#64+16]
add $rp,sp,#$S
bl __ecp_nistz256_add // p256_mul_by_2(S, in_y);
add $rp,sp,#$Zsqr
bl __ecp_nistz256_sqr_mont // p256_sqr_mont(Zsqr, in_z);
ldp $t0,$t1,[$ap_real]
ldp $t2,$t3,[$ap_real,#16]
mov $a0,$acc0 // put Zsqr aside for p256_sub
mov $a1,$acc1
mov $a2,$acc2
mov $a3,$acc3
add $rp,sp,#$M
bl __ecp_nistz256_add // p256_add(M, Zsqr, in_x);
add $bp,$ap_real,#0
mov $acc0,$a0 // restore Zsqr
mov $acc1,$a1
ldp $a0,$a1,[sp,#$S] // forward load for p256_sqr_mont
mov $acc2,$a2
mov $acc3,$a3
ldp $a2,$a3,[sp,#$S+16]
add $rp,sp,#$Zsqr
bl __ecp_nistz256_sub_morf // p256_sub(Zsqr, in_x, Zsqr);
add $rp,sp,#$S
bl __ecp_nistz256_sqr_mont // p256_sqr_mont(S, S);
ldr $bi,[$ap_real,#32]
ldp $a0,$a1,[$ap_real,#64]
ldp $a2,$a3,[$ap_real,#64+16]
add $bp,$ap_real,#32
add $rp,sp,#$tmp0
bl __ecp_nistz256_mul_mont // p256_mul_mont(tmp0, in_z, in_y);
mov $t0,$acc0
mov $t1,$acc1
ldp $a0,$a1,[sp,#$S] // forward load for p256_sqr_mont
mov $t2,$acc2
mov $t3,$acc3
ldp $a2,$a3,[sp,#$S+16]
add $rp,$rp_real,#64
bl __ecp_nistz256_add // p256_mul_by_2(res_z, tmp0);
add $rp,sp,#$tmp0
bl __ecp_nistz256_sqr_mont // p256_sqr_mont(tmp0, S);
ldr $bi,[sp,#$Zsqr] // forward load for p256_mul_mont
ldp $a0,$a1,[sp,#$M]
ldp $a2,$a3,[sp,#$M+16]
add $rp,$rp_real,#32
bl __ecp_nistz256_div_by_2 // p256_div_by_2(res_y, tmp0);
add $bp,sp,#$Zsqr
add $rp,sp,#$M
bl __ecp_nistz256_mul_mont // p256_mul_mont(M, M, Zsqr);
mov $t0,$acc0 // duplicate M
mov $t1,$acc1
mov $t2,$acc2
mov $t3,$acc3
mov $a0,$acc0 // put M aside
mov $a1,$acc1
mov $a2,$acc2
mov $a3,$acc3
add $rp,sp,#$M
bl __ecp_nistz256_add
mov $t0,$a0 // restore M
mov $t1,$a1
ldr $bi,[$ap_real] // forward load for p256_mul_mont
mov $t2,$a2
ldp $a0,$a1,[sp,#$S]
mov $t3,$a3
ldp $a2,$a3,[sp,#$S+16]
bl __ecp_nistz256_add // p256_mul_by_3(M, M);
add $bp,$ap_real,#0
add $rp,sp,#$S
bl __ecp_nistz256_mul_mont // p256_mul_mont(S, S, in_x);
mov $t0,$acc0
mov $t1,$acc1
ldp $a0,$a1,[sp,#$M] // forward load for p256_sqr_mont
mov $t2,$acc2
mov $t3,$acc3
ldp $a2,$a3,[sp,#$M+16]
add $rp,sp,#$tmp0
bl __ecp_nistz256_add // p256_mul_by_2(tmp0, S);
add $rp,$rp_real,#0
bl __ecp_nistz256_sqr_mont // p256_sqr_mont(res_x, M);
add $bp,sp,#$tmp0
bl __ecp_nistz256_sub_from // p256_sub(res_x, res_x, tmp0);
add $bp,sp,#$S
add $rp,sp,#$S
bl __ecp_nistz256_sub_morf // p256_sub(S, S, res_x);
ldr $bi,[sp,#$M]
mov $a0,$acc0 // copy S
mov $a1,$acc1
mov $a2,$acc2
mov $a3,$acc3
add $bp,sp,#$M
bl __ecp_nistz256_mul_mont // p256_mul_mont(S, S, M);
add $bp,$rp_real,#32
add $rp,$rp_real,#32
bl __ecp_nistz256_sub_from // p256_sub(res_y, S, res_y);
add sp,x29,#0 // destroy frame
ldp x19,x20,[x29,#16]
ldp x21,x22,[x29,#32]
ldp x29,x30,[sp],#80
ret
.size GFp_nistz256_point_double,.-GFp_nistz256_point_double
___
}
########################################################################
# void GFp_nistz256_point_add_affine(P256_POINT *out,const P256_POINT *in1,
# const P256_POINT_AFFINE *in2);
{
my ($res_x,$res_y,$res_z,
$U2,$S2,$H,$R,$Hsqr,$Hcub,$Rsqr)=map(32*$_,(0..9));
my $Z1sqr = $S2;
# above map() describes stack layout with 10 temporary
# 256-bit vectors on top.
my ($rp_real,$ap_real,$bp_real,$in1infty,$in2infty,$temp)=map("x$_",(21..26));
$code.=<<___;
.globl GFp_nistz256_point_add_affine
.type GFp_nistz256_point_add_affine,%function
.align 5
GFp_nistz256_point_add_affine:
stp x29,x30,[sp,#-80]!
add x29,sp,#0
stp x19,x20,[sp,#16]
stp x21,x22,[sp,#32]
stp x23,x24,[sp,#48]
stp x25,x26,[sp,#64]
sub sp,sp,#32*10
mov $rp_real,$rp
mov $ap_real,$ap
mov $bp_real,$bp
ldr $poly1,.Lpoly+8
ldr $poly3,.Lpoly+24
ldp $a0,$a1,[$ap,#64] // in1_z
ldp $a2,$a3,[$ap,#64+16]
orr $t0,$a0,$a1
orr $t2,$a2,$a3
orr $in1infty,$t0,$t2
cmp $in1infty,#0
csetm $in1infty,ne // !in1infty
ldp $acc0,$acc1,[$bp] // in2_x
ldp $acc2,$acc3,[$bp,#16]
ldp $t0,$t1,[$bp,#32] // in2_y
ldp $t2,$t3,[$bp,#48]
orr $acc0,$acc0,$acc1
orr $acc2,$acc2,$acc3
orr $t0,$t0,$t1
orr $t2,$t2,$t3
orr $acc0,$acc0,$acc2
orr $t0,$t0,$t2
orr $in2infty,$acc0,$t0
cmp $in2infty,#0
csetm $in2infty,ne // !in2infty
add $rp,sp,#$Z1sqr
bl __ecp_nistz256_sqr_mont // p256_sqr_mont(Z1sqr, in1_z);
mov $a0,$acc0
mov $a1,$acc1
mov $a2,$acc2
mov $a3,$acc3
ldr $bi,[$bp_real]
add $bp,$bp_real,#0
add $rp,sp,#$U2
bl __ecp_nistz256_mul_mont // p256_mul_mont(U2, Z1sqr, in2_x);
add $bp,$ap_real,#0
ldr $bi,[$ap_real,#64] // forward load for p256_mul_mont
ldp $a0,$a1,[sp,#$Z1sqr]
ldp $a2,$a3,[sp,#$Z1sqr+16]
add $rp,sp,#$H
bl __ecp_nistz256_sub_from // p256_sub(H, U2, in1_x);
add $bp,$ap_real,#64
add $rp,sp,#$S2
bl __ecp_nistz256_mul_mont // p256_mul_mont(S2, Z1sqr, in1_z);
ldr $bi,[$ap_real,#64]
ldp $a0,$a1,[sp,#$H]
ldp $a2,$a3,[sp,#$H+16]
add $bp,$ap_real,#64
add $rp,sp,#$res_z
bl __ecp_nistz256_mul_mont // p256_mul_mont(res_z, H, in1_z);
ldr $bi,[$bp_real,#32]
ldp $a0,$a1,[sp,#$S2]
ldp $a2,$a3,[sp,#$S2+16]
add $bp,$bp_real,#32
add $rp,sp,#$S2
bl __ecp_nistz256_mul_mont // p256_mul_mont(S2, S2, in2_y);
add $bp,$ap_real,#32
ldp $a0,$a1,[sp,#$H] // forward load for p256_sqr_mont
ldp $a2,$a3,[sp,#$H+16]
add $rp,sp,#$R
bl __ecp_nistz256_sub_from // p256_sub(R, S2, in1_y);
add $rp,sp,#$Hsqr
bl __ecp_nistz256_sqr_mont // p256_sqr_mont(Hsqr, H);
ldp $a0,$a1,[sp,#$R]
ldp $a2,$a3,[sp,#$R+16]
add $rp,sp,#$Rsqr
bl __ecp_nistz256_sqr_mont // p256_sqr_mont(Rsqr, R);
ldr $bi,[sp,#$H]
ldp $a0,$a1,[sp,#$Hsqr]
ldp $a2,$a3,[sp,#$Hsqr+16]
add $bp,sp,#$H
add $rp,sp,#$Hcub
bl __ecp_nistz256_mul_mont // p256_mul_mont(Hcub, Hsqr, H);
ldr $bi,[$ap_real]
ldp $a0,$a1,[sp,#$Hsqr]
ldp $a2,$a3,[sp,#$Hsqr+16]
add $bp,$ap_real,#0
add $rp,sp,#$U2
bl __ecp_nistz256_mul_mont // p256_mul_mont(U2, in1_x, Hsqr);
mov $t0,$acc0
mov $t1,$acc1
mov $t2,$acc2
mov $t3,$acc3
add $rp,sp,#$Hsqr
bl __ecp_nistz256_add // p256_mul_by_2(Hsqr, U2);
add $bp,sp,#$Rsqr
add $rp,sp,#$res_x
bl __ecp_nistz256_sub_morf // p256_sub(res_x, Rsqr, Hsqr);
add $bp,sp,#$Hcub
bl __ecp_nistz256_sub_from // p256_sub(res_x, res_x, Hcub);
add $bp,sp,#$U2
ldr $bi,[$ap_real,#32] // forward load for p256_mul_mont
ldp $a0,$a1,[sp,#$Hcub]
ldp $a2,$a3,[sp,#$Hcub+16]
add $rp,sp,#$res_y
bl __ecp_nistz256_sub_morf // p256_sub(res_y, U2, res_x);
add $bp,$ap_real,#32
add $rp,sp,#$S2
bl __ecp_nistz256_mul_mont // p256_mul_mont(S2, in1_y, Hcub);
ldr $bi,[sp,#$R]
ldp $a0,$a1,[sp,#$res_y]
ldp $a2,$a3,[sp,#$res_y+16]
add $bp,sp,#$R
add $rp,sp,#$res_y
bl __ecp_nistz256_mul_mont // p256_mul_mont(res_y, res_y, R);
add $bp,sp,#$S2
bl __ecp_nistz256_sub_from // p256_sub(res_y, res_y, S2);
ldp $a0,$a1,[sp,#$res_x] // res
ldp $a2,$a3,[sp,#$res_x+16]
ldp $t0,$t1,[$bp_real] // in2
ldp $t2,$t3,[$bp_real,#16]
___
for($i=0;$i<64;$i+=32) { # conditional moves
$code.=<<___;
ldp $acc0,$acc1,[$ap_real,#$i] // in1
cmp $in1infty,#0 // !$in1intfy, remember?
ldp $acc2,$acc3,[$ap_real,#$i+16]
csel $t0,$a0,$t0,ne
csel $t1,$a1,$t1,ne
ldp $a0,$a1,[sp,#$res_x+$i+32] // res
csel $t2,$a2,$t2,ne
csel $t3,$a3,$t3,ne
cmp $in2infty,#0 // !$in2intfy, remember?
ldp $a2,$a3,[sp,#$res_x+$i+48]
csel $acc0,$t0,$acc0,ne
csel $acc1,$t1,$acc1,ne
ldp $t0,$t1,[$bp_real,#$i+32] // in2
csel $acc2,$t2,$acc2,ne
csel $acc3,$t3,$acc3,ne
ldp $t2,$t3,[$bp_real,#$i+48]
stp $acc0,$acc1,[$rp_real,#$i]
stp $acc2,$acc3,[$rp_real,#$i+16]
___
$code.=<<___ if ($i == 0);
adr $bp_real,.Lone_mont-64
___
}
$code.=<<___;
ldp $acc0,$acc1,[$ap_real,#$i] // in1
cmp $in1infty,#0 // !$in1intfy, remember?
ldp $acc2,$acc3,[$ap_real,#$i+16]
csel $t0,$a0,$t0,ne
csel $t1,$a1,$t1,ne
csel $t2,$a2,$t2,ne
csel $t3,$a3,$t3,ne
cmp $in2infty,#0 // !$in2intfy, remember?
csel $acc0,$t0,$acc0,ne
csel $acc1,$t1,$acc1,ne
csel $acc2,$t2,$acc2,ne
csel $acc3,$t3,$acc3,ne
stp $acc0,$acc1,[$rp_real,#$i]
stp $acc2,$acc3,[$rp_real,#$i+16]
add sp,x29,#0 // destroy frame
ldp x19,x20,[x29,#16]
ldp x21,x22,[x29,#32]
ldp x23,x24,[x29,#48]
ldp x25,x26,[x29,#64]
ldp x29,x30,[sp],#80
ret
.size GFp_nistz256_point_add_affine,.-GFp_nistz256_point_add_affine
___
} }
foreach (split("\n",$code)) {
s/\`([^\`]*)\`/eval $1/ge;
print $_,"\n";
}
close STDOUT or die "error closing STDOUT";

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,52 @@
/* Copyright (c) 2014, Intel Corporation.
*
* Permission to use, copy, modify, and/or distribute this software for any
* purpose with or without fee is hereby granted, provided that the above
* copyright notice and this permission notice appear in all copies.
*
* THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
* WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
* MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY
* SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
* WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION
* OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN
* CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. */
#include "ecp_nistz.h"
#if defined(__GNUC__)
#pragma GCC diagnostic ignored "-Wconversion"
#endif
/* Fills |str| with the bytewise little-endian encoding of |scalar|, where
* |scalar| has |num_limbs| limbs. |str| is padded with zeros at the end up
* to |str_len| bytes. Actually, |str_len| must be exactly one byte more than
* needed to encode |num_limbs| losslessly, so that there is an extra byte at
* the end. The extra byte is useful because the caller will be breaking |str|
* up into windows of a number of bits (5 or 7) that isn't divisible by 8, and
* so it is useful for it to be able to read an extra zero byte. */
void gfp_little_endian_bytes_from_scalar(uint8_t str[], size_t str_len,
const Limb scalar[],
size_t num_limbs) {
debug_assert_nonsecret(str_len == (num_limbs * sizeof(Limb)) + 1);
size_t i;
for (i = 0; i < num_limbs * sizeof(Limb); i += sizeof(Limb)) {
Limb d = scalar[i / sizeof(Limb)];
str[i + 0] = d & 0xff;
str[i + 1] = (d >> 8) & 0xff;
str[i + 2] = (d >> 16) & 0xff;
str[i + 3] = (d >>= 24) & 0xff;
if (sizeof(Limb) == 8) {
d >>= 8;
str[i + 4] = d & 0xff;
str[i + 5] = (d >> 8) & 0xff;
str[i + 6] = (d >> 16) & 0xff;
str[i + 7] = (d >> 24) & 0xff;
}
}
for (; i < str_len; i++) {
str[i] = 0;
}
}

View File

@@ -0,0 +1,274 @@
/* Copyright (c) 2015, Google Inc.
*
* Permission to use, copy, modify, and/or distribute this software for any
* purpose with or without fee is hereby granted, provided that the above
* copyright notice and this permission notice appear in all copies.
*
* THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
* WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
* MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY
* SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
* WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION
* OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN
* CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. */
#ifndef OPENSSL_HEADER_EC_ECP_NISTZ_H
#define OPENSSL_HEADER_EC_ECP_NISTZ_H
#include <GFp/base.h>
#include "../../limbs/limbs.h"
#if defined(__GNUC__)
#pragma GCC diagnostic push
#pragma GCC diagnostic ignored "-Wconversion"
#pragma GCC diagnostic ignored "-Wsign-conversion"
#endif
// This function looks at `w + 1` scalar bits (`w` current, 1 adjacent less
// significant bit), and recodes them into a signed digit for use in fast point
// multiplication: the use of signed rather than unsigned digits means that
// fewer points need to be precomputed, given that point inversion is easy (a
// precomputed point dP makes -dP available as well).
//
// BACKGROUND:
//
// Signed digits for multiplication were introduced by Booth ("A signed binary
// multiplication technique", Quart. Journ. Mech. and Applied Math., vol. IV,
// pt. 2 (1951), pp. 236-240), in that case for multiplication of integers.
// Booth's original encoding did not generally improve the density of nonzero
// digits over the binary representation, and was merely meant to simplify the
// handling of signed factors given in two's complement; but it has since been
// shown to be the basis of various signed-digit representations that do have
// further advantages, including the wNAF, using the following general
// approach:
//
// (1) Given a binary representation
//
// b_k ... b_2 b_1 b_0,
//
// of a nonnegative integer (b_k in {0, 1}), rewrite it in digits 0, 1, -1
// by using bit-wise subtraction as follows:
//
// b_k b_(k-1) ... b_2 b_1 b_0
// - b_k ... b_3 b_2 b_1 b_0
// -----------------------------------------
// s_(k+1) s_k ... s_3 s_2 s_1 s_0
//
// A left-shift followed by subtraction of the original value yields a new
// representation of the same value, using signed bits s_i = b_(i-1) - b_i.
// This representation from Booth's paper has since appeared in the
// literature under a variety of different names including "reversed binary
// form", "alternating greedy expansion", "mutual opposite form", and
// "sign-alternating {+-1}-representation".
//
// An interesting property is that among the nonzero bits, values 1 and -1
// strictly alternate.
//
// (2) Various window schemes can be applied to the Booth representation of
// integers: for example, right-to-left sliding windows yield the wNAF
// (a signed-digit encoding independently discovered by various researchers
// in the 1990s), and left-to-right sliding windows yield a left-to-right
// equivalent of the wNAF (independently discovered by various researchers
// around 2004).
//
// To prevent leaking information through side channels in point multiplication,
// we need to recode the given integer into a regular pattern: sliding windows
// as in wNAFs won't do, we need their fixed-window equivalent -- which is a few
// decades older: we'll be using the so-called "modified Booth encoding" due to
// MacSorley ("High-speed arithmetic in binary computers", Proc. IRE, vol. 49
// (1961), pp. 67-91), in a radix-2**w setting. That is, we always combine `w`
// signed bits into a signed digit, e.g. (for `w == 5`):
//
// s_(5j + 4) s_(5j + 3) s_(5j + 2) s_(5j + 1) s_(5j)
//
// The sign-alternating property implies that the resulting digit values are
// integers from `-2**(w-1)` to `2**(w-1)`, e.g. -16 to 16 for `w == 5`.
//
// Of course, we don't actually need to compute the signed digits s_i as an
// intermediate step (that's just a nice way to see how this scheme relates
// to the wNAF): a direct computation obtains the recoded digit from the
// six bits b_(5j + 4) ... b_(5j - 1).
//
// This function takes those `w` bits as an integer (e.g. 0 .. 63), writing the
// recoded digit to *sign (0 for positive, 1 for negative) and *digit (absolute
// value, in the range 0 .. 2**(w-1). Note that this integer essentially provides
// the input bits "shifted to the left" by one position: for example, the input
// to compute the least significant recoded digit, given that there's no bit
// b_-1, has to be b_4 b_3 b_2 b_1 b_0 0.
//
// DOUBLING CASE:
//
// Point addition formulas for short Weierstrass curves are often incomplete.
// Edge cases such as P + P or P + ∞ must be handled separately. This
// complicates constant-time requirements. P + ∞ cannot be avoided (any window
// may be zero) and is handled with constant-time selects. P + P (where P is not
// ∞) usually is not. Instead, windowing strategies are chosen to avoid this
// case. Whether this happens depends on the group order.
//
// Let w be the window width (in this function, w = 5). The non-trivial doubling
// case in single-point scalar multiplication may occur if and only if the
// 2^(w-1) bit of the group order is zero.
//
// Note the above only holds if the scalar is fully reduced and the group order
// is a prime that is much larger than 2^w. It also only holds when windows
// are applied from most significant to least significant, doubling between each
// window. It does not apply to more complex table strategies such as
// |EC_GFp_nistz256_method|.
//
// PROOF:
//
// Let n be the group order. Let l be the number of bits needed to represent n.
// Assume there exists some 0 <= k < n such that signed w-bit windowed
// multiplication hits the doubling case.
//
// Windowed multiplication consists of iterating over groups of s_i (defined
// above based on k's binary representation) from most to least significant. At
// iteration i (for i = ..., 3w, 2w, w, 0, starting from the most significant
// window), we:
//
// 1. Double the accumulator A, w times. Let A_i be the value of A at this
// point.
//
// 2. Set A to T_i + A_i, where T_i is a precomputed multiple of P
// corresponding to the window s_(i+w-1) ... s_i.
//
// Let j be the index such that A_j = T_j ≠ ∞. Looking at A_i and T_i as
// multiples of P, define a_i and t_i to be scalar coefficients of A_i and T_i.
// Thus a_j = t_j ≠ 0 (mod n). Note a_i and t_i may not be reduced mod n. t_i is
// the value of the w signed bits s_(i+w-1) ... s_i. a_i is computed as a_i =
// 2^w * (a_(i+w) + t_(i+w)).
//
// t_i is bounded by -2^(w-1) <= t_i <= 2^(w-1). Additionally, we may write it
// in terms of unsigned bits b_i. t_i consists of signed bits s_(i+w-1) ... s_i.
// This is computed as:
//
// b_(i+w-2) b_(i+w-3) ... b_i b_(i-1)
// - b_(i+w-1) b_(i+w-2) ... b_(i+1) b_i
// --------------------------------------------
// t_i = s_(i+w-1) s_(i+w-2) ... s_(i+1) s_i
//
// Observe that b_(i+w-2) through b_i occur in both terms. Let x be the integer
// represented by that bit string, i.e. 2^(w-2)*b_(i+w-2) + ... + b_i.
//
// t_i = (2*x + b_(i-1)) - (2^(w-1)*b_(i+w-1) + x)
// = x - 2^(w-1)*b_(i+w-1) + b_(i-1)
//
// Or, using C notation for bit operations:
//
// t_i = (k>>i) & ((1<<(w-1)) - 1) - (k>>i) & (1<<(w-1)) + (k>>(i-1)) & 1
//
// Note b_(i-1) is added in left-shifted by one (or doubled) from its place.
// This is compensated by t_(i-w)'s subtraction term. Thus, a_i may be computed
// by adding b_l b_(l-1) ... b_(i+1) b_i and an extra copy of b_(i-1). In C
// notation, this is:
//
// a_i = (k>>(i+w)) << w + ((k>>(i+w-1)) & 1) << w
//
// Observe that, while t_i may be positive or negative, a_i is bounded by
// 0 <= a_i < n + 2^w. Additionally, a_i can only be zero if b_(i+w-1) and up
// are all zero. (Note this implies a non-trivial P + (-P) is unreachable for
// all groups. That would imply the subsequent a_i is zero, which means all
// terms thus far were zero.)
//
// Returning to our doubling position, we have a_j = t_j (mod n). We now
// determine the value of a_j - t_j, which must be divisible by n. Our bounds on
// a_j and t_j imply a_j - t_j is 0 or n. If it is 0, a_j = t_j. However, 2^w
// divides a_j and -2^(w-1) <= t_j <= 2^(w-1), so this can only happen if
// a_j = t_j = 0, which is a trivial doubling. Therefore, a_j - t_j = n.
//
// Now we determine j. Suppose j > 0. w divides j, so j >= w. Then,
//
// n = a_j - t_j = (k>>(j+w)) << w + ((k>>(j+w-1)) & 1) << w - t_j
// <= k/2^j + 2^w - t_j
// < n/2^w + 2^w + 2^(w-1)
//
// n is much larger than 2^w, so this is impossible. Thus, j = 0: only the final
// addition may hit the doubling case.
//
// Finally, we consider bit patterns for n and k. Divide k into k_H + k_M + k_L
// such that k_H is the contribution from b_(l-1) .. b_w, k_M is the
// contribution from b_(w-1), and k_L is the contribution from b_(w-2) ... b_0.
// That is:
//
// - 2^w divides k_H
// - k_M is 0 or 2^(w-1)
// - 0 <= k_L < 2^(w-1)
//
// Divide n into n_H + n_M + n_L similarly. We thus have:
//
// t_0 = (k>>0) & ((1<<(w-1)) - 1) - (k>>0) & (1<<(w-1)) + (k>>(0-1)) & 1
// = k & ((1<<(w-1)) - 1) - k & (1<<(w-1))
// = k_L - k_M
//
// a_0 = (k>>(0+w)) << w + ((k>>(0+w-1)) & 1) << w
// = (k>>w) << w + ((k>>(w-1)) & 1) << w
// = k_H + 2*k_M
//
// n = a_0 - t_0
// n_H + n_M + n_L = (k_H + 2*k_M) - (k_L - k_M)
// = k_H + 3*k_M - k_L
//
// k_H - k_L < k and k < n, so k_H - k_L ≠ n. Therefore k_M is not 0 and must be
// 2^(w-1). Now we consider k_H and n_H. We know k_H <= n_H. Suppose k_H = n_H.
// Then,
//
// n_M + n_L = 3*(2^(w-1)) - k_L
// > 3*(2^(w-1)) - 2^(w-1)
// = 2^w
//
// Contradiction (n_M + n_L is the bottom w bits of n). Thus k_H < n_H. Suppose
// k_H < n_H - 2*2^w. Then,
//
// n_H + n_M + n_L = k_H + 3*(2^(w-1)) - k_L
// < n_H - 2*2^w + 3*(2^(w-1)) - k_L
// n_M + n_L < -2^(w-1) - k_L
//
// Contradiction. Thus, k_H = n_H - 2^w. (Note 2^w divides n_H and k_H.) Thus,
//
// n_H + n_M + n_L = k_H + 3*(2^(w-1)) - k_L
// = n_H - 2^w + 3*(2^(w-1)) - k_L
// n_M + n_L = 2^(w-1) - k_L
// <= 2^(w-1)
//
// Equality would mean 2^(w-1) divides n, which is impossible if n is prime.
// Thus n_M + n_L < 2^(w-1), so n_M is zero, proving our condition.
//
// This proof constructs k, so, to show the converse, let k_H = n_H - 2^w,
// k_M = 2^(w-1), k_L = 2^(w-1) - n_L. This will result in a non-trivial point
// doubling in the final addition and is the only such scalar.
//
// COMMON CURVES:
//
// The group orders for common curves end in the following bit patterns:
//
// P-521: ...00001001; w = 4 is okay
// P-384: ...01110011; w = 2, 5, 6, 7 are okay
// P-256: ...01010001; w = 5, 7 are okay
// P-224: ...00111101; w = 3, 4, 5, 6 are okay
static inline void booth_recode(crypto_word *is_negative, crypto_word *digit,
crypto_word in, crypto_word w) {
debug_assert_nonsecret(w >= 2);
debug_assert_nonsecret(w <= 7);
// Set all bits of `s` to MSB(in), similar to |constant_time_msb_s|,
// but 'in' seen as (`w+1`)-bit value.
crypto_word s = ~((in >> w) - 1);
crypto_word d;
d = ((crypto_word)1u << (w + 1)) - in - 1;
d = (d & s) | (in & ~s);
d = (d >> 1) + (d & 1);
*is_negative = constant_time_is_nonzero_w(s & 1);
*digit = d;
}
#if defined(__GNUC__)
#pragma GCC diagnostic pop
#endif
void gfp_little_endian_bytes_from_scalar(uint8_t str[], size_t str_len,
const Limb scalar[],
size_t num_limbs);
#endif // OPENSSL_HEADER_EC_ECP_NISTZ_H

View File

@@ -0,0 +1,349 @@
/* Copyright (c) 2014, Intel Corporation.
*
* Permission to use, copy, modify, and/or distribute this software for any
* purpose with or without fee is hereby granted, provided that the above
* copyright notice and this permission notice appear in all copies.
*
* THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
* WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
* MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY
* SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
* WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION
* OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN
* CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. */
/* Developers and authors:
* Shay Gueron (1, 2), and Vlad Krasnov (1)
* (1) Intel Corporation, Israel Development Center
* (2) University of Haifa
* Reference:
* Shay Gueron and Vlad Krasnov
* "Fast Prime Field Elliptic Curve Cryptography with 256 Bit Primes"
* http://eprint.iacr.org/2013/816 */
#include "ecp_nistz256.h"
#include "ecp_nistz.h"
#include "../bn/internal.h"
#include "../../limbs/limbs.inl"
#if defined(__GNUC__)
#pragma GCC diagnostic ignored "-Wsign-conversion"
#endif
/* Functions implemented in assembly */
/* Modular neg: res = -a mod P */
void GFp_nistz256_neg(Limb res[P256_LIMBS], const Limb a[P256_LIMBS]);
/* One converted into the Montgomery domain */
static const Limb ONE[P256_LIMBS] = {
TOBN(0x00000000, 0x00000001), TOBN(0xffffffff, 0x00000000),
TOBN(0xffffffff, 0xffffffff), TOBN(0x00000000, 0xfffffffe),
};
static void copy_conditional(Limb dst[P256_LIMBS],
const Limb src[P256_LIMBS], Limb move) {
Limb mask1 = move;
Limb mask2 = ~mask1;
dst[0] = (src[0] & mask1) ^ (dst[0] & mask2);
dst[1] = (src[1] & mask1) ^ (dst[1] & mask2);
dst[2] = (src[2] & mask1) ^ (dst[2] & mask2);
dst[3] = (src[3] & mask1) ^ (dst[3] & mask2);
if (P256_LIMBS == 8) {
dst[4] = (src[4] & mask1) ^ (dst[4] & mask2);
dst[5] = (src[5] & mask1) ^ (dst[5] & mask2);
dst[6] = (src[6] & mask1) ^ (dst[6] & mask2);
dst[7] = (src[7] & mask1) ^ (dst[7] & mask2);
}
}
void GFp_nistz256_point_double(P256_POINT *r, const P256_POINT *a);
#if defined(GFp_USE_LARGE_TABLE)
void GFp_nistz256_point_add_affine(P256_POINT *r, const P256_POINT *a,
const P256_POINT_AFFINE *b);
#endif
void GFp_nistz256_point_add(P256_POINT *r, const P256_POINT *a,
const P256_POINT *b);
// |GFp_nistz256_point_add| is defined in assembly language in X86-64 only.
#if !defined(OPENSSL_X86_64)
static const BN_ULONG Q[P256_LIMBS] = {
TOBN(0xffffffff, 0xffffffff),
TOBN(0x00000000, 0xffffffff),
TOBN(0x00000000, 0x00000000),
TOBN(0xffffffff, 0x00000001),
};
static inline Limb is_equal(const Limb a[P256_LIMBS], const Limb b[P256_LIMBS]) {
return LIMBS_equal(a, b, P256_LIMBS);
}
static inline Limb is_zero(const BN_ULONG a[P256_LIMBS]) {
return LIMBS_are_zero(a, P256_LIMBS);
}
static inline void elem_mul_by_2(Limb r[P256_LIMBS], const Limb a[P256_LIMBS]) {
LIMBS_shl_mod(r, a, Q, P256_LIMBS);
}
static inline void elem_mul_mont(Limb r[P256_LIMBS], const Limb a[P256_LIMBS],
const Limb b[P256_LIMBS]) {
GFp_nistz256_mul_mont(r, a, b);
}
static inline void elem_sqr_mont(Limb r[P256_LIMBS], const Limb a[P256_LIMBS]) {
GFp_nistz256_sqr_mont(r, a);
}
static inline void elem_sub(Limb r[P256_LIMBS], const Limb a[P256_LIMBS],
const Limb b[P256_LIMBS]) {
LIMBS_sub_mod(r, a, b, Q, P256_LIMBS);
}
/* Point addition: r = a+b */
void GFp_nistz256_point_add(P256_POINT *r, const P256_POINT *a, const P256_POINT *b) {
BN_ULONG U2[P256_LIMBS], S2[P256_LIMBS];
BN_ULONG U1[P256_LIMBS], S1[P256_LIMBS];
BN_ULONG Z1sqr[P256_LIMBS];
BN_ULONG Z2sqr[P256_LIMBS];
BN_ULONG H[P256_LIMBS], R[P256_LIMBS];
BN_ULONG Hsqr[P256_LIMBS];
BN_ULONG Rsqr[P256_LIMBS];
BN_ULONG Hcub[P256_LIMBS];
BN_ULONG res_x[P256_LIMBS];
BN_ULONG res_y[P256_LIMBS];
BN_ULONG res_z[P256_LIMBS];
const BN_ULONG *in1_x = a->X;
const BN_ULONG *in1_y = a->Y;
const BN_ULONG *in1_z = a->Z;
const BN_ULONG *in2_x = b->X;
const BN_ULONG *in2_y = b->Y;
const BN_ULONG *in2_z = b->Z;
BN_ULONG in1infty = is_zero(a->Z);
BN_ULONG in2infty = is_zero(b->Z);
elem_sqr_mont(Z2sqr, in2_z); /* Z2^2 */
elem_sqr_mont(Z1sqr, in1_z); /* Z1^2 */
elem_mul_mont(S1, Z2sqr, in2_z); /* S1 = Z2^3 */
elem_mul_mont(S2, Z1sqr, in1_z); /* S2 = Z1^3 */
elem_mul_mont(S1, S1, in1_y); /* S1 = Y1*Z2^3 */
elem_mul_mont(S2, S2, in2_y); /* S2 = Y2*Z1^3 */
elem_sub(R, S2, S1); /* R = S2 - S1 */
elem_mul_mont(U1, in1_x, Z2sqr); /* U1 = X1*Z2^2 */
elem_mul_mont(U2, in2_x, Z1sqr); /* U2 = X2*Z1^2 */
elem_sub(H, U2, U1); /* H = U2 - U1 */
BN_ULONG is_exceptional = is_equal(U1, U2) & ~in1infty & ~in2infty;
if (is_exceptional) {
if (is_equal(S1, S2)) {
GFp_nistz256_point_double(r, a);
} else {
limbs_zero(r->X, P256_LIMBS);
limbs_zero(r->Y, P256_LIMBS);
limbs_zero(r->Z, P256_LIMBS);
}
return;
}
elem_sqr_mont(Rsqr, R); /* R^2 */
elem_mul_mont(res_z, H, in1_z); /* Z3 = H*Z1*Z2 */
elem_sqr_mont(Hsqr, H); /* H^2 */
elem_mul_mont(res_z, res_z, in2_z); /* Z3 = H*Z1*Z2 */
elem_mul_mont(Hcub, Hsqr, H); /* H^3 */
elem_mul_mont(U2, U1, Hsqr); /* U1*H^2 */
elem_mul_by_2(Hsqr, U2); /* 2*U1*H^2 */
elem_sub(res_x, Rsqr, Hsqr);
elem_sub(res_x, res_x, Hcub);
elem_sub(res_y, U2, res_x);
elem_mul_mont(S2, S1, Hcub);
elem_mul_mont(res_y, R, res_y);
elem_sub(res_y, res_y, S2);
copy_conditional(res_x, in2_x, in1infty);
copy_conditional(res_y, in2_y, in1infty);
copy_conditional(res_z, in2_z, in1infty);
copy_conditional(res_x, in1_x, in2infty);
copy_conditional(res_y, in1_y, in2infty);
copy_conditional(res_z, in1_z, in2infty);
limbs_copy(r->X, res_x, P256_LIMBS);
limbs_copy(r->Y, res_y, P256_LIMBS);
limbs_copy(r->Z, res_z, P256_LIMBS);
}
#endif
/* r = p * p_scalar */
void GFp_nistz256_point_mul(P256_POINT *r, const Limb p_scalar[P256_LIMBS],
const Limb p_x[P256_LIMBS],
const Limb p_y[P256_LIMBS]) {
static const size_t kWindowSize = 5;
static const crypto_word kMask = (1 << (5 /* kWindowSize */ + 1)) - 1;
uint8_t p_str[(P256_LIMBS * sizeof(Limb)) + 1];
gfp_little_endian_bytes_from_scalar(p_str, sizeof(p_str) / sizeof(p_str[0]),
p_scalar, P256_LIMBS);
/* A |P256_POINT| is (3 * 32) = 96 bytes, and the 64-byte alignment should
* add no more than 63 bytes of overhead. Thus, |table| should require
* ~1599 ((96 * 16) + 63) bytes of stack space. */
alignas(64) P256_POINT table[16];
/* table[0] is implicitly (0,0,0) (the point at infinity), therefore it is
* not stored. All other values are actually stored with an offset of -1 in
* table. */
P256_POINT *row = table;
limbs_copy(row[1 - 1].X, p_x, P256_LIMBS);
limbs_copy(row[1 - 1].Y, p_y, P256_LIMBS);
limbs_copy(row[1 - 1].Z, ONE, P256_LIMBS);
GFp_nistz256_point_double(&row[2 - 1], &row[1 - 1]);
GFp_nistz256_point_add(&row[3 - 1], &row[2 - 1], &row[1 - 1]);
GFp_nistz256_point_double(&row[4 - 1], &row[2 - 1]);
GFp_nistz256_point_double(&row[6 - 1], &row[3 - 1]);
GFp_nistz256_point_double(&row[8 - 1], &row[4 - 1]);
GFp_nistz256_point_double(&row[12 - 1], &row[6 - 1]);
GFp_nistz256_point_add(&row[5 - 1], &row[4 - 1], &row[1 - 1]);
GFp_nistz256_point_add(&row[7 - 1], &row[6 - 1], &row[1 - 1]);
GFp_nistz256_point_add(&row[9 - 1], &row[8 - 1], &row[1 - 1]);
GFp_nistz256_point_add(&row[13 - 1], &row[12 - 1], &row[1 - 1]);
GFp_nistz256_point_double(&row[14 - 1], &row[7 - 1]);
GFp_nistz256_point_double(&row[10 - 1], &row[5 - 1]);
GFp_nistz256_point_add(&row[15 - 1], &row[14 - 1], &row[1 - 1]);
GFp_nistz256_point_add(&row[11 - 1], &row[10 - 1], &row[1 - 1]);
GFp_nistz256_point_double(&row[16 - 1], &row[8 - 1]);
Limb tmp[P256_LIMBS];
alignas(32) P256_POINT h;
static const size_t START_INDEX = 256 - 1;
size_t index = START_INDEX;
crypto_word raw_wvalue;
crypto_word recoded_is_negative;
crypto_word recoded;
raw_wvalue = p_str[(index - 1) / 8];
raw_wvalue = (raw_wvalue >> ((index - 1) % 8)) & kMask;
booth_recode(&recoded_is_negative, &recoded, raw_wvalue, kWindowSize);
dev_assert_secret(!recoded_is_negative);
GFp_nistz256_select_w5(r, table, recoded);
while (index >= kWindowSize) {
if (index != START_INDEX) {
size_t off = (index - 1) / 8;
raw_wvalue = p_str[off] | p_str[off + 1] << 8;
raw_wvalue = (raw_wvalue >> ((index - 1) % 8)) & kMask;
booth_recode(&recoded_is_negative, &recoded, raw_wvalue, kWindowSize);
GFp_nistz256_select_w5(&h, table, recoded);
GFp_nistz256_neg(tmp, h.Y);
copy_conditional(h.Y, tmp, recoded_is_negative);
GFp_nistz256_point_add(r, r, &h);
}
index -= kWindowSize;
GFp_nistz256_point_double(r, r);
GFp_nistz256_point_double(r, r);
GFp_nistz256_point_double(r, r);
GFp_nistz256_point_double(r, r);
GFp_nistz256_point_double(r, r);
}
/* Final window */
raw_wvalue = p_str[0];
raw_wvalue = (raw_wvalue << 1) & kMask;
booth_recode(&recoded_is_negative, &recoded, raw_wvalue, kWindowSize);
GFp_nistz256_select_w5(&h, table, recoded);
GFp_nistz256_neg(tmp, h.Y);
copy_conditional(h.Y, tmp, recoded_is_negative);
GFp_nistz256_point_add(r, r, &h);
}
#if defined(GFp_USE_LARGE_TABLE)
/* Precomputed tables for the default generator */
#include "ecp_nistz256_table.inl"
static const size_t kWindowSize = 7;
static inline void select_precomputed(P256_POINT_AFFINE *p, size_t i,
crypto_word raw_wvalue) {
crypto_word recoded_is_negative;
crypto_word recoded;
booth_recode(&recoded_is_negative, &recoded, raw_wvalue, kWindowSize);
GFp_nistz256_select_w7(p, GFp_nistz256_precomputed[i], recoded);
Limb neg_y[P256_LIMBS];
GFp_nistz256_neg(neg_y, p->Y);
copy_conditional(p->Y, neg_y, recoded_is_negative);
}
/* This assumes that |x| and |y| have been each been reduced to their minimal
* unique representations. */
static Limb is_infinity(const Limb x[P256_LIMBS],
const Limb y[P256_LIMBS]) {
Limb acc = 0;
for (size_t i = 0; i < P256_LIMBS; ++i) {
acc |= x[i] | y[i];
}
return constant_time_is_zero_w(acc);
}
void GFp_nistz256_point_mul_base(P256_POINT *r,
const Limb g_scalar[P256_LIMBS]) {
static const crypto_word kMask = (1 << (7 /* kWindowSize */ + 1)) - 1;
uint8_t p_str[(P256_LIMBS * sizeof(Limb)) + 1];
gfp_little_endian_bytes_from_scalar(p_str, sizeof(p_str) / sizeof(p_str[0]),
g_scalar, P256_LIMBS);
/* First window */
size_t index = kWindowSize;
alignas(32) P256_POINT_AFFINE t;
crypto_word raw_wvalue = (p_str[0] << 1) & kMask;
select_precomputed(&t, 0, raw_wvalue);
alignas(32) P256_POINT p;
limbs_copy(p.X, t.X, P256_LIMBS);
limbs_copy(p.Y, t.Y, P256_LIMBS);
limbs_copy(p.Z, ONE, P256_LIMBS);
/* If it is at the point at infinity then p.p.X will be zero. */
copy_conditional(p.Z, p.X, is_infinity(p.X, p.Y));
for (size_t i = 1; i < 37; i++) {
size_t off = (index - 1) / 8;
raw_wvalue = p_str[off] | p_str[off + 1] << 8;
raw_wvalue = (raw_wvalue >> ((index - 1) % 8)) & kMask;
index += kWindowSize;
select_precomputed(&t, i, raw_wvalue);
GFp_nistz256_point_add_affine(&p, &p, &t);
}
limbs_copy(r->X, p.X, P256_LIMBS);
limbs_copy(r->Y, p.Y, P256_LIMBS);
limbs_copy(r->Z, p.Z, P256_LIMBS);
}
#endif

View File

@@ -0,0 +1,54 @@
/* Copyright (c) 2014, Intel Corporation.
*
* Permission to use, copy, modify, and/or distribute this software for any
* purpose with or without fee is hereby granted, provided that the above
* copyright notice and this permission notice appear in all copies.
*
* THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
* WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
* MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY
* SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
* WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION
* OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN
* CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. */
#ifndef OPENSSL_HEADER_EC_ECP_NISTZ256_H
#define OPENSSL_HEADER_EC_ECP_NISTZ256_H
#include "../../limbs/limbs.h"
// Keep this in sync with p256.rs.
#if defined(OPENSSL_AARCH64) || defined(OPENSSL_X86) || defined(OPENSSL_X86_64)
#define GFp_USE_LARGE_TABLE
#endif
#define P256_LIMBS (256u / LIMB_BITS)
typedef struct {
Limb X[P256_LIMBS];
Limb Y[P256_LIMBS];
Limb Z[P256_LIMBS];
} P256_POINT;
#if defined(GFp_USE_LARGE_TABLE)
typedef struct {
Limb X[P256_LIMBS];
Limb Y[P256_LIMBS];
} P256_POINT_AFFINE;
#endif
typedef Limb PRECOMP256_ROW[64 * 2 * P256_LIMBS]; // 64 (x, y) entries.
void GFp_nistz256_mul_mont(Limb res[P256_LIMBS], const Limb a[P256_LIMBS],
const Limb b[P256_LIMBS]);
void GFp_nistz256_sqr_mont(Limb res[P256_LIMBS], const Limb a[P256_LIMBS]);
/* Functions that perform constant time access to the precomputed tables */
void GFp_nistz256_select_w5(P256_POINT *out, const P256_POINT table[16],
crypto_word index);
#if defined(GFp_USE_LARGE_TABLE)
void GFp_nistz256_select_w7(P256_POINT_AFFINE *out, const PRECOMP256_ROW table, crypto_word index);
#endif
#endif /* OPENSSL_HEADER_EC_ECP_NISTZ256_H */

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,34 @@
/* Copyright (c) 2014, Intel Corporation.
*
* Permission to use, copy, modify, and/or distribute this software for any
* purpose with or without fee is hereby granted, provided that the above
* copyright notice and this permission notice appear in all copies.
*
* THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
* WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
* MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY
* SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
* WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION
* OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN
* CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. */
#ifndef OPENSSL_HEADER_EC_ECP_NISTZ384_H
#define OPENSSL_HEADER_EC_ECP_NISTZ384_H
#include "../../limbs/limbs.h"
#define P384_LIMBS (384u / LIMB_BITS)
typedef struct {
Limb X[P384_LIMBS];
Limb Y[P384_LIMBS];
Limb Z[P384_LIMBS];
} P384_POINT;
typedef struct {
Limb X[P384_LIMBS];
Limb Y[P384_LIMBS];
} P384_POINT_AFFINE;
#endif // OPENSSL_HEADER_EC_ECP_NISTZ384_H

View File

@@ -0,0 +1,257 @@
/* Copyright (c) 2014, Intel Corporation.
*
* Permission to use, copy, modify, and/or distribute this software for any
* purpose with or without fee is hereby granted, provided that the above
* copyright notice and this permission notice appear in all copies.
*
* THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
* WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
* MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY
* SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
* WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION
* OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN
* CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. */
/* Developers and authors:
* Shay Gueron (1, 2), and Vlad Krasnov (1)
* (1) Intel Corporation, Israel Development Center
* (2) University of Haifa
* Reference:
* Shay Gueron and Vlad Krasnov
* "Fast Prime Field Elliptic Curve Cryptography with 256 Bit Primes"
* http://eprint.iacr.org/2013/816 */
#include "ecp_nistz.h"
#if defined(__GNUC__)
#pragma GCC diagnostic push
#pragma GCC diagnostic ignored "-Wsign-conversion"
#endif
/* Point double: r = 2*a */
void GFp_nistz384_point_double(P384_POINT *r, const P384_POINT *a) {
BN_ULONG S[P384_LIMBS];
BN_ULONG M[P384_LIMBS];
BN_ULONG Zsqr[P384_LIMBS];
BN_ULONG tmp0[P384_LIMBS];
const BN_ULONG *in_x = a->X;
const BN_ULONG *in_y = a->Y;
const BN_ULONG *in_z = a->Z;
BN_ULONG *res_x = r->X;
BN_ULONG *res_y = r->Y;
BN_ULONG *res_z = r->Z;
elem_mul_by_2(S, in_y);
elem_sqr_mont(Zsqr, in_z);
elem_sqr_mont(S, S);
elem_mul_mont(res_z, in_z, in_y);
elem_mul_by_2(res_z, res_z);
elem_add(M, in_x, Zsqr);
elem_sub(Zsqr, in_x, Zsqr);
elem_sqr_mont(res_y, S);
elem_div_by_2(res_y, res_y);
elem_mul_mont(M, M, Zsqr);
elem_mul_by_3(M, M);
elem_mul_mont(S, S, in_x);
elem_mul_by_2(tmp0, S);
elem_sqr_mont(res_x, M);
elem_sub(res_x, res_x, tmp0);
elem_sub(S, S, res_x);
elem_mul_mont(S, S, M);
elem_sub(res_y, S, res_y);
}
/* Point addition: r = a+b */
void GFp_nistz384_point_add(P384_POINT *r, const P384_POINT *a,
const P384_POINT *b) {
BN_ULONG U2[P384_LIMBS], S2[P384_LIMBS];
BN_ULONG U1[P384_LIMBS], S1[P384_LIMBS];
BN_ULONG Z1sqr[P384_LIMBS];
BN_ULONG Z2sqr[P384_LIMBS];
BN_ULONG H[P384_LIMBS], R[P384_LIMBS];
BN_ULONG Hsqr[P384_LIMBS];
BN_ULONG Rsqr[P384_LIMBS];
BN_ULONG Hcub[P384_LIMBS];
BN_ULONG res_x[P384_LIMBS];
BN_ULONG res_y[P384_LIMBS];
BN_ULONG res_z[P384_LIMBS];
const BN_ULONG *in1_x = a->X;
const BN_ULONG *in1_y = a->Y;
const BN_ULONG *in1_z = a->Z;
const BN_ULONG *in2_x = b->X;
const BN_ULONG *in2_y = b->Y;
const BN_ULONG *in2_z = b->Z;
BN_ULONG in1infty = is_zero(a->Z);
BN_ULONG in2infty = is_zero(b->Z);
elem_sqr_mont(Z2sqr, in2_z); /* Z2^2 */
elem_sqr_mont(Z1sqr, in1_z); /* Z1^2 */
elem_mul_mont(S1, Z2sqr, in2_z); /* S1 = Z2^3 */
elem_mul_mont(S2, Z1sqr, in1_z); /* S2 = Z1^3 */
elem_mul_mont(S1, S1, in1_y); /* S1 = Y1*Z2^3 */
elem_mul_mont(S2, S2, in2_y); /* S2 = Y2*Z1^3 */
elem_sub(R, S2, S1); /* R = S2 - S1 */
elem_mul_mont(U1, in1_x, Z2sqr); /* U1 = X1*Z2^2 */
elem_mul_mont(U2, in2_x, Z1sqr); /* U2 = X2*Z1^2 */
elem_sub(H, U2, U1); /* H = U2 - U1 */
BN_ULONG is_exceptional = is_equal(U1, U2) & ~in1infty & ~in2infty;
if (is_exceptional) {
if (is_equal(S1, S2)) {
GFp_nistz384_point_double(r, a);
} else {
limbs_zero(r->X, P384_LIMBS);
limbs_zero(r->Y, P384_LIMBS);
limbs_zero(r->Z, P384_LIMBS);
}
return;
}
elem_sqr_mont(Rsqr, R); /* R^2 */
elem_mul_mont(res_z, H, in1_z); /* Z3 = H*Z1*Z2 */
elem_sqr_mont(Hsqr, H); /* H^2 */
elem_mul_mont(res_z, res_z, in2_z); /* Z3 = H*Z1*Z2 */
elem_mul_mont(Hcub, Hsqr, H); /* H^3 */
elem_mul_mont(U2, U1, Hsqr); /* U1*H^2 */
elem_mul_by_2(Hsqr, U2); /* 2*U1*H^2 */
elem_sub(res_x, Rsqr, Hsqr);
elem_sub(res_x, res_x, Hcub);
elem_sub(res_y, U2, res_x);
elem_mul_mont(S2, S1, Hcub);
elem_mul_mont(res_y, R, res_y);
elem_sub(res_y, res_y, S2);
copy_conditional(res_x, in2_x, in1infty);
copy_conditional(res_y, in2_y, in1infty);
copy_conditional(res_z, in2_z, in1infty);
copy_conditional(res_x, in1_x, in2infty);
copy_conditional(res_y, in1_y, in2infty);
copy_conditional(res_z, in1_z, in2infty);
limbs_copy(r->X, res_x, P384_LIMBS);
limbs_copy(r->Y, res_y, P384_LIMBS);
limbs_copy(r->Z, res_z, P384_LIMBS);
}
static void add_precomputed_w5(P384_POINT *r, crypto_word wvalue,
const P384_POINT table[16]) {
crypto_word recoded_is_negative;
crypto_word recoded;
booth_recode(&recoded_is_negative, &recoded, wvalue, 5);
alignas(64) P384_POINT h;
gfp_p384_point_select_w5(&h, table, recoded);
alignas(64) BN_ULONG tmp[P384_LIMBS];
GFp_p384_elem_neg(tmp, h.Y);
copy_conditional(h.Y, tmp, recoded_is_negative);
GFp_nistz384_point_add(r, r, &h);
}
/* r = p * p_scalar */
void GFp_nistz384_point_mul(P384_POINT *r, const BN_ULONG p_scalar[P384_LIMBS],
const BN_ULONG p_x[P384_LIMBS],
const BN_ULONG p_y[P384_LIMBS]) {
static const size_t kWindowSize = 5;
static const crypto_word kMask = (1 << (5 /* kWindowSize */ + 1)) - 1;
uint8_t p_str[(P384_LIMBS * sizeof(Limb)) + 1];
gfp_little_endian_bytes_from_scalar(p_str, sizeof(p_str) / sizeof(p_str[0]),
p_scalar, P384_LIMBS);
/* A |P384_POINT| is (3 * 48) = 144 bytes, and the 64-byte alignment should
* add no more than 63 bytes of overhead. Thus, |table| should require
* ~2367 ((144 * 16) + 63) bytes of stack space. */
alignas(64) P384_POINT table[16];
/* table[0] is implicitly (0,0,0) (the point at infinity), therefore it is
* not stored. All other values are actually stored with an offset of -1 in
* table. */
P384_POINT *row = table;
limbs_copy(row[1 - 1].X, p_x, P384_LIMBS);
limbs_copy(row[1 - 1].Y, p_y, P384_LIMBS);
limbs_copy(row[1 - 1].Z, ONE, P384_LIMBS);
GFp_nistz384_point_double(&row[2 - 1], &row[1 - 1]);
GFp_nistz384_point_add(&row[3 - 1], &row[2 - 1], &row[1 - 1]);
GFp_nistz384_point_double(&row[4 - 1], &row[2 - 1]);
GFp_nistz384_point_double(&row[6 - 1], &row[3 - 1]);
GFp_nistz384_point_double(&row[8 - 1], &row[4 - 1]);
GFp_nistz384_point_double(&row[12 - 1], &row[6 - 1]);
GFp_nistz384_point_add(&row[5 - 1], &row[4 - 1], &row[1 - 1]);
GFp_nistz384_point_add(&row[7 - 1], &row[6 - 1], &row[1 - 1]);
GFp_nistz384_point_add(&row[9 - 1], &row[8 - 1], &row[1 - 1]);
GFp_nistz384_point_add(&row[13 - 1], &row[12 - 1], &row[1 - 1]);
GFp_nistz384_point_double(&row[14 - 1], &row[7 - 1]);
GFp_nistz384_point_double(&row[10 - 1], &row[5 - 1]);
GFp_nistz384_point_add(&row[15 - 1], &row[14 - 1], &row[1 - 1]);
GFp_nistz384_point_add(&row[11 - 1], &row[10 - 1], &row[1 - 1]);
GFp_nistz384_point_double(&row[16 - 1], &row[8 - 1]);
static const size_t START_INDEX = 384 - 4;
size_t index = START_INDEX;
BN_ULONG recoded_is_negative;
crypto_word recoded;
crypto_word wvalue = p_str[(index - 1) / 8];
wvalue = (wvalue >> ((index - 1) % 8)) & kMask;
booth_recode(&recoded_is_negative, &recoded, wvalue, 5);
dev_assert_secret(!recoded_is_negative);
gfp_p384_point_select_w5(r, table, recoded);
while (index >= kWindowSize) {
if (index != START_INDEX) {
size_t off = (index - 1) / 8;
wvalue = p_str[off] | p_str[off + 1] << 8;
wvalue = (wvalue >> ((index - 1) % 8)) & kMask;
add_precomputed_w5(r, wvalue, table);
}
index -= kWindowSize;
GFp_nistz384_point_double(r, r);
GFp_nistz384_point_double(r, r);
GFp_nistz384_point_double(r, r);
GFp_nistz384_point_double(r, r);
GFp_nistz384_point_double(r, r);
}
/* Final window */
wvalue = p_str[0];
wvalue = (wvalue << 1) & kMask;
add_precomputed_w5(r, wvalue, table);
}
#if defined(__GNUC__)
#pragma GCC diagnostic pop
#endif

View File

@@ -0,0 +1,108 @@
/* Copyright 2016 Brian Smith.
*
* Permission to use, copy, modify, and/or distribute this software for any
* purpose with or without fee is hereby granted, provided that the above
* copyright notice and this permission notice appear in all copies.
*
* THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHORS DISCLAIM ALL WARRANTIES
* WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
* MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY
* SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
* WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION
* OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN
* CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. */
#include "ecp_nistz256.h"
#include "../../limbs/limbs.h"
#include "../../internal.h"
#include "../bn/internal.h"
#include "../../limbs/limbs.inl"
typedef Limb Elem[P256_LIMBS];
typedef Limb ScalarMont[P256_LIMBS];
typedef Limb Scalar[P256_LIMBS];
void GFp_p256_scalar_sqr_rep_mont(ScalarMont r, const ScalarMont a, Limb rep);
#if defined(OPENSSL_ARM) || defined(OPENSSL_X86)
void GFp_nistz256_sqr_mont(Elem r, const Elem a) {
/* XXX: Inefficient. TODO: optimize with dedicated squaring routine. */
GFp_nistz256_mul_mont(r, a, a);
}
#endif
#if !defined(OPENSSL_X86_64)
void GFp_p256_scalar_mul_mont(ScalarMont r, const ScalarMont a,
const ScalarMont b) {
static const BN_ULONG N[] = {
TOBN(0xf3b9cac2, 0xfc632551),
TOBN(0xbce6faad, 0xa7179e84),
TOBN(0xffffffff, 0xffffffff),
TOBN(0xffffffff, 0x00000000),
};
static const BN_ULONG N_N0[] = {
BN_MONT_CTX_N0(0xccd1c8aa, 0xee00bc4f)
};
/* XXX: Inefficient. TODO: optimize with dedicated multiplication routine. */
GFp_bn_mul_mont(r, a, b, N, N_N0, P256_LIMBS);
}
#endif
#if defined(OPENSSL_X86_64)
void GFp_p256_scalar_sqr_mont(ScalarMont r, const ScalarMont a) {
GFp_p256_scalar_sqr_rep_mont(r, a, 1);
}
#else
void GFp_p256_scalar_sqr_mont(ScalarMont r, const ScalarMont a) {
GFp_p256_scalar_mul_mont(r, a, a);
}
void GFp_p256_scalar_sqr_rep_mont(ScalarMont r, const ScalarMont a, Limb rep) {
dev_assert_secret(rep >= 1);
GFp_p256_scalar_sqr_mont(r, a);
for (Limb i = 1; i < rep; ++i) {
GFp_p256_scalar_sqr_mont(r, r);
}
}
#endif
#if !defined(OPENSSL_X86_64)
/* TODO(perf): Optimize these. */
void GFp_nistz256_select_w5(P256_POINT *out, const P256_POINT table[16],
crypto_word index) {
dev_assert_secret(index >= 0);
alignas(32) Elem x; limbs_zero(x, P256_LIMBS);
alignas(32) Elem y; limbs_zero(y, P256_LIMBS);
alignas(32) Elem z; limbs_zero(z, P256_LIMBS);
// TODO: Rewrite in terms of |limbs_select|.
for (size_t i = 0; i < 16; ++i) {
crypto_word equal = constant_time_eq_w(index, (crypto_word)i + 1);
for (size_t j = 0; j < P256_LIMBS; ++j) {
x[j] = constant_time_select_w(equal, table[i].X[j], x[j]);
y[j] = constant_time_select_w(equal, table[i].Y[j], y[j]);
z[j] = constant_time_select_w(equal, table[i].Z[j], z[j]);
}
}
limbs_copy(out->X, x, P256_LIMBS);
limbs_copy(out->Y, y, P256_LIMBS);
limbs_copy(out->Z, z, P256_LIMBS);
}
#if defined GFp_USE_LARGE_TABLE
void GFp_nistz256_select_w7(P256_POINT_AFFINE *out,
const PRECOMP256_ROW table, crypto_word index) {
alignas(32) Limb xy[P256_LIMBS * 2];
limbs_select(xy, table, P256_LIMBS * 2, 64, index - 1);
limbs_copy(out->X, &xy[0], P256_LIMBS);
limbs_copy(out->Y, &xy[P256_LIMBS], P256_LIMBS);
}
#endif
#endif

View File

@@ -0,0 +1,242 @@
/* Copyright 2016 Brian Smith.
*
* Permission to use, copy, modify, and/or distribute this software for any
* purpose with or without fee is hereby granted, provided that the above
* copyright notice and this permission notice appear in all copies.
*
* THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHORS DISCLAIM ALL WARRANTIES
* WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
* MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY
* SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
* WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION
* OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN
* CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. */
#include "../../limbs/limbs.h"
#include "ecp_nistz384.h"
#include "../bn/internal.h"
#include "../../internal.h"
#include "../../limbs/limbs.inl"
/* XXX: Here we assume that the conversion from |Carry| to |Limb| is
* constant-time, but we haven't verified that assumption. TODO: Fix it so
* we don't need to make that assumption. */
typedef Limb Elem[P384_LIMBS];
typedef Limb ScalarMont[P384_LIMBS];
typedef Limb Scalar[P384_LIMBS];
static const BN_ULONG Q[P384_LIMBS] = {
TOBN(0x00000000, 0xffffffff),
TOBN(0xffffffff, 0x00000000),
TOBN(0xffffffff, 0xfffffffe),
TOBN(0xffffffff, 0xffffffff),
TOBN(0xffffffff, 0xffffffff),
TOBN(0xffffffff, 0xffffffff),
};
static const BN_ULONG N[P384_LIMBS] = {
TOBN(0xecec196a, 0xccc52973),
TOBN(0x581a0db2, 0x48b0a77a),
TOBN(0xc7634d81, 0xf4372ddf),
TOBN(0xffffffff, 0xffffffff),
TOBN(0xffffffff, 0xffffffff),
TOBN(0xffffffff, 0xffffffff),
};
static const BN_ULONG ONE[P384_LIMBS] = {
TOBN(0xffffffff, 1), TOBN(0, 0xffffffff), TOBN(0, 1), TOBN(0, 0), TOBN(0, 0),
TOBN(0, 0),
};
/* XXX: MSVC for x86 warns when it fails to inline these functions it should
* probably inline. */
#if defined(_MSC_VER) && !defined(__clang__) && defined(OPENSSL_X86)
#define INLINE_IF_POSSIBLE __forceinline
#else
#define INLINE_IF_POSSIBLE inline
#endif
static inline Limb is_equal(const Elem a, const Elem b) {
return LIMBS_equal(a, b, P384_LIMBS);
}
static inline Limb is_zero(const BN_ULONG a[P384_LIMBS]) {
return LIMBS_are_zero(a, P384_LIMBS);
}
static inline void copy_conditional(Elem r, const Elem a,
const Limb condition) {
for (size_t i = 0; i < P384_LIMBS; ++i) {
r[i] = constant_time_select_w(condition, a[i], r[i]);
}
}
static inline void elem_add(Elem r, const Elem a, const Elem b) {
LIMBS_add_mod(r, a, b, Q, P384_LIMBS);
}
static inline void elem_sub(Elem r, const Elem a, const Elem b) {
LIMBS_sub_mod(r, a, b, Q, P384_LIMBS);
}
static void elem_div_by_2(Elem r, const Elem a) {
/* Consider the case where `a` is even. Then we can shift `a` right one bit
* and the result will still be valid because we didn't lose any bits and so
* `(a >> 1) * 2 == a (mod q)`, which is the invariant we must satisfy.
*
* The remainder of this comment is considering the case where `a` is odd.
*
* Since `a` is odd, it isn't the case that `(a >> 1) * 2 == a (mod q)`
* because the lowest bit is lost during the shift. For example, consider:
*
* ```python
* q = 2**384 - 2**128 - 2**96 + 2**32 - 1
* a = 2**383
* two_a = a * 2 % q
* assert two_a == 0x100000000ffffffffffffffff00000001
* ```
*
* Notice there how `(2 * a) % q` wrapped around to a smaller odd value. When
* we divide `two_a` by two (mod q), we need to get the value `2**383`, which
* we obviously can't get with just a right shift.
*
* `q` is odd, and `a` is odd, so `a + q` is even. We could calculate
* `(a + q) >> 1` and then reduce it mod `q`. However, then we would have to
* keep track of an extra most significant bit. We can avoid that by instead
* calculating `(a >> 1) + ((q + 1) >> 1)`. The `1` in `q + 1` is the least
* significant bit of `a`. `q + 1` is even, which means it can be shifted
* without losing any bits. Since `q` is odd, `q - 1` is even, so the largest
* odd field element is `q - 2`. Thus we know that `a <= q - 2`. We know
* `(q + 1) >> 1` is `(q + 1) / 2` since (`q + 1`) is even. The value of
* `a >> 1` is `(a - 1)/2` since the shift will drop the least significant
* bit of `a`, which is 1. Thus:
*
* sum = ((q + 1) >> 1) + (a >> 1)
* sum = (q + 1)/2 + (a >> 1) (substituting (q + 1)/2)
* <= (q + 1)/2 + (q - 2 - 1)/2 (substituting a <= q - 2)
* <= (q + 1)/2 + (q - 3)/2 (simplifying)
* <= (q + 1 + q - 3)/2 (factoring out the common divisor)
* <= (2q - 2)/2 (simplifying)
* <= q - 1 (simplifying)
*
* Thus, no reduction of the sum mod `q` is necessary. */
Limb is_odd = constant_time_is_nonzero_w(a[0] & 1);
/* r = a >> 1. */
Limb carry = a[P384_LIMBS - 1] & 1;
r[P384_LIMBS - 1] = a[P384_LIMBS - 1] >> 1;
for (size_t i = 1; i < P384_LIMBS; ++i) {
Limb new_carry = a[P384_LIMBS - i - 1];
r[P384_LIMBS - i - 1] =
(a[P384_LIMBS - i - 1] >> 1) | (carry << (LIMB_BITS - 1));
carry = new_carry;
}
static const Elem Q_PLUS_1_SHR_1 = {
TOBN(0x00000000, 0x80000000), TOBN(0x7fffffff, 0x80000000),
TOBN(0xffffffff, 0xffffffff), TOBN(0xffffffff, 0xffffffff),
TOBN(0xffffffff, 0xffffffff), TOBN(0x7fffffff, 0xffffffff),
};
Elem adjusted;
BN_ULONG carry2 = limbs_add(adjusted, r, Q_PLUS_1_SHR_1, P384_LIMBS);
dev_assert_secret(carry2 == 0);
(void)carry2;
copy_conditional(r, adjusted, is_odd);
}
static inline void elem_mul_mont(Elem r, const Elem a, const Elem b) {
static const BN_ULONG Q_N0[] = {
BN_MONT_CTX_N0(0x1, 0x1)
};
/* XXX: Not (clearly) constant-time; inefficient.*/
GFp_bn_mul_mont(r, a, b, Q, Q_N0, P384_LIMBS);
}
static inline void elem_mul_by_2(Elem r, const Elem a) {
LIMBS_shl_mod(r, a, Q, P384_LIMBS);
}
static INLINE_IF_POSSIBLE void elem_mul_by_3(Elem r, const Elem a) {
/* XXX: inefficient. TODO: Replace with an integrated shift + add. */
Elem doubled;
elem_add(doubled, a, a);
elem_add(r, doubled, a);
}
static inline void elem_sqr_mont(Elem r, const Elem a) {
/* XXX: Inefficient. TODO: Add a dedicated squaring routine. */
elem_mul_mont(r, a, a);
}
void GFp_p384_elem_add(Elem r, const Elem a, const Elem b) {
elem_add(r, a, b);
}
void GFp_p384_elem_sub(Elem r, const Elem a, const Elem b) {
elem_sub(r, a, b);
}
void GFp_p384_elem_div_by_2(Elem r, const Elem a) {
elem_div_by_2(r, a);
}
void GFp_p384_elem_mul_mont(Elem r, const Elem a, const Elem b) {
elem_mul_mont(r, a, b);
}
void GFp_p384_elem_neg(Elem r, const Elem a) {
Limb is_zero = LIMBS_are_zero(a, P384_LIMBS);
Carry borrow = limbs_sub(r, Q, a, P384_LIMBS);
dev_assert_secret(borrow == 0);
(void)borrow;
for (size_t i = 0; i < P384_LIMBS; ++i) {
r[i] = constant_time_select_w(is_zero, 0, r[i]);
}
}
void GFp_p384_scalar_mul_mont(ScalarMont r, const ScalarMont a,
const ScalarMont b) {
static const BN_ULONG N_N0[] = {
BN_MONT_CTX_N0(0x6ed46089, 0xe88fdc45)
};
/* XXX: Inefficient. TODO: Add dedicated multiplication routine. */
GFp_bn_mul_mont(r, a, b, N, N_N0, P384_LIMBS);
}
/* TODO(perf): Optimize this. */
static void gfp_p384_point_select_w5(P384_POINT *out,
const P384_POINT table[16], size_t index) {
Elem x; limbs_zero(x, P384_LIMBS);
Elem y; limbs_zero(y, P384_LIMBS);
Elem z; limbs_zero(z, P384_LIMBS);
// TODO: Rewrite in terms of |limbs_select|.
for (size_t i = 0; i < 16; ++i) {
crypto_word equal = constant_time_eq_w(index, (crypto_word)i + 1);
for (size_t j = 0; j < P384_LIMBS; ++j) {
x[j] = constant_time_select_w(equal, table[i].X[j], x[j]);
y[j] = constant_time_select_w(equal, table[i].Y[j], y[j]);
z[j] = constant_time_select_w(equal, table[i].Z[j], z[j]);
}
}
limbs_copy(out->X, x, P384_LIMBS);
limbs_copy(out->Y, y, P384_LIMBS);
limbs_copy(out->Z, z, P384_LIMBS);
}
#include "ecp_nistz384.inl"

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,300 @@
#! /usr/bin/env perl
# Copyright 2010-2018 The OpenSSL Project Authors. All Rights Reserved.
#
# Licensed under the OpenSSL license (the "License"). You may not use
# this file except in compliance with the License. You can obtain a copy
# in the file LICENSE in the source distribution or at
# https://www.openssl.org/source/license.html
#
# ====================================================================
# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
# project. The module is, however, dual licensed under OpenSSL and
# CRYPTOGAMS licenses depending on where you obtain it. For further
# details see http://www.openssl.org/~appro/cryptogams/.
# ====================================================================
#
# April 2010
#
# The module implements "4-bit" GCM GHASH function and underlying
# single multiplication operation in GF(2^128). "4-bit" means that it
# uses 256 bytes per-key table [+32 bytes shared table]. There is no
# experimental performance data available yet. The only approximation
# that can be made at this point is based on code size. Inner loop is
# 32 instructions long and on single-issue core should execute in <40
# cycles. Having verified that gcc 3.4 didn't unroll corresponding
# loop, this assembler loop body was found to be ~3x smaller than
# compiler-generated one...
#
# July 2010
#
# Rescheduling for dual-issue pipeline resulted in 8.5% improvement on
# Cortex A8 core and ~25 cycles per processed byte (which was observed
# to be ~3 times faster than gcc-generated code:-)
#
# February 2011
#
# Profiler-assisted and platform-specific optimization resulted in 7%
# improvement on Cortex A8 core and ~23.5 cycles per byte.
#
# March 2011
#
# Add NEON implementation featuring polynomial multiplication, i.e. no
# lookup tables involved. On Cortex A8 it was measured to process one
# byte in 15 cycles or 55% faster than integer-only code.
#
# April 2014
#
# Switch to multiplication algorithm suggested in paper referred
# below and combine it with reduction algorithm from x86 module.
# Performance improvement over previous version varies from 65% on
# Snapdragon S4 to 110% on Cortex A9. In absolute terms Cortex A8
# processes one byte in 8.45 cycles, A9 - in 10.2, A15 - in 7.63,
# Snapdragon S4 - in 9.33.
#
# Câmara, D.; Gouvêa, C. P. L.; López, J. & Dahab, R.: Fast Software
# Polynomial Multiplication on ARM Processors using the NEON Engine.
#
# http://conradoplg.cryptoland.net/files/2010/12/mocrysen13.pdf
# ====================================================================
# Note about "528B" variant. In ARM case it makes lesser sense to
# implement it for following reasons:
#
# - performance improvement won't be anywhere near 50%, because 128-
# bit shift operation is neatly fused with 128-bit xor here, and
# "538B" variant would eliminate only 4-5 instructions out of 32
# in the inner loop (meaning that estimated improvement is ~15%);
# - ARM-based systems are often embedded ones and extra memory
# consumption might be unappreciated (for so little improvement);
#
# Byte order [in]dependence. =========================================
#
# Caller is expected to maintain specific *dword* order in Htable,
# namely with *least* significant dword of 128-bit value at *lower*
# address. This differs completely from C code and has everything to
# do with ldm instruction and order in which dwords are "consumed" by
# algorithm. *Byte* order within these dwords in turn is whatever
# *native* byte order on current platform. See gcm128.c for working
# example...
# This file was patched in BoringSSL to remove the variable-time 4-bit
# implementation.
$flavour = shift;
if ($flavour=~/\w[\w\-]*\.\w+$/) { $output=$flavour; undef $flavour; }
else { while (($output=shift) && ($output!~/\w[\w\-]*\.\w+$/)) {} }
if ($flavour && $flavour ne "void") {
$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or
( $xlate="${dir}../../../perlasm/arm-xlate.pl" and -f $xlate) or
die "can't locate arm-xlate.pl";
open OUT,"| \"$^X\" $xlate $flavour $output";
*STDOUT=*OUT;
} else {
open OUT,">$output";
*STDOUT=*OUT;
}
$Xi="r0"; # argument block
$Htbl="r1";
$inp="r2";
$len="r3";
$code=<<___;
#include <GFp/arm_arch.h>
@ Silence ARMv8 deprecated IT instruction warnings. This file is used by both
@ ARMv7 and ARMv8 processors and does not use ARMv8 instructions. (ARMv8 PMULL
@ instructions are in aesv8-armx.pl.)
.arch armv7-a
.text
#if defined(__thumb2__) || defined(__clang__)
.syntax unified
#define ldrplb ldrbpl
#define ldrneb ldrbne
#endif
#if defined(__thumb2__)
.thumb
#else
.code 32
#endif
___
{
my ($Xl,$Xm,$Xh,$IN)=map("q$_",(0..3));
my ($t0,$t1,$t2,$t3)=map("q$_",(8..12));
my ($Hlo,$Hhi,$Hhl,$k48,$k32,$k16)=map("d$_",(26..31));
sub clmul64x64 {
my ($r,$a,$b)=@_;
$code.=<<___;
vext.8 $t0#lo, $a, $a, #1 @ A1
vmull.p8 $t0, $t0#lo, $b @ F = A1*B
vext.8 $r#lo, $b, $b, #1 @ B1
vmull.p8 $r, $a, $r#lo @ E = A*B1
vext.8 $t1#lo, $a, $a, #2 @ A2
vmull.p8 $t1, $t1#lo, $b @ H = A2*B
vext.8 $t3#lo, $b, $b, #2 @ B2
vmull.p8 $t3, $a, $t3#lo @ G = A*B2
vext.8 $t2#lo, $a, $a, #3 @ A3
veor $t0, $t0, $r @ L = E + F
vmull.p8 $t2, $t2#lo, $b @ J = A3*B
vext.8 $r#lo, $b, $b, #3 @ B3
veor $t1, $t1, $t3 @ M = G + H
vmull.p8 $r, $a, $r#lo @ I = A*B3
veor $t0#lo, $t0#lo, $t0#hi @ t0 = (L) (P0 + P1) << 8
vand $t0#hi, $t0#hi, $k48
vext.8 $t3#lo, $b, $b, #4 @ B4
veor $t1#lo, $t1#lo, $t1#hi @ t1 = (M) (P2 + P3) << 16
vand $t1#hi, $t1#hi, $k32
vmull.p8 $t3, $a, $t3#lo @ K = A*B4
veor $t2, $t2, $r @ N = I + J
veor $t0#lo, $t0#lo, $t0#hi
veor $t1#lo, $t1#lo, $t1#hi
veor $t2#lo, $t2#lo, $t2#hi @ t2 = (N) (P4 + P5) << 24
vand $t2#hi, $t2#hi, $k16
vext.8 $t0, $t0, $t0, #15
veor $t3#lo, $t3#lo, $t3#hi @ t3 = (K) (P6 + P7) << 32
vmov.i64 $t3#hi, #0
vext.8 $t1, $t1, $t1, #14
veor $t2#lo, $t2#lo, $t2#hi
vmull.p8 $r, $a, $b @ D = A*B
vext.8 $t3, $t3, $t3, #12
vext.8 $t2, $t2, $t2, #13
veor $t0, $t0, $t1
veor $t2, $t2, $t3
veor $r, $r, $t0
veor $r, $r, $t2
___
}
$code.=<<___;
#if __ARM_MAX_ARCH__>=7
.arch armv7-a
.fpu neon
.global GFp_gcm_init_neon
.type GFp_gcm_init_neon,%function
.align 4
GFp_gcm_init_neon:
vld1.64 $IN#hi,[r1]! @ load H
vmov.i8 $t0,#0xe1
vld1.64 $IN#lo,[r1]
vshl.i64 $t0#hi,#57
vshr.u64 $t0#lo,#63 @ t0=0xc2....01
vdup.8 $t1,$IN#hi[7]
vshr.u64 $Hlo,$IN#lo,#63
vshr.s8 $t1,#7 @ broadcast carry bit
vshl.i64 $IN,$IN,#1
vand $t0,$t0,$t1
vorr $IN#hi,$Hlo @ H<<<=1
veor $IN,$IN,$t0 @ twisted H
vstmia r0,{$IN}
ret @ bx lr
.size GFp_gcm_init_neon,.-GFp_gcm_init_neon
.global GFp_gcm_gmult_neon
.type GFp_gcm_gmult_neon,%function
.align 4
GFp_gcm_gmult_neon:
vld1.64 $IN#hi,[$Xi]! @ load Xi
vld1.64 $IN#lo,[$Xi]!
vmov.i64 $k48,#0x0000ffffffffffff
vldmia $Htbl,{$Hlo-$Hhi} @ load twisted H
vmov.i64 $k32,#0x00000000ffffffff
#ifdef __ARMEL__
vrev64.8 $IN,$IN
#endif
vmov.i64 $k16,#0x000000000000ffff
veor $Hhl,$Hlo,$Hhi @ Karatsuba pre-processing
mov $len,#16
b .Lgmult_neon
.size GFp_gcm_gmult_neon,.-GFp_gcm_gmult_neon
.global GFp_gcm_ghash_neon
.type GFp_gcm_ghash_neon,%function
.align 4
GFp_gcm_ghash_neon:
vld1.64 $Xl#hi,[$Xi]! @ load Xi
vld1.64 $Xl#lo,[$Xi]!
vmov.i64 $k48,#0x0000ffffffffffff
vldmia $Htbl,{$Hlo-$Hhi} @ load twisted H
vmov.i64 $k32,#0x00000000ffffffff
#ifdef __ARMEL__
vrev64.8 $Xl,$Xl
#endif
vmov.i64 $k16,#0x000000000000ffff
veor $Hhl,$Hlo,$Hhi @ Karatsuba pre-processing
.Loop_neon:
vld1.64 $IN#hi,[$inp]! @ load inp
vld1.64 $IN#lo,[$inp]!
#ifdef __ARMEL__
vrev64.8 $IN,$IN
#endif
veor $IN,$Xl @ inp^=Xi
.Lgmult_neon:
___
&clmul64x64 ($Xl,$Hlo,"$IN#lo"); # H.lo·Xi.lo
$code.=<<___;
veor $IN#lo,$IN#lo,$IN#hi @ Karatsuba pre-processing
___
&clmul64x64 ($Xm,$Hhl,"$IN#lo"); # (H.lo+H.hi)·(Xi.lo+Xi.hi)
&clmul64x64 ($Xh,$Hhi,"$IN#hi"); # H.hi·Xi.hi
$code.=<<___;
veor $Xm,$Xm,$Xl @ Karatsuba post-processing
veor $Xm,$Xm,$Xh
veor $Xl#hi,$Xl#hi,$Xm#lo
veor $Xh#lo,$Xh#lo,$Xm#hi @ Xh|Xl - 256-bit result
@ equivalent of reduction_avx from ghash-x86_64.pl
vshl.i64 $t1,$Xl,#57 @ 1st phase
vshl.i64 $t2,$Xl,#62
veor $t2,$t2,$t1 @
vshl.i64 $t1,$Xl,#63
veor $t2, $t2, $t1 @
veor $Xl#hi,$Xl#hi,$t2#lo @
veor $Xh#lo,$Xh#lo,$t2#hi
vshr.u64 $t2,$Xl,#1 @ 2nd phase
veor $Xh,$Xh,$Xl
veor $Xl,$Xl,$t2 @
vshr.u64 $t2,$t2,#6
vshr.u64 $Xl,$Xl,#1 @
veor $Xl,$Xl,$Xh @
veor $Xl,$Xl,$t2 @
subs $len,#16
bne .Loop_neon
#ifdef __ARMEL__
vrev64.8 $Xl,$Xl
#endif
sub $Xi,#16
vst1.64 $Xl#hi,[$Xi]! @ write out Xi
vst1.64 $Xl#lo,[$Xi]
ret @ bx lr
.size GFp_gcm_ghash_neon,.-GFp_gcm_ghash_neon
#endif
___
}
$code.=<<___;
.asciz "GHASH for ARMv4/NEON, CRYPTOGAMS by <appro\@openssl.org>"
.align 2
___
foreach (split("\n",$code)) {
s/\`([^\`]*)\`/eval $1/geo;
s/\bq([0-9]+)#(lo|hi)/sprintf "d%d",2*$1+($2 eq "hi")/geo or
s/\bret\b/bx lr/go or
s/\bbx\s+lr\b/.word\t0xe12fff1e/go; # make it possible to compile with -march=armv4
print $_,"\n";
}
close STDOUT or die "error closing STDOUT"; # enforce flush

View File

@@ -0,0 +1,714 @@
#! /usr/bin/env perl
# Copyright 2010-2016 The OpenSSL Project Authors. All Rights Reserved.
#
# Licensed under the OpenSSL license (the "License"). You may not use
# this file except in compliance with the License. You can obtain a copy
# in the file LICENSE in the source distribution or at
# https://www.openssl.org/source/license.html
#
# ====================================================================
# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
# project. The module is, however, dual licensed under OpenSSL and
# CRYPTOGAMS licenses depending on where you obtain it. For further
# details see http://www.openssl.org/~appro/cryptogams/.
# ====================================================================
#
# March, May, June 2010
#
# The module implements "4-bit" GCM GHASH function and underlying
# single multiplication operation in GF(2^128). "4-bit" means that it
# uses 256 bytes per-key table [+64/128 bytes fixed table]. It has two
# code paths: vanilla x86 and vanilla SSE. Former will be executed on
# 486 and Pentium, latter on all others. SSE GHASH features so called
# "528B" variant of "4-bit" method utilizing additional 256+16 bytes
# of per-key storage [+512 bytes shared table]. Performance results
# are for streamed GHASH subroutine and are expressed in cycles per
# processed byte, less is better:
#
# gcc 2.95.3(*) SSE assembler x86 assembler
#
# Pentium 105/111(**) - 50
# PIII 68 /75 12.2 24
# P4 125/125 17.8 84(***)
# Opteron 66 /70 10.1 30
# Core2 54 /67 8.4 18
# Atom 105/105 16.8 53
# VIA Nano 69 /71 13.0 27
#
# (*) gcc 3.4.x was observed to generate few percent slower code,
# which is one of reasons why 2.95.3 results were chosen,
# another reason is lack of 3.4.x results for older CPUs;
# comparison with SSE results is not completely fair, because C
# results are for vanilla "256B" implementation, while
# assembler results are for "528B";-)
# (**) second number is result for code compiled with -fPIC flag,
# which is actually more relevant, because assembler code is
# position-independent;
# (***) see comment in non-MMX routine for further details;
#
# To summarize, it's >2-5 times faster than gcc-generated code. To
# anchor it to something else SHA1 assembler processes one byte in
# ~7 cycles on contemporary x86 cores. As for choice of MMX/SSE
# in particular, see comment at the end of the file...
# May 2010
#
# Add PCLMULQDQ version performing at 2.10 cycles per processed byte.
# The question is how close is it to theoretical limit? The pclmulqdq
# instruction latency appears to be 14 cycles and there can't be more
# than 2 of them executing at any given time. This means that single
# Karatsuba multiplication would take 28 cycles *plus* few cycles for
# pre- and post-processing. Then multiplication has to be followed by
# modulo-reduction. Given that aggregated reduction method [see
# "Carry-less Multiplication and Its Usage for Computing the GCM Mode"
# white paper by Intel] allows you to perform reduction only once in
# a while we can assume that asymptotic performance can be estimated
# as (28+Tmod/Naggr)/16, where Tmod is time to perform reduction
# and Naggr is the aggregation factor.
#
# Before we proceed to this implementation let's have closer look at
# the best-performing code suggested by Intel in their white paper.
# By tracing inter-register dependencies Tmod is estimated as ~19
# cycles and Naggr chosen by Intel is 4, resulting in 2.05 cycles per
# processed byte. As implied, this is quite optimistic estimate,
# because it does not account for Karatsuba pre- and post-processing,
# which for a single multiplication is ~5 cycles. Unfortunately Intel
# does not provide performance data for GHASH alone. But benchmarking
# AES_GCM_encrypt ripped out of Fig. 15 of the white paper with aadt
# alone resulted in 2.46 cycles per byte of out 16KB buffer. Note that
# the result accounts even for pre-computing of degrees of the hash
# key H, but its portion is negligible at 16KB buffer size.
#
# Moving on to the implementation in question. Tmod is estimated as
# ~13 cycles and Naggr is 2, giving asymptotic performance of ...
# 2.16. How is it possible that measured performance is better than
# optimistic theoretical estimate? There is one thing Intel failed
# to recognize. By serializing GHASH with CTR in same subroutine
# former's performance is really limited to above (Tmul + Tmod/Naggr)
# equation. But if GHASH procedure is detached, the modulo-reduction
# can be interleaved with Naggr-1 multiplications at instruction level
# and under ideal conditions even disappear from the equation. So that
# optimistic theoretical estimate for this implementation is ...
# 28/16=1.75, and not 2.16. Well, it's probably way too optimistic,
# at least for such small Naggr. I'd argue that (28+Tproc/Naggr),
# where Tproc is time required for Karatsuba pre- and post-processing,
# is more realistic estimate. In this case it gives ... 1.91 cycles.
# Or in other words, depending on how well we can interleave reduction
# and one of the two multiplications the performance should be between
# 1.91 and 2.16. As already mentioned, this implementation processes
# one byte out of 8KB buffer in 2.10 cycles, while x86_64 counterpart
# - in 2.02. x86_64 performance is better, because larger register
# bank allows to interleave reduction and multiplication better.
#
# Does it make sense to increase Naggr? To start with it's virtually
# impossible in 32-bit mode, because of limited register bank
# capacity. Otherwise improvement has to be weighed against slower
# setup, as well as code size and complexity increase. As even
# optimistic estimate doesn't promise 30% performance improvement,
# there are currently no plans to increase Naggr.
#
# Special thanks to David Woodhouse for providing access to a
# Westmere-based system on behalf of Intel Open Source Technology Centre.
# January 2010
#
# Tweaked to optimize transitions between integer and FP operations
# on same XMM register, PCLMULQDQ subroutine was measured to process
# one byte in 2.07 cycles on Sandy Bridge, and in 2.12 - on Westmere.
# The minor regression on Westmere is outweighed by ~15% improvement
# on Sandy Bridge. Strangely enough attempt to modify 64-bit code in
# similar manner resulted in almost 20% degradation on Sandy Bridge,
# where original 64-bit code processes one byte in 1.95 cycles.
#####################################################################
# For reference, AMD Bulldozer processes one byte in 1.98 cycles in
# 32-bit mode and 1.89 in 64-bit.
# February 2013
#
# Overhaul: aggregate Karatsuba post-processing, improve ILP in
# reduction_alg9. Resulting performance is 1.96 cycles per byte on
# Westmere, 1.95 - on Sandy/Ivy Bridge, 1.76 - on Bulldozer.
$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
push(@INC,"${dir}","${dir}../../../perlasm");
require "x86asm.pl";
$output=pop;
open STDOUT,">$output";
&asm_init($ARGV[0],$x86only = $ARGV[$#ARGV] eq "386");
$sse2=1;
if ($sse2) {{
######################################################################
# PCLMULQDQ version.
$Xip="eax";
$Htbl="edx";
$const="ecx";
$inp="esi";
$len="ebx";
($Xi,$Xhi)=("xmm0","xmm1"); $Hkey="xmm2";
($T1,$T2,$T3)=("xmm3","xmm4","xmm5");
($Xn,$Xhn)=("xmm6","xmm7");
&static_label("bswap");
sub clmul64x64_T2 { # minimal "register" pressure
my ($Xhi,$Xi,$Hkey,$HK)=@_;
&movdqa ($Xhi,$Xi); #
&pshufd ($T1,$Xi,0b01001110);
&pshufd ($T2,$Hkey,0b01001110) if (!defined($HK));
&pxor ($T1,$Xi); #
&pxor ($T2,$Hkey) if (!defined($HK));
$HK=$T2 if (!defined($HK));
&pclmulqdq ($Xi,$Hkey,0x00); #######
&pclmulqdq ($Xhi,$Hkey,0x11); #######
&pclmulqdq ($T1,$HK,0x00); #######
&xorps ($T1,$Xi); #
&xorps ($T1,$Xhi); #
&movdqa ($T2,$T1); #
&psrldq ($T1,8);
&pslldq ($T2,8); #
&pxor ($Xhi,$T1);
&pxor ($Xi,$T2); #
}
sub clmul64x64_T3 {
# Even though this subroutine offers visually better ILP, it
# was empirically found to be a tad slower than above version.
# At least in GFp_gcm_ghash_clmul context. But it's just as well,
# because loop modulo-scheduling is possible only thanks to
# minimized "register" pressure...
my ($Xhi,$Xi,$Hkey)=@_;
&movdqa ($T1,$Xi); #
&movdqa ($Xhi,$Xi);
&pclmulqdq ($Xi,$Hkey,0x00); #######
&pclmulqdq ($Xhi,$Hkey,0x11); #######
&pshufd ($T2,$T1,0b01001110); #
&pshufd ($T3,$Hkey,0b01001110);
&pxor ($T2,$T1); #
&pxor ($T3,$Hkey);
&pclmulqdq ($T2,$T3,0x00); #######
&pxor ($T2,$Xi); #
&pxor ($T2,$Xhi); #
&movdqa ($T3,$T2); #
&psrldq ($T2,8);
&pslldq ($T3,8); #
&pxor ($Xhi,$T2);
&pxor ($Xi,$T3); #
}
if (1) { # Algorithm 9 with <<1 twist.
# Reduction is shorter and uses only two
# temporary registers, which makes it better
# candidate for interleaving with 64x64
# multiplication. Pre-modulo-scheduled loop
# was found to be ~20% faster than Algorithm 5
# below. Algorithm 9 was therefore chosen for
# further optimization...
sub reduction_alg9 { # 17/11 times faster than Intel version
my ($Xhi,$Xi) = @_;
# 1st phase
&movdqa ($T2,$Xi); #
&movdqa ($T1,$Xi);
&psllq ($Xi,5);
&pxor ($T1,$Xi); #
&psllq ($Xi,1);
&pxor ($Xi,$T1); #
&psllq ($Xi,57); #
&movdqa ($T1,$Xi); #
&pslldq ($Xi,8);
&psrldq ($T1,8); #
&pxor ($Xi,$T2);
&pxor ($Xhi,$T1); #
# 2nd phase
&movdqa ($T2,$Xi);
&psrlq ($Xi,1);
&pxor ($Xhi,$T2); #
&pxor ($T2,$Xi);
&psrlq ($Xi,5);
&pxor ($Xi,$T2); #
&psrlq ($Xi,1); #
&pxor ($Xi,$Xhi) #
}
&function_begin_B("GFp_gcm_init_clmul");
&mov ($Htbl,&wparam(0));
&mov ($Xip,&wparam(1));
&call (&label("pic"));
&set_label("pic");
&blindpop ($const);
&lea ($const,&DWP(&label("bswap")."-".&label("pic"),$const));
&movdqu ($Hkey,&QWP(0,$Xip));
&pshufd ($Hkey,$Hkey,0b01001110);# dword swap
# <<1 twist
&pshufd ($T2,$Hkey,0b11111111); # broadcast uppermost dword
&movdqa ($T1,$Hkey);
&psllq ($Hkey,1);
&pxor ($T3,$T3); #
&psrlq ($T1,63);
&pcmpgtd ($T3,$T2); # broadcast carry bit
&pslldq ($T1,8);
&por ($Hkey,$T1); # H<<=1
# magic reduction
&pand ($T3,&QWP(16,$const)); # 0x1c2_polynomial
&pxor ($Hkey,$T3); # if(carry) H^=0x1c2_polynomial
# calculate H^2
&movdqa ($Xi,$Hkey);
&clmul64x64_T2 ($Xhi,$Xi,$Hkey);
&reduction_alg9 ($Xhi,$Xi);
&pshufd ($T1,$Hkey,0b01001110);
&pshufd ($T2,$Xi,0b01001110);
&pxor ($T1,$Hkey); # Karatsuba pre-processing
&movdqu (&QWP(0,$Htbl),$Hkey); # save H
&pxor ($T2,$Xi); # Karatsuba pre-processing
&movdqu (&QWP(16,$Htbl),$Xi); # save H^2
&palignr ($T2,$T1,8); # low part is H.lo^H.hi
&movdqu (&QWP(32,$Htbl),$T2); # save Karatsuba "salt"
&ret ();
&function_end_B("GFp_gcm_init_clmul");
&function_begin_B("GFp_gcm_gmult_clmul");
&mov ($Xip,&wparam(0));
&mov ($Htbl,&wparam(1));
&call (&label("pic"));
&set_label("pic");
&blindpop ($const);
&lea ($const,&DWP(&label("bswap")."-".&label("pic"),$const));
&movdqu ($Xi,&QWP(0,$Xip));
&movdqa ($T3,&QWP(0,$const));
&movups ($Hkey,&QWP(0,$Htbl));
&pshufb ($Xi,$T3);
&movups ($T2,&QWP(32,$Htbl));
&clmul64x64_T2 ($Xhi,$Xi,$Hkey,$T2);
&reduction_alg9 ($Xhi,$Xi);
&pshufb ($Xi,$T3);
&movdqu (&QWP(0,$Xip),$Xi);
&ret ();
&function_end_B("GFp_gcm_gmult_clmul");
&function_begin("GFp_gcm_ghash_clmul");
&mov ($Xip,&wparam(0));
&mov ($Htbl,&wparam(1));
&mov ($inp,&wparam(2));
&mov ($len,&wparam(3));
&call (&label("pic"));
&set_label("pic");
&blindpop ($const);
&lea ($const,&DWP(&label("bswap")."-".&label("pic"),$const));
&movdqu ($Xi,&QWP(0,$Xip));
&movdqa ($T3,&QWP(0,$const));
&movdqu ($Hkey,&QWP(0,$Htbl));
&pshufb ($Xi,$T3);
&sub ($len,0x10);
&jz (&label("odd_tail"));
#######
# Xi+2 =[H*(Ii+1 + Xi+1)] mod P =
# [(H*Ii+1) + (H*Xi+1)] mod P =
# [(H*Ii+1) + H^2*(Ii+Xi)] mod P
#
&movdqu ($T1,&QWP(0,$inp)); # Ii
&movdqu ($Xn,&QWP(16,$inp)); # Ii+1
&pshufb ($T1,$T3);
&pshufb ($Xn,$T3);
&movdqu ($T3,&QWP(32,$Htbl));
&pxor ($Xi,$T1); # Ii+Xi
&pshufd ($T1,$Xn,0b01001110); # H*Ii+1
&movdqa ($Xhn,$Xn);
&pxor ($T1,$Xn); #
&lea ($inp,&DWP(32,$inp)); # i+=2
&pclmulqdq ($Xn,$Hkey,0x00); #######
&pclmulqdq ($Xhn,$Hkey,0x11); #######
&pclmulqdq ($T1,$T3,0x00); #######
&movups ($Hkey,&QWP(16,$Htbl)); # load H^2
&nop ();
&sub ($len,0x20);
&jbe (&label("even_tail"));
&jmp (&label("mod_loop"));
&set_label("mod_loop",32);
&pshufd ($T2,$Xi,0b01001110); # H^2*(Ii+Xi)
&movdqa ($Xhi,$Xi);
&pxor ($T2,$Xi); #
&nop ();
&pclmulqdq ($Xi,$Hkey,0x00); #######
&pclmulqdq ($Xhi,$Hkey,0x11); #######
&pclmulqdq ($T2,$T3,0x10); #######
&movups ($Hkey,&QWP(0,$Htbl)); # load H
&xorps ($Xi,$Xn); # (H*Ii+1) + H^2*(Ii+Xi)
&movdqa ($T3,&QWP(0,$const));
&xorps ($Xhi,$Xhn);
&movdqu ($Xhn,&QWP(0,$inp)); # Ii
&pxor ($T1,$Xi); # aggregated Karatsuba post-processing
&movdqu ($Xn,&QWP(16,$inp)); # Ii+1
&pxor ($T1,$Xhi); #
&pshufb ($Xhn,$T3);
&pxor ($T2,$T1); #
&movdqa ($T1,$T2); #
&psrldq ($T2,8);
&pslldq ($T1,8); #
&pxor ($Xhi,$T2);
&pxor ($Xi,$T1); #
&pshufb ($Xn,$T3);
&pxor ($Xhi,$Xhn); # "Ii+Xi", consume early
&movdqa ($Xhn,$Xn); #&clmul64x64_TX ($Xhn,$Xn,$Hkey); H*Ii+1
&movdqa ($T2,$Xi); #&reduction_alg9($Xhi,$Xi); 1st phase
&movdqa ($T1,$Xi);
&psllq ($Xi,5);
&pxor ($T1,$Xi); #
&psllq ($Xi,1);
&pxor ($Xi,$T1); #
&pclmulqdq ($Xn,$Hkey,0x00); #######
&movups ($T3,&QWP(32,$Htbl));
&psllq ($Xi,57); #
&movdqa ($T1,$Xi); #
&pslldq ($Xi,8);
&psrldq ($T1,8); #
&pxor ($Xi,$T2);
&pxor ($Xhi,$T1); #
&pshufd ($T1,$Xhn,0b01001110);
&movdqa ($T2,$Xi); # 2nd phase
&psrlq ($Xi,1);
&pxor ($T1,$Xhn);
&pxor ($Xhi,$T2); #
&pclmulqdq ($Xhn,$Hkey,0x11); #######
&movups ($Hkey,&QWP(16,$Htbl)); # load H^2
&pxor ($T2,$Xi);
&psrlq ($Xi,5);
&pxor ($Xi,$T2); #
&psrlq ($Xi,1); #
&pxor ($Xi,$Xhi) #
&pclmulqdq ($T1,$T3,0x00); #######
&lea ($inp,&DWP(32,$inp));
&sub ($len,0x20);
&ja (&label("mod_loop"));
&set_label("even_tail");
&pshufd ($T2,$Xi,0b01001110); # H^2*(Ii+Xi)
&movdqa ($Xhi,$Xi);
&pxor ($T2,$Xi); #
&pclmulqdq ($Xi,$Hkey,0x00); #######
&pclmulqdq ($Xhi,$Hkey,0x11); #######
&pclmulqdq ($T2,$T3,0x10); #######
&movdqa ($T3,&QWP(0,$const));
&xorps ($Xi,$Xn); # (H*Ii+1) + H^2*(Ii+Xi)
&xorps ($Xhi,$Xhn);
&pxor ($T1,$Xi); # aggregated Karatsuba post-processing
&pxor ($T1,$Xhi); #
&pxor ($T2,$T1); #
&movdqa ($T1,$T2); #
&psrldq ($T2,8);
&pslldq ($T1,8); #
&pxor ($Xhi,$T2);
&pxor ($Xi,$T1); #
&reduction_alg9 ($Xhi,$Xi);
&test ($len,$len);
&jnz (&label("done"));
&movups ($Hkey,&QWP(0,$Htbl)); # load H
&set_label("odd_tail");
&movdqu ($T1,&QWP(0,$inp)); # Ii
&pshufb ($T1,$T3);
&pxor ($Xi,$T1); # Ii+Xi
&clmul64x64_T2 ($Xhi,$Xi,$Hkey); # H*(Ii+Xi)
&reduction_alg9 ($Xhi,$Xi);
&set_label("done");
&pshufb ($Xi,$T3);
&movdqu (&QWP(0,$Xip),$Xi);
&function_end("GFp_gcm_ghash_clmul");
} else { # Algorithm 5. Kept for reference purposes.
sub reduction_alg5 { # 19/16 times faster than Intel version
my ($Xhi,$Xi)=@_;
# <<1
&movdqa ($T1,$Xi); #
&movdqa ($T2,$Xhi);
&pslld ($Xi,1);
&pslld ($Xhi,1); #
&psrld ($T1,31);
&psrld ($T2,31); #
&movdqa ($T3,$T1);
&pslldq ($T1,4);
&psrldq ($T3,12); #
&pslldq ($T2,4);
&por ($Xhi,$T3); #
&por ($Xi,$T1);
&por ($Xhi,$T2); #
# 1st phase
&movdqa ($T1,$Xi);
&movdqa ($T2,$Xi);
&movdqa ($T3,$Xi); #
&pslld ($T1,31);
&pslld ($T2,30);
&pslld ($Xi,25); #
&pxor ($T1,$T2);
&pxor ($T1,$Xi); #
&movdqa ($T2,$T1); #
&pslldq ($T1,12);
&psrldq ($T2,4); #
&pxor ($T3,$T1);
# 2nd phase
&pxor ($Xhi,$T3); #
&movdqa ($Xi,$T3);
&movdqa ($T1,$T3);
&psrld ($Xi,1); #
&psrld ($T1,2);
&psrld ($T3,7); #
&pxor ($Xi,$T1);
&pxor ($Xhi,$T2);
&pxor ($Xi,$T3); #
&pxor ($Xi,$Xhi); #
}
&function_begin_B("GFp_gcm_init_clmul");
&mov ($Htbl,&wparam(0));
&mov ($Xip,&wparam(1));
&call (&label("pic"));
&set_label("pic");
&blindpop ($const);
&lea ($const,&DWP(&label("bswap")."-".&label("pic"),$const));
&movdqu ($Hkey,&QWP(0,$Xip));
&pshufd ($Hkey,$Hkey,0b01001110);# dword swap
# calculate H^2
&movdqa ($Xi,$Hkey);
&clmul64x64_T3 ($Xhi,$Xi,$Hkey);
&reduction_alg5 ($Xhi,$Xi);
&movdqu (&QWP(0,$Htbl),$Hkey); # save H
&movdqu (&QWP(16,$Htbl),$Xi); # save H^2
&ret ();
&function_end_B("GFp_gcm_init_clmul");
&function_begin_B("GFp_gcm_gmult_clmul");
&mov ($Xip,&wparam(0));
&mov ($Htbl,&wparam(1));
&call (&label("pic"));
&set_label("pic");
&blindpop ($const);
&lea ($const,&DWP(&label("bswap")."-".&label("pic"),$const));
&movdqu ($Xi,&QWP(0,$Xip));
&movdqa ($Xn,&QWP(0,$const));
&movdqu ($Hkey,&QWP(0,$Htbl));
&pshufb ($Xi,$Xn);
&clmul64x64_T3 ($Xhi,$Xi,$Hkey);
&reduction_alg5 ($Xhi,$Xi);
&pshufb ($Xi,$Xn);
&movdqu (&QWP(0,$Xip),$Xi);
&ret ();
&function_end_B("GFp_gcm_gmult_clmul");
&function_begin("GFp_gcm_ghash_clmul");
&mov ($Xip,&wparam(0));
&mov ($Htbl,&wparam(1));
&mov ($inp,&wparam(2));
&mov ($len,&wparam(3));
&call (&label("pic"));
&set_label("pic");
&blindpop ($const);
&lea ($const,&DWP(&label("bswap")."-".&label("pic"),$const));
&movdqu ($Xi,&QWP(0,$Xip));
&movdqa ($T3,&QWP(0,$const));
&movdqu ($Hkey,&QWP(0,$Htbl));
&pshufb ($Xi,$T3);
&sub ($len,0x10);
&jz (&label("odd_tail"));
#######
# Xi+2 =[H*(Ii+1 + Xi+1)] mod P =
# [(H*Ii+1) + (H*Xi+1)] mod P =
# [(H*Ii+1) + H^2*(Ii+Xi)] mod P
#
&movdqu ($T1,&QWP(0,$inp)); # Ii
&movdqu ($Xn,&QWP(16,$inp)); # Ii+1
&pshufb ($T1,$T3);
&pshufb ($Xn,$T3);
&pxor ($Xi,$T1); # Ii+Xi
&clmul64x64_T3 ($Xhn,$Xn,$Hkey); # H*Ii+1
&movdqu ($Hkey,&QWP(16,$Htbl)); # load H^2
&sub ($len,0x20);
&lea ($inp,&DWP(32,$inp)); # i+=2
&jbe (&label("even_tail"));
&set_label("mod_loop");
&clmul64x64_T3 ($Xhi,$Xi,$Hkey); # H^2*(Ii+Xi)
&movdqu ($Hkey,&QWP(0,$Htbl)); # load H
&pxor ($Xi,$Xn); # (H*Ii+1) + H^2*(Ii+Xi)
&pxor ($Xhi,$Xhn);
&reduction_alg5 ($Xhi,$Xi);
#######
&movdqa ($T3,&QWP(0,$const));
&movdqu ($T1,&QWP(0,$inp)); # Ii
&movdqu ($Xn,&QWP(16,$inp)); # Ii+1
&pshufb ($T1,$T3);
&pshufb ($Xn,$T3);
&pxor ($Xi,$T1); # Ii+Xi
&clmul64x64_T3 ($Xhn,$Xn,$Hkey); # H*Ii+1
&movdqu ($Hkey,&QWP(16,$Htbl)); # load H^2
&sub ($len,0x20);
&lea ($inp,&DWP(32,$inp));
&ja (&label("mod_loop"));
&set_label("even_tail");
&clmul64x64_T3 ($Xhi,$Xi,$Hkey); # H^2*(Ii+Xi)
&pxor ($Xi,$Xn); # (H*Ii+1) + H^2*(Ii+Xi)
&pxor ($Xhi,$Xhn);
&reduction_alg5 ($Xhi,$Xi);
&movdqa ($T3,&QWP(0,$const));
&test ($len,$len);
&jnz (&label("done"));
&movdqu ($Hkey,&QWP(0,$Htbl)); # load H
&set_label("odd_tail");
&movdqu ($T1,&QWP(0,$inp)); # Ii
&pshufb ($T1,$T3);
&pxor ($Xi,$T1); # Ii+Xi
&clmul64x64_T3 ($Xhi,$Xi,$Hkey); # H*(Ii+Xi)
&reduction_alg5 ($Xhi,$Xi);
&movdqa ($T3,&QWP(0,$const));
&set_label("done");
&pshufb ($Xi,$T3);
&movdqu (&QWP(0,$Xip),$Xi);
&function_end("GFp_gcm_ghash_clmul");
}
&set_label("bswap",64);
&data_byte(15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0);
&data_byte(1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0xc2); # 0x1c2_polynomial
&set_label("rem_8bit",64);
&data_short(0x0000,0x01C2,0x0384,0x0246,0x0708,0x06CA,0x048C,0x054E);
&data_short(0x0E10,0x0FD2,0x0D94,0x0C56,0x0918,0x08DA,0x0A9C,0x0B5E);
&data_short(0x1C20,0x1DE2,0x1FA4,0x1E66,0x1B28,0x1AEA,0x18AC,0x196E);
&data_short(0x1230,0x13F2,0x11B4,0x1076,0x1538,0x14FA,0x16BC,0x177E);
&data_short(0x3840,0x3982,0x3BC4,0x3A06,0x3F48,0x3E8A,0x3CCC,0x3D0E);
&data_short(0x3650,0x3792,0x35D4,0x3416,0x3158,0x309A,0x32DC,0x331E);
&data_short(0x2460,0x25A2,0x27E4,0x2626,0x2368,0x22AA,0x20EC,0x212E);
&data_short(0x2A70,0x2BB2,0x29F4,0x2836,0x2D78,0x2CBA,0x2EFC,0x2F3E);
&data_short(0x7080,0x7142,0x7304,0x72C6,0x7788,0x764A,0x740C,0x75CE);
&data_short(0x7E90,0x7F52,0x7D14,0x7CD6,0x7998,0x785A,0x7A1C,0x7BDE);
&data_short(0x6CA0,0x6D62,0x6F24,0x6EE6,0x6BA8,0x6A6A,0x682C,0x69EE);
&data_short(0x62B0,0x6372,0x6134,0x60F6,0x65B8,0x647A,0x663C,0x67FE);
&data_short(0x48C0,0x4902,0x4B44,0x4A86,0x4FC8,0x4E0A,0x4C4C,0x4D8E);
&data_short(0x46D0,0x4712,0x4554,0x4496,0x41D8,0x401A,0x425C,0x439E);
&data_short(0x54E0,0x5522,0x5764,0x56A6,0x53E8,0x522A,0x506C,0x51AE);
&data_short(0x5AF0,0x5B32,0x5974,0x58B6,0x5DF8,0x5C3A,0x5E7C,0x5FBE);
&data_short(0xE100,0xE0C2,0xE284,0xE346,0xE608,0xE7CA,0xE58C,0xE44E);
&data_short(0xEF10,0xEED2,0xEC94,0xED56,0xE818,0xE9DA,0xEB9C,0xEA5E);
&data_short(0xFD20,0xFCE2,0xFEA4,0xFF66,0xFA28,0xFBEA,0xF9AC,0xF86E);
&data_short(0xF330,0xF2F2,0xF0B4,0xF176,0xF438,0xF5FA,0xF7BC,0xF67E);
&data_short(0xD940,0xD882,0xDAC4,0xDB06,0xDE48,0xDF8A,0xDDCC,0xDC0E);
&data_short(0xD750,0xD692,0xD4D4,0xD516,0xD058,0xD19A,0xD3DC,0xD21E);
&data_short(0xC560,0xC4A2,0xC6E4,0xC726,0xC268,0xC3AA,0xC1EC,0xC02E);
&data_short(0xCB70,0xCAB2,0xC8F4,0xC936,0xCC78,0xCDBA,0xCFFC,0xCE3E);
&data_short(0x9180,0x9042,0x9204,0x93C6,0x9688,0x974A,0x950C,0x94CE);
&data_short(0x9F90,0x9E52,0x9C14,0x9DD6,0x9898,0x995A,0x9B1C,0x9ADE);
&data_short(0x8DA0,0x8C62,0x8E24,0x8FE6,0x8AA8,0x8B6A,0x892C,0x88EE);
&data_short(0x83B0,0x8272,0x8034,0x81F6,0x84B8,0x857A,0x873C,0x86FE);
&data_short(0xA9C0,0xA802,0xAA44,0xAB86,0xAEC8,0xAF0A,0xAD4C,0xAC8E);
&data_short(0xA7D0,0xA612,0xA454,0xA596,0xA0D8,0xA11A,0xA35C,0xA29E);
&data_short(0xB5E0,0xB422,0xB664,0xB7A6,0xB2E8,0xB32A,0xB16C,0xB0AE);
&data_short(0xBBF0,0xBA32,0xB874,0xB9B6,0xBCF8,0xBD3A,0xBF7C,0xBEBE);
}} # $sse2
&asciz("GHASH for x86, CRYPTOGAMS by <appro\@openssl.org>");
&asm_finish();
close STDOUT or die "error closing STDOUT";
# A question was risen about choice of vanilla MMX. Or rather why wasn't
# SSE2 chosen instead? In addition to the fact that MMX runs on legacy
# CPUs such as PIII, "4-bit" MMX version was observed to provide better
# performance than *corresponding* SSE2 one even on contemporary CPUs.
# SSE2 results were provided by Peter-Michael Hager. He maintains SSE2
# implementation featuring full range of lookup-table sizes, but with
# per-invocation lookup table setup. Latter means that table size is
# chosen depending on how much data is to be hashed in every given call,
# more data - larger table. Best reported result for Core2 is ~4 cycles
# per processed byte out of 64KB block. This number accounts even for
# 64KB table setup overhead. As discussed in gcm128.c we choose to be
# more conservative in respect to lookup table sizes, but how do the
# results compare? Minimalistic "256B" MMX version delivers ~11 cycles
# on same platform. As also discussed in gcm128.c, next in line "8-bit
# Shoup's" or "4KB" method should deliver twice the performance of
# "256B" one, in other words not worse than ~6 cycles per byte. It
# should be also be noted that in SSE2 case improvement can be "super-
# linear," i.e. more than twice, mostly because >>8 maps to single
# instruction on SSE2 register. This is unlike "4-bit" case when >>4
# maps to same amount of instructions in both MMX and SSE2 cases.
# Bottom line is that switch to SSE2 is considered to be justifiable
# only in case we choose to implement "8-bit" method...

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,432 @@
#! /usr/bin/env perl
# Copyright 2014-2016 The OpenSSL Project Authors. All Rights Reserved.
#
# Licensed under the OpenSSL license (the "License"). You may not use
# this file except in compliance with the License. You can obtain a copy
# in the file LICENSE in the source distribution or at
# https://www.openssl.org/source/license.html
#
# ====================================================================
# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
# project. The module is, however, dual licensed under OpenSSL and
# CRYPTOGAMS licenses depending on where you obtain it. For further
# details see http://www.openssl.org/~appro/cryptogams/.
# ====================================================================
#
# GHASH for ARMv8 Crypto Extension, 64-bit polynomial multiplication.
#
# June 2014
# Initial version was developed in tight cooperation with Ard Biesheuvel
# of Linaro from bits-n-pieces from other assembly modules. Just like
# aesv8-armx.pl this module supports both AArch32 and AArch64 execution modes.
#
# July 2014
# Implement 2x aggregated reduction [see ghash-x86.pl for background
# information].
#
# Current performance in cycles per processed byte:
#
# PMULL[2] 32-bit NEON(*)
# Apple A7 0.92 5.62
# Cortex-A53 1.01 8.39
# Cortex-A57 1.17 7.61
# Denver 0.71 6.02
# Mongoose 1.10 8.06
# Kryo 1.16 8.00
#
# (*) presented for reference/comparison purposes;
$flavour = shift;
$output = shift;
$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or
( $xlate="${dir}../../../perlasm/arm-xlate.pl" and -f $xlate) or
die "can't locate arm-xlate.pl";
open OUT,"| \"$^X\" $xlate $flavour $output";
*STDOUT=*OUT;
$Xi="x0"; # argument block
$Htbl="x1";
$inp="x2";
$len="x3";
$inc="x12";
{
my ($Xl,$Xm,$Xh,$IN)=map("q$_",(0..3));
my ($t0,$t1,$t2,$xC2,$H,$Hhl,$H2)=map("q$_",(8..14));
$code=<<___;
#include <GFp/arm_arch.h>
.text
___
$code.=".arch armv8-a+crypto\n" if ($flavour =~ /64/);
$code.=<<___ if ($flavour !~ /64/);
.fpu neon
.code 32
#undef __thumb2__
___
################################################################################
# void GFp_gcm_init_clmul(u128 Htable[16],const u64 H[2]);
#
# input: 128-bit H - secret parameter E(K,0^128)
# output: precomputed table filled with degrees of twisted H;
# H is twisted to handle reverse bitness of GHASH;
# only few of 16 slots of Htable[16] are used;
# data is opaque to outside world (which allows to
# optimize the code independently);
#
$code.=<<___;
.global GFp_gcm_init_clmul
.type GFp_gcm_init_clmul,%function
.align 4
GFp_gcm_init_clmul:
AARCH64_VALID_CALL_TARGET
vld1.64 {$t1},[x1] @ load input H
vmov.i8 $xC2,#0xe1
vshl.i64 $xC2,$xC2,#57 @ 0xc2.0
vext.8 $IN,$t1,$t1,#8
vshr.u64 $t2,$xC2,#63
vdup.32 $t1,${t1}[1]
vext.8 $t0,$t2,$xC2,#8 @ t0=0xc2....01
vshr.u64 $t2,$IN,#63
vshr.s32 $t1,$t1,#31 @ broadcast carry bit
vand $t2,$t2,$t0
vshl.i64 $IN,$IN,#1
vext.8 $t2,$t2,$t2,#8
vand $t0,$t0,$t1
vorr $IN,$IN,$t2 @ H<<<=1
veor $H,$IN,$t0 @ twisted H
vst1.64 {$H},[x0],#16 @ store Htable[0]
@ calculate H^2
vext.8 $t0,$H,$H,#8 @ Karatsuba pre-processing
vpmull.p64 $Xl,$H,$H
veor $t0,$t0,$H
vpmull2.p64 $Xh,$H,$H
vpmull.p64 $Xm,$t0,$t0
vext.8 $t1,$Xl,$Xh,#8 @ Karatsuba post-processing
veor $t2,$Xl,$Xh
veor $Xm,$Xm,$t1
veor $Xm,$Xm,$t2
vpmull.p64 $t2,$Xl,$xC2 @ 1st phase
vmov $Xh#lo,$Xm#hi @ Xh|Xm - 256-bit result
vmov $Xm#hi,$Xl#lo @ Xm is rotated Xl
veor $Xl,$Xm,$t2
vext.8 $t2,$Xl,$Xl,#8 @ 2nd phase
vpmull.p64 $Xl,$Xl,$xC2
veor $t2,$t2,$Xh
veor $H2,$Xl,$t2
vext.8 $t1,$H2,$H2,#8 @ Karatsuba pre-processing
veor $t1,$t1,$H2
vext.8 $Hhl,$t0,$t1,#8 @ pack Karatsuba pre-processed
vst1.64 {$Hhl-$H2},[x0] @ store Htable[1..2]
ret
.size GFp_gcm_init_clmul,.-GFp_gcm_init_clmul
___
################################################################################
# void GFp_gcm_gmult_clmul(u64 Xi[2],const u128 Htable[16]);
#
# input: Xi - current hash value;
# Htable - table precomputed in GFp_gcm_init_clmul;
# output: Xi - next hash value Xi;
#
$code.=<<___;
.global GFp_gcm_gmult_clmul
.type GFp_gcm_gmult_clmul,%function
.align 4
GFp_gcm_gmult_clmul:
AARCH64_VALID_CALL_TARGET
vld1.64 {$t1},[$Xi] @ load Xi
vmov.i8 $xC2,#0xe1
vld1.64 {$H-$Hhl},[$Htbl] @ load twisted H, ...
vshl.u64 $xC2,$xC2,#57
#ifndef __ARMEB__
vrev64.8 $t1,$t1
#endif
vext.8 $IN,$t1,$t1,#8
vpmull.p64 $Xl,$H,$IN @ H.lo·Xi.lo
veor $t1,$t1,$IN @ Karatsuba pre-processing
vpmull2.p64 $Xh,$H,$IN @ H.hi·Xi.hi
vpmull.p64 $Xm,$Hhl,$t1 @ (H.lo+H.hi)·(Xi.lo+Xi.hi)
vext.8 $t1,$Xl,$Xh,#8 @ Karatsuba post-processing
veor $t2,$Xl,$Xh
veor $Xm,$Xm,$t1
veor $Xm,$Xm,$t2
vpmull.p64 $t2,$Xl,$xC2 @ 1st phase of reduction
vmov $Xh#lo,$Xm#hi @ Xh|Xm - 256-bit result
vmov $Xm#hi,$Xl#lo @ Xm is rotated Xl
veor $Xl,$Xm,$t2
vext.8 $t2,$Xl,$Xl,#8 @ 2nd phase of reduction
vpmull.p64 $Xl,$Xl,$xC2
veor $t2,$t2,$Xh
veor $Xl,$Xl,$t2
#ifndef __ARMEB__
vrev64.8 $Xl,$Xl
#endif
vext.8 $Xl,$Xl,$Xl,#8
vst1.64 {$Xl},[$Xi] @ write out Xi
ret
.size GFp_gcm_gmult_clmul,.-GFp_gcm_gmult_clmul
___
################################################################################
# void GFp_gcm_ghash_clmul(u64 Xi[2], const u128 Htable[16], const u8 *inp,
# size_t len);
#
# input: table precomputed in GFp_gcm_init_clmul;
# current hash value Xi;
# pointer to input data;
# length of input data in bytes, but divisible by block size;
# output: next hash value Xi;
#
$code.=<<___;
.global GFp_gcm_ghash_clmul
.type GFp_gcm_ghash_clmul,%function
.align 4
GFp_gcm_ghash_clmul:
AARCH64_VALID_CALL_TARGET
___
$code.=<<___ if ($flavour !~ /64/);
vstmdb sp!,{d8-d15} @ 32-bit ABI says so
___
$code.=<<___;
vld1.64 {$Xl},[$Xi] @ load [rotated] Xi
@ "[rotated]" means that
@ loaded value would have
@ to be rotated in order to
@ make it appear as in
@ algorithm specification
subs $len,$len,#32 @ see if $len is 32 or larger
mov $inc,#16 @ $inc is used as post-
@ increment for input pointer;
@ as loop is modulo-scheduled
@ $inc is zeroed just in time
@ to preclude overstepping
@ inp[len], which means that
@ last block[s] are actually
@ loaded twice, but last
@ copy is not processed
vld1.64 {$H-$Hhl},[$Htbl],#32 @ load twisted H, ..., H^2
vmov.i8 $xC2,#0xe1
vld1.64 {$H2},[$Htbl]
cclr $inc,eq @ is it time to zero $inc?
vext.8 $Xl,$Xl,$Xl,#8 @ rotate Xi
vld1.64 {$t0},[$inp],#16 @ load [rotated] I[0]
vshl.u64 $xC2,$xC2,#57 @ compose 0xc2.0 constant
#ifndef __ARMEB__
vrev64.8 $t0,$t0
vrev64.8 $Xl,$Xl
#endif
vext.8 $IN,$t0,$t0,#8 @ rotate I[0]
b.lo .Lodd_tail_v8 @ $len was less than 32
___
{ my ($Xln,$Xmn,$Xhn,$In) = map("q$_",(4..7));
#######
# Xi+2 =[H*(Ii+1 + Xi+1)] mod P =
# [(H*Ii+1) + (H*Xi+1)] mod P =
# [(H*Ii+1) + H^2*(Ii+Xi)] mod P
#
$code.=<<___;
vld1.64 {$t1},[$inp],$inc @ load [rotated] I[1]
#ifndef __ARMEB__
vrev64.8 $t1,$t1
#endif
vext.8 $In,$t1,$t1,#8
veor $IN,$IN,$Xl @ I[i]^=Xi
vpmull.p64 $Xln,$H,$In @ H·Ii+1
veor $t1,$t1,$In @ Karatsuba pre-processing
vpmull2.p64 $Xhn,$H,$In
b .Loop_mod2x_v8
.align 4
.Loop_mod2x_v8:
vext.8 $t2,$IN,$IN,#8
subs $len,$len,#32 @ is there more data?
vpmull.p64 $Xl,$H2,$IN @ H^2.lo·Xi.lo
cclr $inc,lo @ is it time to zero $inc?
vpmull.p64 $Xmn,$Hhl,$t1
veor $t2,$t2,$IN @ Karatsuba pre-processing
vpmull2.p64 $Xh,$H2,$IN @ H^2.hi·Xi.hi
veor $Xl,$Xl,$Xln @ accumulate
vpmull2.p64 $Xm,$Hhl,$t2 @ (H^2.lo+H^2.hi)·(Xi.lo+Xi.hi)
vld1.64 {$t0},[$inp],$inc @ load [rotated] I[i+2]
veor $Xh,$Xh,$Xhn
cclr $inc,eq @ is it time to zero $inc?
veor $Xm,$Xm,$Xmn
vext.8 $t1,$Xl,$Xh,#8 @ Karatsuba post-processing
veor $t2,$Xl,$Xh
veor $Xm,$Xm,$t1
vld1.64 {$t1},[$inp],$inc @ load [rotated] I[i+3]
#ifndef __ARMEB__
vrev64.8 $t0,$t0
#endif
veor $Xm,$Xm,$t2
vpmull.p64 $t2,$Xl,$xC2 @ 1st phase of reduction
#ifndef __ARMEB__
vrev64.8 $t1,$t1
#endif
vmov $Xh#lo,$Xm#hi @ Xh|Xm - 256-bit result
vmov $Xm#hi,$Xl#lo @ Xm is rotated Xl
vext.8 $In,$t1,$t1,#8
vext.8 $IN,$t0,$t0,#8
veor $Xl,$Xm,$t2
vpmull.p64 $Xln,$H,$In @ H·Ii+1
veor $IN,$IN,$Xh @ accumulate $IN early
vext.8 $t2,$Xl,$Xl,#8 @ 2nd phase of reduction
vpmull.p64 $Xl,$Xl,$xC2
veor $IN,$IN,$t2
veor $t1,$t1,$In @ Karatsuba pre-processing
veor $IN,$IN,$Xl
vpmull2.p64 $Xhn,$H,$In
b.hs .Loop_mod2x_v8 @ there was at least 32 more bytes
veor $Xh,$Xh,$t2
vext.8 $IN,$t0,$t0,#8 @ re-construct $IN
adds $len,$len,#32 @ re-construct $len
veor $Xl,$Xl,$Xh @ re-construct $Xl
b.eq .Ldone_v8 @ is $len zero?
___
}
$code.=<<___;
.Lodd_tail_v8:
vext.8 $t2,$Xl,$Xl,#8
veor $IN,$IN,$Xl @ inp^=Xi
veor $t1,$t0,$t2 @ $t1 is rotated inp^Xi
vpmull.p64 $Xl,$H,$IN @ H.lo·Xi.lo
veor $t1,$t1,$IN @ Karatsuba pre-processing
vpmull2.p64 $Xh,$H,$IN @ H.hi·Xi.hi
vpmull.p64 $Xm,$Hhl,$t1 @ (H.lo+H.hi)·(Xi.lo+Xi.hi)
vext.8 $t1,$Xl,$Xh,#8 @ Karatsuba post-processing
veor $t2,$Xl,$Xh
veor $Xm,$Xm,$t1
veor $Xm,$Xm,$t2
vpmull.p64 $t2,$Xl,$xC2 @ 1st phase of reduction
vmov $Xh#lo,$Xm#hi @ Xh|Xm - 256-bit result
vmov $Xm#hi,$Xl#lo @ Xm is rotated Xl
veor $Xl,$Xm,$t2
vext.8 $t2,$Xl,$Xl,#8 @ 2nd phase of reduction
vpmull.p64 $Xl,$Xl,$xC2
veor $t2,$t2,$Xh
veor $Xl,$Xl,$t2
.Ldone_v8:
#ifndef __ARMEB__
vrev64.8 $Xl,$Xl
#endif
vext.8 $Xl,$Xl,$Xl,#8
vst1.64 {$Xl},[$Xi] @ write out Xi
___
$code.=<<___ if ($flavour !~ /64/);
vldmia sp!,{d8-d15} @ 32-bit ABI says so
___
$code.=<<___;
ret
.size GFp_gcm_ghash_clmul,.-GFp_gcm_ghash_clmul
___
}
$code.=<<___;
.asciz "GHASH for ARMv8, CRYPTOGAMS by <appro\@openssl.org>"
.align 2
___
if ($flavour =~ /64/) { ######## 64-bit code
sub unvmov {
my $arg=shift;
$arg =~ m/q([0-9]+)#(lo|hi),\s*q([0-9]+)#(lo|hi)/o &&
sprintf "ins v%d.d[%d],v%d.d[%d]",$1,($2 eq "lo")?0:1,$3,($4 eq "lo")?0:1;
}
foreach(split("\n",$code)) {
s/cclr\s+([wx])([^,]+),\s*([a-z]+)/csel $1$2,$1zr,$1$2,$3/o or
s/vmov\.i8/movi/o or # fix up legacy mnemonics
s/vmov\s+(.*)/unvmov($1)/geo or
s/vext\.8/ext/o or
s/vshr\.s/sshr\.s/o or
s/vshr/ushr/o or
s/^(\s+)v/$1/o or # strip off v prefix
s/\bbx\s+lr\b/ret/o;
s/\bq([0-9]+)\b/"v".($1<8?$1:$1+8).".16b"/geo; # old->new registers
s/@\s/\/\//o; # old->new style commentary
# fix up remaining legacy suffixes
s/\.[ui]?8(\s)/$1/o;
s/\.[uis]?32//o and s/\.16b/\.4s/go;
m/\.p64/o and s/\.16b/\.1q/o; # 1st pmull argument
m/l\.p64/o and s/\.16b/\.1d/go; # 2nd and 3rd pmull arguments
s/\.[uisp]?64//o and s/\.16b/\.2d/go;
s/\.[42]([sd])\[([0-3])\]/\.$1\[$2\]/o;
print $_,"\n";
}
} else { ######## 32-bit code
sub unvdup32 {
my $arg=shift;
$arg =~ m/q([0-9]+),\s*q([0-9]+)\[([0-3])\]/o &&
sprintf "vdup.32 q%d,d%d[%d]",$1,2*$2+($3>>1),$3&1;
}
sub unvpmullp64 {
my ($mnemonic,$arg)=@_;
if ($arg =~ m/q([0-9]+),\s*q([0-9]+),\s*q([0-9]+)/o) {
my $word = 0xf2a00e00|(($1&7)<<13)|(($1&8)<<19)
|(($2&7)<<17)|(($2&8)<<4)
|(($3&7)<<1) |(($3&8)<<2);
$word |= 0x00010001 if ($mnemonic =~ "2");
# since ARMv7 instructions are always encoded little-endian.
# correct solution is to use .inst directive, but older
# assemblers don't implement it:-(
sprintf ".byte\t0x%02x,0x%02x,0x%02x,0x%02x\t@ %s %s",
$word&0xff,($word>>8)&0xff,
($word>>16)&0xff,($word>>24)&0xff,
$mnemonic,$arg;
}
}
foreach(split("\n",$code)) {
s/\b[wx]([0-9]+)\b/r$1/go; # new->old registers
s/\bv([0-9])\.[12468]+[bsd]\b/q$1/go; # new->old registers
s/\/\/\s?/@ /o; # new->old style commentary
# fix up remaining new-style suffixes
s/\],#[0-9]+/]!/o;
s/cclr\s+([^,]+),\s*([a-z]+)/mov$2 $1,#0/o or
s/vdup\.32\s+(.*)/unvdup32($1)/geo or
s/v?(pmull2?)\.p64\s+(.*)/unvpmullp64($1,$2)/geo or
s/\bq([0-9]+)#(lo|hi)/sprintf "d%d",2*$1+($2 eq "hi")/geo or
s/^(\s+)b\./$1b/o or
s/^(\s+)ret/$1bx\tlr/o;
print $_,"\n";
}
}
close STDOUT or die "error closing STDOUT"; # enforce flush

View File

@@ -0,0 +1,736 @@
#! /usr/bin/env perl
# Copyright 2007-2016 The OpenSSL Project Authors. All Rights Reserved.
#
# Licensed under the OpenSSL license (the "License"). You may not use
# this file except in compliance with the License. You can obtain a copy
# in the file LICENSE in the source distribution or at
# https://www.openssl.org/source/license.html
# ====================================================================
# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
# project. The module is, however, dual licensed under OpenSSL and
# CRYPTOGAMS licenses depending on where you obtain it. For further
# details see http://www.openssl.org/~appro/cryptogams/.
#
# Permission to use under GPL terms is granted.
# ====================================================================
# SHA256 block procedure for ARMv4. May 2007.
# Performance is ~2x better than gcc 3.4 generated code and in "abso-
# lute" terms is ~2250 cycles per 64-byte block or ~35 cycles per
# byte [on single-issue Xscale PXA250 core].
# July 2010.
#
# Rescheduling for dual-issue pipeline resulted in 22% improvement on
# Cortex A8 core and ~20 cycles per processed byte.
# February 2011.
#
# Profiler-assisted and platform-specific optimization resulted in 16%
# improvement on Cortex A8 core and ~15.4 cycles per processed byte.
# September 2013.
#
# Add NEON implementation. On Cortex A8 it was measured to process one
# byte in 12.5 cycles or 23% faster than integer-only code. Snapdragon
# S4 does it in 12.5 cycles too, but it's 50% faster than integer-only
# code (meaning that latter performs sub-optimally, nothing was done
# about it).
# May 2014.
#
# Add ARMv8 code path performing at 2.0 cpb on Apple A7.
$flavour = shift;
if ($flavour=~/\w[\w\-]*\.\w+$/) { $output=$flavour; undef $flavour; }
else { while (($output=shift) && ($output!~/\w[\w\-]*\.\w+$/)) {} }
if ($flavour && $flavour ne "void") {
$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or
( $xlate="${dir}../../../perlasm/arm-xlate.pl" and -f $xlate) or
die "can't locate arm-xlate.pl";
open OUT,"| \"$^X\" $xlate $flavour $output";
*STDOUT=*OUT;
} else {
open OUT,">$output";
*STDOUT=*OUT;
}
$ctx="r0"; $t0="r0";
$inp="r1"; $t4="r1";
$len="r2"; $t1="r2";
$T1="r3"; $t3="r3";
$A="r4";
$B="r5";
$C="r6";
$D="r7";
$E="r8";
$F="r9";
$G="r10";
$H="r11";
@V=($A,$B,$C,$D,$E,$F,$G,$H);
$t2="r12";
$Ktbl="r14";
@Sigma0=( 2,13,22);
@Sigma1=( 6,11,25);
@sigma0=( 7,18, 3);
@sigma1=(17,19,10);
sub BODY_00_15 {
my ($i,$a,$b,$c,$d,$e,$f,$g,$h) = @_;
$code.=<<___ if ($i<16);
#if __ARM_ARCH__>=7
@ ldr $t1,[$inp],#4 @ $i
# if $i==15
str $inp,[sp,#17*4] @ make room for $t4
# endif
eor $t0,$e,$e,ror#`$Sigma1[1]-$Sigma1[0]`
add $a,$a,$t2 @ h+=Maj(a,b,c) from the past
eor $t0,$t0,$e,ror#`$Sigma1[2]-$Sigma1[0]` @ Sigma1(e)
# ifndef __ARMEB__
rev $t1,$t1
# endif
#else
@ ldrb $t1,[$inp,#3] @ $i
add $a,$a,$t2 @ h+=Maj(a,b,c) from the past
ldrb $t2,[$inp,#2]
ldrb $t0,[$inp,#1]
orr $t1,$t1,$t2,lsl#8
ldrb $t2,[$inp],#4
orr $t1,$t1,$t0,lsl#16
# if $i==15
str $inp,[sp,#17*4] @ make room for $t4
# endif
eor $t0,$e,$e,ror#`$Sigma1[1]-$Sigma1[0]`
orr $t1,$t1,$t2,lsl#24
eor $t0,$t0,$e,ror#`$Sigma1[2]-$Sigma1[0]` @ Sigma1(e)
#endif
___
$code.=<<___;
ldr $t2,[$Ktbl],#4 @ *K256++
add $h,$h,$t1 @ h+=X[i]
str $t1,[sp,#`$i%16`*4]
eor $t1,$f,$g
add $h,$h,$t0,ror#$Sigma1[0] @ h+=Sigma1(e)
and $t1,$t1,$e
add $h,$h,$t2 @ h+=K256[i]
eor $t1,$t1,$g @ Ch(e,f,g)
eor $t0,$a,$a,ror#`$Sigma0[1]-$Sigma0[0]`
add $h,$h,$t1 @ h+=Ch(e,f,g)
#if $i==31
and $t2,$t2,#0xff
cmp $t2,#0xf2 @ done?
#endif
#if $i<15
# if __ARM_ARCH__>=7
ldr $t1,[$inp],#4 @ prefetch
# else
ldrb $t1,[$inp,#3]
# endif
eor $t2,$a,$b @ a^b, b^c in next round
#else
ldr $t1,[sp,#`($i+2)%16`*4] @ from future BODY_16_xx
eor $t2,$a,$b @ a^b, b^c in next round
ldr $t4,[sp,#`($i+15)%16`*4] @ from future BODY_16_xx
#endif
eor $t0,$t0,$a,ror#`$Sigma0[2]-$Sigma0[0]` @ Sigma0(a)
and $t3,$t3,$t2 @ (b^c)&=(a^b)
add $d,$d,$h @ d+=h
eor $t3,$t3,$b @ Maj(a,b,c)
add $h,$h,$t0,ror#$Sigma0[0] @ h+=Sigma0(a)
@ add $h,$h,$t3 @ h+=Maj(a,b,c)
___
($t2,$t3)=($t3,$t2);
}
sub BODY_16_XX {
my ($i,$a,$b,$c,$d,$e,$f,$g,$h) = @_;
$code.=<<___;
@ ldr $t1,[sp,#`($i+1)%16`*4] @ $i
@ ldr $t4,[sp,#`($i+14)%16`*4]
mov $t0,$t1,ror#$sigma0[0]
add $a,$a,$t2 @ h+=Maj(a,b,c) from the past
mov $t2,$t4,ror#$sigma1[0]
eor $t0,$t0,$t1,ror#$sigma0[1]
eor $t2,$t2,$t4,ror#$sigma1[1]
eor $t0,$t0,$t1,lsr#$sigma0[2] @ sigma0(X[i+1])
ldr $t1,[sp,#`($i+0)%16`*4]
eor $t2,$t2,$t4,lsr#$sigma1[2] @ sigma1(X[i+14])
ldr $t4,[sp,#`($i+9)%16`*4]
add $t2,$t2,$t0
eor $t0,$e,$e,ror#`$Sigma1[1]-$Sigma1[0]` @ from BODY_00_15
add $t1,$t1,$t2
eor $t0,$t0,$e,ror#`$Sigma1[2]-$Sigma1[0]` @ Sigma1(e)
add $t1,$t1,$t4 @ X[i]
___
&BODY_00_15(@_);
}
$code=<<___;
#ifndef __KERNEL__
# include <GFp/arm_arch.h>
#else
# define __ARM_ARCH__ __LINUX_ARM_ARCH__
# define __ARM_MAX_ARCH__ 7
#endif
@ Silence ARMv8 deprecated IT instruction warnings. This file is used by both
@ ARMv7 and ARMv8 processors. It does have ARMv8-only code, but those
@ instructions are manually-encoded. (See unsha256.)
.arch armv7-a
.text
#if defined(__thumb2__)
.syntax unified
.thumb
#else
.code 32
#endif
.type K256,%object
.align 5
K256:
.word 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
.word 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5
.word 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3
.word 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174
.word 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc
.word 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da
.word 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7
.word 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967
.word 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13
.word 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85
.word 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3
.word 0xd192e819,0xd6990624,0xf40e3585,0x106aa070
.word 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5
.word 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3
.word 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
.word 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
.size K256,.-K256
.word 0 @ terminator
#if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__)
.extern GFp_armcap_P
.hidden GFp_armcap_P
.LOPENSSL_armcap:
.word GFp_armcap_P-.Lsha256_block_data_order
#endif
.align 5
.global GFp_sha256_block_data_order
.type GFp_sha256_block_data_order,%function
GFp_sha256_block_data_order:
.Lsha256_block_data_order:
#if __ARM_ARCH__<7 && !defined(__thumb2__)
sub r3,pc,#8 @ GFp_sha256_block_data_order
#else
adr r3,.Lsha256_block_data_order
#endif
#if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__)
ldr r12,.LOPENSSL_armcap
ldr r12,[r3,r12] @ GFp_armcap_P
#ifdef __APPLE__
ldr r12,[r12]
#endif
tst r12,#ARMV8_SHA256
bne .LARMv8
tst r12,#ARMV7_NEON
bne .LNEON
#endif
add $len,$inp,$len,lsl#6 @ len to point at the end of inp
stmdb sp!,{$ctx,$inp,$len,r4-r11,lr}
ldmia $ctx,{$A,$B,$C,$D,$E,$F,$G,$H}
sub $Ktbl,r3,#256+32 @ K256
sub sp,sp,#16*4 @ alloca(X[16])
.Loop:
# if __ARM_ARCH__>=7
ldr $t1,[$inp],#4
# else
ldrb $t1,[$inp,#3]
# endif
eor $t3,$B,$C @ magic
eor $t2,$t2,$t2
___
for($i=0;$i<16;$i++) { &BODY_00_15($i,@V); unshift(@V,pop(@V)); }
$code.=".Lrounds_16_xx:\n";
for (;$i<32;$i++) { &BODY_16_XX($i,@V); unshift(@V,pop(@V)); }
$code.=<<___;
#if __ARM_ARCH__>=7
ite eq @ Thumb2 thing, sanity check in ARM
#endif
ldreq $t3,[sp,#16*4] @ pull ctx
bne .Lrounds_16_xx
add $A,$A,$t2 @ h+=Maj(a,b,c) from the past
ldr $t0,[$t3,#0]
ldr $t1,[$t3,#4]
ldr $t2,[$t3,#8]
add $A,$A,$t0
ldr $t0,[$t3,#12]
add $B,$B,$t1
ldr $t1,[$t3,#16]
add $C,$C,$t2
ldr $t2,[$t3,#20]
add $D,$D,$t0
ldr $t0,[$t3,#24]
add $E,$E,$t1
ldr $t1,[$t3,#28]
add $F,$F,$t2
ldr $inp,[sp,#17*4] @ pull inp
ldr $t2,[sp,#18*4] @ pull inp+len
add $G,$G,$t0
add $H,$H,$t1
stmia $t3,{$A,$B,$C,$D,$E,$F,$G,$H}
cmp $inp,$t2
sub $Ktbl,$Ktbl,#256 @ rewind Ktbl
bne .Loop
add sp,sp,#`16+3`*4 @ destroy frame
#if __ARM_ARCH__>=5
ldmia sp!,{r4-r11,pc}
#else
ldmia sp!,{r4-r11,lr}
tst lr,#1
moveq pc,lr @ be binary compatible with V4, yet
bx lr @ interoperable with Thumb ISA:-)
#endif
.size GFp_sha256_block_data_order,.-GFp_sha256_block_data_order
___
######################################################################
# NEON stuff
#
{{{
my @X=map("q$_",(0..3));
my ($T0,$T1,$T2,$T3,$T4,$T5)=("q8","q9","q10","q11","d24","d25");
my $Xfer=$t4;
my $j=0;
sub Dlo() { shift=~m|q([1]?[0-9])|?"d".($1*2):""; }
sub Dhi() { shift=~m|q([1]?[0-9])|?"d".($1*2+1):""; }
sub AUTOLOAD() # thunk [simplified] x86-style perlasm
{ my $opcode = $AUTOLOAD; $opcode =~ s/.*:://; $opcode =~ s/_/\./;
my $arg = pop;
$arg = "#$arg" if ($arg*1 eq $arg);
$code .= "\t$opcode\t".join(',',@_,$arg)."\n";
}
sub Xupdate()
{ use integer;
my $body = shift;
my @insns = (&$body,&$body,&$body,&$body);
my ($a,$b,$c,$d,$e,$f,$g,$h);
&vext_8 ($T0,@X[0],@X[1],4); # X[1..4]
eval(shift(@insns));
eval(shift(@insns));
eval(shift(@insns));
&vext_8 ($T1,@X[2],@X[3],4); # X[9..12]
eval(shift(@insns));
eval(shift(@insns));
eval(shift(@insns));
&vshr_u32 ($T2,$T0,$sigma0[0]);
eval(shift(@insns));
eval(shift(@insns));
&vadd_i32 (@X[0],@X[0],$T1); # X[0..3] += X[9..12]
eval(shift(@insns));
eval(shift(@insns));
&vshr_u32 ($T1,$T0,$sigma0[2]);
eval(shift(@insns));
eval(shift(@insns));
&vsli_32 ($T2,$T0,32-$sigma0[0]);
eval(shift(@insns));
eval(shift(@insns));
&vshr_u32 ($T3,$T0,$sigma0[1]);
eval(shift(@insns));
eval(shift(@insns));
&veor ($T1,$T1,$T2);
eval(shift(@insns));
eval(shift(@insns));
&vsli_32 ($T3,$T0,32-$sigma0[1]);
eval(shift(@insns));
eval(shift(@insns));
&vshr_u32 ($T4,&Dhi(@X[3]),$sigma1[0]);
eval(shift(@insns));
eval(shift(@insns));
&veor ($T1,$T1,$T3); # sigma0(X[1..4])
eval(shift(@insns));
eval(shift(@insns));
&vsli_32 ($T4,&Dhi(@X[3]),32-$sigma1[0]);
eval(shift(@insns));
eval(shift(@insns));
&vshr_u32 ($T5,&Dhi(@X[3]),$sigma1[2]);
eval(shift(@insns));
eval(shift(@insns));
&vadd_i32 (@X[0],@X[0],$T1); # X[0..3] += sigma0(X[1..4])
eval(shift(@insns));
eval(shift(@insns));
&veor ($T5,$T5,$T4);
eval(shift(@insns));
eval(shift(@insns));
&vshr_u32 ($T4,&Dhi(@X[3]),$sigma1[1]);
eval(shift(@insns));
eval(shift(@insns));
&vsli_32 ($T4,&Dhi(@X[3]),32-$sigma1[1]);
eval(shift(@insns));
eval(shift(@insns));
&veor ($T5,$T5,$T4); # sigma1(X[14..15])
eval(shift(@insns));
eval(shift(@insns));
&vadd_i32 (&Dlo(@X[0]),&Dlo(@X[0]),$T5);# X[0..1] += sigma1(X[14..15])
eval(shift(@insns));
eval(shift(@insns));
&vshr_u32 ($T4,&Dlo(@X[0]),$sigma1[0]);
eval(shift(@insns));
eval(shift(@insns));
&vsli_32 ($T4,&Dlo(@X[0]),32-$sigma1[0]);
eval(shift(@insns));
eval(shift(@insns));
&vshr_u32 ($T5,&Dlo(@X[0]),$sigma1[2]);
eval(shift(@insns));
eval(shift(@insns));
&veor ($T5,$T5,$T4);
eval(shift(@insns));
eval(shift(@insns));
&vshr_u32 ($T4,&Dlo(@X[0]),$sigma1[1]);
eval(shift(@insns));
eval(shift(@insns));
&vld1_32 ("{$T0}","[$Ktbl,:128]!");
eval(shift(@insns));
eval(shift(@insns));
&vsli_32 ($T4,&Dlo(@X[0]),32-$sigma1[1]);
eval(shift(@insns));
eval(shift(@insns));
&veor ($T5,$T5,$T4); # sigma1(X[16..17])
eval(shift(@insns));
eval(shift(@insns));
&vadd_i32 (&Dhi(@X[0]),&Dhi(@X[0]),$T5);# X[2..3] += sigma1(X[16..17])
eval(shift(@insns));
eval(shift(@insns));
&vadd_i32 ($T0,$T0,@X[0]);
while($#insns>=2) { eval(shift(@insns)); }
&vst1_32 ("{$T0}","[$Xfer,:128]!");
eval(shift(@insns));
eval(shift(@insns));
push(@X,shift(@X)); # "rotate" X[]
}
sub Xpreload()
{ use integer;
my $body = shift;
my @insns = (&$body,&$body,&$body,&$body);
my ($a,$b,$c,$d,$e,$f,$g,$h);
eval(shift(@insns));
eval(shift(@insns));
eval(shift(@insns));
eval(shift(@insns));
&vld1_32 ("{$T0}","[$Ktbl,:128]!");
eval(shift(@insns));
eval(shift(@insns));
eval(shift(@insns));
eval(shift(@insns));
&vrev32_8 (@X[0],@X[0]);
eval(shift(@insns));
eval(shift(@insns));
eval(shift(@insns));
eval(shift(@insns));
&vadd_i32 ($T0,$T0,@X[0]);
foreach (@insns) { eval; } # remaining instructions
&vst1_32 ("{$T0}","[$Xfer,:128]!");
push(@X,shift(@X)); # "rotate" X[]
}
sub body_00_15 () {
(
'($a,$b,$c,$d,$e,$f,$g,$h)=@V;'.
'&add ($h,$h,$t1)', # h+=X[i]+K[i]
'&eor ($t1,$f,$g)',
'&eor ($t0,$e,$e,"ror#".($Sigma1[1]-$Sigma1[0]))',
'&add ($a,$a,$t2)', # h+=Maj(a,b,c) from the past
'&and ($t1,$t1,$e)',
'&eor ($t2,$t0,$e,"ror#".($Sigma1[2]-$Sigma1[0]))', # Sigma1(e)
'&eor ($t0,$a,$a,"ror#".($Sigma0[1]-$Sigma0[0]))',
'&eor ($t1,$t1,$g)', # Ch(e,f,g)
'&add ($h,$h,$t2,"ror#$Sigma1[0]")', # h+=Sigma1(e)
'&eor ($t2,$a,$b)', # a^b, b^c in next round
'&eor ($t0,$t0,$a,"ror#".($Sigma0[2]-$Sigma0[0]))', # Sigma0(a)
'&add ($h,$h,$t1)', # h+=Ch(e,f,g)
'&ldr ($t1,sprintf "[sp,#%d]",4*(($j+1)&15)) if (($j&15)!=15);'.
'&ldr ($t1,"[$Ktbl]") if ($j==15);'.
'&ldr ($t1,"[sp,#64]") if ($j==31)',
'&and ($t3,$t3,$t2)', # (b^c)&=(a^b)
'&add ($d,$d,$h)', # d+=h
'&add ($h,$h,$t0,"ror#$Sigma0[0]");'. # h+=Sigma0(a)
'&eor ($t3,$t3,$b)', # Maj(a,b,c)
'$j++; unshift(@V,pop(@V)); ($t2,$t3)=($t3,$t2);'
)
}
$code.=<<___;
#if __ARM_MAX_ARCH__>=7
.arch armv7-a
.fpu neon
.type sha256_block_data_order_neon,%function
.align 5
.skip 16
sha256_block_data_order_neon:
.LNEON:
stmdb sp!,{r4-r12,lr}
sub $H,sp,#16*4+16
adr $Ktbl,K256
bic $H,$H,#15 @ align for 128-bit stores
mov $t2,sp
mov sp,$H @ alloca
add $len,$inp,$len,lsl#6 @ len to point at the end of inp
vld1.8 {@X[0]},[$inp]!
vld1.8 {@X[1]},[$inp]!
vld1.8 {@X[2]},[$inp]!
vld1.8 {@X[3]},[$inp]!
vld1.32 {$T0},[$Ktbl,:128]!
vld1.32 {$T1},[$Ktbl,:128]!
vld1.32 {$T2},[$Ktbl,:128]!
vld1.32 {$T3},[$Ktbl,:128]!
vrev32.8 @X[0],@X[0] @ yes, even on
str $ctx,[sp,#64]
vrev32.8 @X[1],@X[1] @ big-endian
str $inp,[sp,#68]
mov $Xfer,sp
vrev32.8 @X[2],@X[2]
str $len,[sp,#72]
vrev32.8 @X[3],@X[3]
str $t2,[sp,#76] @ save original sp
vadd.i32 $T0,$T0,@X[0]
vadd.i32 $T1,$T1,@X[1]
vst1.32 {$T0},[$Xfer,:128]!
vadd.i32 $T2,$T2,@X[2]
vst1.32 {$T1},[$Xfer,:128]!
vadd.i32 $T3,$T3,@X[3]
vst1.32 {$T2},[$Xfer,:128]!
vst1.32 {$T3},[$Xfer,:128]!
ldmia $ctx,{$A-$H}
sub $Xfer,$Xfer,#64
ldr $t1,[sp,#0]
eor $t2,$t2,$t2
eor $t3,$B,$C
b .L_00_48
.align 4
.L_00_48:
___
&Xupdate(\&body_00_15);
&Xupdate(\&body_00_15);
&Xupdate(\&body_00_15);
&Xupdate(\&body_00_15);
$code.=<<___;
teq $t1,#0 @ check for K256 terminator
ldr $t1,[sp,#0]
sub $Xfer,$Xfer,#64
bne .L_00_48
ldr $inp,[sp,#68]
ldr $t0,[sp,#72]
sub $Ktbl,$Ktbl,#256 @ rewind $Ktbl
teq $inp,$t0
it eq
subeq $inp,$inp,#64 @ avoid SEGV
vld1.8 {@X[0]},[$inp]! @ load next input block
vld1.8 {@X[1]},[$inp]!
vld1.8 {@X[2]},[$inp]!
vld1.8 {@X[3]},[$inp]!
it ne
strne $inp,[sp,#68]
mov $Xfer,sp
___
&Xpreload(\&body_00_15);
&Xpreload(\&body_00_15);
&Xpreload(\&body_00_15);
&Xpreload(\&body_00_15);
$code.=<<___;
ldr $t0,[$t1,#0]
add $A,$A,$t2 @ h+=Maj(a,b,c) from the past
ldr $t2,[$t1,#4]
ldr $t3,[$t1,#8]
ldr $t4,[$t1,#12]
add $A,$A,$t0 @ accumulate
ldr $t0,[$t1,#16]
add $B,$B,$t2
ldr $t2,[$t1,#20]
add $C,$C,$t3
ldr $t3,[$t1,#24]
add $D,$D,$t4
ldr $t4,[$t1,#28]
add $E,$E,$t0
str $A,[$t1],#4
add $F,$F,$t2
str $B,[$t1],#4
add $G,$G,$t3
str $C,[$t1],#4
add $H,$H,$t4
str $D,[$t1],#4
stmia $t1,{$E-$H}
ittte ne
movne $Xfer,sp
ldrne $t1,[sp,#0]
eorne $t2,$t2,$t2
ldreq sp,[sp,#76] @ restore original sp
itt ne
eorne $t3,$B,$C
bne .L_00_48
ldmia sp!,{r4-r12,pc}
.size sha256_block_data_order_neon,.-sha256_block_data_order_neon
#endif
___
}}}
######################################################################
# ARMv8 stuff
#
{{{
my ($ABCD,$EFGH,$abcd)=map("q$_",(0..2));
my @MSG=map("q$_",(8..11));
my ($W0,$W1,$ABCD_SAVE,$EFGH_SAVE)=map("q$_",(12..15));
my $Ktbl="r3";
$code.=<<___;
#if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__)
# if defined(__thumb2__)
# define INST(a,b,c,d) .byte c,d|0xc,a,b
# else
# define INST(a,b,c,d) .byte a,b,c,d
# endif
.type sha256_block_data_order_armv8,%function
.align 5
sha256_block_data_order_armv8:
.LARMv8:
vld1.32 {$ABCD,$EFGH},[$ctx]
sub $Ktbl,$Ktbl,#256+32
add $len,$inp,$len,lsl#6 @ len to point at the end of inp
b .Loop_v8
.align 4
.Loop_v8:
vld1.8 {@MSG[0]-@MSG[1]},[$inp]!
vld1.8 {@MSG[2]-@MSG[3]},[$inp]!
vld1.32 {$W0},[$Ktbl]!
vrev32.8 @MSG[0],@MSG[0]
vrev32.8 @MSG[1],@MSG[1]
vrev32.8 @MSG[2],@MSG[2]
vrev32.8 @MSG[3],@MSG[3]
vmov $ABCD_SAVE,$ABCD @ offload
vmov $EFGH_SAVE,$EFGH
teq $inp,$len
___
for($i=0;$i<12;$i++) {
$code.=<<___;
vld1.32 {$W1},[$Ktbl]!
vadd.i32 $W0,$W0,@MSG[0]
sha256su0 @MSG[0],@MSG[1]
vmov $abcd,$ABCD
sha256h $ABCD,$EFGH,$W0
sha256h2 $EFGH,$abcd,$W0
sha256su1 @MSG[0],@MSG[2],@MSG[3]
___
($W0,$W1)=($W1,$W0); push(@MSG,shift(@MSG));
}
$code.=<<___;
vld1.32 {$W1},[$Ktbl]!
vadd.i32 $W0,$W0,@MSG[0]
vmov $abcd,$ABCD
sha256h $ABCD,$EFGH,$W0
sha256h2 $EFGH,$abcd,$W0
vld1.32 {$W0},[$Ktbl]!
vadd.i32 $W1,$W1,@MSG[1]
vmov $abcd,$ABCD
sha256h $ABCD,$EFGH,$W1
sha256h2 $EFGH,$abcd,$W1
vld1.32 {$W1},[$Ktbl]
vadd.i32 $W0,$W0,@MSG[2]
sub $Ktbl,$Ktbl,#256-16 @ rewind
vmov $abcd,$ABCD
sha256h $ABCD,$EFGH,$W0
sha256h2 $EFGH,$abcd,$W0
vadd.i32 $W1,$W1,@MSG[3]
vmov $abcd,$ABCD
sha256h $ABCD,$EFGH,$W1
sha256h2 $EFGH,$abcd,$W1
vadd.i32 $ABCD,$ABCD,$ABCD_SAVE
vadd.i32 $EFGH,$EFGH,$EFGH_SAVE
it ne
bne .Loop_v8
vst1.32 {$ABCD,$EFGH},[$ctx]
ret @ bx lr
.size sha256_block_data_order_armv8,.-sha256_block_data_order_armv8
#endif
___
}}}
$code.=<<___;
.asciz "SHA256 block transform for ARMv4/NEON/ARMv8, CRYPTOGAMS by <appro\@openssl.org>"
___
open SELF,$0;
while(<SELF>) {
next if (/^#!/);
last if (!s/^#/@/ and !/^$/);
print;
}
close SELF;
{ my %opcode = (
"sha256h" => 0xf3000c40, "sha256h2" => 0xf3100c40,
"sha256su0" => 0xf3ba03c0, "sha256su1" => 0xf3200c40 );
sub unsha256 {
my ($mnemonic,$arg)=@_;
if ($arg =~ m/q([0-9]+)(?:,\s*q([0-9]+))?,\s*q([0-9]+)/o) {
my $word = $opcode{$mnemonic}|(($1&7)<<13)|(($1&8)<<19)
|(($2&7)<<17)|(($2&8)<<4)
|(($3&7)<<1) |(($3&8)<<2);
# since ARMv7 instructions are always encoded little-endian.
# correct solution is to use .inst directive, but older
# assemblers don't implement it:-(
sprintf "INST(0x%02x,0x%02x,0x%02x,0x%02x)\t@ %s %s",
$word&0xff,($word>>8)&0xff,
($word>>16)&0xff,($word>>24)&0xff,
$mnemonic,$arg;
}
}
}
foreach (split($/,$code)) {
s/\`([^\`]*)\`/eval $1/geo;
s/\b(sha256\w+)\s+(q.*)/unsha256($1,$2)/geo;
s/\bret\b/bx lr/go or
s/\bbx\s+lr\b/.word\t0xe12fff1e/go; # make it possible to compile with -march=armv4
print $_,"\n";
}
close STDOUT or die "error closing STDOUT"; # enforce flush

View File

@@ -0,0 +1,671 @@
#! /usr/bin/env perl
# Copyright 2007-2016 The OpenSSL Project Authors. All Rights Reserved.
#
# Licensed under the OpenSSL license (the "License"). You may not use
# this file except in compliance with the License. You can obtain a copy
# in the file LICENSE in the source distribution or at
# https://www.openssl.org/source/license.html
# ====================================================================
# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
# project. The module is, however, dual licensed under OpenSSL and
# CRYPTOGAMS licenses depending on where you obtain it. For further
# details see http://www.openssl.org/~appro/cryptogams/.
#
# Permission to use under GPL terms is granted.
# ====================================================================
# SHA512 block procedure for ARMv4. September 2007.
# This code is ~4.5 (four and a half) times faster than code generated
# by gcc 3.4 and it spends ~72 clock cycles per byte [on single-issue
# Xscale PXA250 core].
#
# July 2010.
#
# Rescheduling for dual-issue pipeline resulted in 6% improvement on
# Cortex A8 core and ~40 cycles per processed byte.
# February 2011.
#
# Profiler-assisted and platform-specific optimization resulted in 7%
# improvement on Coxtex A8 core and ~38 cycles per byte.
# March 2011.
#
# Add NEON implementation. On Cortex A8 it was measured to process
# one byte in 23.3 cycles or ~60% faster than integer-only code.
# August 2012.
#
# Improve NEON performance by 12% on Snapdragon S4. In absolute
# terms it's 22.6 cycles per byte, which is disappointing result.
# Technical writers asserted that 3-way S4 pipeline can sustain
# multiple NEON instructions per cycle, but dual NEON issue could
# not be observed, see http://www.openssl.org/~appro/Snapdragon-S4.html
# for further details. On side note Cortex-A15 processes one byte in
# 16 cycles.
# Byte order [in]dependence. =========================================
#
# Originally caller was expected to maintain specific *dword* order in
# h[0-7], namely with most significant dword at *lower* address, which
# was reflected in below two parameters as 0 and 4. Now caller is
# expected to maintain native byte order for whole 64-bit values.
$hi="HI";
$lo="LO";
# ====================================================================
$flavour = shift;
if ($flavour=~/\w[\w\-]*\.\w+$/) { $output=$flavour; undef $flavour; }
else { while (($output=shift) && ($output!~/\w[\w\-]*\.\w+$/)) {} }
if ($flavour && $flavour ne "void") {
$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or
( $xlate="${dir}../../../perlasm/arm-xlate.pl" and -f $xlate) or
die "can't locate arm-xlate.pl";
open OUT,"| \"$^X\" $xlate $flavour $output";
*STDOUT=*OUT;
} else {
open OUT,">$output";
*STDOUT=*OUT;
}
$ctx="r0"; # parameter block
$inp="r1";
$len="r2";
$Tlo="r3";
$Thi="r4";
$Alo="r5";
$Ahi="r6";
$Elo="r7";
$Ehi="r8";
$t0="r9";
$t1="r10";
$t2="r11";
$t3="r12";
############ r13 is stack pointer
$Ktbl="r14";
############ r15 is program counter
$Aoff=8*0;
$Boff=8*1;
$Coff=8*2;
$Doff=8*3;
$Eoff=8*4;
$Foff=8*5;
$Goff=8*6;
$Hoff=8*7;
$Xoff=8*8;
sub BODY_00_15() {
my $magic = shift;
$code.=<<___;
@ Sigma1(x) (ROTR((x),14) ^ ROTR((x),18) ^ ROTR((x),41))
@ LO lo>>14^hi<<18 ^ lo>>18^hi<<14 ^ hi>>9^lo<<23
@ HI hi>>14^lo<<18 ^ hi>>18^lo<<14 ^ lo>>9^hi<<23
mov $t0,$Elo,lsr#14
str $Tlo,[sp,#$Xoff+0]
mov $t1,$Ehi,lsr#14
str $Thi,[sp,#$Xoff+4]
eor $t0,$t0,$Ehi,lsl#18
ldr $t2,[sp,#$Hoff+0] @ h.lo
eor $t1,$t1,$Elo,lsl#18
ldr $t3,[sp,#$Hoff+4] @ h.hi
eor $t0,$t0,$Elo,lsr#18
eor $t1,$t1,$Ehi,lsr#18
eor $t0,$t0,$Ehi,lsl#14
eor $t1,$t1,$Elo,lsl#14
eor $t0,$t0,$Ehi,lsr#9
eor $t1,$t1,$Elo,lsr#9
eor $t0,$t0,$Elo,lsl#23
eor $t1,$t1,$Ehi,lsl#23 @ Sigma1(e)
adds $Tlo,$Tlo,$t0
ldr $t0,[sp,#$Foff+0] @ f.lo
adc $Thi,$Thi,$t1 @ T += Sigma1(e)
ldr $t1,[sp,#$Foff+4] @ f.hi
adds $Tlo,$Tlo,$t2
ldr $t2,[sp,#$Goff+0] @ g.lo
adc $Thi,$Thi,$t3 @ T += h
ldr $t3,[sp,#$Goff+4] @ g.hi
eor $t0,$t0,$t2
str $Elo,[sp,#$Eoff+0]
eor $t1,$t1,$t3
str $Ehi,[sp,#$Eoff+4]
and $t0,$t0,$Elo
str $Alo,[sp,#$Aoff+0]
and $t1,$t1,$Ehi
str $Ahi,[sp,#$Aoff+4]
eor $t0,$t0,$t2
ldr $t2,[$Ktbl,#$lo] @ K[i].lo
eor $t1,$t1,$t3 @ Ch(e,f,g)
ldr $t3,[$Ktbl,#$hi] @ K[i].hi
adds $Tlo,$Tlo,$t0
ldr $Elo,[sp,#$Doff+0] @ d.lo
adc $Thi,$Thi,$t1 @ T += Ch(e,f,g)
ldr $Ehi,[sp,#$Doff+4] @ d.hi
adds $Tlo,$Tlo,$t2
and $t0,$t2,#0xff
adc $Thi,$Thi,$t3 @ T += K[i]
adds $Elo,$Elo,$Tlo
ldr $t2,[sp,#$Boff+0] @ b.lo
adc $Ehi,$Ehi,$Thi @ d += T
teq $t0,#$magic
ldr $t3,[sp,#$Coff+0] @ c.lo
#if __ARM_ARCH__>=7
it eq @ Thumb2 thing, sanity check in ARM
#endif
orreq $Ktbl,$Ktbl,#1
@ Sigma0(x) (ROTR((x),28) ^ ROTR((x),34) ^ ROTR((x),39))
@ LO lo>>28^hi<<4 ^ hi>>2^lo<<30 ^ hi>>7^lo<<25
@ HI hi>>28^lo<<4 ^ lo>>2^hi<<30 ^ lo>>7^hi<<25
mov $t0,$Alo,lsr#28
mov $t1,$Ahi,lsr#28
eor $t0,$t0,$Ahi,lsl#4
eor $t1,$t1,$Alo,lsl#4
eor $t0,$t0,$Ahi,lsr#2
eor $t1,$t1,$Alo,lsr#2
eor $t0,$t0,$Alo,lsl#30
eor $t1,$t1,$Ahi,lsl#30
eor $t0,$t0,$Ahi,lsr#7
eor $t1,$t1,$Alo,lsr#7
eor $t0,$t0,$Alo,lsl#25
eor $t1,$t1,$Ahi,lsl#25 @ Sigma0(a)
adds $Tlo,$Tlo,$t0
and $t0,$Alo,$t2
adc $Thi,$Thi,$t1 @ T += Sigma0(a)
ldr $t1,[sp,#$Boff+4] @ b.hi
orr $Alo,$Alo,$t2
ldr $t2,[sp,#$Coff+4] @ c.hi
and $Alo,$Alo,$t3
and $t3,$Ahi,$t1
orr $Ahi,$Ahi,$t1
orr $Alo,$Alo,$t0 @ Maj(a,b,c).lo
and $Ahi,$Ahi,$t2
adds $Alo,$Alo,$Tlo
orr $Ahi,$Ahi,$t3 @ Maj(a,b,c).hi
sub sp,sp,#8
adc $Ahi,$Ahi,$Thi @ h += T
tst $Ktbl,#1
add $Ktbl,$Ktbl,#8
___
}
$code=<<___;
#ifndef __KERNEL__
# include <GFp/arm_arch.h>
# define VFP_ABI_PUSH vstmdb sp!,{d8-d15}
# define VFP_ABI_POP vldmia sp!,{d8-d15}
#else
# define __ARM_ARCH__ __LINUX_ARM_ARCH__
# define __ARM_MAX_ARCH__ 7
# define VFP_ABI_PUSH
# define VFP_ABI_POP
#endif
@ Silence ARMv8 deprecated IT instruction warnings. This file is used by both
@ ARMv7 and ARMv8 processors and does not use ARMv8 instructions.
.arch armv7-a
#ifdef __ARMEL__
# define LO 0
# define HI 4
# define WORD64(hi0,lo0,hi1,lo1) .word lo0,hi0, lo1,hi1
#else
# define HI 0
# define LO 4
# define WORD64(hi0,lo0,hi1,lo1) .word hi0,lo0, hi1,lo1
#endif
.text
#if defined(__thumb2__)
.syntax unified
.thumb
# define adrl adr
#else
.code 32
#endif
.type K512,%object
.align 5
K512:
WORD64(0x428a2f98,0xd728ae22, 0x71374491,0x23ef65cd)
WORD64(0xb5c0fbcf,0xec4d3b2f, 0xe9b5dba5,0x8189dbbc)
WORD64(0x3956c25b,0xf348b538, 0x59f111f1,0xb605d019)
WORD64(0x923f82a4,0xaf194f9b, 0xab1c5ed5,0xda6d8118)
WORD64(0xd807aa98,0xa3030242, 0x12835b01,0x45706fbe)
WORD64(0x243185be,0x4ee4b28c, 0x550c7dc3,0xd5ffb4e2)
WORD64(0x72be5d74,0xf27b896f, 0x80deb1fe,0x3b1696b1)
WORD64(0x9bdc06a7,0x25c71235, 0xc19bf174,0xcf692694)
WORD64(0xe49b69c1,0x9ef14ad2, 0xefbe4786,0x384f25e3)
WORD64(0x0fc19dc6,0x8b8cd5b5, 0x240ca1cc,0x77ac9c65)
WORD64(0x2de92c6f,0x592b0275, 0x4a7484aa,0x6ea6e483)
WORD64(0x5cb0a9dc,0xbd41fbd4, 0x76f988da,0x831153b5)
WORD64(0x983e5152,0xee66dfab, 0xa831c66d,0x2db43210)
WORD64(0xb00327c8,0x98fb213f, 0xbf597fc7,0xbeef0ee4)
WORD64(0xc6e00bf3,0x3da88fc2, 0xd5a79147,0x930aa725)
WORD64(0x06ca6351,0xe003826f, 0x14292967,0x0a0e6e70)
WORD64(0x27b70a85,0x46d22ffc, 0x2e1b2138,0x5c26c926)
WORD64(0x4d2c6dfc,0x5ac42aed, 0x53380d13,0x9d95b3df)
WORD64(0x650a7354,0x8baf63de, 0x766a0abb,0x3c77b2a8)
WORD64(0x81c2c92e,0x47edaee6, 0x92722c85,0x1482353b)
WORD64(0xa2bfe8a1,0x4cf10364, 0xa81a664b,0xbc423001)
WORD64(0xc24b8b70,0xd0f89791, 0xc76c51a3,0x0654be30)
WORD64(0xd192e819,0xd6ef5218, 0xd6990624,0x5565a910)
WORD64(0xf40e3585,0x5771202a, 0x106aa070,0x32bbd1b8)
WORD64(0x19a4c116,0xb8d2d0c8, 0x1e376c08,0x5141ab53)
WORD64(0x2748774c,0xdf8eeb99, 0x34b0bcb5,0xe19b48a8)
WORD64(0x391c0cb3,0xc5c95a63, 0x4ed8aa4a,0xe3418acb)
WORD64(0x5b9cca4f,0x7763e373, 0x682e6ff3,0xd6b2b8a3)
WORD64(0x748f82ee,0x5defb2fc, 0x78a5636f,0x43172f60)
WORD64(0x84c87814,0xa1f0ab72, 0x8cc70208,0x1a6439ec)
WORD64(0x90befffa,0x23631e28, 0xa4506ceb,0xde82bde9)
WORD64(0xbef9a3f7,0xb2c67915, 0xc67178f2,0xe372532b)
WORD64(0xca273ece,0xea26619c, 0xd186b8c7,0x21c0c207)
WORD64(0xeada7dd6,0xcde0eb1e, 0xf57d4f7f,0xee6ed178)
WORD64(0x06f067aa,0x72176fba, 0x0a637dc5,0xa2c898a6)
WORD64(0x113f9804,0xbef90dae, 0x1b710b35,0x131c471b)
WORD64(0x28db77f5,0x23047d84, 0x32caab7b,0x40c72493)
WORD64(0x3c9ebe0a,0x15c9bebc, 0x431d67c4,0x9c100d4c)
WORD64(0x4cc5d4be,0xcb3e42b6, 0x597f299c,0xfc657e2a)
WORD64(0x5fcb6fab,0x3ad6faec, 0x6c44198c,0x4a475817)
.size K512,.-K512
#if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__)
.extern GFp_armcap_P
.hidden GFp_armcap_P
.LOPENSSL_armcap:
.word GFp_armcap_P-.Lsha512_block_data_order
.skip 32-4
#else
.skip 32
#endif
.global GFp_sha512_block_data_order
.type GFp_sha512_block_data_order,%function
GFp_sha512_block_data_order:
.Lsha512_block_data_order:
#if __ARM_ARCH__<7 && !defined(__thumb2__)
sub r3,pc,#8 @ GFp_sha512_block_data_order
#else
adr r3,.Lsha512_block_data_order
#endif
#if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__)
ldr r12,.LOPENSSL_armcap
ldr r12,[r3,r12] @ GFp_armcap_P
#ifdef __APPLE__
ldr r12,[r12]
#endif
tst r12,#ARMV7_NEON
bne .LNEON
#endif
add $len,$inp,$len,lsl#7 @ len to point at the end of inp
stmdb sp!,{r4-r12,lr}
sub $Ktbl,r3,#672 @ K512
sub sp,sp,#9*8
ldr $Elo,[$ctx,#$Eoff+$lo]
ldr $Ehi,[$ctx,#$Eoff+$hi]
ldr $t0, [$ctx,#$Goff+$lo]
ldr $t1, [$ctx,#$Goff+$hi]
ldr $t2, [$ctx,#$Hoff+$lo]
ldr $t3, [$ctx,#$Hoff+$hi]
.Loop:
str $t0, [sp,#$Goff+0]
str $t1, [sp,#$Goff+4]
str $t2, [sp,#$Hoff+0]
str $t3, [sp,#$Hoff+4]
ldr $Alo,[$ctx,#$Aoff+$lo]
ldr $Ahi,[$ctx,#$Aoff+$hi]
ldr $Tlo,[$ctx,#$Boff+$lo]
ldr $Thi,[$ctx,#$Boff+$hi]
ldr $t0, [$ctx,#$Coff+$lo]
ldr $t1, [$ctx,#$Coff+$hi]
ldr $t2, [$ctx,#$Doff+$lo]
ldr $t3, [$ctx,#$Doff+$hi]
str $Tlo,[sp,#$Boff+0]
str $Thi,[sp,#$Boff+4]
str $t0, [sp,#$Coff+0]
str $t1, [sp,#$Coff+4]
str $t2, [sp,#$Doff+0]
str $t3, [sp,#$Doff+4]
ldr $Tlo,[$ctx,#$Foff+$lo]
ldr $Thi,[$ctx,#$Foff+$hi]
str $Tlo,[sp,#$Foff+0]
str $Thi,[sp,#$Foff+4]
.L00_15:
#if __ARM_ARCH__<7
ldrb $Tlo,[$inp,#7]
ldrb $t0, [$inp,#6]
ldrb $t1, [$inp,#5]
ldrb $t2, [$inp,#4]
ldrb $Thi,[$inp,#3]
ldrb $t3, [$inp,#2]
orr $Tlo,$Tlo,$t0,lsl#8
ldrb $t0, [$inp,#1]
orr $Tlo,$Tlo,$t1,lsl#16
ldrb $t1, [$inp],#8
orr $Tlo,$Tlo,$t2,lsl#24
orr $Thi,$Thi,$t3,lsl#8
orr $Thi,$Thi,$t0,lsl#16
orr $Thi,$Thi,$t1,lsl#24
#else
ldr $Tlo,[$inp,#4]
ldr $Thi,[$inp],#8
#ifdef __ARMEL__
rev $Tlo,$Tlo
rev $Thi,$Thi
#endif
#endif
___
&BODY_00_15(0x94);
$code.=<<___;
tst $Ktbl,#1
beq .L00_15
ldr $t0,[sp,#`$Xoff+8*(16-1)`+0]
ldr $t1,[sp,#`$Xoff+8*(16-1)`+4]
bic $Ktbl,$Ktbl,#1
.L16_79:
@ sigma0(x) (ROTR((x),1) ^ ROTR((x),8) ^ ((x)>>7))
@ LO lo>>1^hi<<31 ^ lo>>8^hi<<24 ^ lo>>7^hi<<25
@ HI hi>>1^lo<<31 ^ hi>>8^lo<<24 ^ hi>>7
mov $Tlo,$t0,lsr#1
ldr $t2,[sp,#`$Xoff+8*(16-14)`+0]
mov $Thi,$t1,lsr#1
ldr $t3,[sp,#`$Xoff+8*(16-14)`+4]
eor $Tlo,$Tlo,$t1,lsl#31
eor $Thi,$Thi,$t0,lsl#31
eor $Tlo,$Tlo,$t0,lsr#8
eor $Thi,$Thi,$t1,lsr#8
eor $Tlo,$Tlo,$t1,lsl#24
eor $Thi,$Thi,$t0,lsl#24
eor $Tlo,$Tlo,$t0,lsr#7
eor $Thi,$Thi,$t1,lsr#7
eor $Tlo,$Tlo,$t1,lsl#25
@ sigma1(x) (ROTR((x),19) ^ ROTR((x),61) ^ ((x)>>6))
@ LO lo>>19^hi<<13 ^ hi>>29^lo<<3 ^ lo>>6^hi<<26
@ HI hi>>19^lo<<13 ^ lo>>29^hi<<3 ^ hi>>6
mov $t0,$t2,lsr#19
mov $t1,$t3,lsr#19
eor $t0,$t0,$t3,lsl#13
eor $t1,$t1,$t2,lsl#13
eor $t0,$t0,$t3,lsr#29
eor $t1,$t1,$t2,lsr#29
eor $t0,$t0,$t2,lsl#3
eor $t1,$t1,$t3,lsl#3
eor $t0,$t0,$t2,lsr#6
eor $t1,$t1,$t3,lsr#6
ldr $t2,[sp,#`$Xoff+8*(16-9)`+0]
eor $t0,$t0,$t3,lsl#26
ldr $t3,[sp,#`$Xoff+8*(16-9)`+4]
adds $Tlo,$Tlo,$t0
ldr $t0,[sp,#`$Xoff+8*16`+0]
adc $Thi,$Thi,$t1
ldr $t1,[sp,#`$Xoff+8*16`+4]
adds $Tlo,$Tlo,$t2
adc $Thi,$Thi,$t3
adds $Tlo,$Tlo,$t0
adc $Thi,$Thi,$t1
___
&BODY_00_15(0x17);
$code.=<<___;
#if __ARM_ARCH__>=7
ittt eq @ Thumb2 thing, sanity check in ARM
#endif
ldreq $t0,[sp,#`$Xoff+8*(16-1)`+0]
ldreq $t1,[sp,#`$Xoff+8*(16-1)`+4]
beq .L16_79
bic $Ktbl,$Ktbl,#1
ldr $Tlo,[sp,#$Boff+0]
ldr $Thi,[sp,#$Boff+4]
ldr $t0, [$ctx,#$Aoff+$lo]
ldr $t1, [$ctx,#$Aoff+$hi]
ldr $t2, [$ctx,#$Boff+$lo]
ldr $t3, [$ctx,#$Boff+$hi]
adds $t0,$Alo,$t0
str $t0, [$ctx,#$Aoff+$lo]
adc $t1,$Ahi,$t1
str $t1, [$ctx,#$Aoff+$hi]
adds $t2,$Tlo,$t2
str $t2, [$ctx,#$Boff+$lo]
adc $t3,$Thi,$t3
str $t3, [$ctx,#$Boff+$hi]
ldr $Alo,[sp,#$Coff+0]
ldr $Ahi,[sp,#$Coff+4]
ldr $Tlo,[sp,#$Doff+0]
ldr $Thi,[sp,#$Doff+4]
ldr $t0, [$ctx,#$Coff+$lo]
ldr $t1, [$ctx,#$Coff+$hi]
ldr $t2, [$ctx,#$Doff+$lo]
ldr $t3, [$ctx,#$Doff+$hi]
adds $t0,$Alo,$t0
str $t0, [$ctx,#$Coff+$lo]
adc $t1,$Ahi,$t1
str $t1, [$ctx,#$Coff+$hi]
adds $t2,$Tlo,$t2
str $t2, [$ctx,#$Doff+$lo]
adc $t3,$Thi,$t3
str $t3, [$ctx,#$Doff+$hi]
ldr $Tlo,[sp,#$Foff+0]
ldr $Thi,[sp,#$Foff+4]
ldr $t0, [$ctx,#$Eoff+$lo]
ldr $t1, [$ctx,#$Eoff+$hi]
ldr $t2, [$ctx,#$Foff+$lo]
ldr $t3, [$ctx,#$Foff+$hi]
adds $Elo,$Elo,$t0
str $Elo,[$ctx,#$Eoff+$lo]
adc $Ehi,$Ehi,$t1
str $Ehi,[$ctx,#$Eoff+$hi]
adds $t2,$Tlo,$t2
str $t2, [$ctx,#$Foff+$lo]
adc $t3,$Thi,$t3
str $t3, [$ctx,#$Foff+$hi]
ldr $Alo,[sp,#$Goff+0]
ldr $Ahi,[sp,#$Goff+4]
ldr $Tlo,[sp,#$Hoff+0]
ldr $Thi,[sp,#$Hoff+4]
ldr $t0, [$ctx,#$Goff+$lo]
ldr $t1, [$ctx,#$Goff+$hi]
ldr $t2, [$ctx,#$Hoff+$lo]
ldr $t3, [$ctx,#$Hoff+$hi]
adds $t0,$Alo,$t0
str $t0, [$ctx,#$Goff+$lo]
adc $t1,$Ahi,$t1
str $t1, [$ctx,#$Goff+$hi]
adds $t2,$Tlo,$t2
str $t2, [$ctx,#$Hoff+$lo]
adc $t3,$Thi,$t3
str $t3, [$ctx,#$Hoff+$hi]
add sp,sp,#640
sub $Ktbl,$Ktbl,#640
teq $inp,$len
bne .Loop
add sp,sp,#8*9 @ destroy frame
#if __ARM_ARCH__>=5
ldmia sp!,{r4-r12,pc}
#else
ldmia sp!,{r4-r12,lr}
tst lr,#1
moveq pc,lr @ be binary compatible with V4, yet
bx lr @ interoperable with Thumb ISA:-)
#endif
.size GFp_sha512_block_data_order,.-GFp_sha512_block_data_order
___
{
my @Sigma0=(28,34,39);
my @Sigma1=(14,18,41);
my @sigma0=(1, 8, 7);
my @sigma1=(19,61,6);
my $Ktbl="r3";
my $cnt="r12"; # volatile register known as ip, intra-procedure-call scratch
my @X=map("d$_",(0..15));
my @V=($A,$B,$C,$D,$E,$F,$G,$H)=map("d$_",(16..23));
sub NEON_00_15() {
my $i=shift;
my ($a,$b,$c,$d,$e,$f,$g,$h)=@_;
my ($t0,$t1,$t2,$T1,$K,$Ch,$Maj)=map("d$_",(24..31)); # temps
$code.=<<___ if ($i<16 || $i&1);
vshr.u64 $t0,$e,#@Sigma1[0] @ $i
#if $i<16
vld1.64 {@X[$i%16]},[$inp]! @ handles unaligned
#endif
vshr.u64 $t1,$e,#@Sigma1[1]
#if $i>0
vadd.i64 $a,$Maj @ h+=Maj from the past
#endif
vshr.u64 $t2,$e,#@Sigma1[2]
___
$code.=<<___;
vld1.64 {$K},[$Ktbl,:64]! @ K[i++]
vsli.64 $t0,$e,#`64-@Sigma1[0]`
vsli.64 $t1,$e,#`64-@Sigma1[1]`
vmov $Ch,$e
vsli.64 $t2,$e,#`64-@Sigma1[2]`
#if $i<16 && defined(__ARMEL__)
vrev64.8 @X[$i],@X[$i]
#endif
veor $t1,$t0
vbsl $Ch,$f,$g @ Ch(e,f,g)
vshr.u64 $t0,$a,#@Sigma0[0]
veor $t2,$t1 @ Sigma1(e)
vadd.i64 $T1,$Ch,$h
vshr.u64 $t1,$a,#@Sigma0[1]
vsli.64 $t0,$a,#`64-@Sigma0[0]`
vadd.i64 $T1,$t2
vshr.u64 $t2,$a,#@Sigma0[2]
vadd.i64 $K,@X[$i%16]
vsli.64 $t1,$a,#`64-@Sigma0[1]`
veor $Maj,$a,$b
vsli.64 $t2,$a,#`64-@Sigma0[2]`
veor $h,$t0,$t1
vadd.i64 $T1,$K
vbsl $Maj,$c,$b @ Maj(a,b,c)
veor $h,$t2 @ Sigma0(a)
vadd.i64 $d,$T1
vadd.i64 $Maj,$T1
@ vadd.i64 $h,$Maj
___
}
sub NEON_16_79() {
my $i=shift;
if ($i&1) { &NEON_00_15($i,@_); return; }
# 2x-vectorized, therefore runs every 2nd round
my @X=map("q$_",(0..7)); # view @X as 128-bit vector
my ($t0,$t1,$s0,$s1) = map("q$_",(12..15)); # temps
my ($d0,$d1,$d2) = map("d$_",(24..26)); # temps from NEON_00_15
my $e=@_[4]; # $e from NEON_00_15
$i /= 2;
$code.=<<___;
vshr.u64 $t0,@X[($i+7)%8],#@sigma1[0]
vshr.u64 $t1,@X[($i+7)%8],#@sigma1[1]
vadd.i64 @_[0],d30 @ h+=Maj from the past
vshr.u64 $s1,@X[($i+7)%8],#@sigma1[2]
vsli.64 $t0,@X[($i+7)%8],#`64-@sigma1[0]`
vext.8 $s0,@X[$i%8],@X[($i+1)%8],#8 @ X[i+1]
vsli.64 $t1,@X[($i+7)%8],#`64-@sigma1[1]`
veor $s1,$t0
vshr.u64 $t0,$s0,#@sigma0[0]
veor $s1,$t1 @ sigma1(X[i+14])
vshr.u64 $t1,$s0,#@sigma0[1]
vadd.i64 @X[$i%8],$s1
vshr.u64 $s1,$s0,#@sigma0[2]
vsli.64 $t0,$s0,#`64-@sigma0[0]`
vsli.64 $t1,$s0,#`64-@sigma0[1]`
vext.8 $s0,@X[($i+4)%8],@X[($i+5)%8],#8 @ X[i+9]
veor $s1,$t0
vshr.u64 $d0,$e,#@Sigma1[0] @ from NEON_00_15
vadd.i64 @X[$i%8],$s0
vshr.u64 $d1,$e,#@Sigma1[1] @ from NEON_00_15
veor $s1,$t1 @ sigma0(X[i+1])
vshr.u64 $d2,$e,#@Sigma1[2] @ from NEON_00_15
vadd.i64 @X[$i%8],$s1
___
&NEON_00_15(2*$i,@_);
}
$code.=<<___;
#if __ARM_MAX_ARCH__>=7
.arch armv7-a
.fpu neon
.type sha512_block_data_order_neon,%function
.align 4
sha512_block_data_order_neon:
.LNEON:
dmb @ errata #451034 on early Cortex A8
add $len,$inp,$len,lsl#7 @ len to point at the end of inp
adr $Ktbl,K512
VFP_ABI_PUSH
vldmia $ctx,{$A-$H} @ load context
.Loop_neon:
___
for($i=0;$i<16;$i++) { &NEON_00_15($i,@V); unshift(@V,pop(@V)); }
$code.=<<___;
mov $cnt,#4
.L16_79_neon:
subs $cnt,#1
___
for(;$i<32;$i++) { &NEON_16_79($i,@V); unshift(@V,pop(@V)); }
$code.=<<___;
bne .L16_79_neon
vadd.i64 $A,d30 @ h+=Maj from the past
vldmia $ctx,{d24-d31} @ load context to temp
vadd.i64 q8,q12 @ vectorized accumulate
vadd.i64 q9,q13
vadd.i64 q10,q14
vadd.i64 q11,q15
vstmia $ctx,{$A-$H} @ save context
teq $inp,$len
sub $Ktbl,#640 @ rewind K512
bne .Loop_neon
VFP_ABI_POP
ret @ bx lr
.size sha512_block_data_order_neon,.-sha512_block_data_order_neon
#endif
___
}
$code.=<<___;
.asciz "SHA512 block transform for ARMv4/NEON, CRYPTOGAMS by <appro\@openssl.org>"
___
$code =~ s/\`([^\`]*)\`/eval $1/gem;
$code =~ s/\bbx\s+lr\b/.word\t0xe12fff1e/gm; # make it possible to compile with -march=armv4
$code =~ s/\bret\b/bx lr/gm;
open SELF,$0;
while(<SELF>) {
next if (/^#!/);
last if (!s/^#/@/ and !/^$/);
print;
}
close SELF;
print $code;
close STDOUT or die "error closing STDOUT"; # enforce flush

View File

@@ -0,0 +1,462 @@
#! /usr/bin/env perl
# Copyright 2014-2016 The OpenSSL Project Authors. All Rights Reserved.
#
# Licensed under the OpenSSL license (the "License"). You may not use
# this file except in compliance with the License. You can obtain a copy
# in the file LICENSE in the source distribution or at
# https://www.openssl.org/source/license.html
# ====================================================================
# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
# project. The module is, however, dual licensed under OpenSSL and
# CRYPTOGAMS licenses depending on where you obtain it. For further
# details see http://www.openssl.org/~appro/cryptogams/.
#
# Permission to use under GPLv2 terms is granted.
# ====================================================================
#
# SHA256/512 for ARMv8.
#
# Performance in cycles per processed byte and improvement coefficient
# over code generated with "default" compiler:
#
# SHA256-hw SHA256(*) SHA512
# Apple A7 1.97 10.5 (+33%) 6.73 (-1%(**))
# Cortex-A53 2.38 15.5 (+115%) 10.0 (+150%(***))
# Cortex-A57 2.31 11.6 (+86%) 7.51 (+260%(***))
# Denver 2.01 10.5 (+26%) 6.70 (+8%)
# X-Gene 20.0 (+100%) 12.8 (+300%(***))
# Mongoose 2.36 13.0 (+50%) 8.36 (+33%)
#
# (*) Software SHA256 results are of lesser relevance, presented
# mostly for informational purposes.
# (**) The result is a trade-off: it's possible to improve it by
# 10% (or by 1 cycle per round), but at the cost of 20% loss
# on Cortex-A53 (or by 4 cycles per round).
# (***) Super-impressive coefficients over gcc-generated code are
# indication of some compiler "pathology", most notably code
# generated with -mgeneral-regs-only is significanty faster
# and the gap is only 40-90%.
$output=pop;
$flavour=pop;
if ($flavour && $flavour ne "void") {
$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or
( $xlate="${dir}../../../perlasm/arm-xlate.pl" and -f $xlate) or
die "can't locate arm-xlate.pl";
open OUT,"| \"$^X\" $xlate $flavour $output";
*STDOUT=*OUT;
} else {
open OUT,">$output";
*STDOUT=*OUT;
}
if ($output =~ /sha512-armv8/) {
$BITS=512;
$SZ=8;
@Sigma0=(28,34,39);
@Sigma1=(14,18,41);
@sigma0=(1, 8, 7);
@sigma1=(19,61, 6);
$rounds=80;
$reg_t="x";
} else {
$BITS=256;
$SZ=4;
@Sigma0=( 2,13,22);
@Sigma1=( 6,11,25);
@sigma0=( 7,18, 3);
@sigma1=(17,19,10);
$rounds=64;
$reg_t="w";
}
$func="GFp_sha${BITS}_block_data_order";
($ctx,$inp,$num,$Ktbl)=map("x$_",(0..2,30));
@X=map("$reg_t$_",(3..15,0..2));
@V=($A,$B,$C,$D,$E,$F,$G,$H)=map("$reg_t$_",(20..27));
($t0,$t1,$t2,$t3)=map("$reg_t$_",(16,17,19,28));
sub BODY_00_xx {
my ($i,$a,$b,$c,$d,$e,$f,$g,$h)=@_;
my $j=($i+1)&15;
my ($T0,$T1,$T2)=(@X[($i-8)&15],@X[($i-9)&15],@X[($i-10)&15]);
$T0=@X[$i+3] if ($i<11);
$code.=<<___ if ($i<16);
#ifndef __ARMEB__
rev @X[$i],@X[$i] // $i
#endif
___
$code.=<<___ if ($i<13 && ($i&1));
ldp @X[$i+1],@X[$i+2],[$inp],#2*$SZ
___
$code.=<<___ if ($i==13);
ldp @X[14],@X[15],[$inp]
___
$code.=<<___ if ($i>=14);
ldr @X[($i-11)&15],[sp,#`$SZ*(($i-11)%4)`]
___
$code.=<<___ if ($i>0 && $i<16);
add $a,$a,$t1 // h+=Sigma0(a)
___
$code.=<<___ if ($i>=11);
str @X[($i-8)&15],[sp,#`$SZ*(($i-8)%4)`]
___
# While ARMv8 specifies merged rotate-n-logical operation such as
# 'eor x,y,z,ror#n', it was found to negatively affect performance
# on Apple A7. The reason seems to be that it requires even 'y' to
# be available earlier. This means that such merged instruction is
# not necessarily best choice on critical path... On the other hand
# Cortex-A5x handles merged instructions much better than disjoint
# rotate and logical... See (**) footnote above.
$code.=<<___ if ($i<15);
ror $t0,$e,#$Sigma1[0]
add $h,$h,$t2 // h+=K[i]
eor $T0,$e,$e,ror#`$Sigma1[2]-$Sigma1[1]`
and $t1,$f,$e
bic $t2,$g,$e
add $h,$h,@X[$i&15] // h+=X[i]
orr $t1,$t1,$t2 // Ch(e,f,g)
eor $t2,$a,$b // a^b, b^c in next round
eor $t0,$t0,$T0,ror#$Sigma1[1] // Sigma1(e)
ror $T0,$a,#$Sigma0[0]
add $h,$h,$t1 // h+=Ch(e,f,g)
eor $t1,$a,$a,ror#`$Sigma0[2]-$Sigma0[1]`
add $h,$h,$t0 // h+=Sigma1(e)
and $t3,$t3,$t2 // (b^c)&=(a^b)
add $d,$d,$h // d+=h
eor $t3,$t3,$b // Maj(a,b,c)
eor $t1,$T0,$t1,ror#$Sigma0[1] // Sigma0(a)
add $h,$h,$t3 // h+=Maj(a,b,c)
ldr $t3,[$Ktbl],#$SZ // *K++, $t2 in next round
//add $h,$h,$t1 // h+=Sigma0(a)
___
$code.=<<___ if ($i>=15);
ror $t0,$e,#$Sigma1[0]
add $h,$h,$t2 // h+=K[i]
ror $T1,@X[($j+1)&15],#$sigma0[0]
and $t1,$f,$e
ror $T2,@X[($j+14)&15],#$sigma1[0]
bic $t2,$g,$e
ror $T0,$a,#$Sigma0[0]
add $h,$h,@X[$i&15] // h+=X[i]
eor $t0,$t0,$e,ror#$Sigma1[1]
eor $T1,$T1,@X[($j+1)&15],ror#$sigma0[1]
orr $t1,$t1,$t2 // Ch(e,f,g)
eor $t2,$a,$b // a^b, b^c in next round
eor $t0,$t0,$e,ror#$Sigma1[2] // Sigma1(e)
eor $T0,$T0,$a,ror#$Sigma0[1]
add $h,$h,$t1 // h+=Ch(e,f,g)
and $t3,$t3,$t2 // (b^c)&=(a^b)
eor $T2,$T2,@X[($j+14)&15],ror#$sigma1[1]
eor $T1,$T1,@X[($j+1)&15],lsr#$sigma0[2] // sigma0(X[i+1])
add $h,$h,$t0 // h+=Sigma1(e)
eor $t3,$t3,$b // Maj(a,b,c)
eor $t1,$T0,$a,ror#$Sigma0[2] // Sigma0(a)
eor $T2,$T2,@X[($j+14)&15],lsr#$sigma1[2] // sigma1(X[i+14])
add @X[$j],@X[$j],@X[($j+9)&15]
add $d,$d,$h // d+=h
add $h,$h,$t3 // h+=Maj(a,b,c)
ldr $t3,[$Ktbl],#$SZ // *K++, $t2 in next round
add @X[$j],@X[$j],$T1
add $h,$h,$t1 // h+=Sigma0(a)
add @X[$j],@X[$j],$T2
___
($t2,$t3)=($t3,$t2);
}
$code.=<<___;
#ifndef __KERNEL__
# include <GFp/arm_arch.h>
#endif
.text
.extern GFp_armcap_P
.hidden GFp_armcap_P
.globl $func
.type $func,%function
.align 6
$func:
___
$code.=<<___ if ($SZ==4);
AARCH64_VALID_CALL_TARGET
#ifndef __KERNEL__
#if __has_feature(hwaddress_sanitizer) && __clang_major__ >= 10
adrp x16,:pg_hi21_nc:GFp_armcap_P
#else
adrp x16,:pg_hi21:GFp_armcap_P
#endif
ldr w16,[x16,:lo12:GFp_armcap_P]
tst w16,#ARMV8_SHA256
b.ne .Lv8_entry
#endif
___
$code.=<<___;
AARCH64_SIGN_LINK_REGISTER
stp x29,x30,[sp,#-128]!
add x29,sp,#0
stp x19,x20,[sp,#16]
stp x21,x22,[sp,#32]
stp x23,x24,[sp,#48]
stp x25,x26,[sp,#64]
stp x27,x28,[sp,#80]
sub sp,sp,#4*$SZ
ldp $A,$B,[$ctx] // load context
ldp $C,$D,[$ctx,#2*$SZ]
ldp $E,$F,[$ctx,#4*$SZ]
add $num,$inp,$num,lsl#`log(16*$SZ)/log(2)` // end of input
ldp $G,$H,[$ctx,#6*$SZ]
adrp $Ktbl,:pg_hi21:.LK$BITS
add $Ktbl,$Ktbl,:lo12:.LK$BITS
stp $ctx,$num,[x29,#96]
.Loop:
ldp @X[0],@X[1],[$inp],#2*$SZ
ldr $t2,[$Ktbl],#$SZ // *K++
eor $t3,$B,$C // magic seed
str $inp,[x29,#112]
___
for ($i=0;$i<16;$i++) { &BODY_00_xx($i,@V); unshift(@V,pop(@V)); }
$code.=".Loop_16_xx:\n";
for (;$i<32;$i++) { &BODY_00_xx($i,@V); unshift(@V,pop(@V)); }
$code.=<<___;
cbnz $t2,.Loop_16_xx
ldp $ctx,$num,[x29,#96]
ldr $inp,[x29,#112]
sub $Ktbl,$Ktbl,#`$SZ*($rounds+1)` // rewind
ldp @X[0],@X[1],[$ctx]
ldp @X[2],@X[3],[$ctx,#2*$SZ]
add $inp,$inp,#14*$SZ // advance input pointer
ldp @X[4],@X[5],[$ctx,#4*$SZ]
add $A,$A,@X[0]
ldp @X[6],@X[7],[$ctx,#6*$SZ]
add $B,$B,@X[1]
add $C,$C,@X[2]
add $D,$D,@X[3]
stp $A,$B,[$ctx]
add $E,$E,@X[4]
add $F,$F,@X[5]
stp $C,$D,[$ctx,#2*$SZ]
add $G,$G,@X[6]
add $H,$H,@X[7]
cmp $inp,$num
stp $E,$F,[$ctx,#4*$SZ]
stp $G,$H,[$ctx,#6*$SZ]
b.ne .Loop
ldp x19,x20,[x29,#16]
add sp,sp,#4*$SZ
ldp x21,x22,[x29,#32]
ldp x23,x24,[x29,#48]
ldp x25,x26,[x29,#64]
ldp x27,x28,[x29,#80]
ldp x29,x30,[sp],#128
AARCH64_VALIDATE_LINK_REGISTER
ret
.size $func,.-$func
.section .rodata
.align 6
.type .LK$BITS,%object
.LK$BITS:
___
$code.=<<___ if ($SZ==8);
.quad 0x428a2f98d728ae22,0x7137449123ef65cd
.quad 0xb5c0fbcfec4d3b2f,0xe9b5dba58189dbbc
.quad 0x3956c25bf348b538,0x59f111f1b605d019
.quad 0x923f82a4af194f9b,0xab1c5ed5da6d8118
.quad 0xd807aa98a3030242,0x12835b0145706fbe
.quad 0x243185be4ee4b28c,0x550c7dc3d5ffb4e2
.quad 0x72be5d74f27b896f,0x80deb1fe3b1696b1
.quad 0x9bdc06a725c71235,0xc19bf174cf692694
.quad 0xe49b69c19ef14ad2,0xefbe4786384f25e3
.quad 0x0fc19dc68b8cd5b5,0x240ca1cc77ac9c65
.quad 0x2de92c6f592b0275,0x4a7484aa6ea6e483
.quad 0x5cb0a9dcbd41fbd4,0x76f988da831153b5
.quad 0x983e5152ee66dfab,0xa831c66d2db43210
.quad 0xb00327c898fb213f,0xbf597fc7beef0ee4
.quad 0xc6e00bf33da88fc2,0xd5a79147930aa725
.quad 0x06ca6351e003826f,0x142929670a0e6e70
.quad 0x27b70a8546d22ffc,0x2e1b21385c26c926
.quad 0x4d2c6dfc5ac42aed,0x53380d139d95b3df
.quad 0x650a73548baf63de,0x766a0abb3c77b2a8
.quad 0x81c2c92e47edaee6,0x92722c851482353b
.quad 0xa2bfe8a14cf10364,0xa81a664bbc423001
.quad 0xc24b8b70d0f89791,0xc76c51a30654be30
.quad 0xd192e819d6ef5218,0xd69906245565a910
.quad 0xf40e35855771202a,0x106aa07032bbd1b8
.quad 0x19a4c116b8d2d0c8,0x1e376c085141ab53
.quad 0x2748774cdf8eeb99,0x34b0bcb5e19b48a8
.quad 0x391c0cb3c5c95a63,0x4ed8aa4ae3418acb
.quad 0x5b9cca4f7763e373,0x682e6ff3d6b2b8a3
.quad 0x748f82ee5defb2fc,0x78a5636f43172f60
.quad 0x84c87814a1f0ab72,0x8cc702081a6439ec
.quad 0x90befffa23631e28,0xa4506cebde82bde9
.quad 0xbef9a3f7b2c67915,0xc67178f2e372532b
.quad 0xca273eceea26619c,0xd186b8c721c0c207
.quad 0xeada7dd6cde0eb1e,0xf57d4f7fee6ed178
.quad 0x06f067aa72176fba,0x0a637dc5a2c898a6
.quad 0x113f9804bef90dae,0x1b710b35131c471b
.quad 0x28db77f523047d84,0x32caab7b40c72493
.quad 0x3c9ebe0a15c9bebc,0x431d67c49c100d4c
.quad 0x4cc5d4becb3e42b6,0x597f299cfc657e2a
.quad 0x5fcb6fab3ad6faec,0x6c44198c4a475817
.quad 0 // terminator
___
$code.=<<___ if ($SZ==4);
.long 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
.long 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5
.long 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3
.long 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174
.long 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc
.long 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da
.long 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7
.long 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967
.long 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13
.long 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85
.long 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3
.long 0xd192e819,0xd6990624,0xf40e3585,0x106aa070
.long 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5
.long 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3
.long 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
.long 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
.long 0 //terminator
___
$code.=<<___;
.size .LK$BITS,.-.LK$BITS
.asciz "SHA$BITS block transform for ARMv8, CRYPTOGAMS by <appro\@openssl.org>"
.align 2
___
if ($SZ==4) {
my $Ktbl="x3";
my ($ABCD,$EFGH,$abcd)=map("v$_.16b",(0..2));
my @MSG=map("v$_.16b",(4..7));
my ($W0,$W1)=("v16.4s","v17.4s");
my ($ABCD_SAVE,$EFGH_SAVE)=("v18.16b","v19.16b");
$code.=<<___;
.text
#ifndef __KERNEL__
.type sha256_block_armv8,%function
.align 6
sha256_block_armv8:
.Lv8_entry:
// Armv8.3-A PAuth: even though x30 is pushed to stack it is not popped later.
stp x29,x30,[sp,#-16]!
add x29,sp,#0
ld1.32 {$ABCD,$EFGH},[$ctx]
adrp $Ktbl,:pg_hi21:.LK256
add $Ktbl,$Ktbl,:lo12:.LK256
.Loop_hw:
ld1 {@MSG[0]-@MSG[3]},[$inp],#64
sub $num,$num,#1
ld1.32 {$W0},[$Ktbl],#16
rev32 @MSG[0],@MSG[0]
rev32 @MSG[1],@MSG[1]
rev32 @MSG[2],@MSG[2]
rev32 @MSG[3],@MSG[3]
orr $ABCD_SAVE,$ABCD,$ABCD // offload
orr $EFGH_SAVE,$EFGH,$EFGH
___
for($i=0;$i<12;$i++) {
$code.=<<___;
ld1.32 {$W1},[$Ktbl],#16
add.i32 $W0,$W0,@MSG[0]
sha256su0 @MSG[0],@MSG[1]
orr $abcd,$ABCD,$ABCD
sha256h $ABCD,$EFGH,$W0
sha256h2 $EFGH,$abcd,$W0
sha256su1 @MSG[0],@MSG[2],@MSG[3]
___
($W0,$W1)=($W1,$W0); push(@MSG,shift(@MSG));
}
$code.=<<___;
ld1.32 {$W1},[$Ktbl],#16
add.i32 $W0,$W0,@MSG[0]
orr $abcd,$ABCD,$ABCD
sha256h $ABCD,$EFGH,$W0
sha256h2 $EFGH,$abcd,$W0
ld1.32 {$W0},[$Ktbl],#16
add.i32 $W1,$W1,@MSG[1]
orr $abcd,$ABCD,$ABCD
sha256h $ABCD,$EFGH,$W1
sha256h2 $EFGH,$abcd,$W1
ld1.32 {$W1},[$Ktbl]
add.i32 $W0,$W0,@MSG[2]
sub $Ktbl,$Ktbl,#$rounds*$SZ-16 // rewind
orr $abcd,$ABCD,$ABCD
sha256h $ABCD,$EFGH,$W0
sha256h2 $EFGH,$abcd,$W0
add.i32 $W1,$W1,@MSG[3]
orr $abcd,$ABCD,$ABCD
sha256h $ABCD,$EFGH,$W1
sha256h2 $EFGH,$abcd,$W1
add.i32 $ABCD,$ABCD,$ABCD_SAVE
add.i32 $EFGH,$EFGH,$EFGH_SAVE
cbnz $num,.Loop_hw
st1.32 {$ABCD,$EFGH},[$ctx]
ldr x29,[sp],#16
ret
.size sha256_block_armv8,.-sha256_block_armv8
#endif
___
}
{ my %opcode = (
"sha256h" => 0x5e004000, "sha256h2" => 0x5e005000,
"sha256su0" => 0x5e282800, "sha256su1" => 0x5e006000 );
sub unsha256 {
my ($mnemonic,$arg)=@_;
$arg =~ m/[qv]([0-9]+)[^,]*,\s*[qv]([0-9]+)[^,]*(?:,\s*[qv]([0-9]+))?/o
&&
sprintf ".inst\t0x%08x\t//%s %s",
$opcode{$mnemonic}|$1|($2<<5)|($3<<16),
$mnemonic,$arg;
}
}
open SELF,$0;
while(<SELF>) {
next if (/^#!/);
last if (!s/^#/\/\// and !/^$/);
print;
}
close SELF;
foreach(split("\n",$code)) {
s/\`([^\`]*)\`/eval($1)/geo;
s/\b(sha256\w+)\s+([qv].*)/unsha256($1,$2)/geo;
s/\.\w?32\b//o and s/\.16b/\.4s/go;
m/(ld|st)1[^\[]+\[0\]/o and s/\.4s/\.s/go;
print $_,"\n";
}
close STDOUT or die "error closing STDOUT";

File diff suppressed because it is too large Load Diff