STEVEN REPORT: 2011-06

2011-06-19

CRC32 SSE4.2

Core i7 2600Kの場合、crc32 r32, r32のスループット1、レイテンシ3なのは既知だけど、crc32 r64, r64はどうなのか調べてみた。
結論、r64とr32のスループット及びレイテンシは等しい。

;yasm -f win32 crc32.asm
CPU  SSE4.2
USE32
section .text
;__stdcall crc32_latency_32bit(DWORD loop);
global _crc32_latency_32bit@4
_crc32_latency_32bit@4:
push ebp
mov  ebp, esp
mov  ecx, [ebp+08H]
align 16
.loop_:
TIMES 1000 crc32 eax, ebx
dec  ecx
jnz  .loop_
pop  ebp
ret  4

;__stdcall crc32_throughput_32bit(DWORD loop);
%MACRO step_op 0
crc32 eax, ebx
crc32 edx, ebx
crc32 edi, ebx
crc32 esi, ebx
%ENDMACRO
global _crc32_throughput_32bit@4
_crc32_throughput_32bit@4:
push ebp
mov  ebp, esp
mov  ecx, [ebp+08H]
pusha
align 16
.loop_:
%rep 1000
step_op
%endrep
dec  ecx
jnz  .loop_
popa
pop  ebp
ret  4
;yasm -f x64 crc32_64.asm
CPU  SSE4.2
USE64
section .text
global _crc32_latency_64bit
_crc32_latency_64bit:
align 16
.loop_:
TIMES 1000 crc32 rax, rbx
dec  rcx
jnz  .loop_
ret
%MACRO step_op 0
crc32 rax, rbx
crc32 r8, rbx
crc32 r9, rbx
crc32 r10, rbx
%ENDMACRO
global _crc32_throughput_64bit
_crc32_throughput_64bit:
align 16
.loop_:
%rep 1000
step_op
%endrep
dec  rcx
jnz  .loop_
ret
#include <Windows.h>
#include <stdio.h>
#ifdef _M_X64
extern "C"
{
extern void _crc32_latency_64bit(DWORD count);
extern void _crc32_throughput_64bit(DWORD count);
}
#define crc32_latency _crc32_latency_64bit
#define crc32_throughput _crc32_throughput_64bit
#else
extern "C"
{
extern void __stdcall crc32_latency_32bit(DWORD loop);
extern void __stdcall crc32_throughput_32bit(DWORD loop);
}
#define crc32_latency crc32_latency_32bit
#define crc32_throughput crc32_throughput_32bit
#endif
int main()
{
LARGE_INTEGER f, b, a;
SetPriorityClass(GetCurrentProcess(), HIGH_PRIORITY_CLASS);
QueryPerformanceFrequency(&f);
#ifdef _M_X64
puts("X64");
#else
puts("x86");
#endif
printf("freq: %I64d\n", f.QuadPart);
crc32_latency(1);
QueryPerformanceCounter(&b);
crc32_latency(1000000);
QueryPerformanceCounter(&a);
printf("%I64d\n", a.QuadPart-b.QuadPart);
crc32_throughput(1);
QueryPerformanceCounter(&b);
crc32_throughput(1000000);
QueryPerformanceCounter(&a);
printf("%I64d\n", a.QuadPart-b.QuadPart);
return 0;
}

2011-06-17

AES-NI

・Sbox等のテーブル不要
・効率を考えるなら、CTRモードにして並列性を高める使い方をするのがいいのかな
・暗号鍵からラウンド鍵を作る部分は、Intelの資料にあるのを使う
aeskeygenassistはassistとは名ばかりの、すごく使いにくい命令
・鍵長256[bit]の暗号化・復号は次のような感じ

//暗号化
tmp = _mm_loadu_si128(&pIn[i]);
tmp = _mm_xor_si128(tmp, pKey[0]);
for (round=1; 13>=round; round++)
tmp = _mm_aesenc_si128(tmp, pKey[round]);
tmp = _mm_aesenclast_si128(tmp, pKey[14]);
_mm_storeu_si128(&pOut[i], tmp);

//復号
tmp = _mm_loadu_si128(&pIn[i]);
tmp = _mm_xor_si128(tmp, pKey[14]);
for (round=13; 1<=round; round--)
tmp = _mm_aesdec_si128(tmp, _mm_aesimc_si128(pKey[round]));
tmp = _mm_aesdeclast_si128(tmp, pKey[0]);
_mm_storeu_si128(&pOut[i], tmp);

AES暗号 (4)

・MixColumnsの実装

void CAES::MixColumns(uint8_t aState[16])
{
    uint8_t aTmp[16];

    ::memcpy(aTmp, aState, 16);
    aState[0] = mul_(0x02, aTmp[0]) ^ mul_(0x03, aTmp[1]) ^ aTmp[2] ^ aTmp[3];
    aState[1] = aTmp[0] ^ mul_(0x02, aTmp[1]) ^ mul_(0x03, aTmp[2]) ^ aTmp[3];
    aState[2] = aTmp[0] ^ aTmp[1] ^ mul_(0x02, aTmp[2]) ^ mul_(0x03, aTmp[3]);
    aState[3] = mul_(0x03, aTmp[0]) ^ aTmp[1] ^ aTmp[2] ^ mul_(0x02, aTmp[3]);
    aState[4] = mul_(0x02, aTmp[4]) ^ mul_(0x03, aTmp[5]) ^ aTmp[6] ^ aTmp[7];
    aState[5] = aTmp[4] ^ mul_(0x02, aTmp[5]) ^ mul_(0x03, aTmp[6]) ^ aTmp[7];
    aState[6] = aTmp[4] ^ aTmp[5] ^ mul_(0x02, aTmp[6]) ^ mul_(0x03, aTmp[7]);
    aState[7] = mul_(0x03, aTmp[4]) ^ aTmp[5] ^ aTmp[6] ^ mul_(0x02, aTmp[7]);
    aState[8] = mul_(0x02, aTmp[8]) ^ mul_(0x03, aTmp[9]) ^ aTmp[10] ^ aTmp[11];
    aState[9] = aTmp[8] ^ mul_(0x02, aTmp[9]) ^ mul_(0x03, aTmp[10]) ^ aTmp[11];
    aState[10] = aTmp[8] ^ aTmp[9] ^ mul_(0x02, aTmp[10]) ^ mul_(0x03, aTmp[11]);
    aState[11] = mul_(0x03, aTmp[8]) ^ aTmp[9] ^ aTmp[10] ^ mul_(0x02, aTmp[11]);
    aState[12] = mul_(0x02, aTmp[12]) ^ mul_(0x03, aTmp[13]) ^ aTmp[14] ^ aTmp[15];
    aState[13] = aTmp[12] ^ mul_(0x02, aTmp[13]) ^ mul_(0x03, aTmp[14]) ^ aTmp[15];
    aState[14] = aTmp[12] ^ aTmp[13] ^ mul_(0x02, aTmp[14]) ^ mul_(0x03, aTmp[15]);
    aState[15] = mul_(0x03, aTmp[12]) ^ aTmp[13] ^ aTmp[14] ^ mul_(0x02, aTmp[15]);
}

void CAES::InvMixColumns(uint8_t aState[16])
{
    uint8_t aTmp[16];

    ::memcpy(aTmp, aState, 16);
    aState[0] = mul_(0x0e, aTmp[0]) ^ mul_(0x0b, aTmp[1]) ^ mul_(0x0d, aTmp[2]) ^ mul_(0x09, aTmp[3]);
    aState[1] = mul_(0x09, aTmp[0]) ^ mul_(0x0e, aTmp[1]) ^ mul_(0x0b, aTmp[2]) ^ mul_(0x0d, aTmp[3]);
    aState[2] = mul_(0x0d, aTmp[0]) ^ mul_(0x09, aTmp[1]) ^ mul_(0x0e, aTmp[2]) ^ mul_(0x0b, aTmp[3]);
    aState[3] = mul_(0x0b, aTmp[0]) ^ mul_(0x0d, aTmp[1]) ^ mul_(0x09, aTmp[2]) ^ mul_(0x0e, aTmp[3]);
    aState[4] = mul_(0x0e, aTmp[4]) ^ mul_(0x0b, aTmp[5]) ^ mul_(0x0d, aTmp[6]) ^ mul_(0x09, aTmp[7]);
    aState[5] = mul_(0x09, aTmp[4]) ^ mul_(0x0e, aTmp[5]) ^ mul_(0x0b, aTmp[6]) ^ mul_(0x0d, aTmp[7]);
    aState[6] = mul_(0x0d, aTmp[4]) ^ mul_(0x09, aTmp[5]) ^ mul_(0x0e, aTmp[6]) ^ mul_(0x0b, aTmp[7]);
    aState[7] = mul_(0x0b, aTmp[4]) ^ mul_(0x0d, aTmp[5]) ^ mul_(0x09, aTmp[6]) ^ mul_(0x0e, aTmp[7]);
    aState[8] = mul_(0x0e, aTmp[8]) ^ mul_(0x0b, aTmp[9]) ^ mul_(0x0d, aTmp[10]) ^ mul_(0x09, aTmp[11]);
    aState[9] = mul_(0x09, aTmp[8]) ^ mul_(0x0e, aTmp[9]) ^ mul_(0x0b, aTmp[10]) ^ mul_(0x0d, aTmp[11]);
    aState[10] = mul_(0x0d, aTmp[8]) ^ mul_(0x09, aTmp[9]) ^ mul_(0x0e, aTmp[10]) ^ mul_(0x0b, aTmp[11]);
    aState[11] = mul_(0x0b, aTmp[8]) ^ mul_(0x0d, aTmp[9]) ^ mul_(0x09, aTmp[10]) ^ mul_(0x0e, aTmp[11]);
    aState[12] = mul_(0x0e, aTmp[12]) ^ mul_(0x0b, aTmp[13]) ^ mul_(0x0d, aTmp[14]) ^ mul_(0x09, aTmp[15]);
    aState[13] = mul_(0x09, aTmp[12]) ^ mul_(0x0e, aTmp[13]) ^ mul_(0x0b, aTmp[14]) ^ mul_(0x0d, aTmp[15]);
    aState[14] = mul_(0x0d, aTmp[12]) ^ mul_(0x09, aTmp[13]) ^ mul_(0x0e, aTmp[14]) ^ mul_(0x0b, aTmp[15]);
    aState[15] = mul_(0x0b, aTmp[12]) ^ mul_(0x0d, aTmp[13]) ^ mul_(0x09, aTmp[14]) ^ mul_(0x0e, aTmp[15]);
}

定数16とかを直接入れているけどキニシナイ。
0x02, 0x03, 0x09, 0x0b, 0x0d, 0x0eのそれぞれにLUTを作っておいて引くっていう手もある。

AES暗号 (3)

・Rconの求め方 (2)

愚直に書くと、
uint32_t bsr_(uint32_t a)
{
   uint32_t result;

   __asm
   {
       bsr       eax, a
       mov       result, eax
   }

   return result;
}

uint8_t mul_(uint8_t a, uint8_t b)
{
   uint32_t c;
   uint32_t pos;

   c = 0;
   while (a)
   {
       pos = bsr_(a);
       c ^= (b << pos);
       a ^= (1 << pos);
   }

   //mod 100011011
   while ((~0xff) & c)
   {
       pos = bsr_(c);
       c ^= (0x11b << (pos-8));
   }

   return static_cast<uint8_t>(c);
}

const int MAX_RCON = 20;
int i;
unsigned long int Rcon[MAX_RCON];

Rcon[0] = 0;
Rcon[1] = 1;

for (i=2; i<MAX_RCON; i++)
   Rcon[i] = mul_(2, Rcon[i-1]);

for (i=0; i<MAX_RCON; i++)
   printf("%08x\n", Rcon[i]);

AES暗号 (2)

・Rconの求め方

0, 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80, 0x1b, 0x36
．．．ってテーブルを作れば済む話ではあるけど。

1バイトは8ビットとする。

http://csrc.nist.gov/publications/fips/fips197/fips-197.pdf

The round constant word array, Rcon[i], contains the values given by [x^(i-1),{00},{00},{00}],
with x^(i-1) being powers of x (x is denoted as {02}) in the field GF(2^8),
as discussed in Sec. 4.2 (note that i starts at 1, not 0).

Rcon[i]はi=1から始まる配列、一つの要素は4バイトで上位3バイトは0、最下位の1バイトはx^(i-1)、xは2、GF(2^8)での計算。
4.2にあるように、生成多項式を x^8 + x^4 + x^3 + x + 1 とした乗算を行う。
この乗算は手っ取り早く書くと、掛けた値をx^8 + x^4 + x^3 + x + 1、つまり 100011011(b)で割った余りを計算結果とする。

例えばi=9の時、2^(i-1)を求める。
2^(9-1) = 2^8 = 256(d) = 100000000(b)

100000000
xor 100011011
-------------
000011011

= 0x1b

2^(10-1) = 2^9 = 512(d) = 1000000000(b)
この場合は、100011011を左シフトして最上位ビットの桁を合わせる

1000000000
xor 1000110110
--------------
0000110110

= 0x36

AES暗号 (1)

手持ちのCore i7 2600KにはAES-NI、AES暗号化・復号を補助する命令があり、これを使ってみようということでAESについて調べてみた。

wikipedia: AES暗号
Intel® Advanced Encryption Standard (AES) Instructions Set - Rev 3
これらからリンクしてある*.pdfを参考に、実装した。
日本語の文書が少ないので、分かりにくい部分二点のみを少々書いてみる。

・Rconの求め方
・MixColumnsの実装

2011-06-16

Twitter BOTの作成

https://twitter.com/Ran_Pig
FEZ本スレから、ソレっぽい投稿を拾ってきて、Twitterで発言するだけ。
Visual C# 2010 Express + DotNetOpenAuth。