Core i7 2600Kの場合、crc32 r32, r32のスループット1、レイテンシ3なのは既知だけど、crc32 r64, r64はどうなのか調べてみた。
結論、r64とr32のスループット及びレイテンシは等しい。
;yasm -f win32 crc32.asm
CPU SSE4.2
USE32
section .text
;__stdcall crc32_latency_32bit(DWORD loop);
global _crc32_latency_32bit@4
_crc32_latency_32bit@4:
push ebp
mov ebp, esp
mov ecx, [ebp+08H]
align 16
.loop_:
TIMES 1000 crc32 eax, ebx
dec ecx
jnz .loop_
pop ebp
ret 4
;__stdcall crc32_throughput_32bit(DWORD loop);
%MACRO step_op 0
crc32 eax, ebx
crc32 edx, ebx
crc32 edi, ebx
crc32 esi, ebx
%ENDMACRO
global _crc32_throughput_32bit@4
_crc32_throughput_32bit@4:
push ebp
mov ebp, esp
mov ecx, [ebp+08H]
pusha
align 16
.loop_:
%rep 1000
step_op
%endrep
dec ecx
jnz .loop_
popa
pop ebp
ret 4
;yasm -f x64 crc32_64.asm
CPU SSE4.2
USE64
section .text
global _crc32_latency_64bit
_crc32_latency_64bit:
align 16
.loop_:
TIMES 1000 crc32 rax, rbx
dec rcx
jnz .loop_
ret
%MACRO step_op 0
crc32 rax, rbx
crc32 r8, rbx
crc32 r9, rbx
crc32 r10, rbx
%ENDMACRO
global _crc32_throughput_64bit
_crc32_throughput_64bit:
align 16
.loop_:
%rep 1000
step_op
%endrep
dec rcx
jnz .loop_
ret
#include <Windows.h>
#include <stdio.h>
#ifdef _M_X64
extern "C"
{
extern void _crc32_latency_64bit(DWORD count);
extern void _crc32_throughput_64bit(DWORD count);
}
#define crc32_latency _crc32_latency_64bit
#define crc32_throughput _crc32_throughput_64bit
#else
extern "C"
{
extern void __stdcall crc32_latency_32bit(DWORD loop);
extern void __stdcall crc32_throughput_32bit(DWORD loop);
}
#define crc32_latency crc32_latency_32bit
#define crc32_throughput crc32_throughput_32bit
#endif
int main()
{
LARGE_INTEGER f, b, a;
SetPriorityClass(GetCurrentProcess(), HIGH_PRIORITY_CLASS);
QueryPerformanceFrequency(&f);
#ifdef _M_X64
puts("X64");
#else
puts("x86");
#endif
printf("freq: %I64d\n", f.QuadPart);
crc32_latency(1);
QueryPerformanceCounter(&b);
crc32_latency(1000000);
QueryPerformanceCounter(&a);
printf("%I64d\n", a.QuadPart-b.QuadPart);
crc32_throughput(1);
QueryPerformanceCounter(&b);
crc32_throughput(1000000);
QueryPerformanceCounter(&a);
printf("%I64d\n", a.QuadPart-b.QuadPart);
return 0;
}
2011-06-19
2011-06-17
AES-NI
・Sbox等のテーブル不要
・効率を考えるなら、CTRモードにして並列性を高める使い方をするのがいいのかな
・暗号鍵からラウンド鍵を作る部分は、Intelの資料にあるのを使う
aeskeygenassistはassistとは名ばかりの、すごく使いにくい命令
・鍵長256[bit]の暗号化・復号は次のような感じ
//暗号化
tmp = _mm_loadu_si128(&pIn[i]);
tmp = _mm_xor_si128(tmp, pKey[0]);
for (round=1; 13>=round; round++)
tmp = _mm_aesenc_si128(tmp, pKey[round]);
tmp = _mm_aesenclast_si128(tmp, pKey[14]);
_mm_storeu_si128(&pOut[i], tmp);
//復号
tmp = _mm_loadu_si128(&pIn[i]);
tmp = _mm_xor_si128(tmp, pKey[14]);
for (round=13; 1<=round; round--)
tmp = _mm_aesdec_si128(tmp, _mm_aesimc_si128(pKey[round]));
tmp = _mm_aesdeclast_si128(tmp, pKey[0]);
_mm_storeu_si128(&pOut[i], tmp);
AES暗号 (4)
・MixColumnsの実装
void CAES::MixColumns(uint8_t aState[16])
{
uint8_t aTmp[16];
::memcpy(aTmp, aState, 16);
aState[0] = mul_(0x02, aTmp[0]) ^ mul_(0x03, aTmp[1]) ^ aTmp[2] ^ aTmp[3];
aState[1] = aTmp[0] ^ mul_(0x02, aTmp[1]) ^ mul_(0x03, aTmp[2]) ^ aTmp[3];
aState[2] = aTmp[0] ^ aTmp[1] ^ mul_(0x02, aTmp[2]) ^ mul_(0x03, aTmp[3]);
aState[3] = mul_(0x03, aTmp[0]) ^ aTmp[1] ^ aTmp[2] ^ mul_(0x02, aTmp[3]);
aState[4] = mul_(0x02, aTmp[4]) ^ mul_(0x03, aTmp[5]) ^ aTmp[6] ^ aTmp[7];
aState[5] = aTmp[4] ^ mul_(0x02, aTmp[5]) ^ mul_(0x03, aTmp[6]) ^ aTmp[7];
aState[6] = aTmp[4] ^ aTmp[5] ^ mul_(0x02, aTmp[6]) ^ mul_(0x03, aTmp[7]);
aState[7] = mul_(0x03, aTmp[4]) ^ aTmp[5] ^ aTmp[6] ^ mul_(0x02, aTmp[7]);
aState[8] = mul_(0x02, aTmp[8]) ^ mul_(0x03, aTmp[9]) ^ aTmp[10] ^ aTmp[11];
aState[9] = aTmp[8] ^ mul_(0x02, aTmp[9]) ^ mul_(0x03, aTmp[10]) ^ aTmp[11];
aState[10] = aTmp[8] ^ aTmp[9] ^ mul_(0x02, aTmp[10]) ^ mul_(0x03, aTmp[11]);
aState[11] = mul_(0x03, aTmp[8]) ^ aTmp[9] ^ aTmp[10] ^ mul_(0x02, aTmp[11]);
aState[12] = mul_(0x02, aTmp[12]) ^ mul_(0x03, aTmp[13]) ^ aTmp[14] ^ aTmp[15];
aState[13] = aTmp[12] ^ mul_(0x02, aTmp[13]) ^ mul_(0x03, aTmp[14]) ^ aTmp[15];
aState[14] = aTmp[12] ^ aTmp[13] ^ mul_(0x02, aTmp[14]) ^ mul_(0x03, aTmp[15]);
aState[15] = mul_(0x03, aTmp[12]) ^ aTmp[13] ^ aTmp[14] ^ mul_(0x02, aTmp[15]);
}
void CAES::InvMixColumns(uint8_t aState[16])
{
uint8_t aTmp[16];
::memcpy(aTmp, aState, 16);
aState[0] = mul_(0x0e, aTmp[0]) ^ mul_(0x0b, aTmp[1]) ^ mul_(0x0d, aTmp[2]) ^ mul_(0x09, aTmp[3]);
aState[1] = mul_(0x09, aTmp[0]) ^ mul_(0x0e, aTmp[1]) ^ mul_(0x0b, aTmp[2]) ^ mul_(0x0d, aTmp[3]);
aState[2] = mul_(0x0d, aTmp[0]) ^ mul_(0x09, aTmp[1]) ^ mul_(0x0e, aTmp[2]) ^ mul_(0x0b, aTmp[3]);
aState[3] = mul_(0x0b, aTmp[0]) ^ mul_(0x0d, aTmp[1]) ^ mul_(0x09, aTmp[2]) ^ mul_(0x0e, aTmp[3]);
aState[4] = mul_(0x0e, aTmp[4]) ^ mul_(0x0b, aTmp[5]) ^ mul_(0x0d, aTmp[6]) ^ mul_(0x09, aTmp[7]);
aState[5] = mul_(0x09, aTmp[4]) ^ mul_(0x0e, aTmp[5]) ^ mul_(0x0b, aTmp[6]) ^ mul_(0x0d, aTmp[7]);
aState[6] = mul_(0x0d, aTmp[4]) ^ mul_(0x09, aTmp[5]) ^ mul_(0x0e, aTmp[6]) ^ mul_(0x0b, aTmp[7]);
aState[7] = mul_(0x0b, aTmp[4]) ^ mul_(0x0d, aTmp[5]) ^ mul_(0x09, aTmp[6]) ^ mul_(0x0e, aTmp[7]);
aState[8] = mul_(0x0e, aTmp[8]) ^ mul_(0x0b, aTmp[9]) ^ mul_(0x0d, aTmp[10]) ^ mul_(0x09, aTmp[11]);
aState[9] = mul_(0x09, aTmp[8]) ^ mul_(0x0e, aTmp[9]) ^ mul_(0x0b, aTmp[10]) ^ mul_(0x0d, aTmp[11]);
aState[10] = mul_(0x0d, aTmp[8]) ^ mul_(0x09, aTmp[9]) ^ mul_(0x0e, aTmp[10]) ^ mul_(0x0b, aTmp[11]);
aState[11] = mul_(0x0b, aTmp[8]) ^ mul_(0x0d, aTmp[9]) ^ mul_(0x09, aTmp[10]) ^ mul_(0x0e, aTmp[11]);
aState[12] = mul_(0x0e, aTmp[12]) ^ mul_(0x0b, aTmp[13]) ^ mul_(0x0d, aTmp[14]) ^ mul_(0x09, aTmp[15]);
aState[13] = mul_(0x09, aTmp[12]) ^ mul_(0x0e, aTmp[13]) ^ mul_(0x0b, aTmp[14]) ^ mul_(0x0d, aTmp[15]);
aState[14] = mul_(0x0d, aTmp[12]) ^ mul_(0x09, aTmp[13]) ^ mul_(0x0e, aTmp[14]) ^ mul_(0x0b, aTmp[15]);
aState[15] = mul_(0x0b, aTmp[12]) ^ mul_(0x0d, aTmp[13]) ^ mul_(0x09, aTmp[14]) ^ mul_(0x0e, aTmp[15]);
}
定数16とかを直接入れているけどキニシナイ。
0x02, 0x03, 0x09, 0x0b, 0x0d, 0x0eのそれぞれにLUTを作っておいて引くっていう手もある。
void CAES::MixColumns(uint8_t aState[16])
{
uint8_t aTmp[16];
::memcpy(aTmp, aState, 16);
aState[0] = mul_(0x02, aTmp[0]) ^ mul_(0x03, aTmp[1]) ^ aTmp[2] ^ aTmp[3];
aState[1] = aTmp[0] ^ mul_(0x02, aTmp[1]) ^ mul_(0x03, aTmp[2]) ^ aTmp[3];
aState[2] = aTmp[0] ^ aTmp[1] ^ mul_(0x02, aTmp[2]) ^ mul_(0x03, aTmp[3]);
aState[3] = mul_(0x03, aTmp[0]) ^ aTmp[1] ^ aTmp[2] ^ mul_(0x02, aTmp[3]);
aState[4] = mul_(0x02, aTmp[4]) ^ mul_(0x03, aTmp[5]) ^ aTmp[6] ^ aTmp[7];
aState[5] = aTmp[4] ^ mul_(0x02, aTmp[5]) ^ mul_(0x03, aTmp[6]) ^ aTmp[7];
aState[6] = aTmp[4] ^ aTmp[5] ^ mul_(0x02, aTmp[6]) ^ mul_(0x03, aTmp[7]);
aState[7] = mul_(0x03, aTmp[4]) ^ aTmp[5] ^ aTmp[6] ^ mul_(0x02, aTmp[7]);
aState[8] = mul_(0x02, aTmp[8]) ^ mul_(0x03, aTmp[9]) ^ aTmp[10] ^ aTmp[11];
aState[9] = aTmp[8] ^ mul_(0x02, aTmp[9]) ^ mul_(0x03, aTmp[10]) ^ aTmp[11];
aState[10] = aTmp[8] ^ aTmp[9] ^ mul_(0x02, aTmp[10]) ^ mul_(0x03, aTmp[11]);
aState[11] = mul_(0x03, aTmp[8]) ^ aTmp[9] ^ aTmp[10] ^ mul_(0x02, aTmp[11]);
aState[12] = mul_(0x02, aTmp[12]) ^ mul_(0x03, aTmp[13]) ^ aTmp[14] ^ aTmp[15];
aState[13] = aTmp[12] ^ mul_(0x02, aTmp[13]) ^ mul_(0x03, aTmp[14]) ^ aTmp[15];
aState[14] = aTmp[12] ^ aTmp[13] ^ mul_(0x02, aTmp[14]) ^ mul_(0x03, aTmp[15]);
aState[15] = mul_(0x03, aTmp[12]) ^ aTmp[13] ^ aTmp[14] ^ mul_(0x02, aTmp[15]);
}
void CAES::InvMixColumns(uint8_t aState[16])
{
uint8_t aTmp[16];
::memcpy(aTmp, aState, 16);
aState[0] = mul_(0x0e, aTmp[0]) ^ mul_(0x0b, aTmp[1]) ^ mul_(0x0d, aTmp[2]) ^ mul_(0x09, aTmp[3]);
aState[1] = mul_(0x09, aTmp[0]) ^ mul_(0x0e, aTmp[1]) ^ mul_(0x0b, aTmp[2]) ^ mul_(0x0d, aTmp[3]);
aState[2] = mul_(0x0d, aTmp[0]) ^ mul_(0x09, aTmp[1]) ^ mul_(0x0e, aTmp[2]) ^ mul_(0x0b, aTmp[3]);
aState[3] = mul_(0x0b, aTmp[0]) ^ mul_(0x0d, aTmp[1]) ^ mul_(0x09, aTmp[2]) ^ mul_(0x0e, aTmp[3]);
aState[4] = mul_(0x0e, aTmp[4]) ^ mul_(0x0b, aTmp[5]) ^ mul_(0x0d, aTmp[6]) ^ mul_(0x09, aTmp[7]);
aState[5] = mul_(0x09, aTmp[4]) ^ mul_(0x0e, aTmp[5]) ^ mul_(0x0b, aTmp[6]) ^ mul_(0x0d, aTmp[7]);
aState[6] = mul_(0x0d, aTmp[4]) ^ mul_(0x09, aTmp[5]) ^ mul_(0x0e, aTmp[6]) ^ mul_(0x0b, aTmp[7]);
aState[7] = mul_(0x0b, aTmp[4]) ^ mul_(0x0d, aTmp[5]) ^ mul_(0x09, aTmp[6]) ^ mul_(0x0e, aTmp[7]);
aState[8] = mul_(0x0e, aTmp[8]) ^ mul_(0x0b, aTmp[9]) ^ mul_(0x0d, aTmp[10]) ^ mul_(0x09, aTmp[11]);
aState[9] = mul_(0x09, aTmp[8]) ^ mul_(0x0e, aTmp[9]) ^ mul_(0x0b, aTmp[10]) ^ mul_(0x0d, aTmp[11]);
aState[10] = mul_(0x0d, aTmp[8]) ^ mul_(0x09, aTmp[9]) ^ mul_(0x0e, aTmp[10]) ^ mul_(0x0b, aTmp[11]);
aState[11] = mul_(0x0b, aTmp[8]) ^ mul_(0x0d, aTmp[9]) ^ mul_(0x09, aTmp[10]) ^ mul_(0x0e, aTmp[11]);
aState[12] = mul_(0x0e, aTmp[12]) ^ mul_(0x0b, aTmp[13]) ^ mul_(0x0d, aTmp[14]) ^ mul_(0x09, aTmp[15]);
aState[13] = mul_(0x09, aTmp[12]) ^ mul_(0x0e, aTmp[13]) ^ mul_(0x0b, aTmp[14]) ^ mul_(0x0d, aTmp[15]);
aState[14] = mul_(0x0d, aTmp[12]) ^ mul_(0x09, aTmp[13]) ^ mul_(0x0e, aTmp[14]) ^ mul_(0x0b, aTmp[15]);
aState[15] = mul_(0x0b, aTmp[12]) ^ mul_(0x0d, aTmp[13]) ^ mul_(0x09, aTmp[14]) ^ mul_(0x0e, aTmp[15]);
}
定数16とかを直接入れているけどキニシナイ。
0x02, 0x03, 0x09, 0x0b, 0x0d, 0x0eのそれぞれにLUTを作っておいて引くっていう手もある。
AES暗号 (3)
・Rconの求め方 (2)
愚直に書くと、
uint32_t bsr_(uint32_t a)
{
uint32_t result;
__asm
{
bsr eax, a
mov result, eax
}
return result;
}
uint8_t mul_(uint8_t a, uint8_t b)
{
uint32_t c;
uint32_t pos;
c = 0;
while (a)
{
pos = bsr_(a);
c ^= (b << pos);
a ^= (1 << pos);
}
//mod 100011011
while ((~0xff) & c)
{
pos = bsr_(c);
c ^= (0x11b << (pos-8));
}
return static_cast<uint8_t>(c);
}
const int MAX_RCON = 20;
int i;
unsigned long int Rcon[MAX_RCON];
Rcon[0] = 0;
Rcon[1] = 1;
for (i=2; i<MAX_RCON; i++)
Rcon[i] = mul_(2, Rcon[i-1]);
for (i=0; i<MAX_RCON; i++)
printf("%08x\n", Rcon[i]);
愚直に書くと、
uint32_t bsr_(uint32_t a)
{
uint32_t result;
__asm
{
bsr eax, a
mov result, eax
}
return result;
}
uint8_t mul_(uint8_t a, uint8_t b)
{
uint32_t c;
uint32_t pos;
c = 0;
while (a)
{
pos = bsr_(a);
c ^= (b << pos);
a ^= (1 << pos);
}
//mod 100011011
while ((~0xff) & c)
{
pos = bsr_(c);
c ^= (0x11b << (pos-8));
}
return static_cast<uint8_t>(c);
}
const int MAX_RCON = 20;
int i;
unsigned long int Rcon[MAX_RCON];
Rcon[0] = 0;
Rcon[1] = 1;
for (i=2; i<MAX_RCON; i++)
Rcon[i] = mul_(2, Rcon[i-1]);
for (i=0; i<MAX_RCON; i++)
printf("%08x\n", Rcon[i]);
AES暗号 (2)
・Rconの求め方
0, 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80, 0x1b, 0x36
...ってテーブルを作れば済む話ではあるけど。
1バイトは8ビットとする。
http://csrc.nist.gov/publications/fips/fips197/fips-197.pdf
Rcon[i]はi=1から始まる配列、一つの要素は4バイトで上位3バイトは0、最下位の1バイトはx^(i-1)、xは2、GF(2^8)での計算。
4.2にあるように、生成多項式を x^8 + x^4 + x^3 + x + 1 とした乗算を行う。
この乗算は手っ取り早く書くと、掛けた値をx^8 + x^4 + x^3 + x + 1、つまり 100011011(b)で割った余りを計算結果とする。
例えばi=9の時、2^(i-1)を求める。
2^(9-1) = 2^8 = 256(d) = 100000000(b)
100000000
xor 100011011
-------------
000011011
= 0x1b
2^(10-1) = 2^9 = 512(d) = 1000000000(b)
この場合は、100011011を左シフトして最上位ビットの桁を合わせる
1000000000
xor 1000110110
--------------
0000110110
= 0x36
0, 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80, 0x1b, 0x36
...ってテーブルを作れば済む話ではあるけど。
1バイトは8ビットとする。
http://csrc.nist.gov/publications/fips/fips197/fips-197.pdf
The round constant word array, Rcon[i], contains the values given by [x^(i-1),{00},{00},{00}],
with x^(i-1) being powers of x (x is denoted as {02}) in the field GF(2^8),
as discussed in Sec. 4.2 (note that i starts at 1, not 0).
Rcon[i]はi=1から始まる配列、一つの要素は4バイトで上位3バイトは0、最下位の1バイトはx^(i-1)、xは2、GF(2^8)での計算。
4.2にあるように、生成多項式を x^8 + x^4 + x^3 + x + 1 とした乗算を行う。
この乗算は手っ取り早く書くと、掛けた値をx^8 + x^4 + x^3 + x + 1、つまり 100011011(b)で割った余りを計算結果とする。
例えばi=9の時、2^(i-1)を求める。
2^(9-1) = 2^8 = 256(d) = 100000000(b)
100000000
xor 100011011
-------------
000011011
= 0x1b
2^(10-1) = 2^9 = 512(d) = 1000000000(b)
この場合は、100011011を左シフトして最上位ビットの桁を合わせる
1000000000
xor 1000110110
--------------
0000110110
= 0x36
AES暗号 (1)
手持ちのCore i7 2600KにはAES-NI、AES暗号化・復号を補助する命令があり、これを使ってみようということでAESについて調べてみた。
wikipedia: AES暗号
Intel® Advanced Encryption Standard (AES) Instructions Set - Rev 3
これらからリンクしてある*.pdfを参考に、実装した。
日本語の文書が少ないので、分かりにくい部分二点のみを少々書いてみる。
・Rconの求め方
・MixColumnsの実装
wikipedia: AES暗号
Intel® Advanced Encryption Standard (AES) Instructions Set - Rev 3
これらからリンクしてある*.pdfを参考に、実装した。
日本語の文書が少ないので、分かりにくい部分二点のみを少々書いてみる。
・Rconの求め方
・MixColumnsの実装
2011-06-16
Twitter BOTの作成
https://twitter.com/Ran_Pig
FEZ本スレから、ソレっぽい投稿を拾ってきて、Twitterで発言するだけ。
Visual C# 2010 Express + DotNetOpenAuth。
FEZ本スレから、ソレっぽい投稿を拾ってきて、Twitterで発言するだけ。
Visual C# 2010 Express + DotNetOpenAuth。
登録:
投稿 (Atom)
