void serpent256_set_key (u32 *ks, const u32 *ukey)
{
+ #ifdef _unroll
#pragma unroll
+ #endif
for (int i = 0; i < 8; i++)
{
ks[i] = ukey[i];
}
+ #ifdef _unroll
#pragma unroll
+ #endif
for (int i = 0; i < 132; i++)
{
ks[i + 8] = rotl32 (ks[i + 7] ^ ks[i + 5] ^ ks[i + 3] ^ ks[i + 0] ^ 0x9e3779b9 ^ i, 11);
ROUND_STEP (0);
+ #ifdef _unroll
#pragma unroll
+ #endif
for (int i = 16; i < 64; i += 16)
{
ROUND_EXPAND (); ROUND_STEP (i);
ROUND_STEP (0);
+ #ifdef _unroll
#pragma unroll
+ #endif
for (int i = 16; i < 64; i += 16)
{
ROUND_EXPAND (); ROUND_STEP (i);
ROUND_STEP (0);
+ #ifdef _unroll
#pragma unroll
+ #endif
for (int i = 16; i < 64; i += 16)
{
ROUND_EXPAND (); ROUND_STEP (i);
ROUND_STEP (0);
+ #ifdef _unroll
#pragma unroll
+ #endif
for (int i = 16; i < 64; i += 16)
{
ROUND_EXPAND (); ROUND_STEP (i);
ROUND_STEP (0);
+ #ifdef _unroll
#pragma unroll
+ #endif
for (int i = 16; i < 64; i += 16)
{
ROUND_EXPAND (); ROUND_STEP (i);
ROUND_STEP (0);
+ #ifdef _unroll
#pragma unroll
+ #endif
for (int i = 16; i < 64; i += 16)
{
ROUND_EXPAND (); ROUND_STEP (i);
c = c & 0x0fffffff;
- #pragma unroll 16
+ #ifdef _unroll
+ #pragma unroll
+ #endif
for (u32 i = 0; i < 16; i++)
{
if ((i < 2) || (i == 8) || (i == 15))
for (u32 i = 0; i < 25; i++)
{
+ #ifdef _unroll
+ #pragma unroll
+ #endif
for (u32 j = 0; j < 16; j += 2)
{
u32x t;
c = c & 0x0fffffff;
- #pragma unroll 16
+ #ifdef _unroll
+ #pragma unroll
+ #endif
for (u32 i = 0; i < 16; i++)
{
if ((i < 2) || (i == 8) || (i == 15))
for (u32 i = 0; i < 25; i++)
{
+ #ifdef _unroll
+ #pragma unroll
+ #endif
for (u32 j = 0; j < 16; j += 2)
{
u32x t;
for (u32 ii = 0; ii < 25; ii++)
{
- #ifdef IS_NV
- #if CUDA_ARCH >= 500
- #else
+ #ifdef _unroll
#pragma unroll
#endif
- #endif
-
- #ifdef IS_AMD
- #pragma unroll
- #endif
-
for (u32 i = 0; i < 2; i++)
{
if (i) KEYSET10 else KEYSET00
for (u32 ii = 0; ii < 25; ii++)
{
- #ifdef IS_NV
- #if CUDA_ARCH >= 500
- #else
- #pragma unroll
- #endif
- #endif
-
- #ifdef IS_AMD
+ #ifdef _unroll
#pragma unroll
#endif
-
for (u32 i = 0; i < 2; i++)
{
if (i) KEYSET10 else KEYSET00
u32 tmpResult = 0;
+ #ifdef _unroll
#pragma unroll
+ #endif
for (int i = 0; i < 32; i++)
{
const u32 b0 = -((search[0] >> i) & 1);
u32 out0[32];
u32 out1[32];
+ #ifdef _unroll
#pragma unroll
+ #endif
for (int i = 0; i < 32; i++)
{
out0[i] = out[ 0 + 31 - i];
transpose32c (out0);
transpose32c (out1);
+ #ifdef _unroll
#pragma unroll
+ #endif
for (int slice = 0; slice < 32; slice++)
{
const u32 r0 = out0[31 - slice];
const u32 w0s = (w0 << 1) & 0xfefefefe;
+ #ifdef _unroll
#pragma unroll
+ #endif
for (int i = 0, j = 0; i < 32; i += 8, j += 7)
{
atomic_or (&words_buf_r[block].b[j + 0], (((w0s >> (i + 7)) & 1) << slice));
ROUND_STEP (0);
+ #ifdef _unroll
#pragma unroll
+ #endif
for (int i = 16; i < 80; i += 16)
{
ROUND_EXPAND (); ROUND_STEP (i);
ROUND_STEP (0);
+ #ifdef _unroll
#pragma unroll
+ #endif
for (int i = 16; i < 80; i += 16)
{
ROUND_EXPAND (); ROUND_STEP (i);
ROUND_STEP (0);
+ #ifdef _unroll
#pragma unroll
+ #endif
for (int i = 16; i < 80; i += 16)
{
ROUND_EXPAND (); ROUND_STEP (i);
ROUND_STEP (0);
+ #ifdef _unroll
#pragma unroll
+ #endif
for (int i = 16; i < 80; i += 16)
{
ROUND_EXPAND (); ROUND_STEP (i);
ROUND_STEP (0);
+ #ifdef _unroll
#pragma unroll
+ #endif
for (int i = 16; i < 80; i += 16)
{
ROUND_EXPAND (); ROUND_STEP (i);
ROUND_STEP (0);
+ #ifdef _unroll
#pragma unroll
+ #endif
for (int i = 16; i < 80; i += 16)
{
ROUND_EXPAND (); ROUND_STEP (i);
ROUND_STEP (0);
+ #ifdef _unroll
#pragma unroll
+ #endif
for (int i = 16; i < 80; i += 16)
{
ROUND_EXPAND (); ROUND_STEP (i);
ROUND_STEP (0);
+ #ifdef _unroll
#pragma unroll
+ #endif
for (int i = 16; i < 80; i += 16)
{
ROUND_EXPAND (); ROUND_STEP (i);
ROUND_STEP (0);
+ #ifdef _unroll
#pragma unroll
+ #endif
for (int i = 16; i < 80; i += 16)
{
ROUND_EXPAND (); ROUND_STEP (i);
ROUND_STEP (0);
+ #ifdef _unroll
#pragma unroll
+ #endif
for (int i = 16; i < 80; i += 16)
{
ROUND_EXPAND (); ROUND_STEP (i);
ROUND_STEP (0);
+ #ifdef _unroll
#pragma unroll
+ #endif
for (int i = 16; i < 80; i += 16)
{
ROUND_EXPAND (); ROUND_STEP (i);
ROUND_STEP (0);
+ #ifdef _unroll
#pragma unroll
+ #endif
for (int i = 16; i < 80; i += 16)
{
ROUND_EXPAND (); ROUND_STEP (i);
ROUND_STEP (0);
+ #ifdef _unroll
#pragma unroll
+ #endif
for (int i = 16; i < 80; i += 16)
{
ROUND_EXPAND (); ROUND_STEP (i);
ROUND_STEP (0);
+ #ifdef _unroll
#pragma unroll
+ #endif
for (int i = 16; i < 80; i += 16)
{
ROUND_EXPAND (); ROUND_STEP (i);
ROUND_STEP (0);
+ #ifdef _unroll
#pragma unroll
+ #endif
for (int i = 16; i < 80; i += 16)
{
ROUND_EXPAND (); ROUND_STEP (i);
ROUND_STEP (0);
- //#pragma unroll
+ #ifdef _unroll
+ #pragma unroll
+ #endif
for (int i = 16; i < 80; i += 16)
{
ROUND_EXPAND (); ROUND_STEP (i);
ROUND_STEP (0);
- //#pragma unroll
+ #ifdef _unroll
+ #pragma unroll
+ #endif
for (int i = 16; i < 80; i += 16)
{
ROUND_EXPAND (); ROUND_STEP (i);
ROUND_STEP (0);
- //#pragma unroll
+ #ifdef _unroll
+ #pragma unroll
+ #endif
for (int i = 16; i < 80; i += 16)
{
ROUND_EXPAND (); ROUND_STEP (i);
ROUND_STEP (0);
- //#pragma unroll
+ #ifdef _unroll
+ #pragma unroll
+ #endif
for (int i = 16; i < 80; i += 16)
{
ROUND_EXPAND (); ROUND_STEP (i);
ROUND_STEP (0);
- //#pragma unroll
+ #ifdef _unroll
+ #pragma unroll
+ #endif
for (int i = 16; i < 80; i += 16)
{
ROUND_EXPAND (); ROUND_STEP (i);
ROUND_STEP (0);
- //#pragma unroll
+ #ifdef _unroll
+ #pragma unroll
+ #endif
for (int i = 16; i < 80; i += 16)
{
ROUND_EXPAND (); ROUND_STEP (i);
ROUND_STEP (0);
- //#pragma unroll
+ #ifdef _unroll
+ #pragma unroll
+ #endif
for (int i = 16; i < 80; i += 16)
{
ROUND_EXPAND (); ROUND_STEP (i);
{
const u32 block_len = wpc_len[pc];
- #pragma unroll 64
+ #ifdef _unroll
+ #pragma unroll
+ #endif
for (u32 k = 0, p = block_len - 64; k < 64; k++, p++)
{
PUTCHAR64_BE (block, p, GETCHAR64_BE (l_alt_result, k));
u32x r = data[0];
u32x l = data[1];
- #pragma unroll 16
+ #ifdef _unroll
+ #pragma unroll
+ #endif
for (u32 i = 0; i < 16; i += 2)
{
u32x u;
c = c & 0x0fffffff;
- #pragma unroll 16
+ #ifdef _unroll
+ #pragma unroll
+ #endif
for (u32 i = 0; i < 16; i++)
{
if ((i < 2) || (i == 8) || (i == 15))
u32x r = data[0];
u32x l = data[1];
- #pragma unroll 16
+ #ifdef _unroll
+ #pragma unroll
+ #endif
for (u32 i = 0; i < 16; i += 2)
{
u32x u;
c = c & 0x0fffffff;
- #pragma unroll 16
+ #ifdef _unroll
+ #pragma unroll
+ #endif
for (u32 i = 0; i < 16; i++)
{
if ((i < 2) || (i == 8) || (i == 15))
KXX_DECL u32 k36, k37, k38, k39, k40, k41;
KXX_DECL u32 k42, k43, k44, k45, k46, k47;
- #ifdef IS_NV
- #if CUDA_ARCH >= 500
- #else
+ #ifdef _unroll
#pragma unroll
#endif
- #endif
-
- #ifdef IS_AMD
- #pragma unroll
- #endif
-
- #ifdef IS_GENERIC
- #pragma unroll 1
- #endif
-
for (u32 i = 0; i < 2; i++)
{
if (i) KEYSET10 else KEYSET00
u32 tmpResult = 0;
+ #ifdef _unroll
#pragma unroll
+ #endif
for (int i = 0; i < 32; i++)
{
const u32 b0 = -((search[0] >> i) & 1);
u32 out0[32];
u32 out1[32];
+ #ifdef _unroll
#pragma unroll
+ #endif
for (int i = 0; i < 32; i++)
{
out0[i] = out[ 0 + 31 - i];
transpose32c (out0);
transpose32c (out1);
+ #ifdef _unroll
#pragma unroll
+ #endif
for (int slice = 0; slice < 32; slice++)
{
const u32 r0 = out0[31 - slice];
r = rotl32 (r, 3u);
l = rotl32 (l, 3u);
- #pragma unroll 16
+ #ifdef _unroll
+ #pragma unroll
+ #endif
for (u32 i = 0; i < 16; i += 2)
{
u32x u;
c = c & 0x0fffffff;
- #pragma unroll 16
+ #ifdef _unroll
+ #pragma unroll
+ #endif
for (u32 i = 0; i < 16; i++)
{
if ((i < 2) || (i == 8) || (i == 15))
r = rotl32 (r, 3u);
l = rotl32 (l, 3u);
- #pragma unroll 16
+ #ifdef _unroll
+ #pragma unroll
+ #endif
for (u32 i = 0; i < 16; i += 2)
{
u32x u;
c = c & 0x0fffffff;
- #pragma unroll 16
+ #ifdef _unroll
+ #pragma unroll
+ #endif
for (u32 i = 0; i < 16; i++)
{
if ((i < 2) || (i == 8) || (i == 15))
r = rotl32 (r, 3u);
l = rotl32 (l, 3u);
- #pragma unroll 16
+ #ifdef _unroll
+ #pragma unroll
+ #endif
for (u32 i = 0; i < 16; i += 2)
{
u32x u;
c = c & 0x0fffffff;
- #pragma unroll 16
+ #ifdef _unroll
+ #pragma unroll
+ #endif
for (u32 i = 0; i < 16; i++)
{
if ((i < 2) || (i == 8) || (i == 15))
L0 = 0;
R0 = 0;
+ #ifdef _unroll
#pragma unroll
+ #endif
for (u32 i = 0; i < 9; i++)
{
BF_ENCRYPT (L0, R0);
L0 = 0;
R0 = 0;
+ #ifdef _unroll
#pragma unroll
+ #endif
for (u32 i = 0; i < 9; i++)
{
BF_ENCRYPT (L0, R0);
ROUND_STEP (0);
+ #ifdef _unroll
#pragma unroll
+ #endif
for (int i = 16; i < 64; i += 16)
{
ROUND_EXPAND (); ROUND_STEP (i);
u32x r = data[0];
u32x l = data[1];
- #pragma unroll 16
+ #ifdef _unroll
+ #pragma unroll
+ #endif
for (u32 i = 0; i < 16; i += 2)
{
u32x u;
c = c & 0x0fffffff;
- #pragma unroll 16
+ #ifdef _unroll
+ #pragma unroll
+ #endif
for (u32 i = 0; i < 16; i++)
{
if ((i < 2) || (i == 8) || (i == 15))
u32x r = data[0];
u32x l = data[1];
- #pragma unroll 16
+ #ifdef _unroll
+ #pragma unroll
+ #endif
for (u32 i = 0; i < 16; i += 2)
{
u32x u;
c = c & 0x0fffffff;
- #pragma unroll 16
+ #ifdef _unroll
+ #pragma unroll
+ #endif
for (u32 i = 0; i < 16; i++)
{
if ((i < 2) || (i == 8) || (i == 15))
u32x r = data[0];
u32x l = data[1];
- #pragma unroll 16
+ #ifdef _unroll
+ #pragma unroll
+ #endif
for (u32 i = 0; i < 16; i += 2)
{
u32x u;
c = c & 0x0fffffff;
- #pragma unroll 16
+ #ifdef _unroll
+ #pragma unroll
+ #endif
for (u32 i = 0; i < 16; i++)
{
if ((i < 2) || (i == 8) || (i == 15))
u32x Lh[8];
u32x Ll[8];
+ #ifdef _unroll
#pragma unroll
+ #endif
for (int i = 0; i < 8; i++)
{
const u32x Lp0 = stateh[(i + 8) & 7] >> 24;
u32x Lh[8];
u32x Ll[8];
+ #ifdef _unroll
#pragma unroll
+ #endif
for (int i = 0; i < 8; i++)
{
const u32x Lp0 = Kh[(i + 8) & 7] >> 24;
Kh[7] = Lh[7];
Kl[7] = Ll[7];
- #pragma unroll 8
+ #ifdef _unroll
+ #pragma unroll
+ #endif
for (int i = 0; i < 8; i++)
{
const u32x Lp0 = stateh[(i + 8) & 7] >> 24;
u32x Lh[8];
u32x Ll[8];
+ #ifdef _unroll
#pragma unroll
+ #endif
for (int i = 0; i < 8; i++)
{
const u32x Lp0 = stateh[(i + 8) & 7] >> 24;
u32x Lh[8];
u32x Ll[8];
+ #ifdef _unroll
#pragma unroll
+ #endif
for (int i = 0; i < 8; i++)
{
const u32x Lp0 = Kh[(i + 8) & 7] >> 24;
Kh[7] = Lh[7];
Kl[7] = Ll[7];
- #pragma unroll 8
+ #ifdef _unroll
+ #pragma unroll
+ #endif
for (int i = 0; i < 8; i++)
{
const u32x Lp0 = stateh[(i + 8) & 7] >> 24;
u32x Lh[8];
u32x Ll[8];
+ #ifdef _unroll
#pragma unroll
+ #endif
for (int i = 0; i < 8; i++)
{
const u32x Lp0 = stateh[(i + 8) & 7] >> 24;
u32x Lh[8];
u32x Ll[8];
+ #ifdef _unroll
#pragma unroll
+ #endif
for (int i = 0; i < 8; i++)
{
const u32x Lp0 = Kh[(i + 8) & 7] >> 24;
Kh[7] = Lh[7];
Kl[7] = Ll[7];
- #pragma unroll 8
+ #ifdef _unroll
+ #pragma unroll
+ #endif
for (int i = 0; i < 8; i++)
{
const u32x Lp0 = stateh[(i + 8) & 7] >> 24;
ROUND_STEP (0);
- //#pragma unroll
+ #ifdef _unroll
+ #pragma unroll
+ #endif
for (int i = 16; i < 80; i += 16)
{
ROUND_EXPAND (); ROUND_STEP (i);
ROUND_STEP (0);
- //#pragma unroll
+ #ifdef _unroll
+ #pragma unroll
+ #endif
for (int i = 16; i < 80; i += 16)
{
ROUND_EXPAND (); ROUND_STEP (i);
ROUND_STEP (0);
- //#pragma unroll
+ #ifdef _unroll
+ #pragma unroll
+ #endif
for (int i = 16; i < 80; i += 16)
{
ROUND_EXPAND (); ROUND_STEP (i);
u32 i;
+ #ifdef _unroll
#pragma unroll
+ #endif
for (i = 0; i < 8; i++)
{
const u32 Lp0 = Kh[(i + 8) & 7] >> 24;
Kh[7] = Lh[7];
Kl[7] = Ll[7];
+ #ifdef _unroll
#pragma unroll
+ #endif
for (i = 0; i < 8; i++)
{
const u32 Lp0 = stateh[(i + 8) & 7] >> 24;
u32 i;
+ #ifdef _unroll
#pragma unroll
+ #endif
for (i = 0; i < 8; i++)
{
const u32 Lp0 = Kh[(i + 8) & 7] >> 24;
Kh[7] = Lh[7];
Kl[7] = Ll[7];
+ #ifdef _unroll
#pragma unroll
+ #endif
for (i = 0; i < 8; i++)
{
const u32 Lp0 = stateh[(i + 8) & 7] >> 24;
u32 i;
- #pragma unroll 8
+ #ifdef _unroll
+ #pragma unroll
+ #endif
for (i = 0; i < 8; i++)
{
const u8 Lp0 = Kh[(i + 8) & 7] >> 24;
Kh[7] = Lh[7];
Kl[7] = Ll[7];
- #pragma unroll 8
+ #ifdef _unroll
+ #pragma unroll
+ #endif
for (i = 0; i < 8; i++)
{
const u8 Lp0 = stateh[(i + 8) & 7] >> 24;
u32 i;
- #pragma unroll 8
+ #ifdef _unroll
+ #pragma unroll
+ #endif
for (i = 0; i < 8; i++)
{
const u8 Lp0 = Kh[(i + 8) & 7] >> 24;
Kh[7] = Lh[7];
Kl[7] = Ll[7];
- #pragma unroll 8
+ #ifdef _unroll
+ #pragma unroll
+ #endif
for (i = 0; i < 8; i++)
{
const u8 Lp0 = stateh[(i + 8) & 7] >> 24;
ROUND_STEP (0);
+ #ifdef _unroll
#pragma unroll
+ #endif
for (int i = 16; i < 64; i += 16)
{
ROUND_EXPAND (); ROUND_STEP (i);
ROUND_STEP (0);
- //#pragma unroll
+ #ifdef _unroll
+ #pragma unroll
+ #endif
for (int i = 16; i < 80; i += 16)
{
ROUND_EXPAND (); ROUND_STEP (i);
rek[2] = userkey[2];
rek[3] = userkey[3];
- #pragma unroll 10
+ #ifdef _unroll
+ #pragma unroll
+ #endif
for (u32 i = 0, j = 0; i < 10; i += 1, j += 4)
{
u32 temp = rek[j + 3];
AES128_ExpandKey (ukey, rek, s_te0, s_te1, s_te2, s_te3, s_te4);
- #pragma unroll KEYLEN
+ #ifdef _unroll
+ #pragma unroll
+ #endif
for (u32 i = 0; i < KEYLEN; i++) rdk[i] = rek[i];
AES128_InvertKey (rdk, s_td0, s_td1, s_td2, s_td3, s_td4, s_te0, s_te1, s_te2, s_te3, s_te4);
ROUND_STEP (0);
+ #ifdef _unroll
#pragma unroll
+ #endif
for (int i = 16; i < 64; i += 16)
{
ROUND_EXPAND (); ROUND_STEP (i);
u32 rdk[KEYLEN];
- #pragma unroll 60
+ #ifdef _unroll
+ #pragma unroll
+ #endif
for (u32 i = 0; i < KEYLEN; i++) rdk[i] = rek[i];
AES256_InvertKey (rdk, s_td0, s_td1, s_td2, s_td3, s_td4, s_te0, s_te1, s_te2, s_te3, s_te4);
ROUND_STEP (0);
- //#pragma unroll
+ #ifdef _unroll
+ #pragma unroll
+ #endif
for (int i = 16; i < 80; i += 16)
{
ROUND_EXPAND (); ROUND_STEP (i);
ROUND_STEP (0);
+ #ifdef _unroll
#pragma unroll
+ #endif
for (int i = 16; i < 64; i += 16)
{
ROUND_EXPAND (); ROUND_STEP (i);
ROUND_STEP (0);
- //#pragma unroll
+ #ifdef _unroll
+ #pragma unroll
+ #endif
for (int i = 16; i < 64; i += 16)
{
ROUND_EXPAND (); ROUND_STEP (i);
if (j1)
{
- #pragma unroll 32
+ #ifdef _unroll
+ #pragma unroll
+ #endif
for (u32 k = 0, p = block_len - 32; k < 32; k++, p++)
{
PUTCHAR32_BE (block, p, GETCHAR32_BE (alt_result, k));
__local u32 *ptr = (__local u32 *) rc4_key->S;
+ #ifdef _unroll
#pragma unroll
+ #endif
for (u32 i = 0; i < 64; i++)
{
*ptr++ = v; v += a;
u8 rc4_next_16 (__local RC4_KEY *rc4_key, u8 i, u8 j, const u32 in[4], u32 out[4])
{
+ #ifdef _unroll
#pragma unroll
+ #endif
for (u32 k = 0; k < 4; k++)
{
u32 xor4 = 0;
__local u32 *ptr = (__local u32 *) rc4_key->S;
+ #ifdef _unroll
#pragma unroll
+ #endif
for (u32 i = 0; i < 64; i++)
{
*ptr++ = v; v += a;
u8 rc4_next_16 (__local RC4_KEY *rc4_key, u8 i, u8 j, const u32 in[4], u32 out[4])
{
+ #ifdef _unroll
#pragma unroll
+ #endif
for (u32 k = 0; k < 4; k++)
{
u32 xor4 = 0;
__local u32 *ptr = (__local u32 *) rc4_key->S;
+ #ifdef _unroll
#pragma unroll
+ #endif
for (u32 i = 0; i < 64; i++)
{
*ptr++ = v; v += a;
u8 rc4_next_16 (__local RC4_KEY *rc4_key, u8 i, u8 j, const u32 in[4], u32 out[4])
{
+ #ifdef _unroll
#pragma unroll
+ #endif
for (u32 k = 0; k < 4; k++)
{
u32 xor4 = 0;
digest[3] = SHA1M_D;
digest[4] = SHA1M_E;
- #pragma unroll 32
+ #ifdef _unroll
+ #pragma unroll
+ #endif
for (int i = 0; i < 32; i++) final[i] = 0;
final[0] = w0[0];
digest[3] = SHA1M_D;
digest[4] = SHA1M_E;
- #pragma unroll 32
+ #ifdef _unroll
+ #pragma unroll
+ #endif
for (int i = 0; i < 32; i++) final[i] = 0;
final[0] = w0[0];
digest[3] = SHA1M_D;
digest[4] = SHA1M_E;
- #pragma unroll 32
+ #ifdef _unroll
+ #pragma unroll
+ #endif
for (int i = 0; i < 32; i++) final[i] = 0;
final[0] = w0[0];
digest[3] = SHA1M_D;
digest[4] = SHA1M_E;
- #pragma unroll 32
+ #ifdef _unroll
+ #pragma unroll
+ #endif
for (int i = 0; i < 32; i++) final[i] = 0;
final[0] = w0[0];
digest[3] = SHA1M_D;
digest[4] = SHA1M_E;
- #pragma unroll 32
+ #ifdef _unroll
+ #pragma unroll
+ #endif
for (int i = 0; i < 32; i++) final[i] = 0;
final[0] = swap32 (w0[0]);
digest[3] = SHA1M_D;
digest[4] = SHA1M_E;
- #pragma unroll 32
+ #ifdef _unroll
+ #pragma unroll
+ #endif
for (int i = 0; i < 32; i++) final[i] = 0;
final[0] = swap32 (w0[0]);
ROUND_STEP (0);
+ #ifdef _unroll
#pragma unroll
+ #endif
for (int i = 16; i < 80; i += 16)
{
ROUND_EXPAND (); ROUND_STEP (i);
ROUND_STEP (0);
+ #ifdef _unroll
#pragma unroll
+ #endif
for (int i = 16; i < 64; i += 16)
{
ROUND_EXPAND (); ROUND_STEP (i);
ROUND_STEP_Z (0);
+ #ifdef _unroll
#pragma unroll
+ #endif
for (int i = 16; i < 64; i += 16)
{
ROUND_STEP_Z (i);
ROUND_STEP_S (0);
+ #ifdef _unroll
#pragma unroll
+ #endif
for (int i = 16; i < 64; i += 16)
{
ROUND_STEP_S (i);
{
w_s1[15] = 0 | salt_buf0 >> 16;
+ #ifdef _unroll
#pragma unroll
+ #endif
for (int i = 16; i < 64; i++)
{
w_s1[i] = SHA256_EXPAND_S (w_s1[i - 2], w_s1[i - 7], w_s1[i - 15], w_s1[i - 16]);
w_s2[ 2] = salt_buf2 << 16 | 0;
w_s2[15] = (510 + 8) * 8;
+ #ifdef _unroll
#pragma unroll
+ #endif
for (int i = 16; i < 64; i++)
{
w_s2[i] = SHA256_EXPAND_S (w_s2[i - 2], w_s2[i - 7], w_s2[i - 15], w_s2[i - 16]);
{
w_s1[15] = 0 | salt_buf0 >> 16;
+ #ifdef _unroll
#pragma unroll
+ #endif
for (int i = 16; i < 64; i++)
{
w_s1[i] = SHA256_EXPAND_S (w_s1[i - 2], w_s1[i - 7], w_s1[i - 15], w_s1[i - 16]);
w_s2[ 2] = salt_buf2 << 16 | 0;
w_s2[15] = (510 + 8) * 8;
+ #ifdef _unroll
#pragma unroll
+ #endif
for (int i = 16; i < 64; i++)
{
w_s2[i] = SHA256_EXPAND_S (w_s2[i - 2], w_s2[i - 7], w_s2[i - 15], w_s2[i - 16]);
ROUND_STEP (0);
+ #ifdef _unroll
#pragma unroll
+ #endif
for (int i = 16; i < 64; i += 16)
{
ROUND_EXPAND (); ROUND_STEP (i);
ROUND_STEP_Z (0);
+ #ifdef _unroll
#pragma unroll
+ #endif
for (int i = 16; i < 64; i += 16)
{
ROUND_STEP_Z (i);
ROUND_STEP_S (0);
+ #ifdef _unroll
#pragma unroll
+ #endif
for (int i = 16; i < 64; i += 16)
{
ROUND_STEP_S (i);
{
w_s1[15] = 0 | salt_buf0 >> 16;
+ #ifdef _unroll
#pragma unroll
+ #endif
for (int i = 16; i < 64; i++)
{
w_s1[i] = SHA256_EXPAND_S (w_s1[i - 2], w_s1[i - 7], w_s1[i - 15], w_s1[i - 16]);
w_s2[ 2] = salt_buf2 << 16 | 0;
w_s2[15] = (510 + 8) * 8;
+ #ifdef _unroll
#pragma unroll
+ #endif
for (int i = 16; i < 64; i++)
{
w_s2[i] = SHA256_EXPAND_S (w_s2[i - 2], w_s2[i - 7], w_s2[i - 15], w_s2[i - 16]);
{
w_s1[15] = 0 | salt_buf0 >> 16;
+ #ifdef _unroll
#pragma unroll
+ #endif
for (int i = 16; i < 64; i++)
{
w_s1[i] = SHA256_EXPAND_S (w_s1[i - 2], w_s1[i - 7], w_s1[i - 15], w_s1[i - 16]);
w_s2[ 2] = salt_buf2 << 16 | 0;
w_s2[15] = (510 + 8) * 8;
+ #ifdef _unroll
#pragma unroll
+ #endif
for (int i = 16; i < 64; i++)
{
w_s2[i] = SHA256_EXPAND_S (w_s2[i - 2], w_s2[i - 7], w_s2[i - 15], w_s2[i - 16]);
ROUND_STEP (0);
+ #ifdef _unroll
#pragma unroll
+ #endif
for (int i = 16; i < 64; i += 16)
{
ROUND_EXPAND (); ROUND_STEP (i);
ROUND_STEP_Z (0);
+ #ifdef _unroll
#pragma unroll
+ #endif
for (int i = 16; i < 64; i += 16)
{
ROUND_STEP_Z (i);
ROUND_STEP_S (0);
+ #ifdef _unroll
#pragma unroll
+ #endif
for (int i = 16; i < 64; i += 16)
{
ROUND_STEP_S (i);
{
w_s1[15] = 0 | salt_buf0 >> 16;
+ #ifdef _unroll
#pragma unroll
+ #endif
for (int i = 16; i < 64; i++)
{
w_s1[i] = SHA256_EXPAND_S (w_s1[i - 2], w_s1[i - 7], w_s1[i - 15], w_s1[i - 16]);
w_s2[ 2] = salt_buf2 << 16 | 0;
w_s2[15] = (510 + 8) * 8;
+ #ifdef _unroll
#pragma unroll
+ #endif
for (int i = 16; i < 64; i++)
{
w_s2[i] = SHA256_EXPAND_S (w_s2[i - 2], w_s2[i - 7], w_s2[i - 15], w_s2[i - 16]);
{
w_s1[15] = 0 | salt_buf0 >> 16;
+ #ifdef _unroll
#pragma unroll
+ #endif
for (int i = 16; i < 64; i++)
{
w_s1[i] = SHA256_EXPAND_S (w_s1[i - 2], w_s1[i - 7], w_s1[i - 15], w_s1[i - 16]);
w_s2[ 2] = salt_buf2 << 16 | 0;
w_s2[15] = (510 + 8) * 8;
+ #ifdef _unroll
#pragma unroll
+ #endif
for (int i = 16; i < 64; i++)
{
w_s2[i] = SHA256_EXPAND_S (w_s2[i - 2], w_s2[i - 7], w_s2[i - 15], w_s2[i - 16]);
ROUND_STEP (0);
+ #ifdef _unroll
#pragma unroll
+ #endif
for (int i = 16; i < 64; i += 16)
{
ROUND_EXPAND (); ROUND_STEP (i);
ROUND512_STEP (0);
- //#pragma unroll
+ #ifdef _unroll
+ #pragma unroll
+ #endif
for (int i = 16; i < 80; i += 16)
{
ROUND512_EXPAND (); ROUND512_STEP (i);
u32x r = data[0];
u32x l = data[1];
- #pragma unroll 16
+ #ifdef _unroll
+ #pragma unroll
+ #endif
for (u32 i = 0; i < 16; i += 2)
{
u32x u;
c = c & 0x0fffffff;
- #pragma unroll 16
+ #ifdef _unroll
+ #pragma unroll
+ #endif
for (u32 i = 0; i < 16; i++)
{
if ((i < 2) || (i == 8) || (i == 15))
u32x r = data[0];
u32x l = data[1];
- #pragma unroll 16
+ #ifdef _unroll
+ #pragma unroll
+ #endif
for (u32 i = 0; i < 16; i += 2)
{
u32x u;
c = c & 0x0fffffff;
- #pragma unroll 16
+ #ifdef _unroll
+ #pragma unroll
+ #endif
for (u32 i = 0; i < 16; i++)
{
if ((i < 2) || (i == 8) || (i == 15))
u32x r = data[0];
u32x l = data[1];
- #pragma unroll 16
+ #ifdef _unroll
+ #pragma unroll
+ #endif
for (u32 i = 0; i < 16; i += 2)
{
u32x u;
c = c & 0x0fffffff;
- #pragma unroll 16
+ #ifdef _unroll
+ #pragma unroll
+ #endif
for (u32 i = 0; i < 16; i++)
{
if ((i < 2) || (i == 8) || (i == 15))
{
u32 s = 48;
- #pragma unroll 12
+ #ifdef _unroll
+ #pragma unroll
+ #endif
for (int j = 0; j < 12; j++)
{
u32x tmp_in = in[j];
u32x c;
- #pragma unroll 4
+ #ifdef _unroll
+ #pragma unroll
+ #endif
for (int i = 0; i < 4; i++)
{
t ^= (in[i] >> 0) & 0xff; c = BOX1 (s_lotus_magic_table, t); out[i] ^= c << 0; t = ((out[i] >> 0) & 0xff);
{
u32 s = 48;
- #pragma unroll 12
+ #ifdef _unroll
+ #pragma unroll
+ #endif
for (int j = 0; j < 12; j++)
{
u32x tmp_in = in[j];
u32x c;
- #pragma unroll 4
+ #ifdef _unroll
+ #pragma unroll
+ #endif
for (int i = 0; i < 4; i++)
{
t ^= (in[i] >> 0) & 0xff; c = BOX1 (s_lotus_magic_table, t); out[i] ^= c << 0; t = ((out[i] >> 0) & 0xff);
{
u32 s = 48;
- #pragma unroll 12
+ #ifdef _unroll
+ #pragma unroll
+ #endif
for (int j = 0; j < 12; j++)
{
u32x tmp_in = in[j];
u32x c;
- #pragma unroll 4
+ #ifdef _unroll
+ #pragma unroll
+ #endif
for (int i = 0; i < 4; i++)
{
t ^= (in[i] >> 0) & 0xff; c = BOX1 (s_lotus_magic_table, t); out[i] ^= c << 0; t = ((out[i] >> 0) & 0xff);
{
u32 s = 48;
- #pragma unroll 12
+ #ifdef _unroll
+ #pragma unroll
+ #endif
for (int j = 0; j < 12; j++)
{
u32x tmp_in = in[j];
u32x c;
- #pragma unroll 4
+ #ifdef _unroll
+ #pragma unroll
+ #endif
for (int i = 0; i < 4; i++)
{
t ^= (in[i] >> 0) & 0xff; c = BOX1 (s_lotus_magic_table, t); out[i] ^= c << 0; t = ((out[i] >> 0) & 0xff);
{
u32 s = 48;
- #pragma unroll 12
+ #ifdef _unroll
+ #pragma unroll
+ #endif
for (int j = 0; j < 12; j++)
{
u32x tmp_in = in[j];
u32x c;
- #pragma unroll 4
+ #ifdef _unroll
+ #pragma unroll
+ #endif
for (int i = 0; i < 4; i++)
{
t ^= (in[i] >> 0) & 0xff; c = BOX1 (s_lotus_magic_table, t); out[i] ^= c << 0; t = ((out[i] >> 0) & 0xff);
{
u32 s = 48;
- #pragma unroll 12
+ #ifdef _unroll
+ #pragma unroll
+ #endif
for (int j = 0; j < 12; j++)
{
u32x tmp_in = in[j];
u32x c;
- #pragma unroll 4
+ #ifdef _unroll
+ #pragma unroll
+ #endif
for (int i = 0; i < 4; i++)
{
t ^= (in[i] >> 0) & 0xff; c = BOX1 (s_lotus_magic_table, t); out[i] ^= c << 0; t = ((out[i] >> 0) & 0xff);
ROUND_STEP (0);
+ #ifdef _unroll
#pragma unroll
+ #endif
for (int i = 16; i < 64; i += 16)
{
ROUND_EXPAND (); ROUND_STEP (i);
ROUND_STEP (0);
+ #ifdef _unroll
#pragma unroll
+ #endif
for (int i = 16; i < 64; i += 16)
{
ROUND_EXPAND (); ROUND_STEP (i);
const u32 x = gid % xSIZE;
+ #ifdef _unroll
#pragma unroll
+ #endif
for (u32 i = 0; i < STATE_CNT4; i += 4)
{
T[0] = (uint4) (X[i + 0].x, X[i + 1].y, X[i + 2].z, X[i + 3].w);
salsa_r (X);
}
+ #ifdef _unroll
#pragma unroll
+ #endif
for (u32 i = 0; i < STATE_CNT4; i += 4)
{
T[0] = (uint4) (X[i + 0].x, X[i + 3].y, X[i + 2].z, X[i + 1].w);
uint4 X[STATE_CNT4];
uint4 T[STATE_CNT4];
+ #ifdef _unroll
#pragma unroll
+ #endif
for (int z = 0; z < STATE_CNT4; z++) X[z] = swap32_4 (tmps[gid].P[z]);
scrypt_smix (X, T, scrypt_phy, d_scryptV_buf);
+ #ifdef _unroll
#pragma unroll
+ #endif
for (int z = 0; z < STATE_CNT4; z++) tmps[gid].P[z] = swap32_4 (X[z]);
#if SCRYPT_P >= 1
{
u32 s = 48;
- #pragma unroll 12
+ #ifdef _unroll
+ #pragma unroll
+ #endif
for (int j = 0; j < 12; j++)
{
u32 tmp_in = in[j];
u32 c;
- #pragma unroll 4
+ #ifdef _unroll
+ #pragma unroll
+ #endif
for (int i = 0; i < 4; i++)
{
t ^= (in[i] >> 0) & 0xff; c = BOX (s_lotus_magic_table, t); out[i] ^= c << 0; t = ((out[i] >> 0) & 0xff);
ROUND_STEP (0);
- #ifdef IS_AMD
- // #pragma unroll
- // breaks compiler
- #else
+ #ifdef _unroll
#pragma unroll
#endif
for (int i = 16; i < 80; i += 16)
__local u32 *ptr = (__local u32 *) rc4_key->S;
+ #ifdef _unroll
#pragma unroll
+ #endif
for (u32 i = 0; i < 64; i++)
{
*ptr++ = v; v += a;
u8 rc4_next_16 (__local RC4_KEY *rc4_key, u8 i, u8 j, const u32 in[4], u32 out[4])
{
+ #ifdef _unroll
#pragma unroll
+ #endif
for (u32 k = 0; k < 4; k++)
{
u32 xor4 = 0;
__local u32 *ptr = (__local u32 *) rc4_key->S;
+ #ifdef _unroll
#pragma unroll
+ #endif
for (u32 i = 0; i < 64; i++)
{
*ptr++ = v; v += a;
u8 rc4_next_16 (__local RC4_KEY *rc4_key, u8 i, u8 j, const u32 in[4], u32 out[4])
{
+ #ifdef _unroll
#pragma unroll
+ #endif
for (u32 k = 0; k < 4; k++)
{
u32 xor4 = 0;
__local u32 *ptr = (__local u32 *) rc4_key->S;
+ #ifdef _unroll
#pragma unroll
+ #endif
for (u32 i = 0; i < 64; i++)
{
*ptr++ = v; v += a;
u8 rc4_next_16 (__local RC4_KEY *rc4_key, u8 i, u8 j, const u32 in[4], u32 out[4])
{
+ #ifdef _unroll
#pragma unroll
+ #endif
for (u32 k = 0; k < 4; k++)
{
u32 xor4 = 0;
__local u32 *ptr = (__local u32 *) rc4_key->S;
+ #ifdef _unroll
#pragma unroll
+ #endif
for (u32 i = 0; i < 64; i++)
{
*ptr++ = v; v += a;
u8 rc4_next_16 (__local RC4_KEY *rc4_key, u8 i, u8 j, const u32 in[4], u32 out[4])
{
+ #ifdef _unroll
#pragma unroll
+ #endif
for (u32 k = 0; k < 4; k++)
{
u32 xor4 = 0;
__local u32 *ptr = (__local u32 *) rc4_key->S;
+ #ifdef _unroll
#pragma unroll
+ #endif
for (u32 i = 0; i < 64; i++)
{
*ptr++ = v; v += a;
u8 rc4_next_16 (__local RC4_KEY *rc4_key, u8 i, u8 j, const u32 in[4], u32 out[4])
{
+ #ifdef _unroll
#pragma unroll
+ #endif
for (u32 k = 0; k < 4; k++)
{
u32 xor4 = 0;
__local u32 *ptr = (__local u32 *) rc4_key->S;
+ #ifdef _unroll
#pragma unroll
+ #endif
for (u32 i = 0; i < 64; i++)
{
*ptr++ = v; v += a;
u8 rc4_next_16 (__local RC4_KEY *rc4_key, u8 i, u8 j, const u32 in[4], u32 out[4])
{
+ #ifdef _unroll
#pragma unroll
+ #endif
for (u32 k = 0; k < 4; k++)
{
u32 xor4 = 0;
__local u32 *ptr = (__local u32 *) rc4_key->S;
+ #ifdef _unroll
#pragma unroll
+ #endif
for (u32 i = 0; i < 64; i++)
{
*ptr++ = v; v += a;
u8 rc4_next_16 (__local RC4_KEY *rc4_key, u8 i, u8 j, const u32 in[4], u32 out[4])
{
+ #ifdef _unroll
#pragma unroll
+ #endif
for (u32 k = 0; k < 4; k++)
{
u32 xor4 = 0;
__local u32 *ptr = (__local u32 *) rc4_key->S;
+ #ifdef _unroll
#pragma unroll
+ #endif
for (u32 i = 0; i < 64; i++)
{
*ptr++ = v; v += a;
u8 rc4_next_16 (__local RC4_KEY *rc4_key, u8 i, u8 j, const u32 in[4], u32 out[4])
{
+ #ifdef _unroll
#pragma unroll
+ #endif
for (u32 k = 0; k < 4; k++)
{
u32 xor4 = 0;
__local u32 *ptr = (__local u32 *) rc4_key->S;
+ #ifdef _unroll
#pragma unroll
+ #endif
for (u32 i = 0; i < 64; i++)
{
*ptr++ = v; v += a;
u8 rc4_next_16 (__local RC4_KEY *rc4_key, u8 i, u8 j, const u32 in[4], u32 out[4])
{
+ #ifdef _unroll
#pragma unroll
+ #endif
for (u32 k = 0; k < 4; k++)
{
u32 xor4 = 0;
__local u32 *ptr = (__local u32 *) rc4_key->S;
+ #ifdef _unroll
#pragma unroll
+ #endif
for (u32 i = 0; i < 64; i++)
{
*ptr++ = v; v += a;
u8 rc4_next_16 (__local RC4_KEY *rc4_key, u8 i, u8 j, const u32 in[4], u32 out[4])
{
+ #ifdef _unroll
#pragma unroll
+ #endif
for (u32 k = 0; k < 4; k++)
{
u32 xor4 = 0;
__local u32 *ptr = (__local u32 *) rc4_key->S;
+ #ifdef _unroll
#pragma unroll
+ #endif
for (u32 i = 0; i < 64; i++)
{
*ptr++ = v; v += a;
u8 rc4_next_16 (__local RC4_KEY *rc4_key, u8 i, u8 j, const u32 in[4], u32 out[4])
{
+ #ifdef _unroll
#pragma unroll
+ #endif
for (u32 k = 0; k < 4; k++)
{
u32 xor4 = 0;
__local u32 *ptr = (__local u32 *) rc4_key->S;
+ #ifdef _unroll
#pragma unroll
+ #endif
for (u32 i = 0; i < 64; i++)
{
*ptr++ = v; v += a;
u8 rc4_next_16 (__local RC4_KEY *rc4_key, u8 i, u8 j, const u32 in[4], u32 out[4])
{
+ #ifdef _unroll
#pragma unroll
+ #endif
for (u32 k = 0; k < 4; k++)
{
u32 xor4 = 0;
__local u32 *ptr = (__local u32 *) rc4_key->S;
+ #ifdef _unroll
#pragma unroll
+ #endif
for (u32 i = 0; i < 64; i++)
{
ptr[i] = v; v += a;
u32 j = 0;
+ #ifdef _unroll
#pragma unroll
+ #endif
for (u32 i = 0; i < 255; i += 5)
{
j += rc4_key->S[i + 0] + d0; swap (rc4_key, i + 0, j);
u8 rc4_next_16 (__local RC4_KEY *rc4_key, u8 i, u8 j, __constant u32 *in, u32 out[4])
{
- #pragma unroll 4
+ #ifdef _unroll
+ #pragma unroll
+ #endif
for (u32 k = 0; k < 4; k++)
{
u32 xor4 = 0;
__local u32 *ptr = (__local u32 *) rc4_key->S;
+ #ifdef _unroll
#pragma unroll
+ #endif
for (u32 i = 0; i < 64; i++)
{
ptr[i] = v; v += a;
u32 j = 0;
+ #ifdef _unroll
#pragma unroll
+ #endif
for (u32 i = 0; i < 255; i += 5)
{
j += rc4_key->S[i + 0] + d0; swap (rc4_key, i + 0, j);
u8 rc4_next_16 (__local RC4_KEY *rc4_key, u8 i, u8 j, __constant u32 *in, u32 out[4])
{
- #pragma unroll 4
+ #ifdef _unroll
+ #pragma unroll
+ #endif
for (u32 k = 0; k < 4; k++)
{
u32 xor4 = 0;
__local u32 *ptr = (__local u32 *) rc4_key->S;
+ #ifdef _unroll
#pragma unroll
+ #endif
for (u32 i = 0; i < 64; i++)
{
ptr[i] = v; v += a;
u32 j = 0;
+ #ifdef _unroll
#pragma unroll
+ #endif
for (u32 i = 0; i < 255; i += 5)
{
j += rc4_key->S[i + 0] + d0; swap (rc4_key, i + 0, j);
u8 rc4_next_16 (__local RC4_KEY *rc4_key, u8 i, u8 j, __constant u32 *in, u32 out[4])
{
- #pragma unroll 4
+ #ifdef _unroll
+ #pragma unroll
+ #endif
for (u32 k = 0; k < 4; k++)
{
u32 xor4 = 0;
__local u32 *ptr = (__local u32 *) rc4_key->S;
+ #ifdef _unroll
#pragma unroll
+ #endif
for (u32 i = 0; i < 64; i++)
{
ptr[i] = v; v += a;
u32 j = 0;
+ #ifdef _unroll
#pragma unroll
+ #endif
for (u32 i = 0; i < 255; i += 5)
{
j += rc4_key->S[i + 0] + d0; swap (rc4_key, i + 0, j);
u8 rc4_next_16 (__local RC4_KEY *rc4_key, u8 i, u8 j, __constant u32 *in, u32 out[4])
{
- #pragma unroll 4
+ #ifdef _unroll
+ #pragma unroll
+ #endif
for (u32 k = 0; k < 4; k++)
{
u32 xor4 = 0;
__local u32 *ptr = (__local u32 *) rc4_key->S;
+ #ifdef _unroll
#pragma unroll
+ #endif
for (u32 i = 0; i < 64; i++)
{
ptr[i] = v; v += a;
u32 j = 0;
+ #ifdef _unroll
#pragma unroll
+ #endif
for (u32 i = 0; i < 255; i += 5)
{
j += rc4_key->S[i + 0] + d0; swap (rc4_key, i + 0, j);
u8 rc4_next_16 (__local RC4_KEY *rc4_key, u8 i, u8 j, __constant u32 *in, u32 out[4])
{
- #pragma unroll 4
+ #ifdef _unroll
+ #pragma unroll
+ #endif
for (u32 k = 0; k < 4; k++)
{
u32 xor4 = 0;
__local u32 *ptr = (__local u32 *) rc4_key->S;
+ #ifdef _unroll
#pragma unroll
+ #endif
for (u32 i = 0; i < 64; i++)
{
ptr[i] = v; v += a;
u32 j = 0;
+ #ifdef _unroll
#pragma unroll
+ #endif
for (u32 i = 0; i < 255; i += 5)
{
j += rc4_key->S[i + 0] + d0; swap (rc4_key, i + 0, j);
u8 rc4_next_16 (__local RC4_KEY *rc4_key, u8 i, u8 j, __constant u32 *in, u32 out[4])
{
- #pragma unroll 4
+ #ifdef _unroll
+ #pragma unroll
+ #endif
for (u32 k = 0; k < 4; k++)
{
u32 xor4 = 0;
__local u32 *ptr = (__local u32 *) rc4_key->S;
+ #ifdef _unroll
#pragma unroll
+ #endif
for (u32 i = 0; i < 64; i++)
{
*ptr++ = v; v += a;
u32 j = 0;
+ #ifdef _unroll
#pragma unroll
+ #endif
for (u32 i = 0; i < 16; i++)
{
u32 idx = i * 16;
u8 rc4_next_16 (__local RC4_KEY *rc4_key, u8 i, u8 j, const u32 in[4], u32 out[4])
{
+ #ifdef _unroll
#pragma unroll
+ #endif
for (u32 k = 0; k < 4; k++)
{
u32 xor4 = 0;
ROUND256_STEP (0);
+ #ifdef _unroll
#pragma unroll
+ #endif
for (int i = 16; i < 64; i += 16)
{
ROUND256_EXPAND (); ROUND256_STEP (i);
ROUND384_STEP (0);
- //#pragma unroll
+ #ifdef _unroll
+ #pragma unroll
+ #endif
for (int i = 16; i < 80; i += 16)
{
ROUND384_EXPAND (); ROUND384_STEP (i);
ROUND512_STEP (0);
- //#pragma unroll
+ #ifdef _unroll
+ #pragma unroll
+ #endif
for (int i = 16; i < 80; i += 16)
{
ROUND512_EXPAND (); ROUND512_STEP (i);
ROUND_STEP (0);
+ #ifdef _unroll
#pragma unroll
+ #endif
for (int i = 16; i < 80; i += 16)
{
ROUND_EXPAND (); ROUND_STEP (i);
ROUND_STEP (0);
+ #ifdef _unroll
#pragma unroll
+ #endif
for (int i = 16; i < 80; i += 16)
{
ROUND_EXPAND (); ROUND_STEP (i);
ROUND_STEP (0);
+ #ifdef _unroll
#pragma unroll
+ #endif
for (int i = 16; i < 80; i += 16)
{
ROUND_EXPAND (); ROUND_STEP (i);
ROUND_STEP (0);
+ #ifdef _unroll
#pragma unroll
+ #endif
for (int i = 16; i < 64; i += 16)
{
ROUND_EXPAND (); ROUND_STEP (i);
ROUND_STEP (0);
+ #ifdef _unroll
#pragma unroll
+ #endif
for (int i = 16; i < 80; i += 16)
{
ROUND_EXPAND (); ROUND_STEP (i);
ROUND_STEP (0);
+ #ifdef _unroll
#pragma unroll
+ #endif
for (int i = 16; i < 64; i += 16)
{
ROUND_EXPAND (); ROUND_STEP (i);
u64x s[8];
u64x t[8];
+ #ifdef _unroll
#pragma unroll
+ #endif
for (int i = 0; i < 8; i++)
{
t[i] = h[i];
k[i] = SBOG_LPSti64;
}
+ #ifdef _unroll
#pragma unroll
+ #endif
for (int i = 0; i < 8; i++)
{
s[i] = m[i];
for (int r = 0; r < 12; r++)
{
+ #ifdef _unroll
#pragma unroll
+ #endif
for (int i = 0; i < 8; i++)
{
t[i] = s[i] ^ k[i];
}
+ #ifdef _unroll
#pragma unroll
+ #endif
for (int i = 0; i < 8; i++)
{
s[i] = SBOG_LPSti64;
t[i] = k[i] ^ sbob_rc64[r][i];
}
+ #ifdef _unroll
#pragma unroll
+ #endif
for (int i = 0; i < 8; i++)
{
k[i] = SBOG_LPSti64;
}
}
+ #ifdef _unroll
#pragma unroll
+ #endif
for (int i = 0; i < 8; i++)
{
h[i] ^= s[i] ^ k[i] ^ m[i];
u64x s[8];
u64x t[8];
+ #ifdef _unroll
#pragma unroll
+ #endif
for (int i = 0; i < 8; i++)
{
t[i] = h[i];
k[i] = SBOG_LPSti64;
}
+ #ifdef _unroll
#pragma unroll
+ #endif
for (int i = 0; i < 8; i++)
{
s[i] = m[i];
for (int r = 0; r < 12; r++)
{
+ #ifdef _unroll
#pragma unroll
+ #endif
for (int i = 0; i < 8; i++)
{
t[i] = s[i] ^ k[i];
}
+ #ifdef _unroll
#pragma unroll
+ #endif
for (int i = 0; i < 8; i++)
{
s[i] = SBOG_LPSti64;
t[i] = k[i] ^ sbob_rc64[r][i];
}
+ #ifdef _unroll
#pragma unroll
+ #endif
for (int i = 0; i < 8; i++)
{
k[i] = SBOG_LPSti64;
}
}
+ #ifdef _unroll
#pragma unroll
+ #endif
for (int i = 0; i < 8; i++)
{
h[i] ^= s[i] ^ k[i] ^ m[i];
u64x s[8];
u64x t[8];
+ #ifdef _unroll
#pragma unroll
+ #endif
for (int i = 0; i < 8; i++)
{
t[i] = h[i];
k[i] = SBOG_LPSti64;
}
+ #ifdef _unroll
#pragma unroll
+ #endif
for (int i = 0; i < 8; i++)
{
s[i] = m[i];
for (int r = 0; r < 12; r++)
{
+ #ifdef _unroll
#pragma unroll
+ #endif
for (int i = 0; i < 8; i++)
{
t[i] = s[i] ^ k[i];
}
+ #ifdef _unroll
#pragma unroll
+ #endif
for (int i = 0; i < 8; i++)
{
s[i] = SBOG_LPSti64;
t[i] = k[i] ^ sbob_rc64[r][i];
}
+ #ifdef _unroll
#pragma unroll
+ #endif
for (int i = 0; i < 8; i++)
{
k[i] = SBOG_LPSti64;
}
}
+ #ifdef _unroll
#pragma unroll
+ #endif
for (int i = 0; i < 8; i++)
{
h[i] ^= s[i] ^ k[i] ^ m[i];
u64x s[8];
u64x t[8];
+ #ifdef _unroll
#pragma unroll
+ #endif
for (int i = 0; i < 8; i++)
{
t[i] = h[i];
k[i] = SBOG_LPSti64;
}
+ #ifdef _unroll
#pragma unroll
+ #endif
for (int i = 0; i < 8; i++)
{
s[i] = m[i];
for (int r = 0; r < 12; r++)
{
+ #ifdef _unroll
#pragma unroll
+ #endif
for (int i = 0; i < 8; i++)
{
t[i] = s[i] ^ k[i];
}
+ #ifdef _unroll
#pragma unroll
+ #endif
for (int i = 0; i < 8; i++)
{
s[i] = SBOG_LPSti64;
t[i] = k[i] ^ sbob_rc64[r][i];
}
+ #ifdef _unroll
#pragma unroll
+ #endif
for (int i = 0; i < 8; i++)
{
k[i] = SBOG_LPSti64;
}
}
+ #ifdef _unroll
#pragma unroll
+ #endif
for (int i = 0; i < 8; i++)
{
h[i] ^= s[i] ^ k[i] ^ m[i];
u64x s[8];
u64x t[8];
+ #ifdef _unroll
#pragma unroll
+ #endif
for (int i = 0; i < 8; i++)
{
t[i] = h[i];
k[i] = SBOG_LPSti64;
}
+ #ifdef _unroll
#pragma unroll
+ #endif
for (int i = 0; i < 8; i++)
{
s[i] = m[i];
for (int r = 0; r < 12; r++)
{
+ #ifdef _unroll
#pragma unroll
+ #endif
for (int i = 0; i < 8; i++)
{
t[i] = s[i] ^ k[i];
}
+ #ifdef _unroll
#pragma unroll
+ #endif
for (int i = 0; i < 8; i++)
{
s[i] = SBOG_LPSti64;
t[i] = k[i] ^ sbob_rc64[r][i];
}
+ #ifdef _unroll
#pragma unroll
+ #endif
for (int i = 0; i < 8; i++)
{
k[i] = SBOG_LPSti64;
}
}
+ #ifdef _unroll
#pragma unroll
+ #endif
for (int i = 0; i < 8; i++)
{
h[i] ^= s[i] ^ k[i] ^ m[i];
u64x s[8];
u64x t[8];
+ #ifdef _unroll
#pragma unroll
+ #endif
for (int i = 0; i < 8; i++)
{
t[i] = h[i];
k[i] = SBOG_LPSti64;
}
+ #ifdef _unroll
#pragma unroll
+ #endif
for (int i = 0; i < 8; i++)
{
s[i] = m[i];
for (int r = 0; r < 12; r++)
{
+ #ifdef _unroll
#pragma unroll
+ #endif
for (int i = 0; i < 8; i++)
{
t[i] = s[i] ^ k[i];
}
+ #ifdef _unroll
#pragma unroll
+ #endif
for (int i = 0; i < 8; i++)
{
s[i] = SBOG_LPSti64;
t[i] = k[i] ^ sbob_rc64[r][i];
}
+ #ifdef _unroll
#pragma unroll
+ #endif
for (int i = 0; i < 8; i++)
{
k[i] = SBOG_LPSti64;
}
}
+ #ifdef _unroll
#pragma unroll
+ #endif
for (int i = 0; i < 8; i++)
{
h[i] ^= s[i] ^ k[i] ^ m[i];
ROUND_STEP (0);
- #ifdef IS_AMD
- // #pragma unroll
- // breaks compiler
- #else
+ #ifdef _unroll
#pragma unroll
#endif
for (int i = 16; i < 80; i += 16)
ROUND_STEP (0);
- //#pragma unroll
+ #ifdef _unroll
+ #pragma unroll
+ #endif
for (int i = 16; i < 80; i += 16)
{
ROUND_EXPAND (); ROUND_STEP (i);
c = c & 0x0fffffff;
+ #ifdef _unroll
#pragma unroll
+ #endif
for (u32 i = 0; i < 16; i++)
{
if ((i < 2) || (i == 8) || (i == 15))
ROUND_STEP (0);
+ #ifdef _unroll
#pragma unroll
+ #endif
for (int i = 16; i < 64; i += 16)
{
ROUND_EXPAND (); ROUND_STEP (i);
ROUND_STEP (0);
+ #ifdef _unroll
#pragma unroll
+ #endif
for (int i = 16; i < 64; i += 16)
{
ROUND_EXPAND (); ROUND_STEP (i);
ROUND_STEP (0);
+ #ifdef _unroll
#pragma unroll
+ #endif
for (int i = 16; i < 64; i += 16)
{
ROUND_EXPAND (); ROUND_STEP (i);
__local u32 *ptr = (__local u32 *) rc4_key->S;
+ #ifdef _unroll
#pragma unroll
+ #endif
for (u32 i = 0; i < 64; i++)
{
*ptr++ = v; v += a;
u8 rc4_next_16 (__local RC4_KEY *rc4_key, u8 i, u8 j, __global u32 *in, u32 out[4])
{
+ #ifdef _unroll
#pragma unroll
+ #endif
for (u32 k = 0; k < 4; k++)
{
u32 xor4 = 0;
__local u32 *ptr = (__local u32 *) rc4_key->S;
+ #ifdef _unroll
#pragma unroll
+ #endif
for (u32 i = 0; i < 64; i++)
{
*ptr++ = v; v += a;
u8 rc4_next_16 (__local RC4_KEY *rc4_key, u8 i, u8 j, __global u32 *in, u32 out[4])
{
+ #ifdef _unroll
#pragma unroll
+ #endif
for (u32 k = 0; k < 4; k++)
{
u32 xor4 = 0;
__local u32 *ptr = (__local u32 *) rc4_key->S;
+ #ifdef _unroll
#pragma unroll
+ #endif
for (u32 i = 0; i < 64; i++)
{
*ptr++ = v; v += a;
u8 rc4_next_16 (__local RC4_KEY *rc4_key, u8 i, u8 j, __global u32 *in, u32 out[4])
{
+ #ifdef _unroll
#pragma unroll
+ #endif
for (u32 k = 0; k < 4; k++)
{
u32 xor4 = 0;
* License.....: MIT
*/
-
+
#define _SHA1_
#include "include/constants.h"
rek[2] = userkey[2];
rek[3] = userkey[3];
- #pragma unroll 10
+ #ifdef _unroll
+ #pragma unroll
+ #endif
for (u32 i = 0, j = 0; i < 10; i += 1, j += 4)
{
u32 temp = rek[j + 3];
const u32 pw_len = pws[gid].pw_len;
append_0x80_4x4 (w0, w1, w2, w3, pw_len);
-
+
w0[0] = swap32 (w0[0]);
w0[1] = swap32 (w0[1]);
w0[2] = swap32 (w0[2]);
w3[3] = swap32 (w3[3]);
w3[3] = pw_len * 8;
-
+
/**
* KEK
*/
tmps[gid].cipher[1] = salt_bufs[salt_pos].salt_buf[5];
tmps[gid].cipher[2] = 0;
tmps[gid].cipher[3] = 0;
-
+
}
__kernel void m13200_loop (__global pw_t *pws, __global kernel_rule_t *rules_buf, __global comb_t *combs_buf, __global bf_t *bfs_buf, __global axcrypt_tmp_t *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 rules_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max)
ukeyx[1] = tmps[gid].KEK[1];
ukeyx[2] = tmps[gid].KEK[2];
ukeyx[3] = tmps[gid].KEK[3];
-
+
AES128_ExpandKey (ukeyx, rek, s_te0, s_te1, s_te2, s_te3, s_te4);
- #pragma unroll KEYLEN
+ #ifdef _unroll
+ #pragma unroll
+ #endif
for (u32 i = 0; i < KEYLEN; i++) rdk[i] = rek[i];
AES128_InvertKey (rdk, s_td0, s_td1, s_td2, s_td3, s_td4, s_te0, s_te1, s_te2, s_te3, s_te4);
u32 lsb[4];
u32 cipher[4];
-
+
lsb[0] = tmps[gid].lsb[0];
lsb[1] = tmps[gid].lsb[1];
lsb[2] = tmps[gid].lsb[2];
cipher[1] = tmps[gid].cipher[1];
cipher[2] = tmps[gid].cipher[2];
cipher[3] = tmps[gid].cipher[3];
-
-
+
+
/**
* AxCrypt main cipher routine
*/
tmps[gid].lsb[1] = lsb[1];
tmps[gid].lsb[2] = lsb[2];
tmps[gid].lsb[3] = lsb[3];
-
+
tmps[gid].cipher[0] = cipher[0];
tmps[gid].cipher[1] = cipher[1];
tmps[gid].cipher[2] = cipher[2];
ROUND_STEP (0);
+ #ifdef _unroll
#pragma unroll
+ #endif
for (int i = 16; i < 64; i += 16)
{
ROUND_EXPAND (); ROUND_STEP (i);
u32x out_len = 0;
+ #ifdef _unroll
#pragma unroll
+ #endif
for (int i = 0; i < VECT_SIZE; i++)
{
u32 tmp0[4];
* License.....: MIT
*/
-#define DEVICE_TYPE_CPU 2
-#define DEVICE_TYPE_GPU 4
-
typedef uchar u8;
typedef ushort u16;
typedef uint u32;
* License.....: MIT
*/
-#ifdef cl_khr_byte_addressable_store
+#pragma OPENCL EXTENSION cl_khr_int64_base_atomics : enable
#pragma OPENCL EXTENSION cl_khr_byte_addressable_store : enable
+
+/**
+ * device type
+ */
+
+#define DEVICE_TYPE_CPU 2
+#define DEVICE_TYPE_GPU 4
+#define DEVICE_TYPE_ACCEL 8
+
+#if DEVICE_TYPE == DEVICE_TYPE_CPU
+#define IS_CPU
+#elif DEVICE_TYPE == DEVICE_TYPE_GPU
+#define IS_GPU
+#elif DEVICE_TYPE == DEVICE_TYPE_ACCEL
+#define IS_ACCEL
#endif
/**
* vendor specific
*/
-#if VENDOR_ID == (1 << 0)
+#if VENDOR_ID == (1 << 0)
#define IS_AMD
#elif VENDOR_ID == (1 << 6)
#define IS_NV
#endif
/**
- * NV specific
+ * Unrolling is generally enabled, for all device types and hash modes
+ * There's a few exception when it's better not to unroll
*/
+// Some algorithms run into too much register pressure due to loop unrolling
+
#ifdef IS_NV
+#ifdef IS_GPU
+
+#if KERN_TYPE == 1500
+#undef _unroll
+#endif
+#if KERN_TYPE == 1800
+#undef _unroll
+#endif
+#if KERN_TYPE == 3000
+#undef _unroll
+#endif
+#if KERN_TYPE == 6221
+#undef _unroll
+#endif
+#if KERN_TYPE == 6222
+#undef _unroll
+#endif
+#if KERN_TYPE == 6223
+#undef _unroll
+#endif
+#if KERN_TYPE == 6500
+#undef _unroll
+#endif
+#if KERN_TYPE == 7100
+#undef _unroll
+#endif
+#if KERN_TYPE == 7400
+#undef _unroll
+#endif
+#if KERN_TYPE == 8200
+#undef _unroll
+#endif
+#if KERN_TYPE == 10400
+#undef _unroll
+#endif
+#if KERN_TYPE == 10500
+#undef _unroll
+#endif
+#if KERN_TYPE == 10700
+#undef _unroll
+#endif
+#if KERN_TYPE == 12300
+#undef _unroll
+#endif
+#if KERN_TYPE == 12400
+#undef _unroll
#endif
-/**
- * Generic
- */
+#endif
+#endif
+
+#ifdef IS_AMD
+#ifdef IS_GPU
+
+#if KERN_TYPE == 3200
+#undef _unroll
+#endif
+#if KERN_TYPE == 5200
+#undef _unroll
+#endif
+#if KERN_TYPE == 6100
+#undef _unroll
+#endif
+#if KERN_TYPE == 6221
+#undef _unroll
+#endif
+#if KERN_TYPE == 6222
+#undef _unroll
+#endif
+#if KERN_TYPE == 6223
+#undef _unroll
+#endif
+#if KERN_TYPE == 6400
+#undef _unroll
+#endif
+#if KERN_TYPE == 6500
+#undef _unroll
+#endif
+#if KERN_TYPE == 6800
+#undef _unroll
+#endif
+#if KERN_TYPE == 7100
+#undef _unroll
+#endif
+#if KERN_TYPE == 7400
+#undef _unroll
+#endif
+#if KERN_TYPE == 8000
+#undef _unroll
+#endif
+#if KERN_TYPE == 8200
+#undef _unroll
+#endif
+#if KERN_TYPE == 10900
+#undef _unroll
+#endif
+#if KERN_TYPE == 11600
+#undef _unroll
+#endif
+#if KERN_TYPE == 12300
+#undef _unroll
+#endif
+#if KERN_TYPE == 12800
+#undef _unroll
+#endif
+#if KERN_TYPE == 12900
+#undef _unroll
+#endif
+#if KERN_TYPE == 13000
+#undef _unroll
+#endif
-#ifdef IS_GENERIC
+#endif
+#endif
+
+// Some algorithms break due to loop unrolling, it's unknown why, probably compiler bugs
+// Can overlap with above cases
+
+#ifdef IS_AMD
+#ifdef IS_GPU
+
+#if KERN_TYPE == 1750
+#undef _unroll
+#endif
+#if KERN_TYPE == 1760
+#undef _unroll
+#endif
+#if KERN_TYPE == 6500
+#undef _unroll
+#endif
+#if KERN_TYPE == 7100
+#undef _unroll
+#endif
+#if KERN_TYPE == 9600
+#undef _unroll
+#endif
+#if KERN_TYPE == 12200
+#undef _unroll
+#endif
+#if KERN_TYPE == 12300
+#undef _unroll
+#endif
+
+#endif
#endif
// we don't have sm_* on vendors not NV but it doesn't matter
- snprintf (build_opts, sizeof (build_opts) - 1, "-cl-std=CL1.1 -I\"%s/\" -DVENDOR_ID=%u -DCUDA_ARCH=%d -DVECT_SIZE=%u -DDEVICE_TYPE=%u", shared_dir, device_param->vendor_id, (device_param->sm_major * 100) + device_param->sm_minor, device_param->vector_width, (u32) device_param->device_type);
+ snprintf (build_opts, sizeof (build_opts) - 1, "-cl-std=CL1.1 -I\"%s/\" -DVENDOR_ID=%u -DCUDA_ARCH=%d -DVECT_SIZE=%u -DDEVICE_TYPE=%u -DKERN_TYPE=%u -D_unroll", shared_dir, device_param->vendor_id, (device_param->sm_major * 100) + device_param->sm_minor, device_param->vector_width, (u32) device_param->device_type, kern_type);
/**
* main kernel