#define _MD5_
+#define NEW_SIMD_CODE
+
#include "include/constants.h"
#include "include/kernel_vendor.h"
#include "include/kernel_functions.c"
#include "OpenCL/types_ocl.c"
#include "OpenCL/common.c"
-
-#define COMPARE_S "OpenCL/check_single_comp4.c"
-#define COMPARE_M "OpenCL/check_multi_comp4.c"
+#include "OpenCL/simd.c"
__kernel void m00000_m04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, __global comb_t *combs_buf, __global bf_t *bfs_buf, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 combs_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max)
{
if (combs_mode == COMBINATOR_MODE_BASE_RIGHT)
{
- append_0x80_2x4 (wordl0, wordl1, pw_l_len);
+ append_0x80_2x4_S (wordl0, wordl1, pw_l_len);
- switch_buffer_by_offset_le (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len);
+ switch_buffer_by_offset_le_S (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len);
}
/**
* loop
*/
- for (u32 il_pos = 0; il_pos < combs_cnt; il_pos++)
+ for (u32 il_pos = 0; il_pos < combs_cnt; il_pos += VECT_SIZE)
{
- const u32 pw_r_len = combs_buf[il_pos].pw_len;
-
- const u32 pw_len = pw_l_len + pw_r_len;
-
- u32 wordr0[4];
-
- wordr0[0] = combs_buf[il_pos].i[0];
- wordr0[1] = combs_buf[il_pos].i[1];
- wordr0[2] = combs_buf[il_pos].i[2];
- wordr0[3] = combs_buf[il_pos].i[3];
+ const u32x pw_r_len = pwlenx_create_combt (combs_buf, il_pos);
- u32 wordr1[4];
+ const u32x pw_len = pw_l_len + pw_r_len;
- wordr1[0] = combs_buf[il_pos].i[4];
- wordr1[1] = combs_buf[il_pos].i[5];
- wordr1[2] = combs_buf[il_pos].i[6];
- wordr1[3] = combs_buf[il_pos].i[7];
+ u32x wordr0[4] = { 0 };
+ u32x wordr1[4] = { 0 };
+ u32x wordr2[4] = { 0 };
+ u32x wordr3[4] = { 0 };
- u32 wordr2[4];
-
- wordr2[0] = 0;
- wordr2[1] = 0;
- wordr2[2] = 0;
- wordr2[3] = 0;
-
- u32 wordr3[4];
-
- wordr3[0] = 0;
- wordr3[1] = 0;
- wordr3[2] = 0;
- wordr3[3] = 0;
+ wordr0[0] = ix_create_combt (combs_buf, il_pos, 0);
+ wordr0[1] = ix_create_combt (combs_buf, il_pos, 1);
+ wordr0[2] = ix_create_combt (combs_buf, il_pos, 2);
+ wordr0[3] = ix_create_combt (combs_buf, il_pos, 3);
+ wordr1[0] = ix_create_combt (combs_buf, il_pos, 4);
+ wordr1[1] = ix_create_combt (combs_buf, il_pos, 5);
+ wordr1[2] = ix_create_combt (combs_buf, il_pos, 6);
+ wordr1[3] = ix_create_combt (combs_buf, il_pos, 7);
if (combs_mode == COMBINATOR_MODE_BASE_LEFT)
{
switch_buffer_by_offset_le (wordr0, wordr1, wordr2, wordr3, pw_l_len);
}
- u32 w0[4];
+ u32x w0[4];
w0[0] = wordl0[0] | wordr0[0];
w0[1] = wordl0[1] | wordr0[1];
w0[2] = wordl0[2] | wordr0[2];
w0[3] = wordl0[3] | wordr0[3];
- u32 w1[4];
+ u32x w1[4];
w1[0] = wordl1[0] | wordr1[0];
w1[1] = wordl1[1] | wordr1[1];
w1[2] = wordl1[2] | wordr1[2];
w1[3] = wordl1[3] | wordr1[3];
- u32 w2[4];
+ u32x w2[4];
w2[0] = wordl2[0] | wordr2[0];
w2[1] = wordl2[1] | wordr2[1];
w2[2] = wordl2[2] | wordr2[2];
w2[3] = wordl2[3] | wordr2[3];
- u32 w3[4];
+ u32x w3[4];
w3[0] = wordl3[0] | wordr3[0];
w3[1] = wordl3[1] | wordr3[1];
w3[2] = pw_len * 8;
w3[3] = 0;
- u32 a = MD5M_A;
- u32 b = MD5M_B;
- u32 c = MD5M_C;
- u32 d = MD5M_D;
+ u32x a = MD5M_A;
+ u32x b = MD5M_B;
+ u32x c = MD5M_C;
+ u32x d = MD5M_D;
MD5_STEP (MD5_Fo, a, b, c, d, w0[0], MD5C00, MD5S00);
MD5_STEP (MD5_Fo, d, a, b, c, w0[1], MD5C01, MD5S01);
MD5_STEP (MD5_I , c, d, a, b, w0[2], MD5C3e, MD5S32);
MD5_STEP (MD5_I , b, c, d, a, w2[1], MD5C3f, MD5S33);
- const u32 r0 = a;
- const u32 r1 = d;
- const u32 r2 = c;
- const u32 r3 = b;
-
- #include COMPARE_M
+ COMPARE_M_SIMD (a, d, c, b);
}
}
if (combs_mode == COMBINATOR_MODE_BASE_RIGHT)
{
- append_0x80_2x4 (wordl0, wordl1, pw_l_len);
+ append_0x80_2x4_S (wordl0, wordl1, pw_l_len);
- switch_buffer_by_offset_le (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len);
+ switch_buffer_by_offset_le_S (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len);
}
/**
* loop
*/
- for (u32 il_pos = 0; il_pos < combs_cnt; il_pos++)
+ for (u32 il_pos = 0; il_pos < combs_cnt; il_pos += VECT_SIZE)
{
- const u32 pw_r_len = combs_buf[il_pos].pw_len;
+ const u32x pw_r_len = pwlenx_create_combt (combs_buf, il_pos);
- const u32 pw_len = pw_l_len + pw_r_len;
+ const u32x pw_len = pw_l_len + pw_r_len;
- u32 wordr0[4];
+ u32x wordr0[4] = { 0 };
+ u32x wordr1[4] = { 0 };
+ u32x wordr2[4] = { 0 };
+ u32x wordr3[4] = { 0 };
- wordr0[0] = combs_buf[il_pos].i[0];
- wordr0[1] = combs_buf[il_pos].i[1];
- wordr0[2] = combs_buf[il_pos].i[2];
- wordr0[3] = combs_buf[il_pos].i[3];
-
- u32 wordr1[4];
-
- wordr1[0] = combs_buf[il_pos].i[4];
- wordr1[1] = combs_buf[il_pos].i[5];
- wordr1[2] = combs_buf[il_pos].i[6];
- wordr1[3] = combs_buf[il_pos].i[7];
-
- u32 wordr2[4];
-
- wordr2[0] = 0;
- wordr2[1] = 0;
- wordr2[2] = 0;
- wordr2[3] = 0;
-
- u32 wordr3[4];
-
- wordr3[0] = 0;
- wordr3[1] = 0;
- wordr3[2] = 0;
- wordr3[3] = 0;
+ wordr0[0] = ix_create_combt (combs_buf, il_pos, 0);
+ wordr0[1] = ix_create_combt (combs_buf, il_pos, 1);
+ wordr0[2] = ix_create_combt (combs_buf, il_pos, 2);
+ wordr0[3] = ix_create_combt (combs_buf, il_pos, 3);
+ wordr1[0] = ix_create_combt (combs_buf, il_pos, 4);
+ wordr1[1] = ix_create_combt (combs_buf, il_pos, 5);
+ wordr1[2] = ix_create_combt (combs_buf, il_pos, 6);
+ wordr1[3] = ix_create_combt (combs_buf, il_pos, 7);
if (combs_mode == COMBINATOR_MODE_BASE_LEFT)
{
switch_buffer_by_offset_le (wordr0, wordr1, wordr2, wordr3, pw_l_len);
}
- u32 w0[4];
+ u32x w0[4];
w0[0] = wordl0[0] | wordr0[0];
w0[1] = wordl0[1] | wordr0[1];
w0[2] = wordl0[2] | wordr0[2];
w0[3] = wordl0[3] | wordr0[3];
- u32 w1[4];
+ u32x w1[4];
w1[0] = wordl1[0] | wordr1[0];
w1[1] = wordl1[1] | wordr1[1];
w1[2] = wordl1[2] | wordr1[2];
w1[3] = wordl1[3] | wordr1[3];
- u32 w2[4];
+ u32x w2[4];
w2[0] = wordl2[0] | wordr2[0];
w2[1] = wordl2[1] | wordr2[1];
w2[2] = wordl2[2] | wordr2[2];
w2[3] = wordl2[3] | wordr2[3];
- u32 w3[4];
+ u32x w3[4];
w3[0] = wordl3[0] | wordr3[0];
w3[1] = wordl3[1] | wordr3[1];
w3[2] = pw_len * 8;
w3[3] = 0;
- u32 a = MD5M_A;
- u32 b = MD5M_B;
- u32 c = MD5M_C;
- u32 d = MD5M_D;
+ u32x a = MD5M_A;
+ u32x b = MD5M_B;
+ u32x c = MD5M_C;
+ u32x d = MD5M_D;
MD5_STEP (MD5_Fo, a, b, c, d, w0[0], MD5C00, MD5S00);
MD5_STEP (MD5_Fo, d, a, b, c, w0[1], MD5C01, MD5S01);
MD5_STEP (MD5_I , b, c, d, a, w3[1], MD5C3b, MD5S33);
MD5_STEP (MD5_I , a, b, c, d, w1[0], MD5C3c, MD5S30);
- bool q_cond = allx (search[0] != a);
-
- if (q_cond) continue;
+ if (MATCHES_NONE_VS (a, search[0])) continue;
MD5_STEP (MD5_I , d, a, b, c, w2[3], MD5C3d, MD5S31);
MD5_STEP (MD5_I , c, d, a, b, w0[2], MD5C3e, MD5S32);
MD5_STEP (MD5_I , b, c, d, a, w2[1], MD5C3f, MD5S33);
- const u32 r0 = a;
- const u32 r1 = d;
- const u32 r2 = c;
- const u32 r3 = b;
-
- #include COMPARE_S
+ COMPARE_S_SIMD (a, d, c, b);
}
}
for (u32 il_pos = 0; il_pos < bfs_cnt; il_pos += VECT_SIZE)
{
- const u32x w0r = w0r_create_bft (bfs_buf, il_pos);
+ const u32x w0r = ix_create_bft (bfs_buf, il_pos);
const u32x w0lr = w0l | w0r;
for (u32 il_pos = 0; il_pos < bfs_cnt; il_pos += VECT_SIZE)
{
- const u32x w0r = w0r_create_bft (bfs_buf, il_pos);
+ const u32x w0r = ix_create_bft (bfs_buf, il_pos);
const u32x w0lr = w0l | w0r;
for (u32 il_pos = 0; il_pos < bfs_cnt; il_pos += VECT_SIZE)
{
- const u32x w0r = w0r_create_bft (bfs_buf, il_pos);
+ const u32x w0r = ix_create_bft (bfs_buf, il_pos);
const u32x w0lr = w0l | w0r;
for (u32 il_pos = 0; il_pos < bfs_cnt; il_pos += VECT_SIZE)
{
- const u32x w0r = w0r_create_bft (bfs_buf, il_pos);
+ const u32x w0r = ix_create_bft (bfs_buf, il_pos);
const u32x w0lr = w0l | w0r;
for (u32 il_pos = 0; il_pos < bfs_cnt; il_pos += VECT_SIZE)
{
- const u32x w0r = w0r_create_bft (bfs_buf, il_pos);
+ const u32x w0r = ix_create_bft (bfs_buf, il_pos);
const u32x w0lr = w0l | w0r;
for (u32 il_pos = 0; il_pos < bfs_cnt; il_pos += VECT_SIZE)
{
- const u32x w0r = w0r_create_bft (bfs_buf, il_pos);
+ const u32x w0r = ix_create_bft (bfs_buf, il_pos);
const u32x w0lr = w0l | w0r;
for (u32 il_pos = 0; il_pos < bfs_cnt; il_pos += VECT_SIZE)
{
- const u32x w0r = w0r_create_bft (bfs_buf, il_pos);
+ const u32x w0r = ix_create_bft (bfs_buf, il_pos);
const u32x w0rl = w0r | w0l;
for (u32 il_pos = 0; il_pos < bfs_cnt; il_pos += VECT_SIZE)
{
- const u32x w0r = w0r_create_bft (bfs_buf, il_pos);
+ const u32x w0r = ix_create_bft (bfs_buf, il_pos);
const u32x w0rl = w0r | w0l;
for (u32 il_pos = 0; il_pos < bfs_cnt; il_pos += VECT_SIZE)
{
- const u32x w0r = w0r_create_bft (bfs_buf, il_pos);
+ const u32x w0r = ix_create_bft (bfs_buf, il_pos);
const u32x w0lr = w0l | w0r;
for (u32 il_pos = 0; il_pos < bfs_cnt; il_pos += VECT_SIZE)
{
- const u32x w0r = w0r_create_bft (bfs_buf, il_pos);
+ const u32x w0r = ix_create_bft (bfs_buf, il_pos);
const u32x w0lr = w0l | w0r;
for (u32 il_pos = 0; il_pos < bfs_cnt; il_pos += VECT_SIZE)
{
- const u32x w0r = w0r_create_bft (bfs_buf, il_pos);
+ const u32x w0r = ix_create_bft (bfs_buf, il_pos);
const u32x w0lr = w0l | w0r;
for (u32 il_pos = 0; il_pos < bfs_cnt; il_pos += VECT_SIZE)
{
- const u32x w0r = w0r_create_bft (bfs_buf, il_pos);
+ const u32x w0r = ix_create_bft (bfs_buf, il_pos);
const u32x w0lr = w0l | w0r;
for (u32 il_pos = 0; il_pos < bfs_cnt; il_pos += VECT_SIZE)
{
- const u32x w0r = w0r_create_bft (bfs_buf, il_pos);
+ const u32x w0r = ix_create_bft (bfs_buf, il_pos);
const u32x w0lr = w0l | w0r;
for (u32 il_pos = 0; il_pos < bfs_cnt; il_pos += VECT_SIZE)
{
- const u32x w0r = w0r_create_bft (bfs_buf, il_pos);
+ const u32x w0r = ix_create_bft (bfs_buf, il_pos);
const u32x w0lr = w0l | w0r;
for (u32 il_pos = 0; il_pos < bfs_cnt; il_pos += VECT_SIZE)
{
- const u32x w0r = w0r_create_bft (bfs_buf, il_pos);
+ const u32x w0r = ix_create_bft (bfs_buf, il_pos);
const u32x w0lr = w0l | w0r;
for (u32 il_pos = 0; il_pos < bfs_cnt; il_pos += VECT_SIZE)
{
- const u32x w0r = w0r_create_bft (bfs_buf, il_pos);
+ const u32x w0r = ix_create_bft (bfs_buf, il_pos);
const u32x w0lr = w0l | w0r;
for (u32 il_pos = 0; il_pos < bfs_cnt; il_pos += VECT_SIZE)
{
- const u32x w0r = w0r_create_bft (bfs_buf, il_pos);
+ const u32x w0r = ix_create_bft (bfs_buf, il_pos);
const u32x w0lr = w0l | w0r;
for (u32 il_pos = 0; il_pos < bfs_cnt; il_pos += VECT_SIZE)
{
- const u32x w0r = w0r_create_bft (bfs_buf, il_pos);
+ const u32x w0r = ix_create_bft (bfs_buf, il_pos);
const u32x w0lr = w0l | w0r;
for (u32 il_pos = 0; il_pos < bfs_cnt; il_pos += VECT_SIZE)
{
- const u32x w0r = w0r_create_bft (bfs_buf, il_pos);
+ const u32x w0r = ix_create_bft (bfs_buf, il_pos);
const u32x w0lr = w0l | w0r;
for (u32 il_pos = 0; il_pos < bfs_cnt; il_pos += VECT_SIZE)
{
- const u32x w0r = w0r_create_bft (bfs_buf, il_pos);
+ const u32x w0r = ix_create_bft (bfs_buf, il_pos);
const u32x w0lr = w0l | w0r;
for (u32 il_pos = 0; il_pos < bfs_cnt; il_pos += VECT_SIZE)
{
- const u32x w0r = w0r_create_bft (bfs_buf, il_pos);
+ const u32x w0r = ix_create_bft (bfs_buf, il_pos);
const u32x w0lr = w0l | w0r;
for (u32 il_pos = 0; il_pos < bfs_cnt; il_pos += VECT_SIZE)
{
- const u32x w0r = w0r_create_bft (bfs_buf, il_pos);
+ const u32x w0r = ix_create_bft (bfs_buf, il_pos);
const u32x w0lr = w0l | w0r;
for (u32 il_pos = 0; il_pos < bfs_cnt; il_pos += VECT_SIZE)
{
- const u32x w0r = w0r_create_bft (bfs_buf, il_pos);
+ const u32x w0r = ix_create_bft (bfs_buf, il_pos);
const u32x w0lr = w0l | w0r;
for (u32 il_pos = 0; il_pos < bfs_cnt; il_pos += VECT_SIZE)
{
- const u32x w0r = w0r_create_bft (bfs_buf, il_pos);
+ const u32x w0r = ix_create_bft (bfs_buf, il_pos);
const u32x w0lr = w0l | w0r;
for (u32 il_pos = 0; il_pos < bfs_cnt; il_pos += VECT_SIZE)
{
- const u32x w0r = w0r_create_bft (bfs_buf, il_pos);
+ const u32x w0r = ix_create_bft (bfs_buf, il_pos);
const u32x w0lr = w0l | w0r;
for (u32 il_pos = 0; il_pos < bfs_cnt; il_pos += VECT_SIZE)
{
- const u32x w0r = w0r_create_bft (bfs_buf, il_pos);
+ const u32x w0r = ix_create_bft (bfs_buf, il_pos);
const u32x w0lr = w0l | w0r;
for (u32 il_pos = 0; il_pos < bfs_cnt; il_pos += VECT_SIZE)
{
- const u32x w0r = w0r_create_bft (bfs_buf, il_pos);
+ const u32x w0r = ix_create_bft (bfs_buf, il_pos);
const u32x w0lr = w0l | w0r;
for (u32 il_pos = 0; il_pos < bfs_cnt; il_pos += VECT_SIZE)
{
- const u32x w0r = w0r_create_bft (bfs_buf, il_pos);
+ const u32x w0r = ix_create_bft (bfs_buf, il_pos);
const u32x w0lr = w0l | w0r;
for (u32 il_pos = 0; il_pos < bfs_cnt; il_pos += VECT_SIZE)
{
- const u32x w0r = w0r_create_bft (bfs_buf, il_pos);
+ const u32x w0r = ix_create_bft (bfs_buf, il_pos);
const u32x w0lr = w0l | w0r;
for (u32 il_pos = 0; il_pos < bfs_cnt; il_pos += VECT_SIZE)
{
- const u32x w0r = w0r_create_bft (bfs_buf, il_pos);
+ const u32x w0r = ix_create_bft (bfs_buf, il_pos);
const u32x w0lr = w0l | w0r;
for (u32 il_pos = 0; il_pos < bfs_cnt; il_pos += VECT_SIZE)
{
- const u32x w0r = w0r_create_bft (bfs_buf, il_pos);
+ const u32x w0r = ix_create_bft (bfs_buf, il_pos);
const u32x w0lr = w0l | w0r;
for (u32 il_pos = 0; il_pos < bfs_cnt; il_pos += VECT_SIZE)
{
- const u32x w0r = w0r_create_bft (bfs_buf, il_pos);
+ const u32x w0r = ix_create_bft (bfs_buf, il_pos);
const u32x w0lr = w0l | w0r;
for (u32 il_pos = 0; il_pos < bfs_cnt; il_pos += VECT_SIZE)
{
- const u32x w0r = w0r_create_bft (bfs_buf, il_pos);
+ const u32x w0r = ix_create_bft (bfs_buf, il_pos);
const u32x w0lr = w0l | w0r;
for (u32 il_pos = 0; il_pos < bfs_cnt; il_pos += VECT_SIZE)
{
- const u32x w0r = w0r_create_bft (bfs_buf, il_pos);
+ const u32x w0r = ix_create_bft (bfs_buf, il_pos);
const u32x w0lr = w0l | w0r;
for (u32 il_pos = 0; il_pos < bfs_cnt; il_pos += VECT_SIZE)
{
- const u32x w0r = w0r_create_bft (bfs_buf, il_pos);
+ const u32x w0r = ix_create_bft (bfs_buf, il_pos);
const u32x w0lr = w0l | w0r;
for (u32 il_pos = 0; il_pos < bfs_cnt; il_pos += VECT_SIZE)
{
- const u32x w0r = w0r_create_bft (bfs_buf, il_pos);
+ const u32x w0r = ix_create_bft (bfs_buf, il_pos);
const u32x w0lr = w0l | w0r;
for (u32 il_pos = 0; il_pos < bfs_cnt; il_pos += VECT_SIZE)
{
- const u32x w0r = w0r_create_bft (bfs_buf, il_pos);
+ const u32x w0r = ix_create_bft (bfs_buf, il_pos);
const u32x w0lr = w0l | w0r;
for (u32 il_pos = 0; il_pos < bfs_cnt; il_pos += VECT_SIZE)
{
- const u32x w0r = w0r_create_bft (bfs_buf, il_pos);
+ const u32x w0r = ix_create_bft (bfs_buf, il_pos);
const u32x w0lr = w0l | w0r;
for (u32 il_pos = 0; il_pos < bfs_cnt; il_pos += VECT_SIZE)
{
- const u32x w0r = w0r_create_bft (bfs_buf, il_pos);
+ const u32x w0r = ix_create_bft (bfs_buf, il_pos);
const u32x w0lr = w0l | w0r;
for (u32 il_pos = 0; il_pos < bfs_cnt; il_pos += VECT_SIZE)
{
- const u32x w0r = w0r_create_bft (bfs_buf, il_pos);
+ const u32x w0r = ix_create_bft (bfs_buf, il_pos);
const u32x w0lr = w0l | w0r;
for (u32 il_pos = 0; il_pos < bfs_cnt; il_pos += VECT_SIZE)
{
- const u32x w0r = w0r_create_bft (bfs_buf, il_pos);
+ const u32x w0r = ix_create_bft (bfs_buf, il_pos);
const u32x w0lr = w0l | w0r;
for (u32 il_pos = 0; il_pos < bfs_cnt; il_pos += VECT_SIZE)
{
- const u32x w0r = w0r_create_bft (bfs_buf, il_pos);
+ const u32x w0r = ix_create_bft (bfs_buf, il_pos);
const u32x w0lr = w0l | w0r;
for (u32 il_pos = 0; il_pos < bfs_cnt; il_pos += VECT_SIZE)
{
- const u32x w0r = w0r_create_bft (bfs_buf, il_pos);
+ const u32x w0r = ix_create_bft (bfs_buf, il_pos);
const u32x w0lr = w0l | w0r;
for (u32 il_pos = 0; il_pos < bfs_cnt; il_pos += VECT_SIZE)
{
- const u32x w0r = w0r_create_bft (bfs_buf, il_pos);
+ const u32x w0r = ix_create_bft (bfs_buf, il_pos);
const u32x w0lr = w0l | w0r;
for (u32 il_pos = 0; il_pos < bfs_cnt; il_pos += VECT_SIZE)
{
- const u32x w0r = w0r_create_bft (bfs_buf, il_pos);
+ const u32x w0r = ix_create_bft (bfs_buf, il_pos);
const u32x w0lr = w0l | w0r;
for (u32 il_pos = 0; il_pos < bfs_cnt; il_pos += VECT_SIZE)
{
- const u32x w0r = w0r_create_bft (bfs_buf, il_pos);
+ const u32x w0r = ix_create_bft (bfs_buf, il_pos);
const u32x w0lr = w0l | w0r;
for (u32 il_pos = 0; il_pos < bfs_cnt; il_pos += VECT_SIZE)
{
- const u32x w0r = w0r_create_bft (bfs_buf, il_pos);
+ const u32x w0r = ix_create_bft (bfs_buf, il_pos);
const u32x w0lr = w0l | w0r;
for (u32 il_pos = 0; il_pos < bfs_cnt; il_pos += VECT_SIZE)
{
- const u32x w0r = w0r_create_bft (bfs_buf, il_pos);
+ const u32x w0r = ix_create_bft (bfs_buf, il_pos);
const u32x w0lr = w0l | w0r;
for (u32 il_pos = 0; il_pos < bfs_cnt; il_pos += VECT_SIZE)
{
- const u32x w0r = w0r_create_bft (bfs_buf, il_pos);
+ const u32x w0r = ix_create_bft (bfs_buf, il_pos);
const u32x w0lr = w0l | w0r;
for (u32 il_pos = 0; il_pos < bfs_cnt; il_pos += VECT_SIZE)
{
- const u32x w0r = w0r_create_bft (bfs_buf, il_pos);
+ const u32x w0r = ix_create_bft (bfs_buf, il_pos);
const u32x w0lr = w0l | w0r;
for (u32 il_pos = 0; il_pos < bfs_cnt; il_pos += VECT_SIZE)
{
- const u32x w0r = w0r_create_bft (bfs_buf, il_pos);
+ const u32x w0r = ix_create_bft (bfs_buf, il_pos);
w0_t[0] = w0l | (w0r << 8);
w0_t[1] = w1l | (w0r >> 24);
for (u32 il_pos = 0; il_pos < bfs_cnt; il_pos += VECT_SIZE)
{
- const u32x w0r = w0r_create_bft (bfs_buf, il_pos);
+ const u32x w0r = ix_create_bft (bfs_buf, il_pos);
w0_t[0] = w0l | (w0r << 8);
w0_t[1] = w1l | (w0r >> 24);
for (u32 il_pos = 0; il_pos < bfs_cnt; il_pos += VECT_SIZE)
{
- const u32x w0r = w0r_create_bft (bfs_buf, il_pos);
+ const u32x w0r = ix_create_bft (bfs_buf, il_pos);
const u32x w0lr = w0l | w0r;
for (u32 il_pos = 0; il_pos < bfs_cnt; il_pos += VECT_SIZE)
{
- const u32x w0r = w0r_create_bft (bfs_buf, il_pos);
+ const u32x w0r = ix_create_bft (bfs_buf, il_pos);
const u32x w0lr = w0l | w0r;
for (u32 il_pos = 0; il_pos < bfs_cnt; il_pos += VECT_SIZE)
{
- const u32x w0r = w0r_create_bft (bfs_buf, il_pos);
+ const u32x w0r = ix_create_bft (bfs_buf, il_pos);
const u32x w0lr = w0l | w0r;
for (u32 il_pos = 0; il_pos < bfs_cnt; il_pos += VECT_SIZE)
{
- const u32x w0r = w0r_create_bft (bfs_buf, il_pos);
+ const u32x w0r = ix_create_bft (bfs_buf, il_pos);
const u32x w0lr = w0l | w0r;
for (u32 il_pos = 0; il_pos < bfs_cnt; il_pos += VECT_SIZE)
{
- const u32x w0r = w0r_create_bft (bfs_buf, il_pos);
+ const u32x w0r = ix_create_bft (bfs_buf, il_pos);
const u32x w0lr = w0l | w0r;
for (u32 il_pos = 0; il_pos < bfs_cnt; il_pos += VECT_SIZE)
{
- const u32x w0r = w0r_create_bft (bfs_buf, il_pos);
+ const u32x w0r = ix_create_bft (bfs_buf, il_pos);
const u32x w0lr = w0l | w0r;
for (u32 il_pos = 0; il_pos < bfs_cnt; il_pos += VECT_SIZE)
{
- const u32x w0r = w0r_create_bft (bfs_buf, il_pos);
+ const u32x w0r = ix_create_bft (bfs_buf, il_pos);
const u32x w0lr = w0l | w0r;
for (u32 il_pos = 0; il_pos < bfs_cnt; il_pos += VECT_SIZE)
{
- const u32x w0r = w0r_create_bft (bfs_buf, il_pos);
+ const u32x w0r = ix_create_bft (bfs_buf, il_pos);
const u32x w0lr = w0l | w0r;
for (u32 il_pos = 0; il_pos < bfs_cnt; il_pos += VECT_SIZE)
{
- const u32x w0r = w0r_create_bft (bfs_buf, il_pos);
+ const u32x w0r = ix_create_bft (bfs_buf, il_pos);
const u32x w0lr = w0l | w0r;
for (u32 il_pos = 0; il_pos < bfs_cnt; il_pos += VECT_SIZE)
{
- const u32x w0r = w0r_create_bft (bfs_buf, il_pos);
+ const u32x w0r = ix_create_bft (bfs_buf, il_pos);
const u32x w0lr = w0l | w0r;
for (u32 il_pos = 0; il_pos < bfs_cnt; il_pos += VECT_SIZE)
{
- const u32x w0r = w0r_create_bft (bfs_buf, il_pos);
+ const u32x w0r = ix_create_bft (bfs_buf, il_pos);
const u32x w0lr = w0l | w0r;
for (u32 il_pos = 0; il_pos < bfs_cnt; il_pos += VECT_SIZE)
{
- const u32x w0r = w0r_create_bft (bfs_buf, il_pos);
+ const u32x w0r = ix_create_bft (bfs_buf, il_pos);
const u32x w0lr = w0l | w0r;
for (u32 il_pos = 0; il_pos < bfs_cnt; il_pos += VECT_SIZE)
{
- const u32x w0r = w0r_create_bft (bfs_buf, il_pos);
+ const u32x w0r = ix_create_bft (bfs_buf, il_pos);
const u32x w0lr = w0l | w0r;
for (u32 il_pos = 0; il_pos < bfs_cnt; il_pos += VECT_SIZE)
{
- const u32x w0r = w0r_create_bft (bfs_buf, il_pos);
+ const u32x w0r = ix_create_bft (bfs_buf, il_pos);
const u32x w0lr = w0l | w0r;
for (u32 il_pos = 0; il_pos < bfs_cnt; il_pos += VECT_SIZE)
{
- const u32x w0r = w0r_create_bft (bfs_buf, il_pos);
+ const u32x w0r = ix_create_bft (bfs_buf, il_pos);
const u32x w0lr = w0l | w0r;
for (u32 il_pos = 0; il_pos < bfs_cnt; il_pos += VECT_SIZE)
{
- const u32x w0r = w0r_create_bft (bfs_buf, il_pos);
+ const u32x w0r = ix_create_bft (bfs_buf, il_pos);
const u32x w0lr = w0l | w0r;
for (u32 il_pos = 0; il_pos < bfs_cnt; il_pos += VECT_SIZE)
{
- const u32x w0r = w0r_create_bft (bfs_buf, il_pos);
+ const u32x w0r = ix_create_bft (bfs_buf, il_pos);
const u32x w0lr = w0l | w0r;
for (u32 il_pos = 0; il_pos < bfs_cnt; il_pos += VECT_SIZE)
{
- const u32x w0r = w0r_create_bft (bfs_buf, il_pos);
+ const u32x w0r = ix_create_bft (bfs_buf, il_pos);
const u32x w0lr = w0l | w0r;
for (u32 il_pos = 0; il_pos < bfs_cnt; il_pos += VECT_SIZE)
{
- const u32x w0r = w0r_create_bft (bfs_buf, il_pos);
+ const u32x w0r = ix_create_bft (bfs_buf, il_pos);
const u32x w0lr = w0l | w0r;
for (u32 il_pos = 0; il_pos < bfs_cnt; il_pos += VECT_SIZE)
{
- const u32x w0r = w0r_create_bft (bfs_buf, il_pos);
+ const u32x w0r = ix_create_bft (bfs_buf, il_pos);
const u32x w0lr = w0l | w0r;
for (u32 il_pos = 0; il_pos < bfs_cnt; il_pos += VECT_SIZE)
{
- const u32x w0r = w0r_create_bft (bfs_buf, il_pos);
+ const u32x w0r = ix_create_bft (bfs_buf, il_pos);
const u32x w0lr = w0l | w0r;
for (u32 il_pos = 0; il_pos < bfs_cnt; il_pos += VECT_SIZE)
{
- const u32x w0r = w0r_create_bft (bfs_buf, il_pos);
+ const u32x w0r = ix_create_bft (bfs_buf, il_pos);
const u32x w0lr = w0l | w0r;
for (u32 il_pos = 0; il_pos < bfs_cnt; il_pos += VECT_SIZE)
{
- const u32x w0r = w0r_create_bft (bfs_buf, il_pos);
+ const u32x w0r = ix_create_bft (bfs_buf, il_pos);
const u32x w0lr = w0l | w0r;
for (u32 il_pos = 0; il_pos < bfs_cnt; il_pos += VECT_SIZE)
{
- const u32x w0r = w0r_create_bft (bfs_buf, il_pos);
+ const u32x w0r = ix_create_bft (bfs_buf, il_pos);
const u32x w0lr = w0l | w0r;
for (u32 il_pos = 0; il_pos < bfs_cnt; il_pos += VECT_SIZE)
{
- const u32x w0r = w0r_create_bft (bfs_buf, il_pos);
+ const u32x w0r = ix_create_bft (bfs_buf, il_pos);
const u32x w0lr = w0l | w0r;
for (u32 il_pos = 0; il_pos < bfs_cnt; il_pos += VECT_SIZE)
{
- const u32x w0r = w0r_create_bft (bfs_buf, il_pos);
+ const u32x w0r = ix_create_bft (bfs_buf, il_pos);
const u32x w0lr = w0l | w0r;
for (u32 il_pos = 0; il_pos < bfs_cnt; il_pos += VECT_SIZE)
{
- const u32x w0r = w0r_create_bft (bfs_buf, il_pos);
+ const u32x w0r = ix_create_bft (bfs_buf, il_pos);
const u32x w0lr = w0l | w0r;
for (u32 il_pos = 0; il_pos < bfs_cnt; il_pos += VECT_SIZE)
{
- const u32x w0r = w0r_create_bft (bfs_buf, il_pos);
+ const u32x w0r = ix_create_bft (bfs_buf, il_pos);
const u32x w0lr = w0l | w0r;
for (u32 il_pos = 0; il_pos < bfs_cnt; il_pos += VECT_SIZE)
{
- const u32 w0r = w0r_create_bft (bfs_buf, il_pos);
+ const u32 w0r = ix_create_bft (bfs_buf, il_pos);
const u32 w0lr = w0l | w0r;
for (u32 il_pos = 0; il_pos < bfs_cnt; il_pos += VECT_SIZE)
{
- const u32 w0r = w0r_create_bft (bfs_buf, il_pos);
+ const u32 w0r = ix_create_bft (bfs_buf, il_pos);
const u32 w0lr = w0l | w0r;
for (u32 il_pos = 0; il_pos < bfs_cnt; il_pos += VECT_SIZE)
{
- const u32 w0r = w0r_create_bft (bfs_buf, il_pos);
+ const u32 w0r = ix_create_bft (bfs_buf, il_pos);
const u32 w0lr = w0l | w0r;
for (u32 il_pos = 0; il_pos < bfs_cnt; il_pos += VECT_SIZE)
{
- const u32 w0r = w0r_create_bft (bfs_buf, il_pos);
+ const u32 w0r = ix_create_bft (bfs_buf, il_pos);
const u32 w0lr = w0l | w0r;
for (u32 il_pos = 0; il_pos < bfs_cnt; il_pos += VECT_SIZE)
{
- const u32x w0r = w0r_create_bft (bfs_buf, il_pos);
+ const u32x w0r = ix_create_bft (bfs_buf, il_pos);
const u32x w0lr = w0l | w0r;
for (u32 il_pos = 0; il_pos < bfs_cnt; il_pos += VECT_SIZE)
{
- const u32x w0r = w0r_create_bft (bfs_buf, il_pos);
+ const u32x w0r = ix_create_bft (bfs_buf, il_pos);
const u32x w0lr = w0l | w0r;
for (u32 il_pos = 0; il_pos < bfs_cnt; il_pos += VECT_SIZE)
{
- const u32 w0r = w0r_create_bft (bfs_buf, il_pos);
+ const u32 w0r = ix_create_bft (bfs_buf, il_pos);
const u32 w0lr = w0l | w0r;
for (u32 il_pos = 0; il_pos < bfs_cnt; il_pos += VECT_SIZE)
{
- const u32 w0r = w0r_create_bft (bfs_buf, il_pos);
+ const u32 w0r = ix_create_bft (bfs_buf, il_pos);
const u32 w0lr = w0l | w0r;
for (u32 il_pos = 0; il_pos < bfs_cnt; il_pos += VECT_SIZE)
{
- const u32 w0r = w0r_create_bft (bfs_buf, il_pos);
+ const u32 w0r = ix_create_bft (bfs_buf, il_pos);
const u32 w0lr = w0l | w0r;
for (u32 il_pos = 0; il_pos < bfs_cnt; il_pos += VECT_SIZE)
{
- const u32 w0r = w0r_create_bft (bfs_buf, il_pos);
+ const u32 w0r = ix_create_bft (bfs_buf, il_pos);
const u32 w0lr = w0l | w0r;
for (u32 il_pos = 0; il_pos < bfs_cnt; il_pos += VECT_SIZE)
{
- const u32x w0r = w0r_create_bft (bfs_buf, il_pos);
+ const u32x w0r = ix_create_bft (bfs_buf, il_pos);
const u32x w0lr = w0l | w0r;
for (u32 il_pos = 0; il_pos < bfs_cnt; il_pos += VECT_SIZE)
{
- const u32x w0r = w0r_create_bft (bfs_buf, il_pos);
+ const u32x w0r = ix_create_bft (bfs_buf, il_pos);
const u32x w0lr = w0l | w0r;
for (u32 il_pos = 0; il_pos < bfs_cnt; il_pos++)
{
- const u32 w0r = w0r_create_bft (bfs_buf, il_pos);
+ const u32 w0r = ix_create_bft (bfs_buf, il_pos);
w0[0] = w0l | w0r;
for (u32 il_pos = 0; il_pos < bfs_cnt; il_pos++)
{
- const u32 w0r = w0r_create_bft (bfs_buf, il_pos);
+ const u32 w0r = ix_create_bft (bfs_buf, il_pos);
w0[0] = w0l | w0r;
for (u32 il_pos = 0; il_pos < bfs_cnt; il_pos += VECT_SIZE)
{
- const u32 w0r = w0r_create_bft (bfs_buf, il_pos);
+ const u32 w0r = ix_create_bft (bfs_buf, il_pos);
const u32 w0lr = w0l | w0r;
for (u32 il_pos = 0; il_pos < bfs_cnt; il_pos += VECT_SIZE)
{
- const u32 w0r = w0r_create_bft (bfs_buf, il_pos);
+ const u32 w0r = ix_create_bft (bfs_buf, il_pos);
const u32 w0lr = w0l | w0r;
for (u32 il_pos = 0; il_pos < bfs_cnt; il_pos += VECT_SIZE)
{
- const u32x w0r = w0r_create_bft (bfs_buf, il_pos);
+ const u32x w0r = ix_create_bft (bfs_buf, il_pos);
const u32x w0lr = w0l | w0r;
for (u32 il_pos = 0; il_pos < bfs_cnt; il_pos += VECT_SIZE)
{
- const u32x w0r = w0r_create_bft (bfs_buf, il_pos);
+ const u32x w0r = ix_create_bft (bfs_buf, il_pos);
const u32x w0lr = w0l | w0r;
for (u32 il_pos = 0; il_pos < bfs_cnt; il_pos += VECT_SIZE)
{
- const u32x w0r = w0r_create_bft (bfs_buf, il_pos);
+ const u32x w0r = ix_create_bft (bfs_buf, il_pos);
const u32x w0lr = w0l | w0r;
for (u32 il_pos = 0; il_pos < bfs_cnt; il_pos += VECT_SIZE)
{
- const u32x w0r = w0r_create_bft (bfs_buf, il_pos);
+ const u32x w0r = ix_create_bft (bfs_buf, il_pos);
const u32x w0lr = w0l | w0r;
for (u32 il_pos = 0; il_pos < bfs_cnt; il_pos += VECT_SIZE)
{
- const u32x w0r = w0r_create_bft (bfs_buf, il_pos);
+ const u32x w0r = ix_create_bft (bfs_buf, il_pos);
const u32x w0lr = w0l | w0r;
for (u32 il_pos = 0; il_pos < bfs_cnt; il_pos += VECT_SIZE)
{
- const u32x w0r = w0r_create_bft (bfs_buf, il_pos);
+ const u32x w0r = ix_create_bft (bfs_buf, il_pos);
const u32x w0lr = w0l | w0r;
for (u32 il_pos = 0; il_pos < bfs_cnt; il_pos += VECT_SIZE)
{
- const u32x w0r = w0r_create_bft (bfs_buf, il_pos);
+ const u32x w0r = ix_create_bft (bfs_buf, il_pos);
const u32x w0lr = w0l | w0r;
for (u32 il_pos = 0; il_pos < bfs_cnt; il_pos += VECT_SIZE)
{
- const u32x w0r = w0r_create_bft (bfs_buf, il_pos);
+ const u32x w0r = ix_create_bft (bfs_buf, il_pos);
const u32x w0lr = w0l | w0r;
for (u32 il_pos = 0; il_pos < bfs_cnt; il_pos += VECT_SIZE)
{
- const u32x w0r = w0r_create_bft (bfs_buf, il_pos);
+ const u32x w0r = ix_create_bft (bfs_buf, il_pos);
const u32x w0lr = w0l | w0r;
for (u32 il_pos = 0; il_pos < bfs_cnt; il_pos += VECT_SIZE)
{
- const u32x w0r = w0r_create_bft (bfs_buf, il_pos);
+ const u32x w0r = ix_create_bft (bfs_buf, il_pos);
const u32x w0lr = w0l | w0r;
for (u32 il_pos = 0; il_pos < bfs_cnt; il_pos += VECT_SIZE)
{
- const u32x w0r = w0r_create_bft (bfs_buf, il_pos);
+ const u32x w0r = ix_create_bft (bfs_buf, il_pos);
const u32x w0lr = w0l | w0r;
for (u32 il_pos = 0; il_pos < bfs_cnt; il_pos += VECT_SIZE)
{
- const u32x w0r = w0r_create_bft (bfs_buf, il_pos);
+ const u32x w0r = ix_create_bft (bfs_buf, il_pos);
const u32x w0lr = w0l | w0r;
for (u32 il_pos = 0; il_pos < bfs_cnt; il_pos += VECT_SIZE)
{
- const u32x w0r = w0r_create_bft (bfs_buf, il_pos);
+ const u32x w0r = ix_create_bft (bfs_buf, il_pos);
const u32x w0lr = w0l | w0r;
for (u32 il_pos = 0; il_pos < bfs_cnt; il_pos += VECT_SIZE)
{
- const u32x w0r = w0r_create_bft (bfs_buf, il_pos);
+ const u32x w0r = ix_create_bft (bfs_buf, il_pos);
const u32x w0lr = w0l | w0r;
for (u32 il_pos = 0; il_pos < bfs_cnt; il_pos += VECT_SIZE)
{
- const u32x w0r = w0r_create_bft (bfs_buf, il_pos);
+ const u32x w0r = ix_create_bft (bfs_buf, il_pos);
const u32x w0lr = w0l | w0r;
for (u32 il_pos = 0; il_pos < bfs_cnt; il_pos += VECT_SIZE)
{
- const u32x w0r = w0r_create_bft (bfs_buf, il_pos);
+ const u32x w0r = ix_create_bft (bfs_buf, il_pos);
const u32x w0lr = w0l | w0r;
for (u32 il_pos = 0; il_pos < bfs_cnt; il_pos += VECT_SIZE)
{
- const u32x w0r = w0r_create_bft (bfs_buf, il_pos);
+ const u32x w0r = ix_create_bft (bfs_buf, il_pos);
const u32x w0lr = w0l | w0r;
for (u32 il_pos = 0; il_pos < bfs_cnt; il_pos += VECT_SIZE)
{
- const u32x w0r = w0r_create_bft (bfs_buf, il_pos);
+ const u32x w0r = ix_create_bft (bfs_buf, il_pos);
const u32x w0lr = w0l | w0r;
for (u32 il_pos = 0; il_pos < bfs_cnt; il_pos += VECT_SIZE)
{
- const u32x w0r = w0r_create_bft (bfs_buf, il_pos);
+ const u32x w0r = ix_create_bft (bfs_buf, il_pos);
const u32x w0lr = w0l | w0r;
for (u32 il_pos = 0; il_pos < bfs_cnt; il_pos += VECT_SIZE)
{
- const u32x w0r = w0r_create_bft (bfs_buf, il_pos);
+ const u32x w0r = ix_create_bft (bfs_buf, il_pos);
const u32x w0lr = w0l | w0r;
for (u32 il_pos = 0; il_pos < bfs_cnt; il_pos += VECT_SIZE)
{
- const u32x w0r = w0r_create_bft (bfs_buf, il_pos);
+ const u32x w0r = ix_create_bft (bfs_buf, il_pos);
const u32x w0lr = w0l | w0r;
for (u32 il_pos = 0; il_pos < bfs_cnt; il_pos += VECT_SIZE)
{
- const u32x w0r = w0r_create_bft (bfs_buf, il_pos);
+ const u32x w0r = ix_create_bft (bfs_buf, il_pos);
const u32x w0lr = w0l | w0r;
// attack-mode 0
-static inline u32x w0r_create_bft (__global bf_t *bfs_buf, const u32 il_pos)
+static inline u32x ix_create_bft (__global bf_t *bfs_buf, const u32 il_pos)
{
#if VECT_SIZE == 1
- const u32x w0r = (u32x) (bfs_buf[il_pos + 0].i);
+ const u32x ix = (u32x) (bfs_buf[il_pos + 0].i);
#elif VECT_SIZE == 2
- const u32x w0r = (u32x) (bfs_buf[il_pos + 0].i, bfs_buf[il_pos + 1].i);
+ const u32x ix = (u32x) (bfs_buf[il_pos + 0].i, bfs_buf[il_pos + 1].i);
#elif VECT_SIZE == 4
- const u32x w0r = (u32x) (bfs_buf[il_pos + 0].i, bfs_buf[il_pos + 1].i, bfs_buf[il_pos + 2].i, bfs_buf[il_pos + 3].i);
+ const u32x ix = (u32x) (bfs_buf[il_pos + 0].i, bfs_buf[il_pos + 1].i, bfs_buf[il_pos + 2].i, bfs_buf[il_pos + 3].i);
#elif VECT_SIZE == 8
- const u32x w0r = (u32x) (bfs_buf[il_pos + 0].i, bfs_buf[il_pos + 1].i, bfs_buf[il_pos + 2].i, bfs_buf[il_pos + 3].i, bfs_buf[il_pos + 4].i, bfs_buf[il_pos + 5].i, bfs_buf[il_pos + 6].i, bfs_buf[il_pos + 7].i);
+ const u32x ix = (u32x) (bfs_buf[il_pos + 0].i, bfs_buf[il_pos + 1].i, bfs_buf[il_pos + 2].i, bfs_buf[il_pos + 3].i, bfs_buf[il_pos + 4].i, bfs_buf[il_pos + 5].i, bfs_buf[il_pos + 6].i, bfs_buf[il_pos + 7].i);
#elif VECT_SIZE == 16
- const u32x w0r = (u32x) (bfs_buf[il_pos + 0].i, bfs_buf[il_pos + 1].i, bfs_buf[il_pos + 2].i, bfs_buf[il_pos + 3].i, bfs_buf[il_pos + 4].i, bfs_buf[il_pos + 5].i, bfs_buf[il_pos + 6].i, bfs_buf[il_pos + 7].i, bfs_buf[il_pos + 8].i, bfs_buf[il_pos + 9].i, bfs_buf[il_pos + 10].i, bfs_buf[il_pos + 11].i, bfs_buf[il_pos + 12].i, bfs_buf[il_pos + 13].i, bfs_buf[il_pos + 14].i, bfs_buf[il_pos + 15].i);
+ const u32x ix = (u32x) (bfs_buf[il_pos + 0].i, bfs_buf[il_pos + 1].i, bfs_buf[il_pos + 2].i, bfs_buf[il_pos + 3].i, bfs_buf[il_pos + 4].i, bfs_buf[il_pos + 5].i, bfs_buf[il_pos + 6].i, bfs_buf[il_pos + 7].i, bfs_buf[il_pos + 8].i, bfs_buf[il_pos + 9].i, bfs_buf[il_pos + 10].i, bfs_buf[il_pos + 11].i, bfs_buf[il_pos + 12].i, bfs_buf[il_pos + 13].i, bfs_buf[il_pos + 14].i, bfs_buf[il_pos + 15].i);
#endif
- return w0r;
+ return ix;
+}
+
+// attack-mode 1
+
+static inline u32x pwlenx_create_combt (__global comb_t *combs_buf, const u32 il_pos)
+{
+ #if VECT_SIZE == 1
+ const u32x pw_lenx = (u32x) (combs_buf[il_pos + 0].pw_len);
+ #elif VECT_SIZE == 2
+ const u32x pw_lenx = (u32x) (combs_buf[il_pos + 0].pw_len, combs_buf[il_pos + 1].pw_len);
+ #elif VECT_SIZE == 4
+ const u32x pw_lenx = (u32x) (combs_buf[il_pos + 0].pw_len, combs_buf[il_pos + 1].pw_len, combs_buf[il_pos + 2].pw_len, combs_buf[il_pos + 3].pw_len);
+ #elif VECT_SIZE == 8
+ const u32x pw_lenx = (u32x) (combs_buf[il_pos + 0].pw_len, combs_buf[il_pos + 1].pw_len, combs_buf[il_pos + 2].pw_len, combs_buf[il_pos + 3].pw_len, combs_buf[il_pos + 4].pw_len, combs_buf[il_pos + 5].pw_len, combs_buf[il_pos + 6].pw_len, combs_buf[il_pos + 7].pw_len);
+ #elif VECT_SIZE == 16
+ const u32x pw_lenx = (u32x) (combs_buf[il_pos + 0].pw_len, combs_buf[il_pos + 1].pw_len, combs_buf[il_pos + 2].pw_len, combs_buf[il_pos + 3].pw_len, combs_buf[il_pos + 4].pw_len, combs_buf[il_pos + 5].pw_len, combs_buf[il_pos + 6].pw_len, combs_buf[il_pos + 7].pw_len, combs_buf[il_pos + 8].pw_len, combs_buf[il_pos + 9].pw_len, combs_buf[il_pos + 10].pw_len, combs_buf[il_pos + 11].pw_len, combs_buf[il_pos + 12].pw_len, combs_buf[il_pos + 13].pw_len, combs_buf[il_pos + 14].pw_len, combs_buf[il_pos + 15].pw_len);
+ #endif
+
+ return pw_lenx;
+}
+
+static inline u32x ix_create_combt (__global comb_t *combs_buf, const u32 il_pos, const int idx)
+{
+ #if VECT_SIZE == 1
+ const u32x ix = (u32x) (combs_buf[il_pos + 0].i[idx]);
+ #elif VECT_SIZE == 2
+ const u32x ix = (u32x) (combs_buf[il_pos + 0].i[idx], combs_buf[il_pos + 1].i[idx]);
+ #elif VECT_SIZE == 4
+ const u32x ix = (u32x) (combs_buf[il_pos + 0].i[idx], combs_buf[il_pos + 1].i[idx], combs_buf[il_pos + 2].i[idx], combs_buf[il_pos + 3].i[idx]);
+ #elif VECT_SIZE == 8
+ const u32x ix = (u32x) (combs_buf[il_pos + 0].i[idx], combs_buf[il_pos + 1].i[idx], combs_buf[il_pos + 2].i[idx], combs_buf[il_pos + 3].i[idx], combs_buf[il_pos + 4].i[idx], combs_buf[il_pos + 5].i[idx], combs_buf[il_pos + 6].i[idx], combs_buf[il_pos + 7].i[idx]);
+ #elif VECT_SIZE == 16
+ const u32x ix = (u32x) (combs_buf[il_pos + 0].i[idx], combs_buf[il_pos + 1].i[idx], combs_buf[il_pos + 2].i[idx], combs_buf[il_pos + 3].i[idx], combs_buf[il_pos + 4].i[idx], combs_buf[il_pos + 5].i[idx], combs_buf[il_pos + 6].i[idx], combs_buf[il_pos + 7].i[idx], combs_buf[il_pos + 8].i[idx], combs_buf[il_pos + 9].i[idx], combs_buf[il_pos + 10].i[idx], combs_buf[il_pos + 11].i[idx], combs_buf[il_pos + 12].i[idx], combs_buf[il_pos + 13].i[idx], combs_buf[il_pos + 14].i[idx], combs_buf[il_pos + 15].i[idx]);
+ #endif
+
+ return ix;
}
#if VECT_SIZE == 1
-#define packv(arr,var,gid,idx) (u32x) ((arr)[((gid) * 1) + 0].var[(idx)])
+#define packv(arr,var,gid,idx) (u32x) ((arr)[((gid) * 1) + 0].var[(idx)])
#elif VECT_SIZE == 2
-#define packv(arr,var,gid,idx) (u32x) ((arr)[((gid) * 2) + 0].var[(idx)], (arr)[((gid) * 2) + 1].var[(idx)])
+#define packv(arr,var,gid,idx) (u32x) ((arr)[((gid) * 2) + 0].var[(idx)], (arr)[((gid) * 2) + 1].var[(idx)])
#elif VECT_SIZE == 4
-#define packv(arr,var,gid,idx) (u32x) ((arr)[((gid) * 4) + 0].var[(idx)], (arr)[((gid) * 4) + 1].var[(idx)], (arr)[((gid) * 4) + 2].var[(idx)], (arr)[((gid) * 4) + 3].var[(idx)])
+#define packv(arr,var,gid,idx) (u32x) ((arr)[((gid) * 4) + 0].var[(idx)], (arr)[((gid) * 4) + 1].var[(idx)], (arr)[((gid) * 4) + 2].var[(idx)], (arr)[((gid) * 4) + 3].var[(idx)])
#elif VECT_SIZE == 8
-#define packv(arr,var,gid,idx) (u32x) ((arr)[((gid) * 8) + 0].var[(idx)], (arr)[((gid) * 8) + 1].var[(idx)], (arr)[((gid) * 8) + 2].var[(idx)], (arr)[((gid) * 8) + 3].var[(idx)], (arr)[((gid) * 8) + 4].var[(idx)], (arr)[((gid) * 8) + 5].var[(idx)], (arr)[((gid) * 8) + 6].var[(idx)], (arr)[((gid) * 8) + 7].var[(idx)])
+#define packv(arr,var,gid,idx) (u32x) ((arr)[((gid) * 8) + 0].var[(idx)], (arr)[((gid) * 8) + 1].var[(idx)], (arr)[((gid) * 8) + 2].var[(idx)], (arr)[((gid) * 8) + 3].var[(idx)], (arr)[((gid) * 8) + 4].var[(idx)], (arr)[((gid) * 8) + 5].var[(idx)], (arr)[((gid) * 8) + 6].var[(idx)], (arr)[((gid) * 8) + 7].var[(idx)])
#elif VECT_SIZE == 16
-#define packv(arr,var,gid,idx) (u32x) ((arr)[((gid) * 8) + 0].var[(idx)], (arr)[((gid) * 8) + 1].var[(idx)], (arr)[((gid) * 8) + 2].var[(idx)], (arr)[((gid) * 8) + 3].var[(idx)], (arr)[((gid) * 8) + 4].var[(idx)], (arr)[((gid) * 8) + 5].var[(idx)], (arr)[((gid) * 8) + 6].var[(idx)], (arr)[((gid) * 8) + 7].var[(idx)], (arr)[((gid) * 8) + 8].var[(idx)], (arr)[((gid) * 8) + 9].var[(idx)], (arr)[((gid) * 8) + 10].var[(idx)], (arr)[((gid) * 8) + 11].var[(idx)], (arr)[((gid) * 8) + 12].var[(idx)], (arr)[((gid) * 8) + 13].var[(idx)], (arr)[((gid) * 8) + 14].var[(idx)], (arr)[((gid) * 8) + 15].var[(idx)])
+#define packv(arr,var,gid,idx) (u32x) ((arr)[((gid) * 16) + 0].var[(idx)], (arr)[((gid) * 16) + 1].var[(idx)], (arr)[((gid) * 16) + 2].var[(idx)], (arr)[((gid) * 16) + 3].var[(idx)], (arr)[((gid) * 16) + 4].var[(idx)], (arr)[((gid) * 16) + 5].var[(idx)], (arr)[((gid) * 16) + 6].var[(idx)], (arr)[((gid) * 16) + 7].var[(idx)], (arr)[((gid) * 16) + 8].var[(idx)], (arr)[((gid) * 16) + 9].var[(idx)], (arr)[((gid) * 16) + 10].var[(idx)], (arr)[((gid) * 16) + 11].var[(idx)], (arr)[((gid) * 16) + 12].var[(idx)], (arr)[((gid) * 16) + 13].var[(idx)], (arr)[((gid) * 16) + 14].var[(idx)], (arr)[((gid) * 16) + 15].var[(idx)])
#endif
#if VECT_SIZE == 1
-#define unpackv(arr,var,gid,idx,val) (arr)[((gid) * 1) + 0].var[(idx)] = val;
+#define unpackv(arr,var,gid,idx,val) (arr)[((gid) * 1) + 0].var[(idx)] = val;
#elif VECT_SIZE == 2
-#define unpackv(arr,var,gid,idx,val) (arr)[((gid) * 2) + 0].var[(idx)] = val.s0; (arr)[((gid) * 2) + 1].var[(idx)] = val.s1;
+#define unpackv(arr,var,gid,idx,val) (arr)[((gid) * 2) + 0].var[(idx)] = val.s0; (arr)[((gid) * 2) + 1].var[(idx)] = val.s1;
#elif VECT_SIZE == 4
-#define unpackv(arr,var,gid,idx,val) (arr)[((gid) * 4) + 0].var[(idx)] = val.s0; (arr)[((gid) * 4) + 1].var[(idx)] = val.s1; (arr)[((gid) * 4) + 2].var[(idx)] = val.s2; (arr)[((gid) * 4) + 3].var[(idx)] = val.s3;
+#define unpackv(arr,var,gid,idx,val) (arr)[((gid) * 4) + 0].var[(idx)] = val.s0; (arr)[((gid) * 4) + 1].var[(idx)] = val.s1; (arr)[((gid) * 4) + 2].var[(idx)] = val.s2; (arr)[((gid) * 4) + 3].var[(idx)] = val.s3;
#elif VECT_SIZE == 8
-#define unpackv(arr,var,gid,idx,val) (arr)[((gid) * 8) + 0].var[(idx)] = val.s0; (arr)[((gid) * 8) + 1].var[(idx)] = val.s1; (arr)[((gid) * 8) + 2].var[(idx)] = val.s2; (arr)[((gid) * 8) + 3].var[(idx)] = val.s3; (arr)[((gid) * 8) + 4].var[(idx)] = val.s4; (arr)[((gid) * 8) + 5].var[(idx)] = val.s5; (arr)[((gid) * 8) + 6].var[(idx)] = val.s6; (arr)[((gid) * 8) + 7].var[(idx)] = val.s7;
+#define unpackv(arr,var,gid,idx,val) (arr)[((gid) * 8) + 0].var[(idx)] = val.s0; (arr)[((gid) * 8) + 1].var[(idx)] = val.s1; (arr)[((gid) * 8) + 2].var[(idx)] = val.s2; (arr)[((gid) * 8) + 3].var[(idx)] = val.s3; (arr)[((gid) * 8) + 4].var[(idx)] = val.s4; (arr)[((gid) * 8) + 5].var[(idx)] = val.s5; (arr)[((gid) * 8) + 6].var[(idx)] = val.s6; (arr)[((gid) * 8) + 7].var[(idx)] = val.s7;
#elif VECT_SIZE == 16
-#define unpackv(arr,var,gid,idx,val) (arr)[((gid) * 8) + 0].var[(idx)] = val.s0; (arr)[((gid) * 8) + 1].var[(idx)] = val.s1; (arr)[((gid) * 8) + 2].var[(idx)] = val.s2; (arr)[((gid) * 8) + 3].var[(idx)] = val.s3; (arr)[((gid) * 8) + 4].var[(idx)] = val.s4; (arr)[((gid) * 8) + 5].var[(idx)] = val.s5; (arr)[((gid) * 8) + 6].var[(idx)] = val.s6; (arr)[((gid) * 8) + 7].var[(idx)] = val.s7; (arr)[((gid) * 8) + 8].var[(idx)] = val.s8; (arr)[((gid) * 8) + 9].var[(idx)] = val.s9; (arr)[((gid) * 8) + 10].var[(idx)] = val.sa; (arr)[((gid) * 8) + 11].var[(idx)] = val.sb; (arr)[((gid) * 8) + 12].var[(idx)] = val.sc; (arr)[((gid) * 8) + 13].var[(idx)] = val.sd; (arr)[((gid) * 8) + 14].var[(idx)] = val.se; (arr)[((gid) * 8) + 15].var[(idx)] = val.sf;
+#define unpackv(arr,var,gid,idx,val) (arr)[((gid) * 16) + 0].var[(idx)] = val.s0; (arr)[((gid) * 16) + 1].var[(idx)] = val.s1; (arr)[((gid) * 16) + 2].var[(idx)] = val.s2; (arr)[((gid) * 16) + 3].var[(idx)] = val.s3; (arr)[((gid) * 16) + 4].var[(idx)] = val.s4; (arr)[((gid) * 16) + 5].var[(idx)] = val.s5; (arr)[((gid) * 16) + 6].var[(idx)] = val.s6; (arr)[((gid) * 16) + 7].var[(idx)] = val.s7; (arr)[((gid) * 16) + 8].var[(idx)] = val.s8; (arr)[((gid) * 16) + 9].var[(idx)] = val.s9; (arr)[((gid) * 16) + 10].var[(idx)] = val.sa; (arr)[((gid) * 16) + 11].var[(idx)] = val.sb; (arr)[((gid) * 16) + 12].var[(idx)] = val.sc; (arr)[((gid) * 16) + 13].var[(idx)] = val.sd; (arr)[((gid) * 16) + 14].var[(idx)] = val.se; (arr)[((gid) * 16) + 15].var[(idx)] = val.sf;
#endif
101,
111,
1711,
- 3000, // broken in osx
+ 3000,
1000,
1100,
2100,
12800,
- 1500, // broken in osx
+ 1500,
12400,
500,
3200,
10410,
10500,
10600,
- 10700, // broken in osx
+ 10700,
9000,
5200,
6800,
u32 steps_accel[STEPS_ACCEL_CNT];
u32 steps_loops[STEPS_LOOPS_CNT];
- for (int i = 1; i < STEPS_ACCEL_CNT; i++)
+ for (int i = 0; i < STEPS_ACCEL_CNT; i++)
{
steps_accel[i] = 1 << i;
}
u32 kernel_loops_tmp;
- for (kernel_loops_tmp = kernel_loops_max; kernel_loops_tmp >= kernel_loops_min; kernel_loops_tmp >>= 1)
+ for (kernel_loops_tmp = kernel_loops_max; kernel_loops_tmp > kernel_loops_min; kernel_loops_tmp >>= 1)
{
const double exec_ms = try_run (device_param, kernel_accel_min, kernel_loops_tmp, 1);
if (exec_ms < target_ms) break;
-
- if (kernel_loops_tmp == kernel_loops_min) break;
}
// kernel-accel
- double e_best = 0;
-
- for (int i = 0; i < STEPS_ACCEL_CNT; i++)
+ if (kernel_accel_min < kernel_accel_max)
{
- const u32 kernel_accel_try = steps_accel[i];
+ double e_best = 0;
+
+ for (int i = 0; i < STEPS_ACCEL_CNT; i++)
+ {
+ const u32 kernel_accel_try = steps_accel[i];
- if (kernel_accel_try < kernel_accel_min) continue;
- if (kernel_accel_try > kernel_accel_max) break;
+ if (kernel_accel_try < kernel_accel_min) continue;
+ if (kernel_accel_try > kernel_accel_max) break;
- const double exec_ms = try_run (device_param, kernel_accel_try, kernel_loops_tmp, 1);
+ const double exec_ms = try_run (device_param, kernel_accel_try, kernel_loops_tmp, 1);
- if (exec_ms > target_ms) break;
+ if (exec_ms > target_ms) break;
- const double e = kernel_accel_try / exec_ms;
+ const double e = kernel_accel_try / exec_ms;
- if (e > e_best)
- {
- kernel_accel = kernel_accel_try;
+ if (e > e_best)
+ {
+ kernel_accel = kernel_accel_try;
- e_best = e;
+ e_best = e;
+ }
}
}
// kernel-loops final
- e_best = 0;
-
- for (int i = 0; i < STEPS_LOOPS_CNT; i++)
+ if (kernel_loops_min < kernel_loops_max)
{
- const u32 kernel_loops_try = steps_loops[i];
+ double e_best = 0;
- if (kernel_loops_try < kernel_loops_min) continue;
- if (kernel_loops_try > kernel_loops_max) break;
+ for (int i = 0; i < STEPS_LOOPS_CNT; i++)
+ {
+ const u32 kernel_loops_try = steps_loops[i];
- const double exec_ms = try_run (device_param, kernel_accel, kernel_loops_try, 1);
+ if (kernel_loops_try < kernel_loops_min) continue;
+ if (kernel_loops_try > kernel_loops_max) break;
- if (exec_ms > target_ms) break;
+ const double exec_ms = try_run (device_param, kernel_accel, kernel_loops_try, 1);
- const double e = kernel_loops_try / exec_ms;
+ if (exec_ms > target_ms) break;
- if (e > e_best)
- {
- kernel_loops = kernel_loops_try;
+ const double e = kernel_loops_try / exec_ms;
+
+ if (e > e_best)
+ {
+ kernel_loops = kernel_loops_try;
- e_best = e;
+ e_best = e;
+ }
}
}
// reset
- u32 kernel_accel_try = kernel_accel;
- u32 kernel_loops_try = kernel_loops;
-
- for (int i = 0; i < 2; i++)
+ if (kernel_accel_min < kernel_accel_max)
{
- kernel_accel_try >>= 1;
- kernel_loops_try <<= 1;
+ u32 kernel_accel_try = kernel_accel;
+ u32 kernel_loops_try = kernel_loops;
- if (kernel_accel_try < kernel_accel_min) break;
- if (kernel_loops_try > kernel_loops_max) break;
+ for (int i = 0; i < 2; i++)
+ {
+ kernel_accel_try >>= 1;
+ kernel_loops_try <<= 1;
- const double exec_ms = try_run (device_param, kernel_accel_try, kernel_loops_try, 1);
+ if (kernel_accel_try < kernel_accel_min) break;
+ if (kernel_loops_try > kernel_loops_max) break;
- if (exec_ms < exec_best)
- {
- kernel_accel_best = kernel_accel_try;
- kernel_loops_best = kernel_loops_try;
+ const double exec_ms = try_run (device_param, kernel_accel_try, kernel_loops_try, 1);
- exec_best = exec_ms;
+ if (exec_ms < exec_best)
+ {
+ kernel_accel_best = kernel_accel_try;
+ kernel_loops_best = kernel_loops_try;
+
+ exec_best = exec_ms;
+ }
}
+
+ kernel_accel = kernel_accel_best;
+ kernel_loops = kernel_loops_best;
}
// reset
- kernel_accel_try = kernel_accel;
- kernel_loops_try = kernel_loops;
- for (int i = 0; i < 2; i++)
+ if (kernel_loops_min < kernel_loops_max)
{
- kernel_accel_try <<= 1;
- kernel_loops_try >>= 1;
+ u32 kernel_accel_try = kernel_accel;
+ u32 kernel_loops_try = kernel_loops;
+
+ for (int i = 0; i < 2; i++)
+ {
+ kernel_accel_try <<= 1;
+ kernel_loops_try >>= 1;
- if (kernel_accel_try > kernel_accel_max) break;
- if (kernel_loops_try < kernel_loops_min) break;
+ if (kernel_accel_try > kernel_accel_max) break;
+ if (kernel_loops_try < kernel_loops_min) break;
- const double exec_ms = try_run (device_param, kernel_accel_try, kernel_loops_try, 1);
+ const double exec_ms = try_run (device_param, kernel_accel_try, kernel_loops_try, 1);
- if (exec_ms < exec_best)
- {
- kernel_accel_best = kernel_accel_try;
- kernel_loops_best = kernel_loops_try;
+ if (exec_ms < exec_best)
+ {
+ kernel_accel_best = kernel_accel_try;
+ kernel_loops_best = kernel_loops_try;
- exec_best = exec_ms;
+ exec_best = exec_ms;
+ }
}
- }
- kernel_accel = kernel_accel_best;
- kernel_loops = kernel_loops_best;
+ kernel_accel = kernel_accel_best;
+ kernel_loops = kernel_loops_best;
+ }
// reset timer
#endif // HAVE_ADK
#endif // HAVE_HWMON
- #ifdef OSX
- if (hash_mode == 3000 || hash_mode == 1500 || hash_mode == 10700)
- {
- if (force == 0)
- {
- log_info ("");
- log_info ("Warning: Hash mode %d is not stable with OSX.", hash_mode);
- log_info ("You can use --force to override this but do not post error reports if you do so");
- log_info ("");
-
- continue;
- }
- }
- #endif
-
#ifdef DEBUG
if (benchmark == 1) log_info ("Hashmode: %d", data.hash_mode);
#endif
SLOW_ALGOS="400 500 501 1600 1800 2100 2500 3200 5200 5800 6211 6221 6231 6241 6251 6261 6271 6281 6300 6400 6500 6600 6700 6800 7100 7200 7400 7900 8200 8800 8900 9000 9100 9200 9300 9400 9500 9600 10000 10300 10500 10700 10900 11300 11600 11900 12000 12100 12200 12300 12400 12500 12800 12900 13000"
-OPTS="--quiet --force --potfile-disable --runtime 200 --gpu-temp-disable --weak-hash-threshold=0"
+OPTS="--quiet --force --potfile-disable --runtime 200 --gpu-temp-disable --weak-hash-threshold=0 -u 1024 -n 128"
OUTD="test_$(date +%s)"