X-Git-Url: https://www.flypig.org.uk/git/?a=blobdiff_plain;f=OpenCL%2Fm11400_a1.cl;h=a6ca88f692d37402b0954f4358520a10cc94fc27;hb=8702d0e3e155e6a61ca1b0dcdc1ec924d92b8436;hp=5ddc22705f691f7e072f7884151da63fb0fde147;hpb=bacc1049e3152771d306d97ce858f05d4024d331;p=hashcat.git diff --git a/OpenCL/m11400_a1.cl b/OpenCL/m11400_a1.cl index 5ddc227..a6ca88f 100644 --- a/OpenCL/m11400_a1.cl +++ b/OpenCL/m11400_a1.cl @@ -5,60 +5,65 @@ #define _MD5_ -#include "include/constants.h" -#include "include/kernel_vendor.h" - -#define DGST_R0 0 -#define DGST_R1 3 -#define DGST_R2 2 -#define DGST_R3 1 - -#include "include/kernel_functions.c" -#include "OpenCL/types_ocl.c" -#include "OpenCL/common.c" - -#define COMPARE_S "OpenCL/check_single_comp4.c" -#define COMPARE_M "OpenCL/check_multi_comp4.c" - -#define uint_to_hex_lower8(i) l_bin2asc[(i)] - -static u32 memcat32 (u32 block0[16], u32 block1[16], const u32 block_len, const u32 append0[4], const u32 append1[4], const u32 append2[4], const u32 append3[4], const u32 append_len) +//incompatible because of brances +//#define NEW_SIMD_CODE + +#include "inc_vendor.cl" +#include "inc_hash_constants.h" +#include "inc_hash_functions.cl" +#include "inc_types.cl" +#include "inc_common.cl" +#include "inc_simd.cl" + +#if VECT_SIZE == 1 +#define uint_to_hex_lower8(i) (u32x) (l_bin2asc[(i)]) +#elif VECT_SIZE == 2 +#define uint_to_hex_lower8(i) (u32x) (l_bin2asc[(i).s0], l_bin2asc[(i).s1]) +#elif VECT_SIZE == 4 +#define uint_to_hex_lower8(i) (u32x) (l_bin2asc[(i).s0], l_bin2asc[(i).s1], l_bin2asc[(i).s2], l_bin2asc[(i).s3]) +#elif VECT_SIZE == 8 +#define uint_to_hex_lower8(i) (u32x) (l_bin2asc[(i).s0], l_bin2asc[(i).s1], l_bin2asc[(i).s2], l_bin2asc[(i).s3], l_bin2asc[(i).s4], l_bin2asc[(i).s5], l_bin2asc[(i).s6], l_bin2asc[(i).s7]) +#elif VECT_SIZE == 16 +#define uint_to_hex_lower8(i) (u32x) (l_bin2asc[(i).s0], l_bin2asc[(i).s1], l_bin2asc[(i).s2], l_bin2asc[(i).s3], l_bin2asc[(i).s4], l_bin2asc[(i).s5], l_bin2asc[(i).s6], l_bin2asc[(i).s7], l_bin2asc[(i).s8], l_bin2asc[(i).s9], l_bin2asc[(i).sa], l_bin2asc[(i).sb], l_bin2asc[(i).sc], l_bin2asc[(i).sd], l_bin2asc[(i).se], l_bin2asc[(i).sf]) +#endif + +u32 memcat32 (u32x block0[16], u32x block1[16], const u32 block_len, const u32x append0[4], const u32x append1[4], const u32x append2[4], const u32x append3[4], const u32 append_len) { const u32 mod = block_len & 3; const u32 div = block_len / 4; - #if defined IS_AMD || defined IS_UNKNOWN + #if defined IS_AMD || defined IS_GENERIC const int offset_minus_4 = 4 - mod; - u32 append0_t[4]; + u32x append0_t[4]; append0_t[0] = amd_bytealign (append0[0], 0, offset_minus_4); append0_t[1] = amd_bytealign (append0[1], append0[0], offset_minus_4); append0_t[2] = amd_bytealign (append0[2], append0[1], offset_minus_4); append0_t[3] = amd_bytealign (append0[3], append0[2], offset_minus_4); - u32 append1_t[4]; + u32x append1_t[4]; append1_t[0] = amd_bytealign (append1[0], append0[3], offset_minus_4); append1_t[1] = amd_bytealign (append1[1], append1[0], offset_minus_4); append1_t[2] = amd_bytealign (append1[2], append1[1], offset_minus_4); append1_t[3] = amd_bytealign (append1[3], append1[2], offset_minus_4); - u32 append2_t[4]; + u32x append2_t[4]; append2_t[0] = amd_bytealign (append2[0], append1[3], offset_minus_4); append2_t[1] = amd_bytealign (append2[1], append2[0], offset_minus_4); append2_t[2] = amd_bytealign (append2[2], append2[1], offset_minus_4); append2_t[3] = amd_bytealign (append2[3], append2[2], offset_minus_4); - u32 append3_t[4]; + u32x append3_t[4]; append3_t[0] = amd_bytealign (append3[0], append2[3], offset_minus_4); append3_t[1] = amd_bytealign (append3[1], append3[0], offset_minus_4); append3_t[2] = amd_bytealign (append3[2], append3[1], offset_minus_4); append3_t[3] = amd_bytealign (append3[3], append3[2], offset_minus_4); - u32 append4_t[4]; + u32x append4_t[4]; append4_t[0] = amd_bytealign ( 0, append3[3], offset_minus_4); append4_t[1] = 0; @@ -100,35 +105,35 @@ static u32 memcat32 (u32 block0[16], u32 block1[16], const u32 block_len, const const int selector = (0x76543210 >> (offset_minus_4 * 4)) & 0xffff; - u32 append0_t[4]; + u32x append0_t[4]; append0_t[0] = __byte_perm ( 0, append0[0], selector); append0_t[1] = __byte_perm (append0[0], append0[1], selector); append0_t[2] = __byte_perm (append0[1], append0[2], selector); append0_t[3] = __byte_perm (append0[2], append0[3], selector); - u32 append1_t[4]; + u32x append1_t[4]; append1_t[0] = __byte_perm (append0[3], append1[0], selector); append1_t[1] = __byte_perm (append1[0], append1[1], selector); append1_t[2] = __byte_perm (append1[1], append1[2], selector); append1_t[3] = __byte_perm (append1[2], append1[3], selector); - u32 append2_t[4]; + u32x append2_t[4]; append2_t[0] = __byte_perm (append1[3], append2[0], selector); append2_t[1] = __byte_perm (append2[0], append2[1], selector); append2_t[2] = __byte_perm (append2[1], append2[2], selector); append2_t[3] = __byte_perm (append2[2], append2[3], selector); - u32 append3_t[4]; + u32x append3_t[4]; append3_t[0] = __byte_perm (append2[3], append3[0], selector); append3_t[1] = __byte_perm (append3[0], append3[1], selector); append3_t[2] = __byte_perm (append3[1], append3[2], selector); append3_t[3] = __byte_perm (append3[2], append3[3], selector); - u32 append4_t[4]; + u32x append4_t[4]; append4_t[0] = __byte_perm (append3[3], 0, selector); append4_t[1] = 0; @@ -743,92 +748,53 @@ static u32 memcat32 (u32 block0[16], u32 block1[16], const u32 block_len, const return new_len; } -__kernel void __attribute__((reqd_work_group_size (64, 1, 1))) m11400_m04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, __global comb_t *combs_buf, __global bf_t *bfs_buf, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global sip_t *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 combs_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max) +__kernel void m11400_m04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, __global comb_t *combs_buf, __global bf_t *bfs_buf, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global sip_t *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV0_buf, __global u32 *d_scryptV1_buf, __global u32 *d_scryptV2_buf, __global u32 *d_scryptV3_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 il_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max) { /** * modifier */ + const u32 gid = get_global_id (0); const u32 lid = get_local_id (0); + const u32 lsz = get_local_size (0); /** - * base + * bin2asc table */ - const u32 gid = get_global_id (0); - - u32 wordl0[4]; - - wordl0[0] = pws[gid].i[ 0]; - wordl0[1] = pws[gid].i[ 1]; - wordl0[2] = pws[gid].i[ 2]; - wordl0[3] = pws[gid].i[ 3]; - - u32 wordl1[4]; - - wordl1[0] = pws[gid].i[ 4]; - wordl1[1] = pws[gid].i[ 5]; - wordl1[2] = pws[gid].i[ 6]; - wordl1[3] = pws[gid].i[ 7]; + __local u32 l_bin2asc[256]; - u32 wordl2[4]; + for (u32 i = lid; i < 256; i += lsz) + { + const u32 i0 = (i >> 0) & 15; + const u32 i1 = (i >> 4) & 15; - wordl2[0] = 0; - wordl2[1] = 0; - wordl2[2] = 0; - wordl2[3] = 0; + l_bin2asc[i] = ((i0 < 10) ? '0' + i0 : 'a' - 10 + i0) << 8 + | ((i1 < 10) ? '0' + i1 : 'a' - 10 + i1) << 0; + } - u32 wordl3[4]; + barrier (CLK_LOCAL_MEM_FENCE); - wordl3[0] = 0; - wordl3[1] = 0; - wordl3[2] = 0; - wordl3[3] = 0; + if (gid >= gid_max) return; /** - * bin2asc table + * base */ - __local u32 l_bin2asc[256]; - - const u32 lid4 = lid * 4; - - const u32 lid40 = lid4 + 0; - const u32 lid41 = lid4 + 1; - const u32 lid42 = lid4 + 2; - const u32 lid43 = lid4 + 3; - - const u32 v400 = (lid40 >> 0) & 15; - const u32 v401 = (lid40 >> 4) & 15; - const u32 v410 = (lid41 >> 0) & 15; - const u32 v411 = (lid41 >> 4) & 15; - const u32 v420 = (lid42 >> 0) & 15; - const u32 v421 = (lid42 >> 4) & 15; - const u32 v430 = (lid43 >> 0) & 15; - const u32 v431 = (lid43 >> 4) & 15; - - l_bin2asc[lid40] = ((v400 < 10) ? '0' + v400 : 'a' - 10 + v400) << 8 - | ((v401 < 10) ? '0' + v401 : 'a' - 10 + v401) << 0; - l_bin2asc[lid41] = ((v410 < 10) ? '0' + v410 : 'a' - 10 + v410) << 8 - | ((v411 < 10) ? '0' + v411 : 'a' - 10 + v411) << 0; - l_bin2asc[lid42] = ((v420 < 10) ? '0' + v420 : 'a' - 10 + v420) << 8 - | ((v421 < 10) ? '0' + v421 : 'a' - 10 + v421) << 0; - l_bin2asc[lid43] = ((v430 < 10) ? '0' + v430 : 'a' - 10 + v430) << 8 - | ((v431 < 10) ? '0' + v431 : 'a' - 10 + v431) << 0; - - barrier (CLK_LOCAL_MEM_FENCE); + u32 pw_buf0[4]; + u32 pw_buf1[4]; - if (gid >= gid_max) return; + pw_buf0[0] = pws[gid].i[0]; + pw_buf0[1] = pws[gid].i[1]; + pw_buf0[2] = pws[gid].i[2]; + pw_buf0[3] = pws[gid].i[3]; + pw_buf1[0] = pws[gid].i[4]; + pw_buf1[1] = pws[gid].i[5]; + pw_buf1[2] = pws[gid].i[6]; + pw_buf1[3] = pws[gid].i[7]; const u32 pw_l_len = pws[gid].pw_len; - if (combs_mode == COMBINATOR_MODE_BASE_RIGHT) - { - append_0x80_2x4 (wordl0, wordl1, pw_l_len); - - switch_buffer_by_offset (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len); - } - /** * salt */ @@ -836,6 +802,7 @@ __kernel void __attribute__((reqd_work_group_size (64, 1, 1))) m11400_m04 (__glo const u32 salt_len = esalt_bufs[salt_pos].salt_len; // not a bug, we need to get it from the esalt u32 salt_buf0[16]; + u32 salt_buf1[16]; salt_buf0[ 0] = esalt_bufs[salt_pos].salt_buf[ 0]; salt_buf0[ 1] = esalt_bufs[salt_pos].salt_buf[ 1]; @@ -853,9 +820,6 @@ __kernel void __attribute__((reqd_work_group_size (64, 1, 1))) m11400_m04 (__glo salt_buf0[13] = esalt_bufs[salt_pos].salt_buf[13]; salt_buf0[14] = esalt_bufs[salt_pos].salt_buf[14]; salt_buf0[15] = esalt_bufs[salt_pos].salt_buf[15]; - - u32 salt_buf1[16]; - salt_buf1[ 0] = esalt_bufs[salt_pos].salt_buf[16]; salt_buf1[ 1] = esalt_bufs[salt_pos].salt_buf[17]; salt_buf1[ 2] = esalt_bufs[salt_pos].salt_buf[18]; @@ -880,6 +844,8 @@ __kernel void __attribute__((reqd_work_group_size (64, 1, 1))) m11400_m04 (__glo const u32 esalt_len = esalt_bufs[salt_pos].esalt_len; u32 esalt_buf0[16]; + u32 esalt_buf1[16]; + u32 esalt_buf2[16]; esalt_buf0[ 0] = esalt_bufs[salt_pos].esalt_buf[ 0]; esalt_buf0[ 1] = esalt_bufs[salt_pos].esalt_buf[ 1]; @@ -897,9 +863,6 @@ __kernel void __attribute__((reqd_work_group_size (64, 1, 1))) m11400_m04 (__glo esalt_buf0[13] = esalt_bufs[salt_pos].esalt_buf[13]; esalt_buf0[14] = esalt_bufs[salt_pos].esalt_buf[14]; esalt_buf0[15] = esalt_bufs[salt_pos].esalt_buf[15]; - - u32 esalt_buf1[16]; - esalt_buf1[ 0] = esalt_bufs[salt_pos].esalt_buf[16]; esalt_buf1[ 1] = esalt_bufs[salt_pos].esalt_buf[17]; esalt_buf1[ 2] = esalt_bufs[salt_pos].esalt_buf[18]; @@ -916,9 +879,6 @@ __kernel void __attribute__((reqd_work_group_size (64, 1, 1))) m11400_m04 (__glo esalt_buf1[13] = esalt_bufs[salt_pos].esalt_buf[29]; esalt_buf1[14] = esalt_bufs[salt_pos].esalt_buf[30]; esalt_buf1[15] = esalt_bufs[salt_pos].esalt_buf[31]; - - u32 esalt_buf2[16]; - esalt_buf2[ 0] = esalt_bufs[salt_pos].esalt_buf[32]; esalt_buf2[ 1] = esalt_bufs[salt_pos].esalt_buf[33]; esalt_buf2[ 2] = esalt_bufs[salt_pos].esalt_buf[34]; @@ -943,74 +903,76 @@ __kernel void __attribute__((reqd_work_group_size (64, 1, 1))) m11400_m04 (__glo * loop */ - for (u32 il_pos = 0; il_pos < combs_cnt; il_pos++) + for (u32 il_pos = 0; il_pos < il_cnt; il_pos += VECT_SIZE) { - const u32 pw_r_len = combs_buf[il_pos].pw_len; - - const u32 pw_len = pw_l_len + pw_r_len; - - u32 wordr0[4]; + const u32x pw_r_len = pwlenx_create_combt (combs_buf, il_pos); - wordr0[0] = combs_buf[il_pos].i[0]; - wordr0[1] = combs_buf[il_pos].i[1]; - wordr0[2] = combs_buf[il_pos].i[2]; - wordr0[3] = combs_buf[il_pos].i[3]; + const u32x pw_len = pw_l_len + pw_r_len; - u32 wordr1[4]; - - wordr1[0] = combs_buf[il_pos].i[4]; - wordr1[1] = combs_buf[il_pos].i[5]; - wordr1[2] = combs_buf[il_pos].i[6]; - wordr1[3] = combs_buf[il_pos].i[7]; - - u32 wordr2[4]; - - wordr2[0] = 0; - wordr2[1] = 0; - wordr2[2] = 0; - wordr2[3] = 0; - - u32 wordr3[4]; + /** + * concat password candidate + */ - wordr3[0] = 0; - wordr3[1] = 0; - wordr3[2] = 0; - wordr3[3] = 0; + u32x wordl0[4] = { 0 }; + u32x wordl1[4] = { 0 }; + u32x wordl2[4] = { 0 }; + u32x wordl3[4] = { 0 }; + + wordl0[0] = pw_buf0[0]; + wordl0[1] = pw_buf0[1]; + wordl0[2] = pw_buf0[2]; + wordl0[3] = pw_buf0[3]; + wordl1[0] = pw_buf1[0]; + wordl1[1] = pw_buf1[1]; + wordl1[2] = pw_buf1[2]; + wordl1[3] = pw_buf1[3]; + + u32x wordr0[4] = { 0 }; + u32x wordr1[4] = { 0 }; + u32x wordr2[4] = { 0 }; + u32x wordr3[4] = { 0 }; + + wordr0[0] = ix_create_combt (combs_buf, il_pos, 0); + wordr0[1] = ix_create_combt (combs_buf, il_pos, 1); + wordr0[2] = ix_create_combt (combs_buf, il_pos, 2); + wordr0[3] = ix_create_combt (combs_buf, il_pos, 3); + wordr1[0] = ix_create_combt (combs_buf, il_pos, 4); + wordr1[1] = ix_create_combt (combs_buf, il_pos, 5); + wordr1[2] = ix_create_combt (combs_buf, il_pos, 6); + wordr1[3] = ix_create_combt (combs_buf, il_pos, 7); if (combs_mode == COMBINATOR_MODE_BASE_LEFT) { - switch_buffer_by_offset (wordr0, wordr1, wordr2, wordr3, pw_l_len); + switch_buffer_by_offset_le_VV (wordr0, wordr1, wordr2, wordr3, pw_l_len); + } + else + { + switch_buffer_by_offset_le_VV (wordl0, wordl1, wordl2, wordl3, pw_r_len); } - u32 w0[4]; + u32x w0[4]; + u32x w1[4]; + u32x w2[4]; + u32x w3[4]; w0[0] = wordl0[0] | wordr0[0]; w0[1] = wordl0[1] | wordr0[1]; w0[2] = wordl0[2] | wordr0[2]; w0[3] = wordl0[3] | wordr0[3]; - - u32 w1[4]; - w1[0] = wordl1[0] | wordr1[0]; w1[1] = wordl1[1] | wordr1[1]; w1[2] = wordl1[2] | wordr1[2]; w1[3] = wordl1[3] | wordr1[3]; - - u32 w2[4]; - w2[0] = wordl2[0] | wordr2[0]; w2[1] = wordl2[1] | wordr2[1]; w2[2] = wordl2[2] | wordr2[2]; w2[3] = wordl2[3] | wordr2[3]; - - u32 w3[4]; - w3[0] = wordl3[0] | wordr3[0]; w3[1] = wordl3[1] | wordr3[1]; w3[2] = wordl3[2] | wordr3[2]; w3[3] = wordl3[3] | wordr3[3]; - const u32 pw_salt_len = salt_len + pw_len; + const u32x pw_salt_len = salt_len + pw_len; /* * HA1 = md5 ($salt . $pass) @@ -1018,7 +980,8 @@ __kernel void __attribute__((reqd_work_group_size (64, 1, 1))) m11400_m04 (__glo // append the pass to the salt - u32 block0[16]; + u32x block0[16]; + u32x block1[16]; block0[ 0] = salt_buf0[ 0]; block0[ 1] = salt_buf0[ 1]; @@ -1036,9 +999,6 @@ __kernel void __attribute__((reqd_work_group_size (64, 1, 1))) m11400_m04 (__glo block0[13] = salt_buf0[13]; block0[14] = salt_buf0[14]; block0[15] = salt_buf0[15]; - - u32 block1[16]; - block1[ 0] = salt_buf1[ 0]; block1[ 1] = salt_buf1[ 1]; block1[ 2] = salt_buf1[ 2]; @@ -1060,29 +1020,23 @@ __kernel void __attribute__((reqd_work_group_size (64, 1, 1))) m11400_m04 (__glo block_len = memcat32 (block0, block1, salt_len, w0, w1, w2, w3, pw_len); - u32 w0_t[4]; + u32x w0_t[4]; + u32x w1_t[4]; + u32x w2_t[4]; + u32x w3_t[4]; w0_t[0] = block0[ 0]; w0_t[1] = block0[ 1]; w0_t[2] = block0[ 2]; w0_t[3] = block0[ 3]; - - u32 w1_t[4]; - w1_t[0] = block0[ 4]; w1_t[1] = block0[ 5]; w1_t[2] = block0[ 6]; w1_t[3] = block0[ 7]; - - u32 w2_t[4]; - w2_t[0] = block0[ 8]; w2_t[1] = block0[ 9]; w2_t[2] = block0[10]; w2_t[3] = block0[11]; - - u32 w3_t[4]; - w3_t[0] = block0[12]; w3_t[1] = block0[13]; w3_t[2] = block0[14]; @@ -1095,12 +1049,10 @@ __kernel void __attribute__((reqd_work_group_size (64, 1, 1))) m11400_m04 (__glo // md5 - u32 tmp2; - - u32 a = MD5M_A; - u32 b = MD5M_B; - u32 c = MD5M_C; - u32 d = MD5M_D; + u32x a = MD5M_A; + u32x b = MD5M_B; + u32x c = MD5M_C; + u32x d = MD5M_D; MD5_STEP (MD5_Fo, a, b, c, d, w0_t[0], MD5C00, MD5S00); MD5_STEP (MD5_Fo, d, a, b, c, w0_t[1], MD5C01, MD5S01); @@ -1136,22 +1088,22 @@ __kernel void __attribute__((reqd_work_group_size (64, 1, 1))) m11400_m04 (__glo MD5_STEP (MD5_Go, c, d, a, b, w1_t[3], MD5C1e, MD5S12); MD5_STEP (MD5_Go, b, c, d, a, w3_t[0], MD5C1f, MD5S13); - MD5_STEP (MD5_H1, a, b, c, d, w1_t[1], MD5C20, MD5S20); - MD5_STEP (MD5_H2, d, a, b, c, w2_t[0], MD5C21, MD5S21); - MD5_STEP (MD5_H1, c, d, a, b, w2_t[3], MD5C22, MD5S22); - MD5_STEP (MD5_H2, b, c, d, a, w3_t[2], MD5C23, MD5S23); - MD5_STEP (MD5_H1, a, b, c, d, w0_t[1], MD5C24, MD5S20); - MD5_STEP (MD5_H2, d, a, b, c, w1_t[0], MD5C25, MD5S21); - MD5_STEP (MD5_H1, c, d, a, b, w1_t[3], MD5C26, MD5S22); - MD5_STEP (MD5_H2, b, c, d, a, w2_t[2], MD5C27, MD5S23); - MD5_STEP (MD5_H1, a, b, c, d, w3_t[1], MD5C28, MD5S20); - MD5_STEP (MD5_H2, d, a, b, c, w0_t[0], MD5C29, MD5S21); - MD5_STEP (MD5_H1, c, d, a, b, w0_t[3], MD5C2a, MD5S22); - MD5_STEP (MD5_H2, b, c, d, a, w1_t[2], MD5C2b, MD5S23); - MD5_STEP (MD5_H1, a, b, c, d, w2_t[1], MD5C2c, MD5S20); - MD5_STEP (MD5_H2, d, a, b, c, w3_t[0], MD5C2d, MD5S21); - MD5_STEP (MD5_H1, c, d, a, b, w3_t[3], MD5C2e, MD5S22); - MD5_STEP (MD5_H2, b, c, d, a, w0_t[2], MD5C2f, MD5S23); + MD5_STEP (MD5_H , a, b, c, d, w1_t[1], MD5C20, MD5S20); + MD5_STEP (MD5_H , d, a, b, c, w2_t[0], MD5C21, MD5S21); + MD5_STEP (MD5_H , c, d, a, b, w2_t[3], MD5C22, MD5S22); + MD5_STEP (MD5_H , b, c, d, a, w3_t[2], MD5C23, MD5S23); + MD5_STEP (MD5_H , a, b, c, d, w0_t[1], MD5C24, MD5S20); + MD5_STEP (MD5_H , d, a, b, c, w1_t[0], MD5C25, MD5S21); + MD5_STEP (MD5_H , c, d, a, b, w1_t[3], MD5C26, MD5S22); + MD5_STEP (MD5_H , b, c, d, a, w2_t[2], MD5C27, MD5S23); + MD5_STEP (MD5_H , a, b, c, d, w3_t[1], MD5C28, MD5S20); + MD5_STEP (MD5_H , d, a, b, c, w0_t[0], MD5C29, MD5S21); + MD5_STEP (MD5_H , c, d, a, b, w0_t[3], MD5C2a, MD5S22); + MD5_STEP (MD5_H , b, c, d, a, w1_t[2], MD5C2b, MD5S23); + MD5_STEP (MD5_H , a, b, c, d, w2_t[1], MD5C2c, MD5S20); + MD5_STEP (MD5_H , d, a, b, c, w3_t[0], MD5C2d, MD5S21); + MD5_STEP (MD5_H , c, d, a, b, w3_t[3], MD5C2e, MD5S22); + MD5_STEP (MD5_H , b, c, d, a, w0_t[2], MD5C2f, MD5S23); MD5_STEP (MD5_I , a, b, c, d, w0_t[0], MD5C30, MD5S30); MD5_STEP (MD5_I , d, a, b, c, w1_t[3], MD5C31, MD5S31); @@ -1177,26 +1129,23 @@ __kernel void __attribute__((reqd_work_group_size (64, 1, 1))) m11400_m04 (__glo if (block_len > 55) { - u32 r_a = a; - u32 r_b = b; - u32 r_c = c; - u32 r_d = d; + u32x r_a = a; + u32x r_b = b; + u32x r_c = c; + u32x r_d = d; w0_t[0] = block1[ 0]; w0_t[1] = block1[ 1]; w0_t[2] = block1[ 2]; w0_t[3] = block1[ 3]; - w1_t[0] = block1[ 4]; w1_t[1] = block1[ 5]; w1_t[2] = block1[ 6]; w1_t[3] = block1[ 7]; - w2_t[0] = block1[ 8]; w2_t[1] = block1[ 9]; w2_t[2] = block1[10]; w2_t[3] = block1[11]; - w3_t[0] = block1[12]; w3_t[1] = block1[13]; w3_t[2] = pw_salt_len * 8; @@ -1236,22 +1185,22 @@ __kernel void __attribute__((reqd_work_group_size (64, 1, 1))) m11400_m04 (__glo MD5_STEP (MD5_Go, c, d, a, b, w1_t[3], MD5C1e, MD5S12); MD5_STEP (MD5_Go, b, c, d, a, w3_t[0], MD5C1f, MD5S13); - MD5_STEP (MD5_H1, a, b, c, d, w1_t[1], MD5C20, MD5S20); - MD5_STEP (MD5_H2, d, a, b, c, w2_t[0], MD5C21, MD5S21); - MD5_STEP (MD5_H1, c, d, a, b, w2_t[3], MD5C22, MD5S22); - MD5_STEP (MD5_H2, b, c, d, a, w3_t[2], MD5C23, MD5S23); - MD5_STEP (MD5_H1, a, b, c, d, w0_t[1], MD5C24, MD5S20); - MD5_STEP (MD5_H2, d, a, b, c, w1_t[0], MD5C25, MD5S21); - MD5_STEP (MD5_H1, c, d, a, b, w1_t[3], MD5C26, MD5S22); - MD5_STEP (MD5_H2, b, c, d, a, w2_t[2], MD5C27, MD5S23); - MD5_STEP (MD5_H1, a, b, c, d, w3_t[1], MD5C28, MD5S20); - MD5_STEP (MD5_H2, d, a, b, c, w0_t[0], MD5C29, MD5S21); - MD5_STEP (MD5_H1, c, d, a, b, w0_t[3], MD5C2a, MD5S22); - MD5_STEP (MD5_H2, b, c, d, a, w1_t[2], MD5C2b, MD5S23); - MD5_STEP (MD5_H1, a, b, c, d, w2_t[1], MD5C2c, MD5S20); - MD5_STEP (MD5_H2, d, a, b, c, w3_t[0], MD5C2d, MD5S21); - MD5_STEP (MD5_H1, c, d, a, b, w3_t[3], MD5C2e, MD5S22); - MD5_STEP (MD5_H2, b, c, d, a, w0_t[2], MD5C2f, MD5S23); + MD5_STEP (MD5_H , a, b, c, d, w1_t[1], MD5C20, MD5S20); + MD5_STEP (MD5_H , d, a, b, c, w2_t[0], MD5C21, MD5S21); + MD5_STEP (MD5_H , c, d, a, b, w2_t[3], MD5C22, MD5S22); + MD5_STEP (MD5_H , b, c, d, a, w3_t[2], MD5C23, MD5S23); + MD5_STEP (MD5_H , a, b, c, d, w0_t[1], MD5C24, MD5S20); + MD5_STEP (MD5_H , d, a, b, c, w1_t[0], MD5C25, MD5S21); + MD5_STEP (MD5_H , c, d, a, b, w1_t[3], MD5C26, MD5S22); + MD5_STEP (MD5_H , b, c, d, a, w2_t[2], MD5C27, MD5S23); + MD5_STEP (MD5_H , a, b, c, d, w3_t[1], MD5C28, MD5S20); + MD5_STEP (MD5_H , d, a, b, c, w0_t[0], MD5C29, MD5S21); + MD5_STEP (MD5_H , c, d, a, b, w0_t[3], MD5C2a, MD5S22); + MD5_STEP (MD5_H , b, c, d, a, w1_t[2], MD5C2b, MD5S23); + MD5_STEP (MD5_H , a, b, c, d, w2_t[1], MD5C2c, MD5S20); + MD5_STEP (MD5_H , d, a, b, c, w3_t[0], MD5C2d, MD5S21); + MD5_STEP (MD5_H , c, d, a, b, w3_t[3], MD5C2e, MD5S22); + MD5_STEP (MD5_H , b, c, d, a, w0_t[2], MD5C2f, MD5S23); MD5_STEP (MD5_I , a, b, c, d, w0_t[0], MD5C30, MD5S30); MD5_STEP (MD5_I , d, a, b, c, w1_t[3], MD5C31, MD5S31); @@ -1297,12 +1246,10 @@ __kernel void __attribute__((reqd_work_group_size (64, 1, 1))) m11400_m04 (__glo | uint_to_hex_lower8 ((d >> 8) & 255) << 16; w1_t[3] = uint_to_hex_lower8 ((d >> 16) & 255) << 0 | uint_to_hex_lower8 ((d >> 24) & 255) << 16; - w2_t[0] = esalt_buf0[0]; w2_t[1] = esalt_buf0[1]; w2_t[2] = esalt_buf0[2]; w2_t[3] = esalt_buf0[3]; - w3_t[0] = esalt_buf0[4]; w3_t[1] = esalt_buf0[5]; w3_t[2] = esalt_buf0[6]; @@ -1350,22 +1297,22 @@ __kernel void __attribute__((reqd_work_group_size (64, 1, 1))) m11400_m04 (__glo MD5_STEP (MD5_Go, c, d, a, b, w1_t[3], MD5C1e, MD5S12); MD5_STEP (MD5_Go, b, c, d, a, w3_t[0], MD5C1f, MD5S13); - MD5_STEP (MD5_H1, a, b, c, d, w1_t[1], MD5C20, MD5S20); - MD5_STEP (MD5_H2, d, a, b, c, w2_t[0], MD5C21, MD5S21); - MD5_STEP (MD5_H1, c, d, a, b, w2_t[3], MD5C22, MD5S22); - MD5_STEP (MD5_H2, b, c, d, a, w3_t[2], MD5C23, MD5S23); - MD5_STEP (MD5_H1, a, b, c, d, w0_t[1], MD5C24, MD5S20); - MD5_STEP (MD5_H2, d, a, b, c, w1_t[0], MD5C25, MD5S21); - MD5_STEP (MD5_H1, c, d, a, b, w1_t[3], MD5C26, MD5S22); - MD5_STEP (MD5_H2, b, c, d, a, w2_t[2], MD5C27, MD5S23); - MD5_STEP (MD5_H1, a, b, c, d, w3_t[1], MD5C28, MD5S20); - MD5_STEP (MD5_H2, d, a, b, c, w0_t[0], MD5C29, MD5S21); - MD5_STEP (MD5_H1, c, d, a, b, w0_t[3], MD5C2a, MD5S22); - MD5_STEP (MD5_H2, b, c, d, a, w1_t[2], MD5C2b, MD5S23); - MD5_STEP (MD5_H1, a, b, c, d, w2_t[1], MD5C2c, MD5S20); - MD5_STEP (MD5_H2, d, a, b, c, w3_t[0], MD5C2d, MD5S21); - MD5_STEP (MD5_H1, c, d, a, b, w3_t[3], MD5C2e, MD5S22); - MD5_STEP (MD5_H2, b, c, d, a, w0_t[2], MD5C2f, MD5S23); + MD5_STEP (MD5_H , a, b, c, d, w1_t[1], MD5C20, MD5S20); + MD5_STEP (MD5_H , d, a, b, c, w2_t[0], MD5C21, MD5S21); + MD5_STEP (MD5_H , c, d, a, b, w2_t[3], MD5C22, MD5S22); + MD5_STEP (MD5_H , b, c, d, a, w3_t[2], MD5C23, MD5S23); + MD5_STEP (MD5_H , a, b, c, d, w0_t[1], MD5C24, MD5S20); + MD5_STEP (MD5_H , d, a, b, c, w1_t[0], MD5C25, MD5S21); + MD5_STEP (MD5_H , c, d, a, b, w1_t[3], MD5C26, MD5S22); + MD5_STEP (MD5_H , b, c, d, a, w2_t[2], MD5C27, MD5S23); + MD5_STEP (MD5_H , a, b, c, d, w3_t[1], MD5C28, MD5S20); + MD5_STEP (MD5_H , d, a, b, c, w0_t[0], MD5C29, MD5S21); + MD5_STEP (MD5_H , c, d, a, b, w0_t[3], MD5C2a, MD5S22); + MD5_STEP (MD5_H , b, c, d, a, w1_t[2], MD5C2b, MD5S23); + MD5_STEP (MD5_H , a, b, c, d, w2_t[1], MD5C2c, MD5S20); + MD5_STEP (MD5_H , d, a, b, c, w3_t[0], MD5C2d, MD5S21); + MD5_STEP (MD5_H , c, d, a, b, w3_t[3], MD5C2e, MD5S22); + MD5_STEP (MD5_H , b, c, d, a, w0_t[2], MD5C2f, MD5S23); MD5_STEP (MD5_I , a, b, c, d, w0_t[0], MD5C30, MD5S30); MD5_STEP (MD5_I , d, a, b, c, w1_t[3], MD5C31, MD5S31); @@ -1389,10 +1336,10 @@ __kernel void __attribute__((reqd_work_group_size (64, 1, 1))) m11400_m04 (__glo c += MD5M_C; d += MD5M_D; - u32 r_a = a; - u32 r_b = b; - u32 r_c = c; - u32 r_d = d; + u32x r_a = a; + u32x r_b = b; + u32x r_c = c; + u32x r_d = d; // 2nd transform @@ -1400,17 +1347,14 @@ __kernel void __attribute__((reqd_work_group_size (64, 1, 1))) m11400_m04 (__glo w0_t[1] = esalt_buf0[ 9]; w0_t[2] = esalt_buf0[10]; w0_t[3] = esalt_buf0[11]; - w1_t[0] = esalt_buf0[12]; w1_t[1] = esalt_buf0[13]; w1_t[2] = esalt_buf0[14]; w1_t[3] = esalt_buf0[15]; - w2_t[0] = esalt_buf1[ 0]; w2_t[1] = esalt_buf1[ 1]; w2_t[2] = esalt_buf1[ 2]; w2_t[3] = esalt_buf1[ 3]; - w3_t[0] = esalt_buf1[ 4]; w3_t[1] = esalt_buf1[ 5]; w3_t[2] = esalt_buf1[ 6]; @@ -1459,22 +1403,22 @@ __kernel void __attribute__((reqd_work_group_size (64, 1, 1))) m11400_m04 (__glo MD5_STEP (MD5_Go, c, d, a, b, w1_t[3], MD5C1e, MD5S12); MD5_STEP (MD5_Go, b, c, d, a, w3_t[0], MD5C1f, MD5S13); - MD5_STEP (MD5_H1, a, b, c, d, w1_t[1], MD5C20, MD5S20); - MD5_STEP (MD5_H2, d, a, b, c, w2_t[0], MD5C21, MD5S21); - MD5_STEP (MD5_H1, c, d, a, b, w2_t[3], MD5C22, MD5S22); - MD5_STEP (MD5_H2, b, c, d, a, w3_t[2], MD5C23, MD5S23); - MD5_STEP (MD5_H1, a, b, c, d, w0_t[1], MD5C24, MD5S20); - MD5_STEP (MD5_H2, d, a, b, c, w1_t[0], MD5C25, MD5S21); - MD5_STEP (MD5_H1, c, d, a, b, w1_t[3], MD5C26, MD5S22); - MD5_STEP (MD5_H2, b, c, d, a, w2_t[2], MD5C27, MD5S23); - MD5_STEP (MD5_H1, a, b, c, d, w3_t[1], MD5C28, MD5S20); - MD5_STEP (MD5_H2, d, a, b, c, w0_t[0], MD5C29, MD5S21); - MD5_STEP (MD5_H1, c, d, a, b, w0_t[3], MD5C2a, MD5S22); - MD5_STEP (MD5_H2, b, c, d, a, w1_t[2], MD5C2b, MD5S23); - MD5_STEP (MD5_H1, a, b, c, d, w2_t[1], MD5C2c, MD5S20); - MD5_STEP (MD5_H2, d, a, b, c, w3_t[0], MD5C2d, MD5S21); - MD5_STEP (MD5_H1, c, d, a, b, w3_t[3], MD5C2e, MD5S22); - MD5_STEP (MD5_H2, b, c, d, a, w0_t[2], MD5C2f, MD5S23); + MD5_STEP (MD5_H , a, b, c, d, w1_t[1], MD5C20, MD5S20); + MD5_STEP (MD5_H , d, a, b, c, w2_t[0], MD5C21, MD5S21); + MD5_STEP (MD5_H , c, d, a, b, w2_t[3], MD5C22, MD5S22); + MD5_STEP (MD5_H , b, c, d, a, w3_t[2], MD5C23, MD5S23); + MD5_STEP (MD5_H , a, b, c, d, w0_t[1], MD5C24, MD5S20); + MD5_STEP (MD5_H , d, a, b, c, w1_t[0], MD5C25, MD5S21); + MD5_STEP (MD5_H , c, d, a, b, w1_t[3], MD5C26, MD5S22); + MD5_STEP (MD5_H , b, c, d, a, w2_t[2], MD5C27, MD5S23); + MD5_STEP (MD5_H , a, b, c, d, w3_t[1], MD5C28, MD5S20); + MD5_STEP (MD5_H , d, a, b, c, w0_t[0], MD5C29, MD5S21); + MD5_STEP (MD5_H , c, d, a, b, w0_t[3], MD5C2a, MD5S22); + MD5_STEP (MD5_H , b, c, d, a, w1_t[2], MD5C2b, MD5S23); + MD5_STEP (MD5_H , a, b, c, d, w2_t[1], MD5C2c, MD5S20); + MD5_STEP (MD5_H , d, a, b, c, w3_t[0], MD5C2d, MD5S21); + MD5_STEP (MD5_H , c, d, a, b, w3_t[3], MD5C2e, MD5S22); + MD5_STEP (MD5_H , b, c, d, a, w0_t[2], MD5C2f, MD5S23); MD5_STEP (MD5_I , a, b, c, d, w0_t[0], MD5C30, MD5S30); MD5_STEP (MD5_I , d, a, b, c, w1_t[3], MD5C31, MD5S31); @@ -1513,17 +1457,14 @@ __kernel void __attribute__((reqd_work_group_size (64, 1, 1))) m11400_m04 (__glo w0_t[1] = esalt_buf1[ 9]; w0_t[2] = esalt_buf1[10]; w0_t[3] = esalt_buf1[11]; - w1_t[0] = esalt_buf1[12]; w1_t[1] = esalt_buf1[13]; w1_t[2] = esalt_buf1[14]; w1_t[3] = esalt_buf1[15]; - w2_t[0] = esalt_buf2[ 0]; w2_t[1] = esalt_buf2[ 1]; w2_t[2] = esalt_buf2[ 2]; w2_t[3] = esalt_buf2[ 3]; - w3_t[0] = esalt_buf2[ 4]; w3_t[1] = esalt_buf2[ 5]; w3_t[2] = digest_esalt_len * 8; @@ -1563,22 +1504,22 @@ __kernel void __attribute__((reqd_work_group_size (64, 1, 1))) m11400_m04 (__glo MD5_STEP (MD5_Go, c, d, a, b, w1_t[3], MD5C1e, MD5S12); MD5_STEP (MD5_Go, b, c, d, a, w3_t[0], MD5C1f, MD5S13); - MD5_STEP (MD5_H1, a, b, c, d, w1_t[1], MD5C20, MD5S20); - MD5_STEP (MD5_H2, d, a, b, c, w2_t[0], MD5C21, MD5S21); - MD5_STEP (MD5_H1, c, d, a, b, w2_t[3], MD5C22, MD5S22); - MD5_STEP (MD5_H2, b, c, d, a, w3_t[2], MD5C23, MD5S23); - MD5_STEP (MD5_H1, a, b, c, d, w0_t[1], MD5C24, MD5S20); - MD5_STEP (MD5_H2, d, a, b, c, w1_t[0], MD5C25, MD5S21); - MD5_STEP (MD5_H1, c, d, a, b, w1_t[3], MD5C26, MD5S22); - MD5_STEP (MD5_H2, b, c, d, a, w2_t[2], MD5C27, MD5S23); - MD5_STEP (MD5_H1, a, b, c, d, w3_t[1], MD5C28, MD5S20); - MD5_STEP (MD5_H2, d, a, b, c, w0_t[0], MD5C29, MD5S21); - MD5_STEP (MD5_H1, c, d, a, b, w0_t[3], MD5C2a, MD5S22); - MD5_STEP (MD5_H2, b, c, d, a, w1_t[2], MD5C2b, MD5S23); - MD5_STEP (MD5_H1, a, b, c, d, w2_t[1], MD5C2c, MD5S20); - MD5_STEP (MD5_H2, d, a, b, c, w3_t[0], MD5C2d, MD5S21); - MD5_STEP (MD5_H1, c, d, a, b, w3_t[3], MD5C2e, MD5S22); - MD5_STEP (MD5_H2, b, c, d, a, w0_t[2], MD5C2f, MD5S23); + MD5_STEP (MD5_H , a, b, c, d, w1_t[1], MD5C20, MD5S20); + MD5_STEP (MD5_H , d, a, b, c, w2_t[0], MD5C21, MD5S21); + MD5_STEP (MD5_H , c, d, a, b, w2_t[3], MD5C22, MD5S22); + MD5_STEP (MD5_H , b, c, d, a, w3_t[2], MD5C23, MD5S23); + MD5_STEP (MD5_H , a, b, c, d, w0_t[1], MD5C24, MD5S20); + MD5_STEP (MD5_H , d, a, b, c, w1_t[0], MD5C25, MD5S21); + MD5_STEP (MD5_H , c, d, a, b, w1_t[3], MD5C26, MD5S22); + MD5_STEP (MD5_H , b, c, d, a, w2_t[2], MD5C27, MD5S23); + MD5_STEP (MD5_H , a, b, c, d, w3_t[1], MD5C28, MD5S20); + MD5_STEP (MD5_H , d, a, b, c, w0_t[0], MD5C29, MD5S21); + MD5_STEP (MD5_H , c, d, a, b, w0_t[3], MD5C2a, MD5S22); + MD5_STEP (MD5_H , b, c, d, a, w1_t[2], MD5C2b, MD5S23); + MD5_STEP (MD5_H , a, b, c, d, w2_t[1], MD5C2c, MD5S20); + MD5_STEP (MD5_H , d, a, b, c, w3_t[0], MD5C2d, MD5S21); + MD5_STEP (MD5_H , c, d, a, b, w3_t[3], MD5C2e, MD5S22); + MD5_STEP (MD5_H , b, c, d, a, w0_t[2], MD5C2f, MD5S23); MD5_STEP (MD5_I , a, b, c, d, w0_t[0], MD5C30, MD5S30); MD5_STEP (MD5_I , d, a, b, c, w1_t[3], MD5C31, MD5S31); @@ -1603,109 +1544,65 @@ __kernel void __attribute__((reqd_work_group_size (64, 1, 1))) m11400_m04 (__glo c += r_c; d += r_d; - const u32 r0 = a; - const u32 r1 = d; - const u32 r2 = c; - const u32 r3 = b; - - #include COMPARE_M + COMPARE_M_SIMD (a, d, c, b); } } -__kernel void __attribute__((reqd_work_group_size (64, 1, 1))) m11400_m08 (__global pw_t *pws, __global kernel_rule_t *rules_buf, __global comb_t *combs_buf, __global bf_t *bfs_buf, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global sip_t *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 combs_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max) +__kernel void m11400_m08 (__global pw_t *pws, __global kernel_rule_t *rules_buf, __global comb_t *combs_buf, __global bf_t *bfs_buf, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global sip_t *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV0_buf, __global u32 *d_scryptV1_buf, __global u32 *d_scryptV2_buf, __global u32 *d_scryptV3_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 il_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max) { } -__kernel void __attribute__((reqd_work_group_size (64, 1, 1))) m11400_m16 (__global pw_t *pws, __global kernel_rule_t *rules_buf, __global comb_t *combs_buf, __global bf_t *bfs_buf, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global sip_t *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 combs_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max) +__kernel void m11400_m16 (__global pw_t *pws, __global kernel_rule_t *rules_buf, __global comb_t *combs_buf, __global bf_t *bfs_buf, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global sip_t *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV0_buf, __global u32 *d_scryptV1_buf, __global u32 *d_scryptV2_buf, __global u32 *d_scryptV3_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 il_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max) { } -__kernel void __attribute__((reqd_work_group_size (64, 1, 1))) m11400_s04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, __global comb_t *combs_buf, __global bf_t *bfs_buf, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global sip_t *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 combs_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max) +__kernel void m11400_s04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, __global comb_t *combs_buf, __global bf_t *bfs_buf, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global sip_t *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV0_buf, __global u32 *d_scryptV1_buf, __global u32 *d_scryptV2_buf, __global u32 *d_scryptV3_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 il_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max) { /** * modifier */ + const u32 gid = get_global_id (0); const u32 lid = get_local_id (0); + const u32 lsz = get_local_size (0); /** - * base + * bin2asc table */ - const u32 gid = get_global_id (0); - - u32 wordl0[4]; - - wordl0[0] = pws[gid].i[ 0]; - wordl0[1] = pws[gid].i[ 1]; - wordl0[2] = pws[gid].i[ 2]; - wordl0[3] = pws[gid].i[ 3]; - - u32 wordl1[4]; - - wordl1[0] = pws[gid].i[ 4]; - wordl1[1] = pws[gid].i[ 5]; - wordl1[2] = pws[gid].i[ 6]; - wordl1[3] = pws[gid].i[ 7]; + __local u32 l_bin2asc[256]; - u32 wordl2[4]; + for (u32 i = lid; i < 256; i += lsz) + { + const u32 i0 = (i >> 0) & 15; + const u32 i1 = (i >> 4) & 15; - wordl2[0] = 0; - wordl2[1] = 0; - wordl2[2] = 0; - wordl2[3] = 0; + l_bin2asc[i] = ((i0 < 10) ? '0' + i0 : 'a' - 10 + i0) << 8 + | ((i1 < 10) ? '0' + i1 : 'a' - 10 + i1) << 0; + } - u32 wordl3[4]; + barrier (CLK_LOCAL_MEM_FENCE); - wordl3[0] = 0; - wordl3[1] = 0; - wordl3[2] = 0; - wordl3[3] = 0; + if (gid >= gid_max) return; /** - * bin2asc table + * base */ - __local u32 l_bin2asc[256]; + u32 pw_buf0[4]; + u32 pw_buf1[4]; - const u32 lid4 = lid * 4; - - const u32 lid40 = lid4 + 0; - const u32 lid41 = lid4 + 1; - const u32 lid42 = lid4 + 2; - const u32 lid43 = lid4 + 3; - - const u32 v400 = (lid40 >> 0) & 15; - const u32 v401 = (lid40 >> 4) & 15; - const u32 v410 = (lid41 >> 0) & 15; - const u32 v411 = (lid41 >> 4) & 15; - const u32 v420 = (lid42 >> 0) & 15; - const u32 v421 = (lid42 >> 4) & 15; - const u32 v430 = (lid43 >> 0) & 15; - const u32 v431 = (lid43 >> 4) & 15; - - l_bin2asc[lid40] = ((v400 < 10) ? '0' + v400 : 'a' - 10 + v400) << 8 - | ((v401 < 10) ? '0' + v401 : 'a' - 10 + v401) << 0; - l_bin2asc[lid41] = ((v410 < 10) ? '0' + v410 : 'a' - 10 + v410) << 8 - | ((v411 < 10) ? '0' + v411 : 'a' - 10 + v411) << 0; - l_bin2asc[lid42] = ((v420 < 10) ? '0' + v420 : 'a' - 10 + v420) << 8 - | ((v421 < 10) ? '0' + v421 : 'a' - 10 + v421) << 0; - l_bin2asc[lid43] = ((v430 < 10) ? '0' + v430 : 'a' - 10 + v430) << 8 - | ((v431 < 10) ? '0' + v431 : 'a' - 10 + v431) << 0; - - barrier (CLK_LOCAL_MEM_FENCE); - - if (gid >= gid_max) return; + pw_buf0[0] = pws[gid].i[0]; + pw_buf0[1] = pws[gid].i[1]; + pw_buf0[2] = pws[gid].i[2]; + pw_buf0[3] = pws[gid].i[3]; + pw_buf1[0] = pws[gid].i[4]; + pw_buf1[1] = pws[gid].i[5]; + pw_buf1[2] = pws[gid].i[6]; + pw_buf1[3] = pws[gid].i[7]; const u32 pw_l_len = pws[gid].pw_len; - if (combs_mode == COMBINATOR_MODE_BASE_RIGHT) - { - append_0x80_2x4 (wordl0, wordl1, pw_l_len); - - switch_buffer_by_offset (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len); - } - /** * salt */ @@ -1713,6 +1610,7 @@ __kernel void __attribute__((reqd_work_group_size (64, 1, 1))) m11400_s04 (__glo const u32 salt_len = esalt_bufs[salt_pos].salt_len; // not a bug, we need to get it from the esalt u32 salt_buf0[16]; + u32 salt_buf1[16]; salt_buf0[ 0] = esalt_bufs[salt_pos].salt_buf[ 0]; salt_buf0[ 1] = esalt_bufs[salt_pos].salt_buf[ 1]; @@ -1730,9 +1628,6 @@ __kernel void __attribute__((reqd_work_group_size (64, 1, 1))) m11400_s04 (__glo salt_buf0[13] = esalt_bufs[salt_pos].salt_buf[13]; salt_buf0[14] = esalt_bufs[salt_pos].salt_buf[14]; salt_buf0[15] = esalt_bufs[salt_pos].salt_buf[15]; - - u32 salt_buf1[16]; - salt_buf1[ 0] = esalt_bufs[salt_pos].salt_buf[16]; salt_buf1[ 1] = esalt_bufs[salt_pos].salt_buf[17]; salt_buf1[ 2] = esalt_bufs[salt_pos].salt_buf[18]; @@ -1757,6 +1652,8 @@ __kernel void __attribute__((reqd_work_group_size (64, 1, 1))) m11400_s04 (__glo const u32 esalt_len = esalt_bufs[salt_pos].esalt_len; u32 esalt_buf0[16]; + u32 esalt_buf1[16]; + u32 esalt_buf2[16]; esalt_buf0[ 0] = esalt_bufs[salt_pos].esalt_buf[ 0]; esalt_buf0[ 1] = esalt_bufs[salt_pos].esalt_buf[ 1]; @@ -1774,9 +1671,6 @@ __kernel void __attribute__((reqd_work_group_size (64, 1, 1))) m11400_s04 (__glo esalt_buf0[13] = esalt_bufs[salt_pos].esalt_buf[13]; esalt_buf0[14] = esalt_bufs[salt_pos].esalt_buf[14]; esalt_buf0[15] = esalt_bufs[salt_pos].esalt_buf[15]; - - u32 esalt_buf1[16]; - esalt_buf1[ 0] = esalt_bufs[salt_pos].esalt_buf[16]; esalt_buf1[ 1] = esalt_bufs[salt_pos].esalt_buf[17]; esalt_buf1[ 2] = esalt_bufs[salt_pos].esalt_buf[18]; @@ -1793,9 +1687,6 @@ __kernel void __attribute__((reqd_work_group_size (64, 1, 1))) m11400_s04 (__glo esalt_buf1[13] = esalt_bufs[salt_pos].esalt_buf[29]; esalt_buf1[14] = esalt_bufs[salt_pos].esalt_buf[30]; esalt_buf1[15] = esalt_bufs[salt_pos].esalt_buf[31]; - - u32 esalt_buf2[16]; - esalt_buf2[ 0] = esalt_bufs[salt_pos].esalt_buf[32]; esalt_buf2[ 1] = esalt_bufs[salt_pos].esalt_buf[33]; esalt_buf2[ 2] = esalt_bufs[salt_pos].esalt_buf[34]; @@ -1832,74 +1723,76 @@ __kernel void __attribute__((reqd_work_group_size (64, 1, 1))) m11400_s04 (__glo * loop */ - for (u32 il_pos = 0; il_pos < combs_cnt; il_pos++) + for (u32 il_pos = 0; il_pos < il_cnt; il_pos += VECT_SIZE) { - const u32 pw_r_len = combs_buf[il_pos].pw_len; - - const u32 pw_len = pw_l_len + pw_r_len; - - u32 wordr0[4]; - - wordr0[0] = combs_buf[il_pos].i[0]; - wordr0[1] = combs_buf[il_pos].i[1]; - wordr0[2] = combs_buf[il_pos].i[2]; - wordr0[3] = combs_buf[il_pos].i[3]; - - u32 wordr1[4]; + const u32x pw_r_len = pwlenx_create_combt (combs_buf, il_pos); - wordr1[0] = combs_buf[il_pos].i[4]; - wordr1[1] = combs_buf[il_pos].i[5]; - wordr1[2] = combs_buf[il_pos].i[6]; - wordr1[3] = combs_buf[il_pos].i[7]; + const u32x pw_len = pw_l_len + pw_r_len; - u32 wordr2[4]; - - wordr2[0] = 0; - wordr2[1] = 0; - wordr2[2] = 0; - wordr2[3] = 0; - - u32 wordr3[4]; + /** + * concat password candidate + */ - wordr3[0] = 0; - wordr3[1] = 0; - wordr3[2] = 0; - wordr3[3] = 0; + u32x wordl0[4] = { 0 }; + u32x wordl1[4] = { 0 }; + u32x wordl2[4] = { 0 }; + u32x wordl3[4] = { 0 }; + + wordl0[0] = pw_buf0[0]; + wordl0[1] = pw_buf0[1]; + wordl0[2] = pw_buf0[2]; + wordl0[3] = pw_buf0[3]; + wordl1[0] = pw_buf1[0]; + wordl1[1] = pw_buf1[1]; + wordl1[2] = pw_buf1[2]; + wordl1[3] = pw_buf1[3]; + + u32x wordr0[4] = { 0 }; + u32x wordr1[4] = { 0 }; + u32x wordr2[4] = { 0 }; + u32x wordr3[4] = { 0 }; + + wordr0[0] = ix_create_combt (combs_buf, il_pos, 0); + wordr0[1] = ix_create_combt (combs_buf, il_pos, 1); + wordr0[2] = ix_create_combt (combs_buf, il_pos, 2); + wordr0[3] = ix_create_combt (combs_buf, il_pos, 3); + wordr1[0] = ix_create_combt (combs_buf, il_pos, 4); + wordr1[1] = ix_create_combt (combs_buf, il_pos, 5); + wordr1[2] = ix_create_combt (combs_buf, il_pos, 6); + wordr1[3] = ix_create_combt (combs_buf, il_pos, 7); if (combs_mode == COMBINATOR_MODE_BASE_LEFT) { - switch_buffer_by_offset (wordr0, wordr1, wordr2, wordr3, pw_l_len); + switch_buffer_by_offset_le_VV (wordr0, wordr1, wordr2, wordr3, pw_l_len); + } + else + { + switch_buffer_by_offset_le_VV (wordl0, wordl1, wordl2, wordl3, pw_r_len); } - u32 w0[4]; + u32x w0[4]; + u32x w1[4]; + u32x w2[4]; + u32x w3[4]; w0[0] = wordl0[0] | wordr0[0]; w0[1] = wordl0[1] | wordr0[1]; w0[2] = wordl0[2] | wordr0[2]; w0[3] = wordl0[3] | wordr0[3]; - - u32 w1[4]; - w1[0] = wordl1[0] | wordr1[0]; w1[1] = wordl1[1] | wordr1[1]; w1[2] = wordl1[2] | wordr1[2]; w1[3] = wordl1[3] | wordr1[3]; - - u32 w2[4]; - w2[0] = wordl2[0] | wordr2[0]; w2[1] = wordl2[1] | wordr2[1]; w2[2] = wordl2[2] | wordr2[2]; w2[3] = wordl2[3] | wordr2[3]; - - u32 w3[4]; - w3[0] = wordl3[0] | wordr3[0]; w3[1] = wordl3[1] | wordr3[1]; w3[2] = wordl3[2] | wordr3[2]; w3[3] = wordl3[3] | wordr3[3]; - const u32 pw_salt_len = salt_len + pw_len; + const u32x pw_salt_len = salt_len + pw_len; /* * HA1 = md5 ($salt . $pass) @@ -1907,7 +1800,8 @@ __kernel void __attribute__((reqd_work_group_size (64, 1, 1))) m11400_s04 (__glo // append the pass to the salt - u32 block0[16]; + u32x block0[16]; + u32x block1[16]; block0[ 0] = salt_buf0[ 0]; block0[ 1] = salt_buf0[ 1]; @@ -1925,9 +1819,6 @@ __kernel void __attribute__((reqd_work_group_size (64, 1, 1))) m11400_s04 (__glo block0[13] = salt_buf0[13]; block0[14] = salt_buf0[14]; block0[15] = salt_buf0[15]; - - u32 block1[16]; - block1[ 0] = salt_buf1[ 0]; block1[ 1] = salt_buf1[ 1]; block1[ 2] = salt_buf1[ 2]; @@ -1949,29 +1840,23 @@ __kernel void __attribute__((reqd_work_group_size (64, 1, 1))) m11400_s04 (__glo block_len = memcat32 (block0, block1, salt_len, w0, w1, w2, w3, pw_len); - u32 w0_t[4]; + u32x w0_t[4]; + u32x w1_t[4]; + u32x w2_t[4]; + u32x w3_t[4]; w0_t[0] = block0[ 0]; w0_t[1] = block0[ 1]; w0_t[2] = block0[ 2]; w0_t[3] = block0[ 3]; - - u32 w1_t[4]; - w1_t[0] = block0[ 4]; w1_t[1] = block0[ 5]; w1_t[2] = block0[ 6]; w1_t[3] = block0[ 7]; - - u32 w2_t[4]; - w2_t[0] = block0[ 8]; w2_t[1] = block0[ 9]; w2_t[2] = block0[10]; w2_t[3] = block0[11]; - - u32 w3_t[4]; - w3_t[0] = block0[12]; w3_t[1] = block0[13]; w3_t[2] = block0[14]; @@ -1984,12 +1869,10 @@ __kernel void __attribute__((reqd_work_group_size (64, 1, 1))) m11400_s04 (__glo // md5 - u32 tmp2; - - u32 a = MD5M_A; - u32 b = MD5M_B; - u32 c = MD5M_C; - u32 d = MD5M_D; + u32x a = MD5M_A; + u32x b = MD5M_B; + u32x c = MD5M_C; + u32x d = MD5M_D; MD5_STEP (MD5_Fo, a, b, c, d, w0_t[0], MD5C00, MD5S00); MD5_STEP (MD5_Fo, d, a, b, c, w0_t[1], MD5C01, MD5S01); @@ -2025,22 +1908,22 @@ __kernel void __attribute__((reqd_work_group_size (64, 1, 1))) m11400_s04 (__glo MD5_STEP (MD5_Go, c, d, a, b, w1_t[3], MD5C1e, MD5S12); MD5_STEP (MD5_Go, b, c, d, a, w3_t[0], MD5C1f, MD5S13); - MD5_STEP (MD5_H1, a, b, c, d, w1_t[1], MD5C20, MD5S20); - MD5_STEP (MD5_H2, d, a, b, c, w2_t[0], MD5C21, MD5S21); - MD5_STEP (MD5_H1, c, d, a, b, w2_t[3], MD5C22, MD5S22); - MD5_STEP (MD5_H2, b, c, d, a, w3_t[2], MD5C23, MD5S23); - MD5_STEP (MD5_H1, a, b, c, d, w0_t[1], MD5C24, MD5S20); - MD5_STEP (MD5_H2, d, a, b, c, w1_t[0], MD5C25, MD5S21); - MD5_STEP (MD5_H1, c, d, a, b, w1_t[3], MD5C26, MD5S22); - MD5_STEP (MD5_H2, b, c, d, a, w2_t[2], MD5C27, MD5S23); - MD5_STEP (MD5_H1, a, b, c, d, w3_t[1], MD5C28, MD5S20); - MD5_STEP (MD5_H2, d, a, b, c, w0_t[0], MD5C29, MD5S21); - MD5_STEP (MD5_H1, c, d, a, b, w0_t[3], MD5C2a, MD5S22); - MD5_STEP (MD5_H2, b, c, d, a, w1_t[2], MD5C2b, MD5S23); - MD5_STEP (MD5_H1, a, b, c, d, w2_t[1], MD5C2c, MD5S20); - MD5_STEP (MD5_H2, d, a, b, c, w3_t[0], MD5C2d, MD5S21); - MD5_STEP (MD5_H1, c, d, a, b, w3_t[3], MD5C2e, MD5S22); - MD5_STEP (MD5_H2, b, c, d, a, w0_t[2], MD5C2f, MD5S23); + MD5_STEP (MD5_H , a, b, c, d, w1_t[1], MD5C20, MD5S20); + MD5_STEP (MD5_H , d, a, b, c, w2_t[0], MD5C21, MD5S21); + MD5_STEP (MD5_H , c, d, a, b, w2_t[3], MD5C22, MD5S22); + MD5_STEP (MD5_H , b, c, d, a, w3_t[2], MD5C23, MD5S23); + MD5_STEP (MD5_H , a, b, c, d, w0_t[1], MD5C24, MD5S20); + MD5_STEP (MD5_H , d, a, b, c, w1_t[0], MD5C25, MD5S21); + MD5_STEP (MD5_H , c, d, a, b, w1_t[3], MD5C26, MD5S22); + MD5_STEP (MD5_H , b, c, d, a, w2_t[2], MD5C27, MD5S23); + MD5_STEP (MD5_H , a, b, c, d, w3_t[1], MD5C28, MD5S20); + MD5_STEP (MD5_H , d, a, b, c, w0_t[0], MD5C29, MD5S21); + MD5_STEP (MD5_H , c, d, a, b, w0_t[3], MD5C2a, MD5S22); + MD5_STEP (MD5_H , b, c, d, a, w1_t[2], MD5C2b, MD5S23); + MD5_STEP (MD5_H , a, b, c, d, w2_t[1], MD5C2c, MD5S20); + MD5_STEP (MD5_H , d, a, b, c, w3_t[0], MD5C2d, MD5S21); + MD5_STEP (MD5_H , c, d, a, b, w3_t[3], MD5C2e, MD5S22); + MD5_STEP (MD5_H , b, c, d, a, w0_t[2], MD5C2f, MD5S23); MD5_STEP (MD5_I , a, b, c, d, w0_t[0], MD5C30, MD5S30); MD5_STEP (MD5_I , d, a, b, c, w1_t[3], MD5C31, MD5S31); @@ -2066,26 +1949,23 @@ __kernel void __attribute__((reqd_work_group_size (64, 1, 1))) m11400_s04 (__glo if (block_len > 55) { - u32 r_a = a; - u32 r_b = b; - u32 r_c = c; - u32 r_d = d; + u32x r_a = a; + u32x r_b = b; + u32x r_c = c; + u32x r_d = d; w0_t[0] = block1[ 0]; w0_t[1] = block1[ 1]; w0_t[2] = block1[ 2]; w0_t[3] = block1[ 3]; - w1_t[0] = block1[ 4]; w1_t[1] = block1[ 5]; w1_t[2] = block1[ 6]; w1_t[3] = block1[ 7]; - w2_t[0] = block1[ 8]; w2_t[1] = block1[ 9]; w2_t[2] = block1[10]; w2_t[3] = block1[11]; - w3_t[0] = block1[12]; w3_t[1] = block1[13]; w3_t[2] = pw_salt_len * 8; @@ -2125,22 +2005,22 @@ __kernel void __attribute__((reqd_work_group_size (64, 1, 1))) m11400_s04 (__glo MD5_STEP (MD5_Go, c, d, a, b, w1_t[3], MD5C1e, MD5S12); MD5_STEP (MD5_Go, b, c, d, a, w3_t[0], MD5C1f, MD5S13); - MD5_STEP (MD5_H1, a, b, c, d, w1_t[1], MD5C20, MD5S20); - MD5_STEP (MD5_H2, d, a, b, c, w2_t[0], MD5C21, MD5S21); - MD5_STEP (MD5_H1, c, d, a, b, w2_t[3], MD5C22, MD5S22); - MD5_STEP (MD5_H2, b, c, d, a, w3_t[2], MD5C23, MD5S23); - MD5_STEP (MD5_H1, a, b, c, d, w0_t[1], MD5C24, MD5S20); - MD5_STEP (MD5_H2, d, a, b, c, w1_t[0], MD5C25, MD5S21); - MD5_STEP (MD5_H1, c, d, a, b, w1_t[3], MD5C26, MD5S22); - MD5_STEP (MD5_H2, b, c, d, a, w2_t[2], MD5C27, MD5S23); - MD5_STEP (MD5_H1, a, b, c, d, w3_t[1], MD5C28, MD5S20); - MD5_STEP (MD5_H2, d, a, b, c, w0_t[0], MD5C29, MD5S21); - MD5_STEP (MD5_H1, c, d, a, b, w0_t[3], MD5C2a, MD5S22); - MD5_STEP (MD5_H2, b, c, d, a, w1_t[2], MD5C2b, MD5S23); - MD5_STEP (MD5_H1, a, b, c, d, w2_t[1], MD5C2c, MD5S20); - MD5_STEP (MD5_H2, d, a, b, c, w3_t[0], MD5C2d, MD5S21); - MD5_STEP (MD5_H1, c, d, a, b, w3_t[3], MD5C2e, MD5S22); - MD5_STEP (MD5_H2, b, c, d, a, w0_t[2], MD5C2f, MD5S23); + MD5_STEP (MD5_H , a, b, c, d, w1_t[1], MD5C20, MD5S20); + MD5_STEP (MD5_H , d, a, b, c, w2_t[0], MD5C21, MD5S21); + MD5_STEP (MD5_H , c, d, a, b, w2_t[3], MD5C22, MD5S22); + MD5_STEP (MD5_H , b, c, d, a, w3_t[2], MD5C23, MD5S23); + MD5_STEP (MD5_H , a, b, c, d, w0_t[1], MD5C24, MD5S20); + MD5_STEP (MD5_H , d, a, b, c, w1_t[0], MD5C25, MD5S21); + MD5_STEP (MD5_H , c, d, a, b, w1_t[3], MD5C26, MD5S22); + MD5_STEP (MD5_H , b, c, d, a, w2_t[2], MD5C27, MD5S23); + MD5_STEP (MD5_H , a, b, c, d, w3_t[1], MD5C28, MD5S20); + MD5_STEP (MD5_H , d, a, b, c, w0_t[0], MD5C29, MD5S21); + MD5_STEP (MD5_H , c, d, a, b, w0_t[3], MD5C2a, MD5S22); + MD5_STEP (MD5_H , b, c, d, a, w1_t[2], MD5C2b, MD5S23); + MD5_STEP (MD5_H , a, b, c, d, w2_t[1], MD5C2c, MD5S20); + MD5_STEP (MD5_H , d, a, b, c, w3_t[0], MD5C2d, MD5S21); + MD5_STEP (MD5_H , c, d, a, b, w3_t[3], MD5C2e, MD5S22); + MD5_STEP (MD5_H , b, c, d, a, w0_t[2], MD5C2f, MD5S23); MD5_STEP (MD5_I , a, b, c, d, w0_t[0], MD5C30, MD5S30); MD5_STEP (MD5_I , d, a, b, c, w1_t[3], MD5C31, MD5S31); @@ -2186,12 +2066,10 @@ __kernel void __attribute__((reqd_work_group_size (64, 1, 1))) m11400_s04 (__glo | uint_to_hex_lower8 ((d >> 8) & 255) << 16; w1_t[3] = uint_to_hex_lower8 ((d >> 16) & 255) << 0 | uint_to_hex_lower8 ((d >> 24) & 255) << 16; - w2_t[0] = esalt_buf0[0]; w2_t[1] = esalt_buf0[1]; w2_t[2] = esalt_buf0[2]; w2_t[3] = esalt_buf0[3]; - w3_t[0] = esalt_buf0[4]; w3_t[1] = esalt_buf0[5]; w3_t[2] = esalt_buf0[6]; @@ -2239,22 +2117,22 @@ __kernel void __attribute__((reqd_work_group_size (64, 1, 1))) m11400_s04 (__glo MD5_STEP (MD5_Go, c, d, a, b, w1_t[3], MD5C1e, MD5S12); MD5_STEP (MD5_Go, b, c, d, a, w3_t[0], MD5C1f, MD5S13); - MD5_STEP (MD5_H1, a, b, c, d, w1_t[1], MD5C20, MD5S20); - MD5_STEP (MD5_H2, d, a, b, c, w2_t[0], MD5C21, MD5S21); - MD5_STEP (MD5_H1, c, d, a, b, w2_t[3], MD5C22, MD5S22); - MD5_STEP (MD5_H2, b, c, d, a, w3_t[2], MD5C23, MD5S23); - MD5_STEP (MD5_H1, a, b, c, d, w0_t[1], MD5C24, MD5S20); - MD5_STEP (MD5_H2, d, a, b, c, w1_t[0], MD5C25, MD5S21); - MD5_STEP (MD5_H1, c, d, a, b, w1_t[3], MD5C26, MD5S22); - MD5_STEP (MD5_H2, b, c, d, a, w2_t[2], MD5C27, MD5S23); - MD5_STEP (MD5_H1, a, b, c, d, w3_t[1], MD5C28, MD5S20); - MD5_STEP (MD5_H2, d, a, b, c, w0_t[0], MD5C29, MD5S21); - MD5_STEP (MD5_H1, c, d, a, b, w0_t[3], MD5C2a, MD5S22); - MD5_STEP (MD5_H2, b, c, d, a, w1_t[2], MD5C2b, MD5S23); - MD5_STEP (MD5_H1, a, b, c, d, w2_t[1], MD5C2c, MD5S20); - MD5_STEP (MD5_H2, d, a, b, c, w3_t[0], MD5C2d, MD5S21); - MD5_STEP (MD5_H1, c, d, a, b, w3_t[3], MD5C2e, MD5S22); - MD5_STEP (MD5_H2, b, c, d, a, w0_t[2], MD5C2f, MD5S23); + MD5_STEP (MD5_H , a, b, c, d, w1_t[1], MD5C20, MD5S20); + MD5_STEP (MD5_H , d, a, b, c, w2_t[0], MD5C21, MD5S21); + MD5_STEP (MD5_H , c, d, a, b, w2_t[3], MD5C22, MD5S22); + MD5_STEP (MD5_H , b, c, d, a, w3_t[2], MD5C23, MD5S23); + MD5_STEP (MD5_H , a, b, c, d, w0_t[1], MD5C24, MD5S20); + MD5_STEP (MD5_H , d, a, b, c, w1_t[0], MD5C25, MD5S21); + MD5_STEP (MD5_H , c, d, a, b, w1_t[3], MD5C26, MD5S22); + MD5_STEP (MD5_H , b, c, d, a, w2_t[2], MD5C27, MD5S23); + MD5_STEP (MD5_H , a, b, c, d, w3_t[1], MD5C28, MD5S20); + MD5_STEP (MD5_H , d, a, b, c, w0_t[0], MD5C29, MD5S21); + MD5_STEP (MD5_H , c, d, a, b, w0_t[3], MD5C2a, MD5S22); + MD5_STEP (MD5_H , b, c, d, a, w1_t[2], MD5C2b, MD5S23); + MD5_STEP (MD5_H , a, b, c, d, w2_t[1], MD5C2c, MD5S20); + MD5_STEP (MD5_H , d, a, b, c, w3_t[0], MD5C2d, MD5S21); + MD5_STEP (MD5_H , c, d, a, b, w3_t[3], MD5C2e, MD5S22); + MD5_STEP (MD5_H , b, c, d, a, w0_t[2], MD5C2f, MD5S23); MD5_STEP (MD5_I , a, b, c, d, w0_t[0], MD5C30, MD5S30); MD5_STEP (MD5_I , d, a, b, c, w1_t[3], MD5C31, MD5S31); @@ -2278,10 +2156,10 @@ __kernel void __attribute__((reqd_work_group_size (64, 1, 1))) m11400_s04 (__glo c += MD5M_C; d += MD5M_D; - u32 r_a = a; - u32 r_b = b; - u32 r_c = c; - u32 r_d = d; + u32x r_a = a; + u32x r_b = b; + u32x r_c = c; + u32x r_d = d; // 2nd transform @@ -2289,17 +2167,14 @@ __kernel void __attribute__((reqd_work_group_size (64, 1, 1))) m11400_s04 (__glo w0_t[1] = esalt_buf0[ 9]; w0_t[2] = esalt_buf0[10]; w0_t[3] = esalt_buf0[11]; - w1_t[0] = esalt_buf0[12]; w1_t[1] = esalt_buf0[13]; w1_t[2] = esalt_buf0[14]; w1_t[3] = esalt_buf0[15]; - w2_t[0] = esalt_buf1[ 0]; w2_t[1] = esalt_buf1[ 1]; w2_t[2] = esalt_buf1[ 2]; w2_t[3] = esalt_buf1[ 3]; - w3_t[0] = esalt_buf1[ 4]; w3_t[1] = esalt_buf1[ 5]; w3_t[2] = esalt_buf1[ 6]; @@ -2348,22 +2223,22 @@ __kernel void __attribute__((reqd_work_group_size (64, 1, 1))) m11400_s04 (__glo MD5_STEP (MD5_Go, c, d, a, b, w1_t[3], MD5C1e, MD5S12); MD5_STEP (MD5_Go, b, c, d, a, w3_t[0], MD5C1f, MD5S13); - MD5_STEP (MD5_H1, a, b, c, d, w1_t[1], MD5C20, MD5S20); - MD5_STEP (MD5_H2, d, a, b, c, w2_t[0], MD5C21, MD5S21); - MD5_STEP (MD5_H1, c, d, a, b, w2_t[3], MD5C22, MD5S22); - MD5_STEP (MD5_H2, b, c, d, a, w3_t[2], MD5C23, MD5S23); - MD5_STEP (MD5_H1, a, b, c, d, w0_t[1], MD5C24, MD5S20); - MD5_STEP (MD5_H2, d, a, b, c, w1_t[0], MD5C25, MD5S21); - MD5_STEP (MD5_H1, c, d, a, b, w1_t[3], MD5C26, MD5S22); - MD5_STEP (MD5_H2, b, c, d, a, w2_t[2], MD5C27, MD5S23); - MD5_STEP (MD5_H1, a, b, c, d, w3_t[1], MD5C28, MD5S20); - MD5_STEP (MD5_H2, d, a, b, c, w0_t[0], MD5C29, MD5S21); - MD5_STEP (MD5_H1, c, d, a, b, w0_t[3], MD5C2a, MD5S22); - MD5_STEP (MD5_H2, b, c, d, a, w1_t[2], MD5C2b, MD5S23); - MD5_STEP (MD5_H1, a, b, c, d, w2_t[1], MD5C2c, MD5S20); - MD5_STEP (MD5_H2, d, a, b, c, w3_t[0], MD5C2d, MD5S21); - MD5_STEP (MD5_H1, c, d, a, b, w3_t[3], MD5C2e, MD5S22); - MD5_STEP (MD5_H2, b, c, d, a, w0_t[2], MD5C2f, MD5S23); + MD5_STEP (MD5_H , a, b, c, d, w1_t[1], MD5C20, MD5S20); + MD5_STEP (MD5_H , d, a, b, c, w2_t[0], MD5C21, MD5S21); + MD5_STEP (MD5_H , c, d, a, b, w2_t[3], MD5C22, MD5S22); + MD5_STEP (MD5_H , b, c, d, a, w3_t[2], MD5C23, MD5S23); + MD5_STEP (MD5_H , a, b, c, d, w0_t[1], MD5C24, MD5S20); + MD5_STEP (MD5_H , d, a, b, c, w1_t[0], MD5C25, MD5S21); + MD5_STEP (MD5_H , c, d, a, b, w1_t[3], MD5C26, MD5S22); + MD5_STEP (MD5_H , b, c, d, a, w2_t[2], MD5C27, MD5S23); + MD5_STEP (MD5_H , a, b, c, d, w3_t[1], MD5C28, MD5S20); + MD5_STEP (MD5_H , d, a, b, c, w0_t[0], MD5C29, MD5S21); + MD5_STEP (MD5_H , c, d, a, b, w0_t[3], MD5C2a, MD5S22); + MD5_STEP (MD5_H , b, c, d, a, w1_t[2], MD5C2b, MD5S23); + MD5_STEP (MD5_H , a, b, c, d, w2_t[1], MD5C2c, MD5S20); + MD5_STEP (MD5_H , d, a, b, c, w3_t[0], MD5C2d, MD5S21); + MD5_STEP (MD5_H , c, d, a, b, w3_t[3], MD5C2e, MD5S22); + MD5_STEP (MD5_H , b, c, d, a, w0_t[2], MD5C2f, MD5S23); MD5_STEP (MD5_I , a, b, c, d, w0_t[0], MD5C30, MD5S30); MD5_STEP (MD5_I , d, a, b, c, w1_t[3], MD5C31, MD5S31); @@ -2402,17 +2277,14 @@ __kernel void __attribute__((reqd_work_group_size (64, 1, 1))) m11400_s04 (__glo w0_t[1] = esalt_buf1[ 9]; w0_t[2] = esalt_buf1[10]; w0_t[3] = esalt_buf1[11]; - w1_t[0] = esalt_buf1[12]; w1_t[1] = esalt_buf1[13]; w1_t[2] = esalt_buf1[14]; w1_t[3] = esalt_buf1[15]; - w2_t[0] = esalt_buf2[ 0]; w2_t[1] = esalt_buf2[ 1]; w2_t[2] = esalt_buf2[ 2]; w2_t[3] = esalt_buf2[ 3]; - w3_t[0] = esalt_buf2[ 4]; w3_t[1] = esalt_buf2[ 5]; w3_t[2] = digest_esalt_len * 8; @@ -2452,22 +2324,22 @@ __kernel void __attribute__((reqd_work_group_size (64, 1, 1))) m11400_s04 (__glo MD5_STEP (MD5_Go, c, d, a, b, w1_t[3], MD5C1e, MD5S12); MD5_STEP (MD5_Go, b, c, d, a, w3_t[0], MD5C1f, MD5S13); - MD5_STEP (MD5_H1, a, b, c, d, w1_t[1], MD5C20, MD5S20); - MD5_STEP (MD5_H2, d, a, b, c, w2_t[0], MD5C21, MD5S21); - MD5_STEP (MD5_H1, c, d, a, b, w2_t[3], MD5C22, MD5S22); - MD5_STEP (MD5_H2, b, c, d, a, w3_t[2], MD5C23, MD5S23); - MD5_STEP (MD5_H1, a, b, c, d, w0_t[1], MD5C24, MD5S20); - MD5_STEP (MD5_H2, d, a, b, c, w1_t[0], MD5C25, MD5S21); - MD5_STEP (MD5_H1, c, d, a, b, w1_t[3], MD5C26, MD5S22); - MD5_STEP (MD5_H2, b, c, d, a, w2_t[2], MD5C27, MD5S23); - MD5_STEP (MD5_H1, a, b, c, d, w3_t[1], MD5C28, MD5S20); - MD5_STEP (MD5_H2, d, a, b, c, w0_t[0], MD5C29, MD5S21); - MD5_STEP (MD5_H1, c, d, a, b, w0_t[3], MD5C2a, MD5S22); - MD5_STEP (MD5_H2, b, c, d, a, w1_t[2], MD5C2b, MD5S23); - MD5_STEP (MD5_H1, a, b, c, d, w2_t[1], MD5C2c, MD5S20); - MD5_STEP (MD5_H2, d, a, b, c, w3_t[0], MD5C2d, MD5S21); - MD5_STEP (MD5_H1, c, d, a, b, w3_t[3], MD5C2e, MD5S22); - MD5_STEP (MD5_H2, b, c, d, a, w0_t[2], MD5C2f, MD5S23); + MD5_STEP (MD5_H , a, b, c, d, w1_t[1], MD5C20, MD5S20); + MD5_STEP (MD5_H , d, a, b, c, w2_t[0], MD5C21, MD5S21); + MD5_STEP (MD5_H , c, d, a, b, w2_t[3], MD5C22, MD5S22); + MD5_STEP (MD5_H , b, c, d, a, w3_t[2], MD5C23, MD5S23); + MD5_STEP (MD5_H , a, b, c, d, w0_t[1], MD5C24, MD5S20); + MD5_STEP (MD5_H , d, a, b, c, w1_t[0], MD5C25, MD5S21); + MD5_STEP (MD5_H , c, d, a, b, w1_t[3], MD5C26, MD5S22); + MD5_STEP (MD5_H , b, c, d, a, w2_t[2], MD5C27, MD5S23); + MD5_STEP (MD5_H , a, b, c, d, w3_t[1], MD5C28, MD5S20); + MD5_STEP (MD5_H , d, a, b, c, w0_t[0], MD5C29, MD5S21); + MD5_STEP (MD5_H , c, d, a, b, w0_t[3], MD5C2a, MD5S22); + MD5_STEP (MD5_H , b, c, d, a, w1_t[2], MD5C2b, MD5S23); + MD5_STEP (MD5_H , a, b, c, d, w2_t[1], MD5C2c, MD5S20); + MD5_STEP (MD5_H , d, a, b, c, w3_t[0], MD5C2d, MD5S21); + MD5_STEP (MD5_H , c, d, a, b, w3_t[3], MD5C2e, MD5S22); + MD5_STEP (MD5_H , b, c, d, a, w0_t[2], MD5C2f, MD5S23); MD5_STEP (MD5_I , a, b, c, d, w0_t[0], MD5C30, MD5S30); MD5_STEP (MD5_I , d, a, b, c, w1_t[3], MD5C31, MD5S31); @@ -2492,19 +2364,14 @@ __kernel void __attribute__((reqd_work_group_size (64, 1, 1))) m11400_s04 (__glo c += r_c; d += r_d; - const u32 r0 = a; - const u32 r1 = d; - const u32 r2 = c; - const u32 r3 = b; - - #include COMPARE_S + COMPARE_S_SIMD (a, d, c, b); } } -__kernel void __attribute__((reqd_work_group_size (64, 1, 1))) m11400_s08 (__global pw_t *pws, __global kernel_rule_t *rules_buf, __global comb_t *combs_buf, __global bf_t *bfs_buf, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global sip_t *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 combs_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max) +__kernel void m11400_s08 (__global pw_t *pws, __global kernel_rule_t *rules_buf, __global comb_t *combs_buf, __global bf_t *bfs_buf, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global sip_t *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV0_buf, __global u32 *d_scryptV1_buf, __global u32 *d_scryptV2_buf, __global u32 *d_scryptV3_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 il_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max) { } -__kernel void __attribute__((reqd_work_group_size (64, 1, 1))) m11400_s16 (__global pw_t *pws, __global kernel_rule_t *rules_buf, __global comb_t *combs_buf, __global bf_t *bfs_buf, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global sip_t *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 combs_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max) +__kernel void m11400_s16 (__global pw_t *pws, __global kernel_rule_t *rules_buf, __global comb_t *combs_buf, __global bf_t *bfs_buf, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global sip_t *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV0_buf, __global u32 *d_scryptV1_buf, __global u32 *d_scryptV2_buf, __global u32 *d_scryptV3_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 il_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max) { }