X-Git-Url: https://www.flypig.org.uk/git/?a=blobdiff_plain;f=OpenCL%2Fm11400_a0.cl;h=b774ef2fc6bafb6c5051983357a65a8736e85b44;hb=161a6eb4bc643d8e636e96eda613f5137d30da59;hp=7f5529420d5d29c00dc221f0b5ad2b60614f5fcd;hpb=0bf4e3c34a6a799ccc34f403bed70119574ca9c8;p=hashcat.git diff --git a/OpenCL/m11400_a0.cl b/OpenCL/m11400_a0.cl index 7f55294..b774ef2 100644 --- a/OpenCL/m11400_a0.cl +++ b/OpenCL/m11400_a0.cl @@ -1,75 +1,73 @@ /** - * Author......: Jens Steube + * Authors.....: Jens Steube + * magnum + * * License.....: MIT */ #define _MD5_ -#include "include/constants.h" -#include "include/kernel_vendor.h" - -#define DGST_R0 0 -#define DGST_R1 3 -#define DGST_R2 2 -#define DGST_R3 1 - -#include "include/kernel_functions.c" -#include "types_ocl.c" -#include "common.c" -#include "include/rp_gpu.h" -#include "rp.c" - -#define COMPARE_S "check_single_comp4.c" -#define COMPARE_M "check_multi_comp4.c" - -#ifdef VECT_SIZE1 -#define uint_to_hex_lower8(i) l_bin2asc[(i)] -#endif - -#ifdef VECT_SIZE2 +//incompatible because of brances +//#define NEW_SIMD_CODE + +#include "inc_vendor.cl" +#include "inc_hash_constants.h" +#include "inc_hash_functions.cl" +#include "inc_types.cl" +#include "inc_common.cl" +#include "inc_rp.h" +#include "inc_rp.cl" +#include "inc_simd.cl" + +#if VECT_SIZE == 1 +#define uint_to_hex_lower8(i) (u32x) (l_bin2asc[(i)]) +#elif VECT_SIZE == 2 #define uint_to_hex_lower8(i) (u32x) (l_bin2asc[(i).s0], l_bin2asc[(i).s1]) -#endif - -#ifdef VECT_SIZE4 +#elif VECT_SIZE == 4 #define uint_to_hex_lower8(i) (u32x) (l_bin2asc[(i).s0], l_bin2asc[(i).s1], l_bin2asc[(i).s2], l_bin2asc[(i).s3]) +#elif VECT_SIZE == 8 +#define uint_to_hex_lower8(i) (u32x) (l_bin2asc[(i).s0], l_bin2asc[(i).s1], l_bin2asc[(i).s2], l_bin2asc[(i).s3], l_bin2asc[(i).s4], l_bin2asc[(i).s5], l_bin2asc[(i).s6], l_bin2asc[(i).s7]) +#elif VECT_SIZE == 16 +#define uint_to_hex_lower8(i) (u32x) (l_bin2asc[(i).s0], l_bin2asc[(i).s1], l_bin2asc[(i).s2], l_bin2asc[(i).s3], l_bin2asc[(i).s4], l_bin2asc[(i).s5], l_bin2asc[(i).s6], l_bin2asc[(i).s7], l_bin2asc[(i).s8], l_bin2asc[(i).s9], l_bin2asc[(i).sa], l_bin2asc[(i).sb], l_bin2asc[(i).sc], l_bin2asc[(i).sd], l_bin2asc[(i).se], l_bin2asc[(i).sf]) #endif -static u32 memcat32 (u32 block0[16], u32 block1[16], const u32 block_len, const u32 append0[4], const u32 append1[4], const u32 append2[4], const u32 append3[4], const u32 append_len) +u32 memcat32 (u32x block0[16], u32x block1[16], const u32 block_len, const u32x append0[4], const u32x append1[4], const u32x append2[4], const u32x append3[4], const u32 append_len) { const u32 mod = block_len & 3; const u32 div = block_len / 4; + #if defined IS_AMD || defined IS_GENERIC const int offset_minus_4 = 4 - mod; - u32 append0_t[4]; + u32x append0_t[4]; - append0_t[0] = amd_bytealign (append0[0], 0, offset_minus_4); + append0_t[0] = amd_bytealign (append0[0], 0, offset_minus_4); append0_t[1] = amd_bytealign (append0[1], append0[0], offset_minus_4); append0_t[2] = amd_bytealign (append0[2], append0[1], offset_minus_4); append0_t[3] = amd_bytealign (append0[3], append0[2], offset_minus_4); - u32 append1_t[4]; + u32x append1_t[4]; append1_t[0] = amd_bytealign (append1[0], append0[3], offset_minus_4); append1_t[1] = amd_bytealign (append1[1], append1[0], offset_minus_4); append1_t[2] = amd_bytealign (append1[2], append1[1], offset_minus_4); append1_t[3] = amd_bytealign (append1[3], append1[2], offset_minus_4); - u32 append2_t[4]; + u32x append2_t[4]; append2_t[0] = amd_bytealign (append2[0], append1[3], offset_minus_4); append2_t[1] = amd_bytealign (append2[1], append2[0], offset_minus_4); append2_t[2] = amd_bytealign (append2[2], append2[1], offset_minus_4); append2_t[3] = amd_bytealign (append2[3], append2[2], offset_minus_4); - u32 append3_t[4]; + u32x append3_t[4]; append3_t[0] = amd_bytealign (append3[0], append2[3], offset_minus_4); append3_t[1] = amd_bytealign (append3[1], append3[0], offset_minus_4); append3_t[2] = amd_bytealign (append3[2], append3[1], offset_minus_4); append3_t[3] = amd_bytealign (append3[3], append3[2], offset_minus_4); - u32 append4_t[4]; + u32x append4_t[4]; append4_t[0] = amd_bytealign ( 0, append3[3], offset_minus_4); append4_t[1] = 0; @@ -103,6 +101,49 @@ static u32 memcat32 (u32 block0[16], u32 block1[16], const u32 block_len, const append4_t[2] = 0; append4_t[3] = 0; } + #endif + + #ifdef IS_NV + + const int offset_minus_4 = 4 - mod; + + const int selector = (0x76543210 >> (offset_minus_4 * 4)) & 0xffff; + + u32x append0_t[4]; + + append0_t[0] = __byte_perm ( 0, append0[0], selector); + append0_t[1] = __byte_perm (append0[0], append0[1], selector); + append0_t[2] = __byte_perm (append0[1], append0[2], selector); + append0_t[3] = __byte_perm (append0[2], append0[3], selector); + + u32x append1_t[4]; + + append1_t[0] = __byte_perm (append0[3], append1[0], selector); + append1_t[1] = __byte_perm (append1[0], append1[1], selector); + append1_t[2] = __byte_perm (append1[1], append1[2], selector); + append1_t[3] = __byte_perm (append1[2], append1[3], selector); + + u32x append2_t[4]; + + append2_t[0] = __byte_perm (append1[3], append2[0], selector); + append2_t[1] = __byte_perm (append2[0], append2[1], selector); + append2_t[2] = __byte_perm (append2[1], append2[2], selector); + append2_t[3] = __byte_perm (append2[2], append2[3], selector); + + u32x append3_t[4]; + + append3_t[0] = __byte_perm (append2[3], append3[0], selector); + append3_t[1] = __byte_perm (append3[0], append3[1], selector); + append3_t[2] = __byte_perm (append3[1], append3[2], selector); + append3_t[3] = __byte_perm (append3[2], append3[3], selector); + + u32x append4_t[4]; + + append4_t[0] = __byte_perm (append3[3], 0, selector); + append4_t[1] = 0; + append4_t[2] = 0; + append4_t[3] = 0; + #endif switch (div) { @@ -711,29 +752,46 @@ static u32 memcat32 (u32 block0[16], u32 block1[16], const u32 block_len, const return new_len; } -__kernel void __attribute__((reqd_work_group_size (64, 1, 1))) m11400_m04 (__global pw_t *pws, __global gpu_rule_t *rules_buf, __global comb_t *combs_buf, __global bf_t *bfs_buf, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global sip_t *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 rules_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max) +__kernel void m11400_m04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, __global comb_t *combs_buf, __global bf_t *bfs_buf, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global sip_t *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV0_buf, __global u32 *d_scryptV1_buf, __global u32 *d_scryptV2_buf, __global u32 *d_scryptV3_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 il_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max) { /** * modifier */ + const u32 gid = get_global_id (0); const u32 lid = get_local_id (0); + const u32 lsz = get_local_size (0); /** - * base + * bin2asc table */ - const u32 gid = get_global_id (0); + __local u32 l_bin2asc[256]; + + for (u32 i = lid; i < 256; i += lsz) + { + const u32 i0 = (i >> 0) & 15; + const u32 i1 = (i >> 4) & 15; + + l_bin2asc[i] = ((i0 < 10) ? '0' + i0 : 'a' - 10 + i0) << 8 + | ((i1 < 10) ? '0' + i1 : 'a' - 10 + i1) << 0; + } + + barrier (CLK_LOCAL_MEM_FENCE); + + if (gid >= gid_max) return; + + /** + * base + */ u32 pw_buf0[4]; + u32 pw_buf1[4]; pw_buf0[0] = pws[gid].i[ 0]; pw_buf0[1] = pws[gid].i[ 1]; pw_buf0[2] = pws[gid].i[ 2]; pw_buf0[3] = pws[gid].i[ 3]; - - u32 pw_buf1[4]; - pw_buf1[0] = pws[gid].i[ 4]; pw_buf1[1] = pws[gid].i[ 5]; pw_buf1[2] = pws[gid].i[ 6]; @@ -741,41 +799,6 @@ __kernel void __attribute__((reqd_work_group_size (64, 1, 1))) m11400_m04 (__glo const u32 pw_len = pws[gid].pw_len; - /** - * bin2asc table - */ - - __local u32 l_bin2asc[256]; - - const u32 lid4 = lid * 4; - - const u32 lid40 = lid4 + 0; - const u32 lid41 = lid4 + 1; - const u32 lid42 = lid4 + 2; - const u32 lid43 = lid4 + 3; - - const u32 v400 = (lid40 >> 0) & 15; - const u32 v401 = (lid40 >> 4) & 15; - const u32 v410 = (lid41 >> 0) & 15; - const u32 v411 = (lid41 >> 4) & 15; - const u32 v420 = (lid42 >> 0) & 15; - const u32 v421 = (lid42 >> 4) & 15; - const u32 v430 = (lid43 >> 0) & 15; - const u32 v431 = (lid43 >> 4) & 15; - - l_bin2asc[lid40] = ((v400 < 10) ? '0' + v400 : 'a' - 10 + v400) << 8 - | ((v401 < 10) ? '0' + v401 : 'a' - 10 + v401) << 0; - l_bin2asc[lid41] = ((v410 < 10) ? '0' + v410 : 'a' - 10 + v410) << 8 - | ((v411 < 10) ? '0' + v411 : 'a' - 10 + v411) << 0; - l_bin2asc[lid42] = ((v420 < 10) ? '0' + v420 : 'a' - 10 + v420) << 8 - | ((v421 < 10) ? '0' + v421 : 'a' - 10 + v421) << 0; - l_bin2asc[lid43] = ((v430 < 10) ? '0' + v430 : 'a' - 10 + v430) << 8 - | ((v431 < 10) ? '0' + v431 : 'a' - 10 + v431) << 0; - - barrier (CLK_LOCAL_MEM_FENCE); - - if (gid >= gid_max) return; - /** * salt */ @@ -783,6 +806,7 @@ __kernel void __attribute__((reqd_work_group_size (64, 1, 1))) m11400_m04 (__glo const u32 salt_len = esalt_bufs[salt_pos].salt_len; // not a bug, we need to get it from the esalt u32 salt_buf0[16]; + u32 salt_buf1[16]; salt_buf0[ 0] = esalt_bufs[salt_pos].salt_buf[ 0]; salt_buf0[ 1] = esalt_bufs[salt_pos].salt_buf[ 1]; @@ -800,9 +824,6 @@ __kernel void __attribute__((reqd_work_group_size (64, 1, 1))) m11400_m04 (__glo salt_buf0[13] = esalt_bufs[salt_pos].salt_buf[13]; salt_buf0[14] = esalt_bufs[salt_pos].salt_buf[14]; salt_buf0[15] = esalt_bufs[salt_pos].salt_buf[15]; - - u32 salt_buf1[16]; - salt_buf1[ 0] = esalt_bufs[salt_pos].salt_buf[16]; salt_buf1[ 1] = esalt_bufs[salt_pos].salt_buf[17]; salt_buf1[ 2] = esalt_bufs[salt_pos].salt_buf[18]; @@ -827,6 +848,8 @@ __kernel void __attribute__((reqd_work_group_size (64, 1, 1))) m11400_m04 (__glo const u32 esalt_len = esalt_bufs[salt_pos].esalt_len; u32 esalt_buf0[16]; + u32 esalt_buf1[16]; + u32 esalt_buf2[16]; esalt_buf0[ 0] = esalt_bufs[salt_pos].esalt_buf[ 0]; esalt_buf0[ 1] = esalt_bufs[salt_pos].esalt_buf[ 1]; @@ -844,9 +867,6 @@ __kernel void __attribute__((reqd_work_group_size (64, 1, 1))) m11400_m04 (__glo esalt_buf0[13] = esalt_bufs[salt_pos].esalt_buf[13]; esalt_buf0[14] = esalt_bufs[salt_pos].esalt_buf[14]; esalt_buf0[15] = esalt_bufs[salt_pos].esalt_buf[15]; - - u32 esalt_buf1[16]; - esalt_buf1[ 0] = esalt_bufs[salt_pos].esalt_buf[16]; esalt_buf1[ 1] = esalt_bufs[salt_pos].esalt_buf[17]; esalt_buf1[ 2] = esalt_bufs[salt_pos].esalt_buf[18]; @@ -863,9 +883,6 @@ __kernel void __attribute__((reqd_work_group_size (64, 1, 1))) m11400_m04 (__glo esalt_buf1[13] = esalt_bufs[salt_pos].esalt_buf[29]; esalt_buf1[14] = esalt_bufs[salt_pos].esalt_buf[30]; esalt_buf1[15] = esalt_bufs[salt_pos].esalt_buf[31]; - - u32 esalt_buf2[16]; - esalt_buf2[ 0] = esalt_bufs[salt_pos].esalt_buf[32]; esalt_buf2[ 1] = esalt_bufs[salt_pos].esalt_buf[33]; esalt_buf2[ 2] = esalt_bufs[salt_pos].esalt_buf[34]; @@ -890,41 +907,18 @@ __kernel void __attribute__((reqd_work_group_size (64, 1, 1))) m11400_m04 (__glo * loop */ - for (u32 il_pos = 0; il_pos < rules_cnt; il_pos++) + for (u32 il_pos = 0; il_pos < il_cnt; il_pos += VECT_SIZE) { - u32 w0[4]; - - w0[0] = pw_buf0[0]; - w0[1] = pw_buf0[1]; - w0[2] = pw_buf0[2]; - w0[3] = pw_buf0[3]; - - u32 w1[4]; - - w1[0] = pw_buf1[0]; - w1[1] = pw_buf1[1]; - w1[2] = pw_buf1[2]; - w1[3] = pw_buf1[3]; - - u32 w2[4]; - - w2[0] = 0; - w2[1] = 0; - w2[2] = 0; - w2[3] = 0; - - u32 w3[4]; + u32x w0[4] = { 0 }; + u32x w1[4] = { 0 }; + u32x w2[4] = { 0 }; + u32x w3[4] = { 0 }; - w3[0] = 0; - w3[1] = 0; - w3[2] = 0; - w3[3] = 0; + const u32x out_len = apply_rules_vect (pw_buf0, pw_buf1, pw_len, rules_buf, il_pos, w0, w1); - const u32 out_len = apply_rules (rules_buf[il_pos].cmds, w0, w1, pw_len); + append_0x80_2x4_VV (w0, w1, out_len); - append_0x80_2 (w0, w1, out_len); - - const u32 pw_salt_len = salt_len + out_len; + const u32x pw_salt_len = salt_len + out_len; /* * HA1 = md5 ($salt . $pass) @@ -932,7 +926,8 @@ __kernel void __attribute__((reqd_work_group_size (64, 1, 1))) m11400_m04 (__glo // append the pass to the salt - u32 block0[16]; + u32x block0[16]; + u32x block1[16]; block0[ 0] = salt_buf0[ 0]; block0[ 1] = salt_buf0[ 1]; @@ -950,9 +945,6 @@ __kernel void __attribute__((reqd_work_group_size (64, 1, 1))) m11400_m04 (__glo block0[13] = salt_buf0[13]; block0[14] = salt_buf0[14]; block0[15] = salt_buf0[15]; - - u32 block1[16]; - block1[ 0] = salt_buf1[ 0]; block1[ 1] = salt_buf1[ 1]; block1[ 2] = salt_buf1[ 2]; @@ -974,29 +966,23 @@ __kernel void __attribute__((reqd_work_group_size (64, 1, 1))) m11400_m04 (__glo block_len = memcat32 (block0, block1, salt_len, w0, w1, w2, w3, out_len); - u32 w0_t[4]; + u32x w0_t[4]; + u32x w1_t[4]; + u32x w2_t[4]; + u32x w3_t[4]; w0_t[0] = block0[ 0]; w0_t[1] = block0[ 1]; w0_t[2] = block0[ 2]; w0_t[3] = block0[ 3]; - - u32 w1_t[4]; - w1_t[0] = block0[ 4]; w1_t[1] = block0[ 5]; w1_t[2] = block0[ 6]; w1_t[3] = block0[ 7]; - - u32 w2_t[4]; - w2_t[0] = block0[ 8]; w2_t[1] = block0[ 9]; w2_t[2] = block0[10]; w2_t[3] = block0[11]; - - u32 w3_t[4]; - w3_t[0] = block0[12]; w3_t[1] = block0[13]; w3_t[2] = block0[14]; @@ -1009,12 +995,10 @@ __kernel void __attribute__((reqd_work_group_size (64, 1, 1))) m11400_m04 (__glo // md5 - u32 tmp2; - - u32 a = MD5M_A; - u32 b = MD5M_B; - u32 c = MD5M_C; - u32 d = MD5M_D; + u32x a = MD5M_A; + u32x b = MD5M_B; + u32x c = MD5M_C; + u32x d = MD5M_D; MD5_STEP (MD5_Fo, a, b, c, d, w0_t[0], MD5C00, MD5S00); MD5_STEP (MD5_Fo, d, a, b, c, w0_t[1], MD5C01, MD5S01); @@ -1050,22 +1034,22 @@ __kernel void __attribute__((reqd_work_group_size (64, 1, 1))) m11400_m04 (__glo MD5_STEP (MD5_Go, c, d, a, b, w1_t[3], MD5C1e, MD5S12); MD5_STEP (MD5_Go, b, c, d, a, w3_t[0], MD5C1f, MD5S13); - MD5_STEP (MD5_H1, a, b, c, d, w1_t[1], MD5C20, MD5S20); - MD5_STEP (MD5_H2, d, a, b, c, w2_t[0], MD5C21, MD5S21); - MD5_STEP (MD5_H1, c, d, a, b, w2_t[3], MD5C22, MD5S22); - MD5_STEP (MD5_H2, b, c, d, a, w3_t[2], MD5C23, MD5S23); - MD5_STEP (MD5_H1, a, b, c, d, w0_t[1], MD5C24, MD5S20); - MD5_STEP (MD5_H2, d, a, b, c, w1_t[0], MD5C25, MD5S21); - MD5_STEP (MD5_H1, c, d, a, b, w1_t[3], MD5C26, MD5S22); - MD5_STEP (MD5_H2, b, c, d, a, w2_t[2], MD5C27, MD5S23); - MD5_STEP (MD5_H1, a, b, c, d, w3_t[1], MD5C28, MD5S20); - MD5_STEP (MD5_H2, d, a, b, c, w0_t[0], MD5C29, MD5S21); - MD5_STEP (MD5_H1, c, d, a, b, w0_t[3], MD5C2a, MD5S22); - MD5_STEP (MD5_H2, b, c, d, a, w1_t[2], MD5C2b, MD5S23); - MD5_STEP (MD5_H1, a, b, c, d, w2_t[1], MD5C2c, MD5S20); - MD5_STEP (MD5_H2, d, a, b, c, w3_t[0], MD5C2d, MD5S21); - MD5_STEP (MD5_H1, c, d, a, b, w3_t[3], MD5C2e, MD5S22); - MD5_STEP (MD5_H2, b, c, d, a, w0_t[2], MD5C2f, MD5S23); + MD5_STEP (MD5_H , a, b, c, d, w1_t[1], MD5C20, MD5S20); + MD5_STEP (MD5_H , d, a, b, c, w2_t[0], MD5C21, MD5S21); + MD5_STEP (MD5_H , c, d, a, b, w2_t[3], MD5C22, MD5S22); + MD5_STEP (MD5_H , b, c, d, a, w3_t[2], MD5C23, MD5S23); + MD5_STEP (MD5_H , a, b, c, d, w0_t[1], MD5C24, MD5S20); + MD5_STEP (MD5_H , d, a, b, c, w1_t[0], MD5C25, MD5S21); + MD5_STEP (MD5_H , c, d, a, b, w1_t[3], MD5C26, MD5S22); + MD5_STEP (MD5_H , b, c, d, a, w2_t[2], MD5C27, MD5S23); + MD5_STEP (MD5_H , a, b, c, d, w3_t[1], MD5C28, MD5S20); + MD5_STEP (MD5_H , d, a, b, c, w0_t[0], MD5C29, MD5S21); + MD5_STEP (MD5_H , c, d, a, b, w0_t[3], MD5C2a, MD5S22); + MD5_STEP (MD5_H , b, c, d, a, w1_t[2], MD5C2b, MD5S23); + MD5_STEP (MD5_H , a, b, c, d, w2_t[1], MD5C2c, MD5S20); + MD5_STEP (MD5_H , d, a, b, c, w3_t[0], MD5C2d, MD5S21); + MD5_STEP (MD5_H , c, d, a, b, w3_t[3], MD5C2e, MD5S22); + MD5_STEP (MD5_H , b, c, d, a, w0_t[2], MD5C2f, MD5S23); MD5_STEP (MD5_I , a, b, c, d, w0_t[0], MD5C30, MD5S30); MD5_STEP (MD5_I , d, a, b, c, w1_t[3], MD5C31, MD5S31); @@ -1091,26 +1075,23 @@ __kernel void __attribute__((reqd_work_group_size (64, 1, 1))) m11400_m04 (__glo if (block_len > 55) { - u32 r_a = a; - u32 r_b = b; - u32 r_c = c; - u32 r_d = d; + u32x r_a = a; + u32x r_b = b; + u32x r_c = c; + u32x r_d = d; w0_t[0] = block1[ 0]; w0_t[1] = block1[ 1]; w0_t[2] = block1[ 2]; w0_t[3] = block1[ 3]; - w1_t[0] = block1[ 4]; w1_t[1] = block1[ 5]; w1_t[2] = block1[ 6]; w1_t[3] = block1[ 7]; - w2_t[0] = block1[ 8]; w2_t[1] = block1[ 9]; w2_t[2] = block1[10]; w2_t[3] = block1[11]; - w3_t[0] = block1[12]; w3_t[1] = block1[13]; w3_t[2] = pw_salt_len * 8; @@ -1150,22 +1131,22 @@ __kernel void __attribute__((reqd_work_group_size (64, 1, 1))) m11400_m04 (__glo MD5_STEP (MD5_Go, c, d, a, b, w1_t[3], MD5C1e, MD5S12); MD5_STEP (MD5_Go, b, c, d, a, w3_t[0], MD5C1f, MD5S13); - MD5_STEP (MD5_H1, a, b, c, d, w1_t[1], MD5C20, MD5S20); - MD5_STEP (MD5_H2, d, a, b, c, w2_t[0], MD5C21, MD5S21); - MD5_STEP (MD5_H1, c, d, a, b, w2_t[3], MD5C22, MD5S22); - MD5_STEP (MD5_H2, b, c, d, a, w3_t[2], MD5C23, MD5S23); - MD5_STEP (MD5_H1, a, b, c, d, w0_t[1], MD5C24, MD5S20); - MD5_STEP (MD5_H2, d, a, b, c, w1_t[0], MD5C25, MD5S21); - MD5_STEP (MD5_H1, c, d, a, b, w1_t[3], MD5C26, MD5S22); - MD5_STEP (MD5_H2, b, c, d, a, w2_t[2], MD5C27, MD5S23); - MD5_STEP (MD5_H1, a, b, c, d, w3_t[1], MD5C28, MD5S20); - MD5_STEP (MD5_H2, d, a, b, c, w0_t[0], MD5C29, MD5S21); - MD5_STEP (MD5_H1, c, d, a, b, w0_t[3], MD5C2a, MD5S22); - MD5_STEP (MD5_H2, b, c, d, a, w1_t[2], MD5C2b, MD5S23); - MD5_STEP (MD5_H1, a, b, c, d, w2_t[1], MD5C2c, MD5S20); - MD5_STEP (MD5_H2, d, a, b, c, w3_t[0], MD5C2d, MD5S21); - MD5_STEP (MD5_H1, c, d, a, b, w3_t[3], MD5C2e, MD5S22); - MD5_STEP (MD5_H2, b, c, d, a, w0_t[2], MD5C2f, MD5S23); + MD5_STEP (MD5_H , a, b, c, d, w1_t[1], MD5C20, MD5S20); + MD5_STEP (MD5_H , d, a, b, c, w2_t[0], MD5C21, MD5S21); + MD5_STEP (MD5_H , c, d, a, b, w2_t[3], MD5C22, MD5S22); + MD5_STEP (MD5_H , b, c, d, a, w3_t[2], MD5C23, MD5S23); + MD5_STEP (MD5_H , a, b, c, d, w0_t[1], MD5C24, MD5S20); + MD5_STEP (MD5_H , d, a, b, c, w1_t[0], MD5C25, MD5S21); + MD5_STEP (MD5_H , c, d, a, b, w1_t[3], MD5C26, MD5S22); + MD5_STEP (MD5_H , b, c, d, a, w2_t[2], MD5C27, MD5S23); + MD5_STEP (MD5_H , a, b, c, d, w3_t[1], MD5C28, MD5S20); + MD5_STEP (MD5_H , d, a, b, c, w0_t[0], MD5C29, MD5S21); + MD5_STEP (MD5_H , c, d, a, b, w0_t[3], MD5C2a, MD5S22); + MD5_STEP (MD5_H , b, c, d, a, w1_t[2], MD5C2b, MD5S23); + MD5_STEP (MD5_H , a, b, c, d, w2_t[1], MD5C2c, MD5S20); + MD5_STEP (MD5_H , d, a, b, c, w3_t[0], MD5C2d, MD5S21); + MD5_STEP (MD5_H , c, d, a, b, w3_t[3], MD5C2e, MD5S22); + MD5_STEP (MD5_H , b, c, d, a, w0_t[2], MD5C2f, MD5S23); MD5_STEP (MD5_I , a, b, c, d, w0_t[0], MD5C30, MD5S30); MD5_STEP (MD5_I , d, a, b, c, w1_t[3], MD5C31, MD5S31); @@ -1211,12 +1192,10 @@ __kernel void __attribute__((reqd_work_group_size (64, 1, 1))) m11400_m04 (__glo | uint_to_hex_lower8 ((d >> 8) & 255) << 16; w1_t[3] = uint_to_hex_lower8 ((d >> 16) & 255) << 0 | uint_to_hex_lower8 ((d >> 24) & 255) << 16; - w2_t[0] = esalt_buf0[0]; w2_t[1] = esalt_buf0[1]; w2_t[2] = esalt_buf0[2]; w2_t[3] = esalt_buf0[3]; - w3_t[0] = esalt_buf0[4]; w3_t[1] = esalt_buf0[5]; w3_t[2] = esalt_buf0[6]; @@ -1264,22 +1243,22 @@ __kernel void __attribute__((reqd_work_group_size (64, 1, 1))) m11400_m04 (__glo MD5_STEP (MD5_Go, c, d, a, b, w1_t[3], MD5C1e, MD5S12); MD5_STEP (MD5_Go, b, c, d, a, w3_t[0], MD5C1f, MD5S13); - MD5_STEP (MD5_H1, a, b, c, d, w1_t[1], MD5C20, MD5S20); - MD5_STEP (MD5_H2, d, a, b, c, w2_t[0], MD5C21, MD5S21); - MD5_STEP (MD5_H1, c, d, a, b, w2_t[3], MD5C22, MD5S22); - MD5_STEP (MD5_H2, b, c, d, a, w3_t[2], MD5C23, MD5S23); - MD5_STEP (MD5_H1, a, b, c, d, w0_t[1], MD5C24, MD5S20); - MD5_STEP (MD5_H2, d, a, b, c, w1_t[0], MD5C25, MD5S21); - MD5_STEP (MD5_H1, c, d, a, b, w1_t[3], MD5C26, MD5S22); - MD5_STEP (MD5_H2, b, c, d, a, w2_t[2], MD5C27, MD5S23); - MD5_STEP (MD5_H1, a, b, c, d, w3_t[1], MD5C28, MD5S20); - MD5_STEP (MD5_H2, d, a, b, c, w0_t[0], MD5C29, MD5S21); - MD5_STEP (MD5_H1, c, d, a, b, w0_t[3], MD5C2a, MD5S22); - MD5_STEP (MD5_H2, b, c, d, a, w1_t[2], MD5C2b, MD5S23); - MD5_STEP (MD5_H1, a, b, c, d, w2_t[1], MD5C2c, MD5S20); - MD5_STEP (MD5_H2, d, a, b, c, w3_t[0], MD5C2d, MD5S21); - MD5_STEP (MD5_H1, c, d, a, b, w3_t[3], MD5C2e, MD5S22); - MD5_STEP (MD5_H2, b, c, d, a, w0_t[2], MD5C2f, MD5S23); + MD5_STEP (MD5_H , a, b, c, d, w1_t[1], MD5C20, MD5S20); + MD5_STEP (MD5_H , d, a, b, c, w2_t[0], MD5C21, MD5S21); + MD5_STEP (MD5_H , c, d, a, b, w2_t[3], MD5C22, MD5S22); + MD5_STEP (MD5_H , b, c, d, a, w3_t[2], MD5C23, MD5S23); + MD5_STEP (MD5_H , a, b, c, d, w0_t[1], MD5C24, MD5S20); + MD5_STEP (MD5_H , d, a, b, c, w1_t[0], MD5C25, MD5S21); + MD5_STEP (MD5_H , c, d, a, b, w1_t[3], MD5C26, MD5S22); + MD5_STEP (MD5_H , b, c, d, a, w2_t[2], MD5C27, MD5S23); + MD5_STEP (MD5_H , a, b, c, d, w3_t[1], MD5C28, MD5S20); + MD5_STEP (MD5_H , d, a, b, c, w0_t[0], MD5C29, MD5S21); + MD5_STEP (MD5_H , c, d, a, b, w0_t[3], MD5C2a, MD5S22); + MD5_STEP (MD5_H , b, c, d, a, w1_t[2], MD5C2b, MD5S23); + MD5_STEP (MD5_H , a, b, c, d, w2_t[1], MD5C2c, MD5S20); + MD5_STEP (MD5_H , d, a, b, c, w3_t[0], MD5C2d, MD5S21); + MD5_STEP (MD5_H , c, d, a, b, w3_t[3], MD5C2e, MD5S22); + MD5_STEP (MD5_H , b, c, d, a, w0_t[2], MD5C2f, MD5S23); MD5_STEP (MD5_I , a, b, c, d, w0_t[0], MD5C30, MD5S30); MD5_STEP (MD5_I , d, a, b, c, w1_t[3], MD5C31, MD5S31); @@ -1303,10 +1282,10 @@ __kernel void __attribute__((reqd_work_group_size (64, 1, 1))) m11400_m04 (__glo c += MD5M_C; d += MD5M_D; - u32 r_a = a; - u32 r_b = b; - u32 r_c = c; - u32 r_d = d; + u32x r_a = a; + u32x r_b = b; + u32x r_c = c; + u32x r_d = d; // 2nd transform @@ -1314,17 +1293,14 @@ __kernel void __attribute__((reqd_work_group_size (64, 1, 1))) m11400_m04 (__glo w0_t[1] = esalt_buf0[ 9]; w0_t[2] = esalt_buf0[10]; w0_t[3] = esalt_buf0[11]; - w1_t[0] = esalt_buf0[12]; w1_t[1] = esalt_buf0[13]; w1_t[2] = esalt_buf0[14]; w1_t[3] = esalt_buf0[15]; - w2_t[0] = esalt_buf1[ 0]; w2_t[1] = esalt_buf1[ 1]; w2_t[2] = esalt_buf1[ 2]; w2_t[3] = esalt_buf1[ 3]; - w3_t[0] = esalt_buf1[ 4]; w3_t[1] = esalt_buf1[ 5]; w3_t[2] = esalt_buf1[ 6]; @@ -1373,22 +1349,22 @@ __kernel void __attribute__((reqd_work_group_size (64, 1, 1))) m11400_m04 (__glo MD5_STEP (MD5_Go, c, d, a, b, w1_t[3], MD5C1e, MD5S12); MD5_STEP (MD5_Go, b, c, d, a, w3_t[0], MD5C1f, MD5S13); - MD5_STEP (MD5_H1, a, b, c, d, w1_t[1], MD5C20, MD5S20); - MD5_STEP (MD5_H2, d, a, b, c, w2_t[0], MD5C21, MD5S21); - MD5_STEP (MD5_H1, c, d, a, b, w2_t[3], MD5C22, MD5S22); - MD5_STEP (MD5_H2, b, c, d, a, w3_t[2], MD5C23, MD5S23); - MD5_STEP (MD5_H1, a, b, c, d, w0_t[1], MD5C24, MD5S20); - MD5_STEP (MD5_H2, d, a, b, c, w1_t[0], MD5C25, MD5S21); - MD5_STEP (MD5_H1, c, d, a, b, w1_t[3], MD5C26, MD5S22); - MD5_STEP (MD5_H2, b, c, d, a, w2_t[2], MD5C27, MD5S23); - MD5_STEP (MD5_H1, a, b, c, d, w3_t[1], MD5C28, MD5S20); - MD5_STEP (MD5_H2, d, a, b, c, w0_t[0], MD5C29, MD5S21); - MD5_STEP (MD5_H1, c, d, a, b, w0_t[3], MD5C2a, MD5S22); - MD5_STEP (MD5_H2, b, c, d, a, w1_t[2], MD5C2b, MD5S23); - MD5_STEP (MD5_H1, a, b, c, d, w2_t[1], MD5C2c, MD5S20); - MD5_STEP (MD5_H2, d, a, b, c, w3_t[0], MD5C2d, MD5S21); - MD5_STEP (MD5_H1, c, d, a, b, w3_t[3], MD5C2e, MD5S22); - MD5_STEP (MD5_H2, b, c, d, a, w0_t[2], MD5C2f, MD5S23); + MD5_STEP (MD5_H , a, b, c, d, w1_t[1], MD5C20, MD5S20); + MD5_STEP (MD5_H , d, a, b, c, w2_t[0], MD5C21, MD5S21); + MD5_STEP (MD5_H , c, d, a, b, w2_t[3], MD5C22, MD5S22); + MD5_STEP (MD5_H , b, c, d, a, w3_t[2], MD5C23, MD5S23); + MD5_STEP (MD5_H , a, b, c, d, w0_t[1], MD5C24, MD5S20); + MD5_STEP (MD5_H , d, a, b, c, w1_t[0], MD5C25, MD5S21); + MD5_STEP (MD5_H , c, d, a, b, w1_t[3], MD5C26, MD5S22); + MD5_STEP (MD5_H , b, c, d, a, w2_t[2], MD5C27, MD5S23); + MD5_STEP (MD5_H , a, b, c, d, w3_t[1], MD5C28, MD5S20); + MD5_STEP (MD5_H , d, a, b, c, w0_t[0], MD5C29, MD5S21); + MD5_STEP (MD5_H , c, d, a, b, w0_t[3], MD5C2a, MD5S22); + MD5_STEP (MD5_H , b, c, d, a, w1_t[2], MD5C2b, MD5S23); + MD5_STEP (MD5_H , a, b, c, d, w2_t[1], MD5C2c, MD5S20); + MD5_STEP (MD5_H , d, a, b, c, w3_t[0], MD5C2d, MD5S21); + MD5_STEP (MD5_H , c, d, a, b, w3_t[3], MD5C2e, MD5S22); + MD5_STEP (MD5_H , b, c, d, a, w0_t[2], MD5C2f, MD5S23); MD5_STEP (MD5_I , a, b, c, d, w0_t[0], MD5C30, MD5S30); MD5_STEP (MD5_I , d, a, b, c, w1_t[3], MD5C31, MD5S31); @@ -1427,17 +1403,14 @@ __kernel void __attribute__((reqd_work_group_size (64, 1, 1))) m11400_m04 (__glo w0_t[1] = esalt_buf1[ 9]; w0_t[2] = esalt_buf1[10]; w0_t[3] = esalt_buf1[11]; - w1_t[0] = esalt_buf1[12]; w1_t[1] = esalt_buf1[13]; w1_t[2] = esalt_buf1[14]; w1_t[3] = esalt_buf1[15]; - w2_t[0] = esalt_buf2[ 0]; w2_t[1] = esalt_buf2[ 1]; w2_t[2] = esalt_buf2[ 2]; w2_t[3] = esalt_buf2[ 3]; - w3_t[0] = esalt_buf2[ 4]; w3_t[1] = esalt_buf2[ 5]; w3_t[2] = digest_esalt_len * 8; @@ -1477,22 +1450,22 @@ __kernel void __attribute__((reqd_work_group_size (64, 1, 1))) m11400_m04 (__glo MD5_STEP (MD5_Go, c, d, a, b, w1_t[3], MD5C1e, MD5S12); MD5_STEP (MD5_Go, b, c, d, a, w3_t[0], MD5C1f, MD5S13); - MD5_STEP (MD5_H1, a, b, c, d, w1_t[1], MD5C20, MD5S20); - MD5_STEP (MD5_H2, d, a, b, c, w2_t[0], MD5C21, MD5S21); - MD5_STEP (MD5_H1, c, d, a, b, w2_t[3], MD5C22, MD5S22); - MD5_STEP (MD5_H2, b, c, d, a, w3_t[2], MD5C23, MD5S23); - MD5_STEP (MD5_H1, a, b, c, d, w0_t[1], MD5C24, MD5S20); - MD5_STEP (MD5_H2, d, a, b, c, w1_t[0], MD5C25, MD5S21); - MD5_STEP (MD5_H1, c, d, a, b, w1_t[3], MD5C26, MD5S22); - MD5_STEP (MD5_H2, b, c, d, a, w2_t[2], MD5C27, MD5S23); - MD5_STEP (MD5_H1, a, b, c, d, w3_t[1], MD5C28, MD5S20); - MD5_STEP (MD5_H2, d, a, b, c, w0_t[0], MD5C29, MD5S21); - MD5_STEP (MD5_H1, c, d, a, b, w0_t[3], MD5C2a, MD5S22); - MD5_STEP (MD5_H2, b, c, d, a, w1_t[2], MD5C2b, MD5S23); - MD5_STEP (MD5_H1, a, b, c, d, w2_t[1], MD5C2c, MD5S20); - MD5_STEP (MD5_H2, d, a, b, c, w3_t[0], MD5C2d, MD5S21); - MD5_STEP (MD5_H1, c, d, a, b, w3_t[3], MD5C2e, MD5S22); - MD5_STEP (MD5_H2, b, c, d, a, w0_t[2], MD5C2f, MD5S23); + MD5_STEP (MD5_H , a, b, c, d, w1_t[1], MD5C20, MD5S20); + MD5_STEP (MD5_H , d, a, b, c, w2_t[0], MD5C21, MD5S21); + MD5_STEP (MD5_H , c, d, a, b, w2_t[3], MD5C22, MD5S22); + MD5_STEP (MD5_H , b, c, d, a, w3_t[2], MD5C23, MD5S23); + MD5_STEP (MD5_H , a, b, c, d, w0_t[1], MD5C24, MD5S20); + MD5_STEP (MD5_H , d, a, b, c, w1_t[0], MD5C25, MD5S21); + MD5_STEP (MD5_H , c, d, a, b, w1_t[3], MD5C26, MD5S22); + MD5_STEP (MD5_H , b, c, d, a, w2_t[2], MD5C27, MD5S23); + MD5_STEP (MD5_H , a, b, c, d, w3_t[1], MD5C28, MD5S20); + MD5_STEP (MD5_H , d, a, b, c, w0_t[0], MD5C29, MD5S21); + MD5_STEP (MD5_H , c, d, a, b, w0_t[3], MD5C2a, MD5S22); + MD5_STEP (MD5_H , b, c, d, a, w1_t[2], MD5C2b, MD5S23); + MD5_STEP (MD5_H , a, b, c, d, w2_t[1], MD5C2c, MD5S20); + MD5_STEP (MD5_H , d, a, b, c, w3_t[0], MD5C2d, MD5S21); + MD5_STEP (MD5_H , c, d, a, b, w3_t[3], MD5C2e, MD5S22); + MD5_STEP (MD5_H , b, c, d, a, w0_t[2], MD5C2f, MD5S23); MD5_STEP (MD5_I , a, b, c, d, w0_t[0], MD5C30, MD5S30); MD5_STEP (MD5_I , d, a, b, c, w1_t[3], MD5C31, MD5S31); @@ -1517,46 +1490,58 @@ __kernel void __attribute__((reqd_work_group_size (64, 1, 1))) m11400_m04 (__glo c += r_c; d += r_d; - const u32 r0 = a; - const u32 r1 = d; - const u32 r2 = c; - const u32 r3 = b; - - #include COMPARE_M + COMPARE_M_SIMD (a, d, c, b); } } -__kernel void __attribute__((reqd_work_group_size (64, 1, 1))) m11400_m08 (__global pw_t *pws, __global gpu_rule_t *rules_buf, __global comb_t *combs_buf, __global bf_t *bfs_buf, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global sip_t *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 combs_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max) +__kernel void m11400_m08 (__global pw_t *pws, __global kernel_rule_t *rules_buf, __global comb_t *combs_buf, __global bf_t *bfs_buf, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global sip_t *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV0_buf, __global u32 *d_scryptV1_buf, __global u32 *d_scryptV2_buf, __global u32 *d_scryptV3_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 il_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max) { } -__kernel void __attribute__((reqd_work_group_size (64, 1, 1))) m11400_m16 (__global pw_t *pws, __global gpu_rule_t *rules_buf, __global comb_t *combs_buf, __global bf_t *bfs_buf, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global sip_t *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 combs_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max) +__kernel void m11400_m16 (__global pw_t *pws, __global kernel_rule_t *rules_buf, __global comb_t *combs_buf, __global bf_t *bfs_buf, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global sip_t *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV0_buf, __global u32 *d_scryptV1_buf, __global u32 *d_scryptV2_buf, __global u32 *d_scryptV3_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 il_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max) { } -__kernel void __attribute__((reqd_work_group_size (64, 1, 1))) m11400_s04 (__global pw_t *pws, __global gpu_rule_t *rules_buf, __global comb_t *combs_buf, __global bf_t *bfs_buf, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global sip_t *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 rules_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max) +__kernel void m11400_s04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, __global comb_t *combs_buf, __global bf_t *bfs_buf, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global sip_t *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV0_buf, __global u32 *d_scryptV1_buf, __global u32 *d_scryptV2_buf, __global u32 *d_scryptV3_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 il_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max) { /** * modifier */ + const u32 gid = get_global_id (0); const u32 lid = get_local_id (0); + const u32 lsz = get_local_size (0); /** - * base + * bin2asc table */ - const u32 gid = get_global_id (0); + __local u32 l_bin2asc[256]; + + for (u32 i = lid; i < 256; i += lsz) + { + const u32 i0 = (i >> 0) & 15; + const u32 i1 = (i >> 4) & 15; + + l_bin2asc[i] = ((i0 < 10) ? '0' + i0 : 'a' - 10 + i0) << 8 + | ((i1 < 10) ? '0' + i1 : 'a' - 10 + i1) << 0; + } + + barrier (CLK_LOCAL_MEM_FENCE); + + if (gid >= gid_max) return; + + /** + * base + */ u32 pw_buf0[4]; + u32 pw_buf1[4]; pw_buf0[0] = pws[gid].i[ 0]; pw_buf0[1] = pws[gid].i[ 1]; pw_buf0[2] = pws[gid].i[ 2]; pw_buf0[3] = pws[gid].i[ 3]; - - u32 pw_buf1[4]; - pw_buf1[0] = pws[gid].i[ 4]; pw_buf1[1] = pws[gid].i[ 5]; pw_buf1[2] = pws[gid].i[ 6]; @@ -1564,41 +1549,6 @@ __kernel void __attribute__((reqd_work_group_size (64, 1, 1))) m11400_s04 (__glo const u32 pw_len = pws[gid].pw_len; - /** - * bin2asc table - */ - - __local u32 l_bin2asc[256]; - - const u32 lid4 = lid * 4; - - const u32 lid40 = lid4 + 0; - const u32 lid41 = lid4 + 1; - const u32 lid42 = lid4 + 2; - const u32 lid43 = lid4 + 3; - - const u32 v400 = (lid40 >> 0) & 15; - const u32 v401 = (lid40 >> 4) & 15; - const u32 v410 = (lid41 >> 0) & 15; - const u32 v411 = (lid41 >> 4) & 15; - const u32 v420 = (lid42 >> 0) & 15; - const u32 v421 = (lid42 >> 4) & 15; - const u32 v430 = (lid43 >> 0) & 15; - const u32 v431 = (lid43 >> 4) & 15; - - l_bin2asc[lid40] = ((v400 < 10) ? '0' + v400 : 'a' - 10 + v400) << 8 - | ((v401 < 10) ? '0' + v401 : 'a' - 10 + v401) << 0; - l_bin2asc[lid41] = ((v410 < 10) ? '0' + v410 : 'a' - 10 + v410) << 8 - | ((v411 < 10) ? '0' + v411 : 'a' - 10 + v411) << 0; - l_bin2asc[lid42] = ((v420 < 10) ? '0' + v420 : 'a' - 10 + v420) << 8 - | ((v421 < 10) ? '0' + v421 : 'a' - 10 + v421) << 0; - l_bin2asc[lid43] = ((v430 < 10) ? '0' + v430 : 'a' - 10 + v430) << 8 - | ((v431 < 10) ? '0' + v431 : 'a' - 10 + v431) << 0; - - barrier (CLK_LOCAL_MEM_FENCE); - - if (gid >= gid_max) return; - /** * salt */ @@ -1606,6 +1556,7 @@ __kernel void __attribute__((reqd_work_group_size (64, 1, 1))) m11400_s04 (__glo const u32 salt_len = esalt_bufs[salt_pos].salt_len; // not a bug, we need to get it from the esalt u32 salt_buf0[16]; + u32 salt_buf1[16]; salt_buf0[ 0] = esalt_bufs[salt_pos].salt_buf[ 0]; salt_buf0[ 1] = esalt_bufs[salt_pos].salt_buf[ 1]; @@ -1623,9 +1574,6 @@ __kernel void __attribute__((reqd_work_group_size (64, 1, 1))) m11400_s04 (__glo salt_buf0[13] = esalt_bufs[salt_pos].salt_buf[13]; salt_buf0[14] = esalt_bufs[salt_pos].salt_buf[14]; salt_buf0[15] = esalt_bufs[salt_pos].salt_buf[15]; - - u32 salt_buf1[16]; - salt_buf1[ 0] = esalt_bufs[salt_pos].salt_buf[16]; salt_buf1[ 1] = esalt_bufs[salt_pos].salt_buf[17]; salt_buf1[ 2] = esalt_bufs[salt_pos].salt_buf[18]; @@ -1650,6 +1598,8 @@ __kernel void __attribute__((reqd_work_group_size (64, 1, 1))) m11400_s04 (__glo const u32 esalt_len = esalt_bufs[salt_pos].esalt_len; u32 esalt_buf0[16]; + u32 esalt_buf1[16]; + u32 esalt_buf2[16]; esalt_buf0[ 0] = esalt_bufs[salt_pos].esalt_buf[ 0]; esalt_buf0[ 1] = esalt_bufs[salt_pos].esalt_buf[ 1]; @@ -1667,9 +1617,6 @@ __kernel void __attribute__((reqd_work_group_size (64, 1, 1))) m11400_s04 (__glo esalt_buf0[13] = esalt_bufs[salt_pos].esalt_buf[13]; esalt_buf0[14] = esalt_bufs[salt_pos].esalt_buf[14]; esalt_buf0[15] = esalt_bufs[salt_pos].esalt_buf[15]; - - u32 esalt_buf1[16]; - esalt_buf1[ 0] = esalt_bufs[salt_pos].esalt_buf[16]; esalt_buf1[ 1] = esalt_bufs[salt_pos].esalt_buf[17]; esalt_buf1[ 2] = esalt_bufs[salt_pos].esalt_buf[18]; @@ -1686,9 +1633,6 @@ __kernel void __attribute__((reqd_work_group_size (64, 1, 1))) m11400_s04 (__glo esalt_buf1[13] = esalt_bufs[salt_pos].esalt_buf[29]; esalt_buf1[14] = esalt_bufs[salt_pos].esalt_buf[30]; esalt_buf1[15] = esalt_bufs[salt_pos].esalt_buf[31]; - - u32 esalt_buf2[16]; - esalt_buf2[ 0] = esalt_bufs[salt_pos].esalt_buf[32]; esalt_buf2[ 1] = esalt_bufs[salt_pos].esalt_buf[33]; esalt_buf2[ 2] = esalt_bufs[salt_pos].esalt_buf[34]; @@ -1725,41 +1669,18 @@ __kernel void __attribute__((reqd_work_group_size (64, 1, 1))) m11400_s04 (__glo * loop */ - for (u32 il_pos = 0; il_pos < rules_cnt; il_pos++) + for (u32 il_pos = 0; il_pos < il_cnt; il_pos += VECT_SIZE) { - u32 w0[4]; - - w0[0] = pw_buf0[0]; - w0[1] = pw_buf0[1]; - w0[2] = pw_buf0[2]; - w0[3] = pw_buf0[3]; - - u32 w1[4]; - - w1[0] = pw_buf1[0]; - w1[1] = pw_buf1[1]; - w1[2] = pw_buf1[2]; - w1[3] = pw_buf1[3]; - - u32 w2[4]; - - w2[0] = 0; - w2[1] = 0; - w2[2] = 0; - w2[3] = 0; - - u32 w3[4]; - - w3[0] = 0; - w3[1] = 0; - w3[2] = 0; - w3[3] = 0; + u32x w0[4] = { 0 }; + u32x w1[4] = { 0 }; + u32x w2[4] = { 0 }; + u32x w3[4] = { 0 }; - const u32 out_len = apply_rules (rules_buf[il_pos].cmds, w0, w1, pw_len); + const u32x out_len = apply_rules_vect (pw_buf0, pw_buf1, pw_len, rules_buf, il_pos, w0, w1); - append_0x80_2 (w0, w1, out_len); + append_0x80_2x4_VV (w0, w1, out_len); - const u32 pw_salt_len = salt_len + out_len; + const u32x pw_salt_len = salt_len + out_len; /* * HA1 = md5 ($salt . $pass) @@ -1767,7 +1688,8 @@ __kernel void __attribute__((reqd_work_group_size (64, 1, 1))) m11400_s04 (__glo // append the pass to the salt - u32 block0[16]; + u32x block0[16]; + u32x block1[16]; block0[ 0] = salt_buf0[ 0]; block0[ 1] = salt_buf0[ 1]; @@ -1785,9 +1707,6 @@ __kernel void __attribute__((reqd_work_group_size (64, 1, 1))) m11400_s04 (__glo block0[13] = salt_buf0[13]; block0[14] = salt_buf0[14]; block0[15] = salt_buf0[15]; - - u32 block1[16]; - block1[ 0] = salt_buf1[ 0]; block1[ 1] = salt_buf1[ 1]; block1[ 2] = salt_buf1[ 2]; @@ -1809,29 +1728,23 @@ __kernel void __attribute__((reqd_work_group_size (64, 1, 1))) m11400_s04 (__glo block_len = memcat32 (block0, block1, salt_len, w0, w1, w2, w3, out_len); - u32 w0_t[4]; + u32x w0_t[4]; + u32x w1_t[4]; + u32x w2_t[4]; + u32x w3_t[4]; w0_t[0] = block0[ 0]; w0_t[1] = block0[ 1]; w0_t[2] = block0[ 2]; w0_t[3] = block0[ 3]; - - u32 w1_t[4]; - w1_t[0] = block0[ 4]; w1_t[1] = block0[ 5]; w1_t[2] = block0[ 6]; w1_t[3] = block0[ 7]; - - u32 w2_t[4]; - w2_t[0] = block0[ 8]; w2_t[1] = block0[ 9]; w2_t[2] = block0[10]; w2_t[3] = block0[11]; - - u32 w3_t[4]; - w3_t[0] = block0[12]; w3_t[1] = block0[13]; w3_t[2] = block0[14]; @@ -1844,12 +1757,10 @@ __kernel void __attribute__((reqd_work_group_size (64, 1, 1))) m11400_s04 (__glo // md5 - u32 tmp2; - - u32 a = MD5M_A; - u32 b = MD5M_B; - u32 c = MD5M_C; - u32 d = MD5M_D; + u32x a = MD5M_A; + u32x b = MD5M_B; + u32x c = MD5M_C; + u32x d = MD5M_D; MD5_STEP (MD5_Fo, a, b, c, d, w0_t[0], MD5C00, MD5S00); MD5_STEP (MD5_Fo, d, a, b, c, w0_t[1], MD5C01, MD5S01); @@ -1885,22 +1796,22 @@ __kernel void __attribute__((reqd_work_group_size (64, 1, 1))) m11400_s04 (__glo MD5_STEP (MD5_Go, c, d, a, b, w1_t[3], MD5C1e, MD5S12); MD5_STEP (MD5_Go, b, c, d, a, w3_t[0], MD5C1f, MD5S13); - MD5_STEP (MD5_H1, a, b, c, d, w1_t[1], MD5C20, MD5S20); - MD5_STEP (MD5_H2, d, a, b, c, w2_t[0], MD5C21, MD5S21); - MD5_STEP (MD5_H1, c, d, a, b, w2_t[3], MD5C22, MD5S22); - MD5_STEP (MD5_H2, b, c, d, a, w3_t[2], MD5C23, MD5S23); - MD5_STEP (MD5_H1, a, b, c, d, w0_t[1], MD5C24, MD5S20); - MD5_STEP (MD5_H2, d, a, b, c, w1_t[0], MD5C25, MD5S21); - MD5_STEP (MD5_H1, c, d, a, b, w1_t[3], MD5C26, MD5S22); - MD5_STEP (MD5_H2, b, c, d, a, w2_t[2], MD5C27, MD5S23); - MD5_STEP (MD5_H1, a, b, c, d, w3_t[1], MD5C28, MD5S20); - MD5_STEP (MD5_H2, d, a, b, c, w0_t[0], MD5C29, MD5S21); - MD5_STEP (MD5_H1, c, d, a, b, w0_t[3], MD5C2a, MD5S22); - MD5_STEP (MD5_H2, b, c, d, a, w1_t[2], MD5C2b, MD5S23); - MD5_STEP (MD5_H1, a, b, c, d, w2_t[1], MD5C2c, MD5S20); - MD5_STEP (MD5_H2, d, a, b, c, w3_t[0], MD5C2d, MD5S21); - MD5_STEP (MD5_H1, c, d, a, b, w3_t[3], MD5C2e, MD5S22); - MD5_STEP (MD5_H2, b, c, d, a, w0_t[2], MD5C2f, MD5S23); + MD5_STEP (MD5_H , a, b, c, d, w1_t[1], MD5C20, MD5S20); + MD5_STEP (MD5_H , d, a, b, c, w2_t[0], MD5C21, MD5S21); + MD5_STEP (MD5_H , c, d, a, b, w2_t[3], MD5C22, MD5S22); + MD5_STEP (MD5_H , b, c, d, a, w3_t[2], MD5C23, MD5S23); + MD5_STEP (MD5_H , a, b, c, d, w0_t[1], MD5C24, MD5S20); + MD5_STEP (MD5_H , d, a, b, c, w1_t[0], MD5C25, MD5S21); + MD5_STEP (MD5_H , c, d, a, b, w1_t[3], MD5C26, MD5S22); + MD5_STEP (MD5_H , b, c, d, a, w2_t[2], MD5C27, MD5S23); + MD5_STEP (MD5_H , a, b, c, d, w3_t[1], MD5C28, MD5S20); + MD5_STEP (MD5_H , d, a, b, c, w0_t[0], MD5C29, MD5S21); + MD5_STEP (MD5_H , c, d, a, b, w0_t[3], MD5C2a, MD5S22); + MD5_STEP (MD5_H , b, c, d, a, w1_t[2], MD5C2b, MD5S23); + MD5_STEP (MD5_H , a, b, c, d, w2_t[1], MD5C2c, MD5S20); + MD5_STEP (MD5_H , d, a, b, c, w3_t[0], MD5C2d, MD5S21); + MD5_STEP (MD5_H , c, d, a, b, w3_t[3], MD5C2e, MD5S22); + MD5_STEP (MD5_H , b, c, d, a, w0_t[2], MD5C2f, MD5S23); MD5_STEP (MD5_I , a, b, c, d, w0_t[0], MD5C30, MD5S30); MD5_STEP (MD5_I , d, a, b, c, w1_t[3], MD5C31, MD5S31); @@ -1926,26 +1837,23 @@ __kernel void __attribute__((reqd_work_group_size (64, 1, 1))) m11400_s04 (__glo if (block_len > 55) { - u32 r_a = a; - u32 r_b = b; - u32 r_c = c; - u32 r_d = d; + u32x r_a = a; + u32x r_b = b; + u32x r_c = c; + u32x r_d = d; w0_t[0] = block1[ 0]; w0_t[1] = block1[ 1]; w0_t[2] = block1[ 2]; w0_t[3] = block1[ 3]; - w1_t[0] = block1[ 4]; w1_t[1] = block1[ 5]; w1_t[2] = block1[ 6]; w1_t[3] = block1[ 7]; - w2_t[0] = block1[ 8]; w2_t[1] = block1[ 9]; w2_t[2] = block1[10]; w2_t[3] = block1[11]; - w3_t[0] = block1[12]; w3_t[1] = block1[13]; w3_t[2] = pw_salt_len * 8; @@ -1985,22 +1893,22 @@ __kernel void __attribute__((reqd_work_group_size (64, 1, 1))) m11400_s04 (__glo MD5_STEP (MD5_Go, c, d, a, b, w1_t[3], MD5C1e, MD5S12); MD5_STEP (MD5_Go, b, c, d, a, w3_t[0], MD5C1f, MD5S13); - MD5_STEP (MD5_H1, a, b, c, d, w1_t[1], MD5C20, MD5S20); - MD5_STEP (MD5_H2, d, a, b, c, w2_t[0], MD5C21, MD5S21); - MD5_STEP (MD5_H1, c, d, a, b, w2_t[3], MD5C22, MD5S22); - MD5_STEP (MD5_H2, b, c, d, a, w3_t[2], MD5C23, MD5S23); - MD5_STEP (MD5_H1, a, b, c, d, w0_t[1], MD5C24, MD5S20); - MD5_STEP (MD5_H2, d, a, b, c, w1_t[0], MD5C25, MD5S21); - MD5_STEP (MD5_H1, c, d, a, b, w1_t[3], MD5C26, MD5S22); - MD5_STEP (MD5_H2, b, c, d, a, w2_t[2], MD5C27, MD5S23); - MD5_STEP (MD5_H1, a, b, c, d, w3_t[1], MD5C28, MD5S20); - MD5_STEP (MD5_H2, d, a, b, c, w0_t[0], MD5C29, MD5S21); - MD5_STEP (MD5_H1, c, d, a, b, w0_t[3], MD5C2a, MD5S22); - MD5_STEP (MD5_H2, b, c, d, a, w1_t[2], MD5C2b, MD5S23); - MD5_STEP (MD5_H1, a, b, c, d, w2_t[1], MD5C2c, MD5S20); - MD5_STEP (MD5_H2, d, a, b, c, w3_t[0], MD5C2d, MD5S21); - MD5_STEP (MD5_H1, c, d, a, b, w3_t[3], MD5C2e, MD5S22); - MD5_STEP (MD5_H2, b, c, d, a, w0_t[2], MD5C2f, MD5S23); + MD5_STEP (MD5_H , a, b, c, d, w1_t[1], MD5C20, MD5S20); + MD5_STEP (MD5_H , d, a, b, c, w2_t[0], MD5C21, MD5S21); + MD5_STEP (MD5_H , c, d, a, b, w2_t[3], MD5C22, MD5S22); + MD5_STEP (MD5_H , b, c, d, a, w3_t[2], MD5C23, MD5S23); + MD5_STEP (MD5_H , a, b, c, d, w0_t[1], MD5C24, MD5S20); + MD5_STEP (MD5_H , d, a, b, c, w1_t[0], MD5C25, MD5S21); + MD5_STEP (MD5_H , c, d, a, b, w1_t[3], MD5C26, MD5S22); + MD5_STEP (MD5_H , b, c, d, a, w2_t[2], MD5C27, MD5S23); + MD5_STEP (MD5_H , a, b, c, d, w3_t[1], MD5C28, MD5S20); + MD5_STEP (MD5_H , d, a, b, c, w0_t[0], MD5C29, MD5S21); + MD5_STEP (MD5_H , c, d, a, b, w0_t[3], MD5C2a, MD5S22); + MD5_STEP (MD5_H , b, c, d, a, w1_t[2], MD5C2b, MD5S23); + MD5_STEP (MD5_H , a, b, c, d, w2_t[1], MD5C2c, MD5S20); + MD5_STEP (MD5_H , d, a, b, c, w3_t[0], MD5C2d, MD5S21); + MD5_STEP (MD5_H , c, d, a, b, w3_t[3], MD5C2e, MD5S22); + MD5_STEP (MD5_H , b, c, d, a, w0_t[2], MD5C2f, MD5S23); MD5_STEP (MD5_I , a, b, c, d, w0_t[0], MD5C30, MD5S30); MD5_STEP (MD5_I , d, a, b, c, w1_t[3], MD5C31, MD5S31); @@ -2046,12 +1954,10 @@ __kernel void __attribute__((reqd_work_group_size (64, 1, 1))) m11400_s04 (__glo | uint_to_hex_lower8 ((d >> 8) & 255) << 16; w1_t[3] = uint_to_hex_lower8 ((d >> 16) & 255) << 0 | uint_to_hex_lower8 ((d >> 24) & 255) << 16; - w2_t[0] = esalt_buf0[0]; w2_t[1] = esalt_buf0[1]; w2_t[2] = esalt_buf0[2]; w2_t[3] = esalt_buf0[3]; - w3_t[0] = esalt_buf0[4]; w3_t[1] = esalt_buf0[5]; w3_t[2] = esalt_buf0[6]; @@ -2099,22 +2005,22 @@ __kernel void __attribute__((reqd_work_group_size (64, 1, 1))) m11400_s04 (__glo MD5_STEP (MD5_Go, c, d, a, b, w1_t[3], MD5C1e, MD5S12); MD5_STEP (MD5_Go, b, c, d, a, w3_t[0], MD5C1f, MD5S13); - MD5_STEP (MD5_H1, a, b, c, d, w1_t[1], MD5C20, MD5S20); - MD5_STEP (MD5_H2, d, a, b, c, w2_t[0], MD5C21, MD5S21); - MD5_STEP (MD5_H1, c, d, a, b, w2_t[3], MD5C22, MD5S22); - MD5_STEP (MD5_H2, b, c, d, a, w3_t[2], MD5C23, MD5S23); - MD5_STEP (MD5_H1, a, b, c, d, w0_t[1], MD5C24, MD5S20); - MD5_STEP (MD5_H2, d, a, b, c, w1_t[0], MD5C25, MD5S21); - MD5_STEP (MD5_H1, c, d, a, b, w1_t[3], MD5C26, MD5S22); - MD5_STEP (MD5_H2, b, c, d, a, w2_t[2], MD5C27, MD5S23); - MD5_STEP (MD5_H1, a, b, c, d, w3_t[1], MD5C28, MD5S20); - MD5_STEP (MD5_H2, d, a, b, c, w0_t[0], MD5C29, MD5S21); - MD5_STEP (MD5_H1, c, d, a, b, w0_t[3], MD5C2a, MD5S22); - MD5_STEP (MD5_H2, b, c, d, a, w1_t[2], MD5C2b, MD5S23); - MD5_STEP (MD5_H1, a, b, c, d, w2_t[1], MD5C2c, MD5S20); - MD5_STEP (MD5_H2, d, a, b, c, w3_t[0], MD5C2d, MD5S21); - MD5_STEP (MD5_H1, c, d, a, b, w3_t[3], MD5C2e, MD5S22); - MD5_STEP (MD5_H2, b, c, d, a, w0_t[2], MD5C2f, MD5S23); + MD5_STEP (MD5_H , a, b, c, d, w1_t[1], MD5C20, MD5S20); + MD5_STEP (MD5_H , d, a, b, c, w2_t[0], MD5C21, MD5S21); + MD5_STEP (MD5_H , c, d, a, b, w2_t[3], MD5C22, MD5S22); + MD5_STEP (MD5_H , b, c, d, a, w3_t[2], MD5C23, MD5S23); + MD5_STEP (MD5_H , a, b, c, d, w0_t[1], MD5C24, MD5S20); + MD5_STEP (MD5_H , d, a, b, c, w1_t[0], MD5C25, MD5S21); + MD5_STEP (MD5_H , c, d, a, b, w1_t[3], MD5C26, MD5S22); + MD5_STEP (MD5_H , b, c, d, a, w2_t[2], MD5C27, MD5S23); + MD5_STEP (MD5_H , a, b, c, d, w3_t[1], MD5C28, MD5S20); + MD5_STEP (MD5_H , d, a, b, c, w0_t[0], MD5C29, MD5S21); + MD5_STEP (MD5_H , c, d, a, b, w0_t[3], MD5C2a, MD5S22); + MD5_STEP (MD5_H , b, c, d, a, w1_t[2], MD5C2b, MD5S23); + MD5_STEP (MD5_H , a, b, c, d, w2_t[1], MD5C2c, MD5S20); + MD5_STEP (MD5_H , d, a, b, c, w3_t[0], MD5C2d, MD5S21); + MD5_STEP (MD5_H , c, d, a, b, w3_t[3], MD5C2e, MD5S22); + MD5_STEP (MD5_H , b, c, d, a, w0_t[2], MD5C2f, MD5S23); MD5_STEP (MD5_I , a, b, c, d, w0_t[0], MD5C30, MD5S30); MD5_STEP (MD5_I , d, a, b, c, w1_t[3], MD5C31, MD5S31); @@ -2138,10 +2044,10 @@ __kernel void __attribute__((reqd_work_group_size (64, 1, 1))) m11400_s04 (__glo c += MD5M_C; d += MD5M_D; - u32 r_a = a; - u32 r_b = b; - u32 r_c = c; - u32 r_d = d; + u32x r_a = a; + u32x r_b = b; + u32x r_c = c; + u32x r_d = d; // 2nd transform @@ -2149,17 +2055,14 @@ __kernel void __attribute__((reqd_work_group_size (64, 1, 1))) m11400_s04 (__glo w0_t[1] = esalt_buf0[ 9]; w0_t[2] = esalt_buf0[10]; w0_t[3] = esalt_buf0[11]; - w1_t[0] = esalt_buf0[12]; w1_t[1] = esalt_buf0[13]; w1_t[2] = esalt_buf0[14]; w1_t[3] = esalt_buf0[15]; - w2_t[0] = esalt_buf1[ 0]; w2_t[1] = esalt_buf1[ 1]; w2_t[2] = esalt_buf1[ 2]; w2_t[3] = esalt_buf1[ 3]; - w3_t[0] = esalt_buf1[ 4]; w3_t[1] = esalt_buf1[ 5]; w3_t[2] = esalt_buf1[ 6]; @@ -2208,22 +2111,22 @@ __kernel void __attribute__((reqd_work_group_size (64, 1, 1))) m11400_s04 (__glo MD5_STEP (MD5_Go, c, d, a, b, w1_t[3], MD5C1e, MD5S12); MD5_STEP (MD5_Go, b, c, d, a, w3_t[0], MD5C1f, MD5S13); - MD5_STEP (MD5_H1, a, b, c, d, w1_t[1], MD5C20, MD5S20); - MD5_STEP (MD5_H2, d, a, b, c, w2_t[0], MD5C21, MD5S21); - MD5_STEP (MD5_H1, c, d, a, b, w2_t[3], MD5C22, MD5S22); - MD5_STEP (MD5_H2, b, c, d, a, w3_t[2], MD5C23, MD5S23); - MD5_STEP (MD5_H1, a, b, c, d, w0_t[1], MD5C24, MD5S20); - MD5_STEP (MD5_H2, d, a, b, c, w1_t[0], MD5C25, MD5S21); - MD5_STEP (MD5_H1, c, d, a, b, w1_t[3], MD5C26, MD5S22); - MD5_STEP (MD5_H2, b, c, d, a, w2_t[2], MD5C27, MD5S23); - MD5_STEP (MD5_H1, a, b, c, d, w3_t[1], MD5C28, MD5S20); - MD5_STEP (MD5_H2, d, a, b, c, w0_t[0], MD5C29, MD5S21); - MD5_STEP (MD5_H1, c, d, a, b, w0_t[3], MD5C2a, MD5S22); - MD5_STEP (MD5_H2, b, c, d, a, w1_t[2], MD5C2b, MD5S23); - MD5_STEP (MD5_H1, a, b, c, d, w2_t[1], MD5C2c, MD5S20); - MD5_STEP (MD5_H2, d, a, b, c, w3_t[0], MD5C2d, MD5S21); - MD5_STEP (MD5_H1, c, d, a, b, w3_t[3], MD5C2e, MD5S22); - MD5_STEP (MD5_H2, b, c, d, a, w0_t[2], MD5C2f, MD5S23); + MD5_STEP (MD5_H , a, b, c, d, w1_t[1], MD5C20, MD5S20); + MD5_STEP (MD5_H , d, a, b, c, w2_t[0], MD5C21, MD5S21); + MD5_STEP (MD5_H , c, d, a, b, w2_t[3], MD5C22, MD5S22); + MD5_STEP (MD5_H , b, c, d, a, w3_t[2], MD5C23, MD5S23); + MD5_STEP (MD5_H , a, b, c, d, w0_t[1], MD5C24, MD5S20); + MD5_STEP (MD5_H , d, a, b, c, w1_t[0], MD5C25, MD5S21); + MD5_STEP (MD5_H , c, d, a, b, w1_t[3], MD5C26, MD5S22); + MD5_STEP (MD5_H , b, c, d, a, w2_t[2], MD5C27, MD5S23); + MD5_STEP (MD5_H , a, b, c, d, w3_t[1], MD5C28, MD5S20); + MD5_STEP (MD5_H , d, a, b, c, w0_t[0], MD5C29, MD5S21); + MD5_STEP (MD5_H , c, d, a, b, w0_t[3], MD5C2a, MD5S22); + MD5_STEP (MD5_H , b, c, d, a, w1_t[2], MD5C2b, MD5S23); + MD5_STEP (MD5_H , a, b, c, d, w2_t[1], MD5C2c, MD5S20); + MD5_STEP (MD5_H , d, a, b, c, w3_t[0], MD5C2d, MD5S21); + MD5_STEP (MD5_H , c, d, a, b, w3_t[3], MD5C2e, MD5S22); + MD5_STEP (MD5_H , b, c, d, a, w0_t[2], MD5C2f, MD5S23); MD5_STEP (MD5_I , a, b, c, d, w0_t[0], MD5C30, MD5S30); MD5_STEP (MD5_I , d, a, b, c, w1_t[3], MD5C31, MD5S31); @@ -2262,17 +2165,14 @@ __kernel void __attribute__((reqd_work_group_size (64, 1, 1))) m11400_s04 (__glo w0_t[1] = esalt_buf1[ 9]; w0_t[2] = esalt_buf1[10]; w0_t[3] = esalt_buf1[11]; - w1_t[0] = esalt_buf1[12]; w1_t[1] = esalt_buf1[13]; w1_t[2] = esalt_buf1[14]; w1_t[3] = esalt_buf1[15]; - w2_t[0] = esalt_buf2[ 0]; w2_t[1] = esalt_buf2[ 1]; w2_t[2] = esalt_buf2[ 2]; w2_t[3] = esalt_buf2[ 3]; - w3_t[0] = esalt_buf2[ 4]; w3_t[1] = esalt_buf2[ 5]; w3_t[2] = digest_esalt_len * 8; @@ -2312,22 +2212,22 @@ __kernel void __attribute__((reqd_work_group_size (64, 1, 1))) m11400_s04 (__glo MD5_STEP (MD5_Go, c, d, a, b, w1_t[3], MD5C1e, MD5S12); MD5_STEP (MD5_Go, b, c, d, a, w3_t[0], MD5C1f, MD5S13); - MD5_STEP (MD5_H1, a, b, c, d, w1_t[1], MD5C20, MD5S20); - MD5_STEP (MD5_H2, d, a, b, c, w2_t[0], MD5C21, MD5S21); - MD5_STEP (MD5_H1, c, d, a, b, w2_t[3], MD5C22, MD5S22); - MD5_STEP (MD5_H2, b, c, d, a, w3_t[2], MD5C23, MD5S23); - MD5_STEP (MD5_H1, a, b, c, d, w0_t[1], MD5C24, MD5S20); - MD5_STEP (MD5_H2, d, a, b, c, w1_t[0], MD5C25, MD5S21); - MD5_STEP (MD5_H1, c, d, a, b, w1_t[3], MD5C26, MD5S22); - MD5_STEP (MD5_H2, b, c, d, a, w2_t[2], MD5C27, MD5S23); - MD5_STEP (MD5_H1, a, b, c, d, w3_t[1], MD5C28, MD5S20); - MD5_STEP (MD5_H2, d, a, b, c, w0_t[0], MD5C29, MD5S21); - MD5_STEP (MD5_H1, c, d, a, b, w0_t[3], MD5C2a, MD5S22); - MD5_STEP (MD5_H2, b, c, d, a, w1_t[2], MD5C2b, MD5S23); - MD5_STEP (MD5_H1, a, b, c, d, w2_t[1], MD5C2c, MD5S20); - MD5_STEP (MD5_H2, d, a, b, c, w3_t[0], MD5C2d, MD5S21); - MD5_STEP (MD5_H1, c, d, a, b, w3_t[3], MD5C2e, MD5S22); - MD5_STEP (MD5_H2, b, c, d, a, w0_t[2], MD5C2f, MD5S23); + MD5_STEP (MD5_H , a, b, c, d, w1_t[1], MD5C20, MD5S20); + MD5_STEP (MD5_H , d, a, b, c, w2_t[0], MD5C21, MD5S21); + MD5_STEP (MD5_H , c, d, a, b, w2_t[3], MD5C22, MD5S22); + MD5_STEP (MD5_H , b, c, d, a, w3_t[2], MD5C23, MD5S23); + MD5_STEP (MD5_H , a, b, c, d, w0_t[1], MD5C24, MD5S20); + MD5_STEP (MD5_H , d, a, b, c, w1_t[0], MD5C25, MD5S21); + MD5_STEP (MD5_H , c, d, a, b, w1_t[3], MD5C26, MD5S22); + MD5_STEP (MD5_H , b, c, d, a, w2_t[2], MD5C27, MD5S23); + MD5_STEP (MD5_H , a, b, c, d, w3_t[1], MD5C28, MD5S20); + MD5_STEP (MD5_H , d, a, b, c, w0_t[0], MD5C29, MD5S21); + MD5_STEP (MD5_H , c, d, a, b, w0_t[3], MD5C2a, MD5S22); + MD5_STEP (MD5_H , b, c, d, a, w1_t[2], MD5C2b, MD5S23); + MD5_STEP (MD5_H , a, b, c, d, w2_t[1], MD5C2c, MD5S20); + MD5_STEP (MD5_H , d, a, b, c, w3_t[0], MD5C2d, MD5S21); + MD5_STEP (MD5_H , c, d, a, b, w3_t[3], MD5C2e, MD5S22); + MD5_STEP (MD5_H , b, c, d, a, w0_t[2], MD5C2f, MD5S23); MD5_STEP (MD5_I , a, b, c, d, w0_t[0], MD5C30, MD5S30); MD5_STEP (MD5_I , d, a, b, c, w1_t[3], MD5C31, MD5S31); @@ -2352,19 +2252,14 @@ __kernel void __attribute__((reqd_work_group_size (64, 1, 1))) m11400_s04 (__glo c += r_c; d += r_d; - const u32 r0 = a; - const u32 r1 = d; - const u32 r2 = c; - const u32 r3 = b; - - #include COMPARE_S + COMPARE_S_SIMD (a, d, c, b); } } -__kernel void __attribute__((reqd_work_group_size (64, 1, 1))) m11400_s08 (__global pw_t *pws, __global gpu_rule_t *rules_buf, __global comb_t *combs_buf, __global bf_t *bfs_buf, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global sip_t *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 combs_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max) +__kernel void m11400_s08 (__global pw_t *pws, __global kernel_rule_t *rules_buf, __global comb_t *combs_buf, __global bf_t *bfs_buf, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global sip_t *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV0_buf, __global u32 *d_scryptV1_buf, __global u32 *d_scryptV2_buf, __global u32 *d_scryptV3_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 il_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max) { } -__kernel void __attribute__((reqd_work_group_size (64, 1, 1))) m11400_s16 (__global pw_t *pws, __global gpu_rule_t *rules_buf, __global comb_t *combs_buf, __global bf_t *bfs_buf, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global sip_t *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 combs_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max) +__kernel void m11400_s16 (__global pw_t *pws, __global kernel_rule_t *rules_buf, __global comb_t *combs_buf, __global bf_t *bfs_buf, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global sip_t *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV0_buf, __global u32 *d_scryptV1_buf, __global u32 *d_scryptV2_buf, __global u32 *d_scryptV3_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 il_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max) { }