X-Git-Url: https://www.flypig.org.uk/git/?a=blobdiff_plain;f=OpenCL%2Fm08900.cl;h=a79c5fcaa372e10dcfd82e131f33f8a5f45ecfa8;hb=0e68b2af2510adbb1faffe8ee84eee578462f29d;hp=e2e6cda3e67ba370e971f0aba368c597300054bd;hpb=2283d5c843c425aebfdad4a521b1e0ad85fae387;p=hashcat.git diff --git a/OpenCL/m08900.cl b/OpenCL/m08900.cl index e2e6cda..a79c5fc 100644 --- a/OpenCL/m08900.cl +++ b/OpenCL/m08900.cl @@ -5,20 +5,20 @@ #define _SCRYPT_ -#include "include/constants.h" -#include "include/kernel_vendor.h" +#include "inc_hash_constants.h" +#include "inc_vendor.cl" #define DGST_R0 0 #define DGST_R1 1 #define DGST_R2 2 #define DGST_R3 3 -#include "include/kernel_functions.c" -#include "types_ocl.c" -#include "common.c" +#include "inc_hash_functions.cl" +#include "inc_types.cl" +#include "inc_common.cl" -#define COMPARE_S "check_single_comp4.c" -#define COMPARE_M "check_multi_comp4.c" +#define COMPARE_S "inc_comp_single.cl" +#define COMPARE_M "inc_comp_multi.cl" __constant u32 k_sha256[64] = { @@ -40,7 +40,7 @@ __constant u32 k_sha256[64] = SHA256C3c, SHA256C3d, SHA256C3e, SHA256C3f, }; -static void sha256_transform (const u32 w0[4], const u32 w1[4], const u32 w2[4], const u32 w3[4], u32 digest[8]) +void sha256_transform (const u32 w0[4], const u32 w1[4], const u32 w2[4], const u32 w3[4], u32 digest[8]) { u32 a = digest[0]; u32 b = digest[1]; @@ -110,7 +110,9 @@ static void sha256_transform (const u32 w0[4], const u32 w1[4], const u32 w2[4], ROUND_STEP (0); + #ifdef _unroll #pragma unroll + #endif for (int i = 16; i < 64; i += 16) { ROUND_EXPAND (); ROUND_STEP (i); @@ -126,7 +128,7 @@ static void sha256_transform (const u32 w0[4], const u32 w1[4], const u32 w2[4], digest[7] += h; } -static void hmac_sha256_pad (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], u32 ipad[8], u32 opad[8]) +void hmac_sha256_pad (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], u32 ipad[8], u32 opad[8]) { w0[0] = w0[0] ^ 0x36363636; w0[1] = w0[1] ^ 0x36363636; @@ -185,7 +187,7 @@ static void hmac_sha256_pad (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], u32 ipa sha256_transform (w0, w1, w2, w3, opad); } -static void hmac_sha256_run (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], u32 ipad[8], u32 opad[8], u32 digest[8]) +void hmac_sha256_run (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], u32 ipad[8], u32 opad[8], u32 digest[8]) { digest[0] = ipad[0]; digest[1] = ipad[1]; @@ -227,7 +229,7 @@ static void hmac_sha256_run (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], u32 ipa sha256_transform (w0, w1, w2, w3, digest); } -static void memcat8 (u32 block0[4], u32 block1[4], u32 block2[4], u32 block3[4], const u32 block_len, const u32 append[2]) +void memcat8 (u32 block0[4], u32 block1[4], u32 block2[4], u32 block3[4], const u32 block_len, const u32 append[2]) { switch (block_len) { @@ -560,7 +562,7 @@ static void memcat8 (u32 block0[4], u32 block1[4], u32 block2[4], u32 block3[4], } } -static uint4 swap32_4 (uint4 v) +uint4 swap32_4 (uint4 v) { return (rotate ((v & 0x00FF00FF), 24u) | rotate ((v & 0xFF00FF00), 8u)); } @@ -620,79 +622,59 @@ static uint4 swap32_4 (uint4 v) R3 = R3 + X3; \ } -static void salsa_r (uint4 *T) +void salsa_r (uint4 *TI) { - uint4 R0 = T[STATE_CNT4 - 4]; - uint4 R1 = T[STATE_CNT4 - 3]; - uint4 R2 = T[STATE_CNT4 - 2]; - uint4 R3 = T[STATE_CNT4 - 1]; + uint4 R0 = TI[STATE_CNT4 - 4]; + uint4 R1 = TI[STATE_CNT4 - 3]; + uint4 R2 = TI[STATE_CNT4 - 2]; + uint4 R3 = TI[STATE_CNT4 - 1]; + + uint4 TO[STATE_CNT4]; + + int idx_y = 0; + int idx_r1 = 0; + int idx_r2 = SCRYPT_R * 4; - for (u32 i = 0; i < STATE_CNT4; i += 8) + for (int i = 0; i < SCRYPT_R; i++) { uint4 Y0; uint4 Y1; uint4 Y2; uint4 Y3; - Y0 = T[i + 0]; - Y1 = T[i + 1]; - Y2 = T[i + 2]; - Y3 = T[i + 3]; + Y0 = TI[idx_y++]; + Y1 = TI[idx_y++]; + Y2 = TI[idx_y++]; + Y3 = TI[idx_y++]; SALSA20_8_XOR (); - T[i + 0] = R0; - T[i + 1] = R1; - T[i + 2] = R2; - T[i + 3] = R3; + TO[idx_r1++] = R0; + TO[idx_r1++] = R1; + TO[idx_r1++] = R2; + TO[idx_r1++] = R3; - Y0 = T[i + 4]; - Y1 = T[i + 5]; - Y2 = T[i + 6]; - Y3 = T[i + 7]; + Y0 = TI[idx_y++]; + Y1 = TI[idx_y++]; + Y2 = TI[idx_y++]; + Y3 = TI[idx_y++]; SALSA20_8_XOR (); - T[i + 4] = R0; - T[i + 5] = R1; - T[i + 6] = R2; - T[i + 7] = R3; - } - - #define exchg(x,y) { const uint4 t = T[(x)]; T[(x)] = T[(y)]; T[(y)] = t; } - - #define exchg4(x,y) \ - { \ - const u32 x4 = (x) * 4; \ - const u32 y4 = (y) * 4; \ - \ - exchg (x4 + 0, y4 + 0); \ - exchg (x4 + 1, y4 + 1); \ - exchg (x4 + 2, y4 + 2); \ - exchg (x4 + 3, y4 + 3); \ - } - - for (u32 i = 1; i < SCRYPT_R / 1; i++) - { - const u32 x = i * 1; - const u32 y = i * 2; - - exchg4 (x, y); + TO[idx_r2++] = R0; + TO[idx_r2++] = R1; + TO[idx_r2++] = R2; + TO[idx_r2++] = R3; } - for (u32 i = 1; i < SCRYPT_R / 2; i++) + #pragma unroll + for (int i = 0; i < STATE_CNT4; i++) { - const u32 x = i * 1; - const u32 y = i * 2; - - const u32 xr1 = (SCRYPT_R * 2) - 1 - x; - const u32 yr1 = (SCRYPT_R * 2) - 1 - y; - - exchg4 (xr1, yr1); + TI[i] = TO[i]; } } -static void scrypt_smix (uint4 *X, uint4 *T, const u32 phy, __global uint4 *V) +void scrypt_smix (uint4 *X, uint4 *T, const u32 phy, __global uint4 *V) { #define Coord(x,y,z) (((x) * zSIZE) + ((y) * zSIZE * xSIZE) + (z)) #define CO Coord(x,y,z) @@ -701,11 +683,15 @@ static void scrypt_smix (uint4 *X, uint4 *T, const u32 phy, __global uint4 *V) const u32 ySIZE = SCRYPT_N / SCRYPT_TMTO; const u32 zSIZE = STATE_CNT4; - const u32 gid = get_global_id (0); + const u32 lid = get_local_id (0); + const u32 lsz = get_local_size (0); + const u32 rid = get_group_id (0); - const u32 x = gid % xSIZE; + const u32 x = (rid * lsz) + lid; + #ifdef _unroll #pragma unroll + #endif for (u32 i = 0; i < STATE_CNT4; i += 4) { T[0] = (uint4) (X[i + 0].x, X[i + 1].y, X[i + 2].z, X[i + 3].w); @@ -743,7 +729,9 @@ static void scrypt_smix (uint4 *X, uint4 *T, const u32 phy, __global uint4 *V) salsa_r (X); } + #ifdef _unroll #pragma unroll + #endif for (u32 i = 0; i < STATE_CNT4; i += 4) { T[0] = (uint4) (X[i + 0].x, X[i + 3].y, X[i + 2].z, X[i + 1].w); @@ -758,7 +746,7 @@ static void scrypt_smix (uint4 *X, uint4 *T, const u32 phy, __global uint4 *V) } } -__kernel void __attribute__((reqd_work_group_size (64, 1, 1))) m08900_init (__global pw_t *pws, __global gpu_rule_t *rules_buf, __global comb_t *combs_buf, __global bf_t *bfs_buf, __global scrypt_tmp_t *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global void *esalt_bufs, __global u32 *d_return_buf, __global uint4 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 rules_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max) +__kernel void m08900_init (__global pw_t *pws, __global kernel_rule_t *rules_buf, __global comb_t *combs_buf, __global bf_t *bfs_buf, __global scrypt_tmp_t *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global void *esalt_bufs, __global u32 *d_return_buf, __global uint4 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 il_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max) { /** * base @@ -899,7 +887,7 @@ __kernel void __attribute__((reqd_work_group_size (64, 1, 1))) m08900_init (__gl } } -__kernel void __attribute__((reqd_work_group_size (64, 1, 1))) m08900_loop (__global pw_t *pws, __global gpu_rule_t *rules_buf, __global comb_t *combs_buf, __global bf_t *bfs_buf, __global scrypt_tmp_t *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global void *esalt_bufs, __global u32 *d_return_buf, __global uint4 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 rules_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max) +__kernel void m08900_loop (__global pw_t *pws, __global kernel_rule_t *rules_buf, __global comb_t *combs_buf, __global bf_t *bfs_buf, __global scrypt_tmp_t *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global void *esalt_bufs, __global u32 *d_return_buf, __global uint4 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 il_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max) { const u32 gid = get_global_id (0); @@ -910,12 +898,16 @@ __kernel void __attribute__((reqd_work_group_size (64, 1, 1))) m08900_loop (__gl uint4 X[STATE_CNT4]; uint4 T[STATE_CNT4]; + #ifdef _unroll #pragma unroll + #endif for (int z = 0; z < STATE_CNT4; z++) X[z] = swap32_4 (tmps[gid].P[z]); scrypt_smix (X, T, scrypt_phy, d_scryptV_buf); + #ifdef _unroll #pragma unroll + #endif for (int z = 0; z < STATE_CNT4; z++) tmps[gid].P[z] = swap32_4 (X[z]); #if SCRYPT_P >= 1 @@ -930,7 +922,7 @@ __kernel void __attribute__((reqd_work_group_size (64, 1, 1))) m08900_loop (__gl #endif } -__kernel void __attribute__((reqd_work_group_size (64, 1, 1))) m08900_comp (__global pw_t *pws, __global gpu_rule_t *rules_buf, __global comb_t *combs_buf, __global bf_t *bfs_buf, __global scrypt_tmp_t *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global void *esalt_bufs, __global u32 *d_return_buf, __global uint4 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 rules_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max) +__kernel void m08900_comp (__global pw_t *pws, __global kernel_rule_t *rules_buf, __global comb_t *combs_buf, __global bf_t *bfs_buf, __global scrypt_tmp_t *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global void *esalt_bufs, __global u32 *d_return_buf, __global uint4 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 il_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max) { /** * base @@ -969,15 +961,6 @@ __kernel void __attribute__((reqd_work_group_size (64, 1, 1))) m08900_comp (__gl w3[2] = pws[gid].i[14]; w3[3] = pws[gid].i[15]; - /** - * memory buffers - */ - - const u32 scrypt_r = SCRYPT_R; - const u32 scrypt_p = SCRYPT_P; - //const u32 scrypt_N = SCRYPT_N; - - /** * 2nd pbkdf2, creates B */