2 * Authors.....: Jens Steube <jens.steube@gmail.com>
3 * Gabriele Gristina <matrix@hashcat.net>
10 //too much register pressure
11 //#define NEW_SIMD_CODE
13 #include "inc_vendor.cl"
14 #include "inc_hash_constants.h"
15 #include "inc_hash_functions.cl"
16 #include "inc_types.cl"
17 #include "inc_common.cl"
18 #include "inc_simd.cl"
20 #define INITVAL 0x0101010101010101
23 #define BOX(S,n,i) (S)[(n)][(i)]
25 #define BOX(S,n,i) (u64x) ((S)[(n)][(i).s0], (S)[(n)][(i).s1])
27 #define BOX(S,n,i) (u64x) ((S)[(n)][(i).s0], (S)[(n)][(i).s1], (S)[(n)][(i).s2], (S)[(n)][(i).s3])
29 #define BOX(S,n,i) (u64x) ((S)[(n)][(i).s0], (S)[(n)][(i).s1], (S)[(n)][(i).s2], (S)[(n)][(i).s3], (S)[(n)][(i).s4], (S)[(n)][(i).s5], (S)[(n)][(i).s6], (S)[(n)][(i).s7])
31 #define BOX(S,n,i) (u64x) ((S)[(n)][(i).s0], (S)[(n)][(i).s1], (S)[(n)][(i).s2], (S)[(n)][(i).s3], (S)[(n)][(i).s4], (S)[(n)][(i).s5], (S)[(n)][(i).s6], (S)[(n)][(i).s7], (S)[(n)][(i).s8], (S)[(n)][(i).s9], (S)[(n)][(i).sa], (S)[(n)][(i).sb], (S)[(n)][(i).sc], (S)[(n)][(i).sd], (S)[(n)][(i).se], (S)[(n)][(i).sf])
34 #define SBOG_LPSti64 \
35 BOX (s_sbob_sl64, 0, ((t[0] >> (i * 8)) & 0xff)) ^ \
36 BOX (s_sbob_sl64, 1, ((t[1] >> (i * 8)) & 0xff)) ^ \
37 BOX (s_sbob_sl64, 2, ((t[2] >> (i * 8)) & 0xff)) ^ \
38 BOX (s_sbob_sl64, 3, ((t[3] >> (i * 8)) & 0xff)) ^ \
39 BOX (s_sbob_sl64, 4, ((t[4] >> (i * 8)) & 0xff)) ^ \
40 BOX (s_sbob_sl64, 5, ((t[5] >> (i * 8)) & 0xff)) ^ \
41 BOX (s_sbob_sl64, 6, ((t[6] >> (i * 8)) & 0xff)) ^ \
42 BOX (s_sbob_sl64, 7, ((t[7] >> (i * 8)) & 0xff))
46 __constant u64 sbob_sl64[8][256] =
2114 __constant u64 sbob_rc64[12][8] =
2238 void streebog_g (u64x h[8], const u64x m[8], __local u64 (*s_sbob_sl64)[256])
2247 for (int i = 0; i < 8; i++)
2252 for (int i = 0; i < 8; i++)
2254 k[i] = SBOG_LPSti64;
2260 for (int i = 0; i < 8; i++)
2265 for (int r = 0; r < 12; r++)
2270 for (int i = 0; i < 8; i++)
2278 for (int i = 0; i < 8; i++)
2280 s[i] = SBOG_LPSti64;
2283 for (int i = 0; i < 8; i++)
2285 t[i] = k[i] ^ sbob_rc64[r][i];
2291 for (int i = 0; i < 8; i++)
2293 k[i] = SBOG_LPSti64;
2300 for (int i = 0; i < 8; i++)
2302 h[i] ^= s[i] ^ k[i] ^ m[i];
2306 __kernel void m11700_m04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, __global comb_t *combs_buf, __global bf_t *bfs_buf, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV0_buf, __global u32 *d_scryptV1_buf, __global u32 *d_scryptV2_buf, __global u32 *d_scryptV3_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 il_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max)
2312 const u32 gid = get_global_id (0);
2313 const u32 lid = get_local_id (0);
2314 const u32 lsz = get_local_size (0);
2317 * shared lookup table
2320 __local u64 s_sbob_sl64[8][256];
2322 for (u32 i = lid; i < 256; i += lsz)
2324 s_sbob_sl64[0][i] = sbob_sl64[0][i];
2325 s_sbob_sl64[1][i] = sbob_sl64[1][i];
2326 s_sbob_sl64[2][i] = sbob_sl64[2][i];
2327 s_sbob_sl64[3][i] = sbob_sl64[3][i];
2328 s_sbob_sl64[4][i] = sbob_sl64[4][i];
2329 s_sbob_sl64[5][i] = sbob_sl64[5][i];
2330 s_sbob_sl64[6][i] = sbob_sl64[6][i];
2331 s_sbob_sl64[7][i] = sbob_sl64[7][i];
2334 barrier (CLK_LOCAL_MEM_FENCE);
2336 if (gid >= gid_max) return;
2345 pw_buf0[0] = pws[gid].i[0];
2346 pw_buf0[1] = pws[gid].i[1];
2347 pw_buf0[2] = pws[gid].i[2];
2348 pw_buf0[3] = pws[gid].i[3];
2349 pw_buf1[0] = pws[gid].i[4];
2350 pw_buf1[1] = pws[gid].i[5];
2351 pw_buf1[2] = pws[gid].i[6];
2352 pw_buf1[3] = pws[gid].i[7];
2354 const u32 pw_l_len = pws[gid].pw_len;
2360 for (u32 il_pos = 0; il_pos < il_cnt; il_pos += VECT_SIZE)
2362 const u32x pw_r_len = pwlenx_create_combt (combs_buf, il_pos);
2364 const u32x pw_len = pw_l_len + pw_r_len;
2367 * concat password candidate
2370 u32x wordl0[4] = { 0 };
2371 u32x wordl1[4] = { 0 };
2372 u32x wordl2[4] = { 0 };
2373 u32x wordl3[4] = { 0 };
2375 wordl0[0] = pw_buf0[0];
2376 wordl0[1] = pw_buf0[1];
2377 wordl0[2] = pw_buf0[2];
2378 wordl0[3] = pw_buf0[3];
2379 wordl1[0] = pw_buf1[0];
2380 wordl1[1] = pw_buf1[1];
2381 wordl1[2] = pw_buf1[2];
2382 wordl1[3] = pw_buf1[3];
2384 u32x wordr0[4] = { 0 };
2385 u32x wordr1[4] = { 0 };
2386 u32x wordr2[4] = { 0 };
2387 u32x wordr3[4] = { 0 };
2389 wordr0[0] = ix_create_combt (combs_buf, il_pos, 0);
2390 wordr0[1] = ix_create_combt (combs_buf, il_pos, 1);
2391 wordr0[2] = ix_create_combt (combs_buf, il_pos, 2);
2392 wordr0[3] = ix_create_combt (combs_buf, il_pos, 3);
2393 wordr1[0] = ix_create_combt (combs_buf, il_pos, 4);
2394 wordr1[1] = ix_create_combt (combs_buf, il_pos, 5);
2395 wordr1[2] = ix_create_combt (combs_buf, il_pos, 6);
2396 wordr1[3] = ix_create_combt (combs_buf, il_pos, 7);
2398 if (combs_mode == COMBINATOR_MODE_BASE_LEFT)
2400 switch_buffer_by_offset_le_VV (wordr0, wordr1, wordr2, wordr3, pw_l_len);
2404 switch_buffer_by_offset_le_VV (wordl0, wordl1, wordl2, wordl3, pw_r_len);
2412 w0[0] = wordl0[0] | wordr0[0];
2413 w0[1] = wordl0[1] | wordr0[1];
2414 w0[2] = wordl0[2] | wordr0[2];
2415 w0[3] = wordl0[3] | wordr0[3];
2416 w1[0] = wordl1[0] | wordr1[0];
2417 w1[1] = wordl1[1] | wordr1[1];
2418 w1[2] = wordl1[2] | wordr1[2];
2419 w1[3] = wordl1[3] | wordr1[3];
2420 w2[0] = wordl2[0] | wordr2[0];
2421 w2[1] = wordl2[1] | wordr2[1];
2422 w2[2] = wordl2[2] | wordr2[2];
2423 w2[3] = wordl2[3] | wordr2[3];
2424 w3[0] = wordl3[0] | wordr3[0];
2425 w3[1] = wordl3[1] | wordr3[1];
2426 w3[2] = wordl3[2] | wordr3[2];
2427 w3[3] = wordl3[3] | wordr3[3];
2453 * reverse message block
2458 m[0] = hl32_to_64 (w[15], w[14]);
2459 m[1] = hl32_to_64 (w[13], w[12]);
2460 m[2] = hl32_to_64 (w[11], w[10]);
2461 m[3] = hl32_to_64 (w[ 9], w[ 8]);
2462 m[4] = hl32_to_64 (w[ 7], w[ 6]);
2463 m[5] = hl32_to_64 (w[ 5], w[ 4]);
2464 m[6] = hl32_to_64 (w[ 3], w[ 2]);
2465 m[7] = hl32_to_64 (w[ 1], w[ 0]);
2467 m[0] = swap64 (m[0]);
2468 m[1] = swap64 (m[1]);
2469 m[2] = swap64 (m[2]);
2470 m[3] = swap64 (m[3]);
2471 m[4] = swap64 (m[4]);
2472 m[5] = swap64 (m[5]);
2473 m[6] = swap64 (m[6]);
2474 m[7] = swap64 (m[7]);
2476 // state buffer (hash)
2489 streebog_g (h, m, s_sbob_sl64);
2500 z[7] = swap64 ((u64) (pw_len * 8));
2502 streebog_g (h, z, s_sbob_sl64);
2503 streebog_g (h, m, s_sbob_sl64);
2505 const u32x r0 = l32_from_64 (h[0]);
2506 const u32x r1 = h32_from_64 (h[0]);
2507 const u32x r2 = l32_from_64 (h[1]);
2508 const u32x r3 = h32_from_64 (h[1]);
2510 COMPARE_M_SIMD (r0, r1, r2, r3);
2514 __kernel void m11700_m08 (__global pw_t *pws, __global kernel_rule_t *rules_buf, __global comb_t *combs_buf, __global bf_t *bfs_buf, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV0_buf, __global u32 *d_scryptV1_buf, __global u32 *d_scryptV2_buf, __global u32 *d_scryptV3_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 il_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max)
2518 __kernel void m11700_m16 (__global pw_t *pws, __global kernel_rule_t *rules_buf, __global comb_t *combs_buf, __global bf_t *bfs_buf, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV0_buf, __global u32 *d_scryptV1_buf, __global u32 *d_scryptV2_buf, __global u32 *d_scryptV3_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 il_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max)
2522 __kernel void m11700_s04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, __global comb_t *combs_buf, __global bf_t *bfs_buf, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV0_buf, __global u32 *d_scryptV1_buf, __global u32 *d_scryptV2_buf, __global u32 *d_scryptV3_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 il_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max)
2528 const u32 gid = get_global_id (0);
2529 const u32 lid = get_local_id (0);
2530 const u32 lsz = get_local_size (0);
2533 * shared lookup table
2536 __local u64 s_sbob_sl64[8][256];
2538 for (u32 i = lid; i < 256; i += lsz)
2540 s_sbob_sl64[0][i] = sbob_sl64[0][i];
2541 s_sbob_sl64[1][i] = sbob_sl64[1][i];
2542 s_sbob_sl64[2][i] = sbob_sl64[2][i];
2543 s_sbob_sl64[3][i] = sbob_sl64[3][i];
2544 s_sbob_sl64[4][i] = sbob_sl64[4][i];
2545 s_sbob_sl64[5][i] = sbob_sl64[5][i];
2546 s_sbob_sl64[6][i] = sbob_sl64[6][i];
2547 s_sbob_sl64[7][i] = sbob_sl64[7][i];
2550 barrier (CLK_LOCAL_MEM_FENCE);
2552 if (gid >= gid_max) return;
2561 pw_buf0[0] = pws[gid].i[0];
2562 pw_buf0[1] = pws[gid].i[1];
2563 pw_buf0[2] = pws[gid].i[2];
2564 pw_buf0[3] = pws[gid].i[3];
2565 pw_buf1[0] = pws[gid].i[4];
2566 pw_buf1[1] = pws[gid].i[5];
2567 pw_buf1[2] = pws[gid].i[6];
2568 pw_buf1[3] = pws[gid].i[7];
2570 const u32 pw_l_len = pws[gid].pw_len;
2576 const u32 search[4] =
2578 digests_buf[digests_offset].digest_buf[DGST_R0],
2579 digests_buf[digests_offset].digest_buf[DGST_R1],
2580 digests_buf[digests_offset].digest_buf[DGST_R2],
2581 digests_buf[digests_offset].digest_buf[DGST_R3]
2588 for (u32 il_pos = 0; il_pos < il_cnt; il_pos += VECT_SIZE)
2590 const u32x pw_r_len = pwlenx_create_combt (combs_buf, il_pos);
2592 const u32x pw_len = pw_l_len + pw_r_len;
2595 * concat password candidate
2598 u32x wordl0[4] = { 0 };
2599 u32x wordl1[4] = { 0 };
2600 u32x wordl2[4] = { 0 };
2601 u32x wordl3[4] = { 0 };
2603 wordl0[0] = pw_buf0[0];
2604 wordl0[1] = pw_buf0[1];
2605 wordl0[2] = pw_buf0[2];
2606 wordl0[3] = pw_buf0[3];
2607 wordl1[0] = pw_buf1[0];
2608 wordl1[1] = pw_buf1[1];
2609 wordl1[2] = pw_buf1[2];
2610 wordl1[3] = pw_buf1[3];
2612 u32x wordr0[4] = { 0 };
2613 u32x wordr1[4] = { 0 };
2614 u32x wordr2[4] = { 0 };
2615 u32x wordr3[4] = { 0 };
2617 wordr0[0] = ix_create_combt (combs_buf, il_pos, 0);
2618 wordr0[1] = ix_create_combt (combs_buf, il_pos, 1);
2619 wordr0[2] = ix_create_combt (combs_buf, il_pos, 2);
2620 wordr0[3] = ix_create_combt (combs_buf, il_pos, 3);
2621 wordr1[0] = ix_create_combt (combs_buf, il_pos, 4);
2622 wordr1[1] = ix_create_combt (combs_buf, il_pos, 5);
2623 wordr1[2] = ix_create_combt (combs_buf, il_pos, 6);
2624 wordr1[3] = ix_create_combt (combs_buf, il_pos, 7);
2626 if (combs_mode == COMBINATOR_MODE_BASE_LEFT)
2628 switch_buffer_by_offset_le_VV (wordr0, wordr1, wordr2, wordr3, pw_l_len);
2632 switch_buffer_by_offset_le_VV (wordl0, wordl1, wordl2, wordl3, pw_r_len);
2640 w0[0] = wordl0[0] | wordr0[0];
2641 w0[1] = wordl0[1] | wordr0[1];
2642 w0[2] = wordl0[2] | wordr0[2];
2643 w0[3] = wordl0[3] | wordr0[3];
2644 w1[0] = wordl1[0] | wordr1[0];
2645 w1[1] = wordl1[1] | wordr1[1];
2646 w1[2] = wordl1[2] | wordr1[2];
2647 w1[3] = wordl1[3] | wordr1[3];
2648 w2[0] = wordl2[0] | wordr2[0];
2649 w2[1] = wordl2[1] | wordr2[1];
2650 w2[2] = wordl2[2] | wordr2[2];
2651 w2[3] = wordl2[3] | wordr2[3];
2652 w3[0] = wordl3[0] | wordr3[0];
2653 w3[1] = wordl3[1] | wordr3[1];
2654 w3[2] = wordl3[2] | wordr3[2];
2655 w3[3] = wordl3[3] | wordr3[3];
2681 * reverse message block
2686 m[0] = hl32_to_64 (w[15], w[14]);
2687 m[1] = hl32_to_64 (w[13], w[12]);
2688 m[2] = hl32_to_64 (w[11], w[10]);
2689 m[3] = hl32_to_64 (w[ 9], w[ 8]);
2690 m[4] = hl32_to_64 (w[ 7], w[ 6]);
2691 m[5] = hl32_to_64 (w[ 5], w[ 4]);
2692 m[6] = hl32_to_64 (w[ 3], w[ 2]);
2693 m[7] = hl32_to_64 (w[ 1], w[ 0]);
2695 m[0] = swap64 (m[0]);
2696 m[1] = swap64 (m[1]);
2697 m[2] = swap64 (m[2]);
2698 m[3] = swap64 (m[3]);
2699 m[4] = swap64 (m[4]);
2700 m[5] = swap64 (m[5]);
2701 m[6] = swap64 (m[6]);
2702 m[7] = swap64 (m[7]);
2704 // state buffer (hash)
2717 streebog_g (h, m, s_sbob_sl64);
2728 z[7] = swap64 ((u64) (pw_len * 8));
2730 streebog_g (h, z, s_sbob_sl64);
2731 streebog_g (h, m, s_sbob_sl64);
2733 const u32x r0 = l32_from_64 (h[0]);
2734 const u32x r1 = h32_from_64 (h[0]);
2735 const u32x r2 = l32_from_64 (h[1]);
2736 const u32x r3 = h32_from_64 (h[1]);
2738 COMPARE_S_SIMD (r0, r1, r2, r3);
2742 __kernel void m11700_s08 (__global pw_t *pws, __global kernel_rule_t *rules_buf, __global comb_t *combs_buf, __global bf_t *bfs_buf, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV0_buf, __global u32 *d_scryptV1_buf, __global u32 *d_scryptV2_buf, __global u32 *d_scryptV3_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 il_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max)
2746 __kernel void m11700_s16 (__global pw_t *pws, __global kernel_rule_t *rules_buf, __global comb_t *combs_buf, __global bf_t *bfs_buf, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV0_buf, __global u32 *d_scryptV1_buf, __global u32 *d_scryptV2_buf, __global u32 *d_scryptV3_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 il_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max)