2 * Author......: Jens Steube <jens.steube@gmail.com>
8 #include "include/constants.h"
9 #include "include/kernel_vendor.h"
16 #include "include/kernel_functions.c"
17 #include "types_ocl.c"
19 #include "include/rp_gpu.h"
22 #define COMPARE_S "check_single_comp4.c"
23 #define COMPARE_M "check_multi_comp4.c"
27 #define SBOG_LPSti64 \
28 s_sbob_sl64[0][(t[0] >> (i * 8)) & 0xff] ^ \
29 s_sbob_sl64[1][(t[1] >> (i * 8)) & 0xff] ^ \
30 s_sbob_sl64[2][(t[2] >> (i * 8)) & 0xff] ^ \
31 s_sbob_sl64[3][(t[3] >> (i * 8)) & 0xff] ^ \
32 s_sbob_sl64[4][(t[4] >> (i * 8)) & 0xff] ^ \
33 s_sbob_sl64[5][(t[5] >> (i * 8)) & 0xff] ^ \
34 s_sbob_sl64[6][(t[6] >> (i * 8)) & 0xff] ^ \
35 s_sbob_sl64[7][(t[7] >> (i * 8)) & 0xff]
39 __constant u64 sbob_sl64[8][256] =
2107 __constant u64 sbob_rc64[12][8] =
2231 static void streebog_g (u64 h[8], const u64 m[8], __local u64 s_sbob_sl64[8][256])
2238 for (int i = 0; i < 8; i++)
2243 for (int i = 0; i < 8; i++)
2245 k[i] = SBOG_LPSti64;
2249 for (int i = 0; i < 8; i++)
2254 for (int r = 0; r < 12; r++)
2257 for (int i = 0; i < 8; i++)
2263 for (int i = 0; i < 8; i++)
2265 s[i] = SBOG_LPSti64;
2268 for (int i = 0; i < 8; i++)
2270 t[i] = k[i] ^ sbob_rc64[r][i];
2274 for (int i = 0; i < 8; i++)
2276 k[i] = SBOG_LPSti64;
2281 for (int i = 0; i < 8; i++)
2283 h[i] ^= s[i] ^ k[i] ^ m[i];
2287 __kernel void __attribute__((reqd_work_group_size (64, 1, 1))) m11800_m04 (__global pw_t *pws, __global gpu_rule_t *rules_buf, __global comb_t *combs_buf, __global bf_t *bfs_buf, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 rules_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max)
2293 const u32 lid = get_local_id (0);
2296 * shared lookup table
2299 const u32 lid4 = lid * 4;
2301 __local u64 s_sbob_sl64[8][256];
2303 s_sbob_sl64[0][lid4 + 0] = sbob_sl64[0][lid4 + 0];
2304 s_sbob_sl64[0][lid4 + 1] = sbob_sl64[0][lid4 + 1];
2305 s_sbob_sl64[0][lid4 + 2] = sbob_sl64[0][lid4 + 2];
2306 s_sbob_sl64[0][lid4 + 3] = sbob_sl64[0][lid4 + 3];
2307 s_sbob_sl64[1][lid4 + 0] = sbob_sl64[1][lid4 + 0];
2308 s_sbob_sl64[1][lid4 + 1] = sbob_sl64[1][lid4 + 1];
2309 s_sbob_sl64[1][lid4 + 2] = sbob_sl64[1][lid4 + 2];
2310 s_sbob_sl64[1][lid4 + 3] = sbob_sl64[1][lid4 + 3];
2311 s_sbob_sl64[2][lid4 + 0] = sbob_sl64[2][lid4 + 0];
2312 s_sbob_sl64[2][lid4 + 1] = sbob_sl64[2][lid4 + 1];
2313 s_sbob_sl64[2][lid4 + 2] = sbob_sl64[2][lid4 + 2];
2314 s_sbob_sl64[2][lid4 + 3] = sbob_sl64[2][lid4 + 3];
2315 s_sbob_sl64[3][lid4 + 0] = sbob_sl64[3][lid4 + 0];
2316 s_sbob_sl64[3][lid4 + 1] = sbob_sl64[3][lid4 + 1];
2317 s_sbob_sl64[3][lid4 + 2] = sbob_sl64[3][lid4 + 2];
2318 s_sbob_sl64[3][lid4 + 3] = sbob_sl64[3][lid4 + 3];
2319 s_sbob_sl64[4][lid4 + 0] = sbob_sl64[4][lid4 + 0];
2320 s_sbob_sl64[4][lid4 + 1] = sbob_sl64[4][lid4 + 1];
2321 s_sbob_sl64[4][lid4 + 2] = sbob_sl64[4][lid4 + 2];
2322 s_sbob_sl64[4][lid4 + 3] = sbob_sl64[4][lid4 + 3];
2323 s_sbob_sl64[5][lid4 + 0] = sbob_sl64[5][lid4 + 0];
2324 s_sbob_sl64[5][lid4 + 1] = sbob_sl64[5][lid4 + 1];
2325 s_sbob_sl64[5][lid4 + 2] = sbob_sl64[5][lid4 + 2];
2326 s_sbob_sl64[5][lid4 + 3] = sbob_sl64[5][lid4 + 3];
2327 s_sbob_sl64[6][lid4 + 0] = sbob_sl64[6][lid4 + 0];
2328 s_sbob_sl64[6][lid4 + 1] = sbob_sl64[6][lid4 + 1];
2329 s_sbob_sl64[6][lid4 + 2] = sbob_sl64[6][lid4 + 2];
2330 s_sbob_sl64[6][lid4 + 3] = sbob_sl64[6][lid4 + 3];
2331 s_sbob_sl64[7][lid4 + 0] = sbob_sl64[7][lid4 + 0];
2332 s_sbob_sl64[7][lid4 + 1] = sbob_sl64[7][lid4 + 1];
2333 s_sbob_sl64[7][lid4 + 2] = sbob_sl64[7][lid4 + 2];
2334 s_sbob_sl64[7][lid4 + 3] = sbob_sl64[7][lid4 + 3];
2336 barrier (CLK_LOCAL_MEM_FENCE);
2342 const u32 gid = get_global_id (0);
2344 if (gid >= gid_max) return;
2348 pw_buf0[0] = pws[gid].i[ 0];
2349 pw_buf0[1] = pws[gid].i[ 1];
2350 pw_buf0[2] = pws[gid].i[ 2];
2351 pw_buf0[3] = pws[gid].i[ 3];
2355 pw_buf1[0] = pws[gid].i[ 4];
2356 pw_buf1[1] = pws[gid].i[ 5];
2357 pw_buf1[2] = pws[gid].i[ 6];
2358 pw_buf1[3] = pws[gid].i[ 7];
2360 const u32 pw_len = pws[gid].pw_len;
2366 for (u32 il_pos = 0; il_pos < rules_cnt; il_pos++)
2387 const u32 out_len = apply_rules (rules_buf[il_pos].cmds, &w[0], &w[1], pw_len);
2389 append_0x01_2x4 (&w[0], &w[1], out_len);
2392 * reverse message block
2397 m[0] = hl32_to_64 (w[15], w[14]);
2398 m[1] = hl32_to_64 (w[13], w[12]);
2399 m[2] = hl32_to_64 (w[11], w[10]);
2400 m[3] = hl32_to_64 (w[ 9], w[ 8]);
2401 m[4] = hl32_to_64 (w[ 7], w[ 6]);
2402 m[5] = hl32_to_64 (w[ 5], w[ 4]);
2403 m[6] = hl32_to_64 (w[ 3], w[ 2]);
2404 m[7] = hl32_to_64 (w[ 1], w[ 0]);
2406 m[0] = swap64 (m[0]);
2407 m[1] = swap64 (m[1]);
2408 m[2] = swap64 (m[2]);
2409 m[3] = swap64 (m[3]);
2410 m[4] = swap64 (m[4]);
2411 m[5] = swap64 (m[5]);
2412 m[6] = swap64 (m[6]);
2413 m[7] = swap64 (m[7]);
2415 // state buffer (hash)
2428 streebog_g (h, m, s_sbob_sl64);
2439 z[7] = swap64 ((u64) (out_len * 8));
2441 streebog_g (h, z, s_sbob_sl64);
2442 streebog_g (h, m, s_sbob_sl64);
2444 const u32 r0 = l32_from_64 (h[0]);
2445 const u32 r1 = h32_from_64 (h[0]);
2446 const u32 r2 = l32_from_64 (h[1]);
2447 const u32 r3 = h32_from_64 (h[1]);
2453 __kernel void __attribute__((reqd_work_group_size (64, 1, 1))) m11800_m08 (__global pw_t *pws, __global gpu_rule_t *rules_buf, __global comb_t *combs_buf, __global bf_t *bfs_buf, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 combs_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max)
2457 __kernel void __attribute__((reqd_work_group_size (64, 1, 1))) m11800_m16 (__global pw_t *pws, __global gpu_rule_t *rules_buf, __global comb_t *combs_buf, __global bf_t *bfs_buf, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 combs_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max)
2461 __kernel void __attribute__((reqd_work_group_size (64, 1, 1))) m11800_s04 (__global pw_t *pws, __global gpu_rule_t *rules_buf, __global comb_t *combs_buf, __global bf_t *bfs_buf, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 rules_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max)
2467 const u32 lid = get_local_id (0);
2471 * shared lookup table
2474 const u32 lid4 = lid * 4;
2476 __local u64 s_sbob_sl64[8][256];
2478 s_sbob_sl64[0][lid4 + 0] = sbob_sl64[0][lid4 + 0];
2479 s_sbob_sl64[0][lid4 + 1] = sbob_sl64[0][lid4 + 1];
2480 s_sbob_sl64[0][lid4 + 2] = sbob_sl64[0][lid4 + 2];
2481 s_sbob_sl64[0][lid4 + 3] = sbob_sl64[0][lid4 + 3];
2482 s_sbob_sl64[1][lid4 + 0] = sbob_sl64[1][lid4 + 0];
2483 s_sbob_sl64[1][lid4 + 1] = sbob_sl64[1][lid4 + 1];
2484 s_sbob_sl64[1][lid4 + 2] = sbob_sl64[1][lid4 + 2];
2485 s_sbob_sl64[1][lid4 + 3] = sbob_sl64[1][lid4 + 3];
2486 s_sbob_sl64[2][lid4 + 0] = sbob_sl64[2][lid4 + 0];
2487 s_sbob_sl64[2][lid4 + 1] = sbob_sl64[2][lid4 + 1];
2488 s_sbob_sl64[2][lid4 + 2] = sbob_sl64[2][lid4 + 2];
2489 s_sbob_sl64[2][lid4 + 3] = sbob_sl64[2][lid4 + 3];
2490 s_sbob_sl64[3][lid4 + 0] = sbob_sl64[3][lid4 + 0];
2491 s_sbob_sl64[3][lid4 + 1] = sbob_sl64[3][lid4 + 1];
2492 s_sbob_sl64[3][lid4 + 2] = sbob_sl64[3][lid4 + 2];
2493 s_sbob_sl64[3][lid4 + 3] = sbob_sl64[3][lid4 + 3];
2494 s_sbob_sl64[4][lid4 + 0] = sbob_sl64[4][lid4 + 0];
2495 s_sbob_sl64[4][lid4 + 1] = sbob_sl64[4][lid4 + 1];
2496 s_sbob_sl64[4][lid4 + 2] = sbob_sl64[4][lid4 + 2];
2497 s_sbob_sl64[4][lid4 + 3] = sbob_sl64[4][lid4 + 3];
2498 s_sbob_sl64[5][lid4 + 0] = sbob_sl64[5][lid4 + 0];
2499 s_sbob_sl64[5][lid4 + 1] = sbob_sl64[5][lid4 + 1];
2500 s_sbob_sl64[5][lid4 + 2] = sbob_sl64[5][lid4 + 2];
2501 s_sbob_sl64[5][lid4 + 3] = sbob_sl64[5][lid4 + 3];
2502 s_sbob_sl64[6][lid4 + 0] = sbob_sl64[6][lid4 + 0];
2503 s_sbob_sl64[6][lid4 + 1] = sbob_sl64[6][lid4 + 1];
2504 s_sbob_sl64[6][lid4 + 2] = sbob_sl64[6][lid4 + 2];
2505 s_sbob_sl64[6][lid4 + 3] = sbob_sl64[6][lid4 + 3];
2506 s_sbob_sl64[7][lid4 + 0] = sbob_sl64[7][lid4 + 0];
2507 s_sbob_sl64[7][lid4 + 1] = sbob_sl64[7][lid4 + 1];
2508 s_sbob_sl64[7][lid4 + 2] = sbob_sl64[7][lid4 + 2];
2509 s_sbob_sl64[7][lid4 + 3] = sbob_sl64[7][lid4 + 3];
2511 barrier (CLK_LOCAL_MEM_FENCE);
2517 const u32 gid = get_global_id (0);
2519 if (gid >= gid_max) return;
2523 pw_buf0[0] = pws[gid].i[ 0];
2524 pw_buf0[1] = pws[gid].i[ 1];
2525 pw_buf0[2] = pws[gid].i[ 2];
2526 pw_buf0[3] = pws[gid].i[ 3];
2530 pw_buf1[0] = pws[gid].i[ 4];
2531 pw_buf1[1] = pws[gid].i[ 5];
2532 pw_buf1[2] = pws[gid].i[ 6];
2533 pw_buf1[3] = pws[gid].i[ 7];
2535 const u32 pw_len = pws[gid].pw_len;
2541 const u32 search[4] =
2543 digests_buf[digests_offset].digest_buf[DGST_R0],
2544 digests_buf[digests_offset].digest_buf[DGST_R1],
2545 digests_buf[digests_offset].digest_buf[DGST_R2],
2546 digests_buf[digests_offset].digest_buf[DGST_R3]
2553 for (u32 il_pos = 0; il_pos < rules_cnt; il_pos++)
2574 const u32 out_len = apply_rules (rules_buf[il_pos].cmds, &w[0], &w[1], pw_len);
2576 append_0x01_2x4 (&w[0], &w[1], out_len);
2579 * reverse message block
2584 m[0] = hl32_to_64 (w[15], w[14]);
2585 m[1] = hl32_to_64 (w[13], w[12]);
2586 m[2] = hl32_to_64 (w[11], w[10]);
2587 m[3] = hl32_to_64 (w[ 9], w[ 8]);
2588 m[4] = hl32_to_64 (w[ 7], w[ 6]);
2589 m[5] = hl32_to_64 (w[ 5], w[ 4]);
2590 m[6] = hl32_to_64 (w[ 3], w[ 2]);
2591 m[7] = hl32_to_64 (w[ 1], w[ 0]);
2593 m[0] = swap64 (m[0]);
2594 m[1] = swap64 (m[1]);
2595 m[2] = swap64 (m[2]);
2596 m[3] = swap64 (m[3]);
2597 m[4] = swap64 (m[4]);
2598 m[5] = swap64 (m[5]);
2599 m[6] = swap64 (m[6]);
2600 m[7] = swap64 (m[7]);
2602 // state buffer (hash)
2615 streebog_g (h, m, s_sbob_sl64);
2626 z[7] = swap64 ((u64) (out_len * 8));
2628 streebog_g (h, z, s_sbob_sl64);
2629 streebog_g (h, m, s_sbob_sl64);
2631 const u32 r0 = l32_from_64 (h[0]);
2632 const u32 r1 = h32_from_64 (h[0]);
2633 const u32 r2 = l32_from_64 (h[1]);
2634 const u32 r3 = h32_from_64 (h[1]);
2640 __kernel void __attribute__((reqd_work_group_size (64, 1, 1))) m11800_s08 (__global pw_t *pws, __global gpu_rule_t *rules_buf, __global comb_t *combs_buf, __global bf_t *bfs_buf, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 combs_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max)
2644 __kernel void __attribute__((reqd_work_group_size (64, 1, 1))) m11800_s16 (__global pw_t *pws, __global gpu_rule_t *rules_buf, __global comb_t *combs_buf, __global bf_t *bfs_buf, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 combs_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max)