2 * Author......: Jens Steube <jens.steube@gmail.com>
9 #include "include/constants.h"
10 #include "include/kernel_vendor.h"
29 #include "include/kernel_functions.c"
30 #include "types_amd.c"
31 #include "common_amd.c"
34 #define VECT_COMPARE_S "check_single_vect1_comp4_warp.c"
35 #define VECT_COMPARE_M "check_multi_vect1_comp4_warp.c"
39 #define VECT_COMPARE_S "check_single_vect2_comp4_warp.c"
40 #define VECT_COMPARE_M "check_multi_vect2_comp4_warp.c"
44 #define VECT_COMPARE_S "check_single_vect4_comp4_warp.c"
45 #define VECT_COMPARE_M "check_multi_vect4_comp4_warp.c"
49 #define SIPROUND(v0,v1,v2,v3) \
51 (v1) = rotl64 ((v1), 13); \
53 (v0) = as_ulong (as_uint2 ((v0)).s10); \
55 (v3) = rotl64 ((v3), 16); \
58 (v3) = rotl64 ((v3), 21); \
61 (v1) = rotl64 ((v1), 17); \
63 (v2) = as_ulong (as_uint2 ((v2)).s10);
65 #define SIPROUND(v0,v1,v2,v3) \
67 (v1) = rotl64 ((v1), 13); \
69 (v0) = rotl64 ((v0), 32); \
71 (v3) = rotl64 ((v3), 16); \
74 (v3) = rotl64 ((v3), 21); \
77 (v1) = rotl64 ((v1), 17); \
79 (v2) = rotl64 ((v2), 32);
82 static void m10100m (u32 w[16], const u32 pw_len, __global pw_t *pws, __global gpu_rule_t *rules_buf, __global comb_t *combs_buf, __global u32x * words_buf_r, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 bfs_cnt, const u32 digests_cnt, const u32 digests_offset)
88 const u32 gid = get_global_id (0);
89 const u32 lid = get_local_id (0);
100 v0p ^= hl32_to_64 (salt_bufs[salt_pos].salt_buf[1], salt_bufs[salt_pos].salt_buf[0]);
101 v1p ^= hl32_to_64 (salt_bufs[salt_pos].salt_buf[3], salt_bufs[salt_pos].salt_buf[2]);
102 v2p ^= hl32_to_64 (salt_bufs[salt_pos].salt_buf[1], salt_bufs[salt_pos].salt_buf[0]);
103 v3p ^= hl32_to_64 (salt_bufs[salt_pos].salt_buf[3], salt_bufs[salt_pos].salt_buf[2]);
105 u64 *w_ptr = (u64 *) w;
107 w_ptr[pw_len / 8] |= (u64) pw_len << 56;
113 const u32 bf_loops = ceil ((float) bfs_cnt / VECT_DIV);
117 for (u32 il_pos = 0; il_pos < bf_loops; il_pos++)
119 const u32x w0r = words_buf_r[il_pos];
121 const u32x w0 = w0l | w0r;
128 u64x m = hl32_to_64 (w[1], w0);
132 SIPROUND (v0, v1, v2, v3);
133 SIPROUND (v0, v1, v2, v3);
140 for (i = 8, j = 2; i <= pw_len; i += 8, j += 2)
142 m = hl32_to_64 (w[j + 1], w[j + 0]);
146 SIPROUND (v0, v1, v2, v3);
147 SIPROUND (v0, v1, v2, v3);
154 SIPROUND (v0, v1, v2, v3);
155 SIPROUND (v0, v1, v2, v3);
156 SIPROUND (v0, v1, v2, v3);
157 SIPROUND (v0, v1, v2, v3);
159 const u64x v = v0 ^ v1 ^ v2 ^ v3;
161 const u32x a = l32_from_64 (v);
162 const u32x b = h32_from_64 (v);
169 #include VECT_COMPARE_M
173 static void m10100s (u32 w[16], const u32 pw_len, __global pw_t *pws, __global gpu_rule_t *rules_buf, __global comb_t *combs_buf, __global u32x * words_buf_r, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 bfs_cnt, const u32 digests_cnt, const u32 digests_offset)
179 const u32 gid = get_global_id (0);
180 const u32 lid = get_local_id (0);
186 const u32 search[4] =
188 digests_buf[digests_offset].digest_buf[DGST_R0],
189 digests_buf[digests_offset].digest_buf[DGST_R1],
190 digests_buf[digests_offset].digest_buf[DGST_R2],
191 digests_buf[digests_offset].digest_buf[DGST_R3]
198 u64 v0p = SIPHASHM_0;
199 u64 v1p = SIPHASHM_1;
200 u64 v2p = SIPHASHM_2;
201 u64 v3p = SIPHASHM_3;
203 v0p ^= hl32_to_64 (salt_bufs[salt_pos].salt_buf[1], salt_bufs[salt_pos].salt_buf[0]);
204 v1p ^= hl32_to_64 (salt_bufs[salt_pos].salt_buf[3], salt_bufs[salt_pos].salt_buf[2]);
205 v2p ^= hl32_to_64 (salt_bufs[salt_pos].salt_buf[1], salt_bufs[salt_pos].salt_buf[0]);
206 v3p ^= hl32_to_64 (salt_bufs[salt_pos].salt_buf[3], salt_bufs[salt_pos].salt_buf[2]);
208 u64 *w_ptr = (u64 *) w;
210 w_ptr[pw_len / 8] |= (u64) pw_len << 56;
216 const u32 bf_loops = ceil ((float) bfs_cnt / VECT_DIV);
220 for (u32 il_pos = 0; il_pos < bf_loops; il_pos++)
222 const u32x w0r = words_buf_r[il_pos];
224 const u32x w0 = w0l | w0r;
231 u64x m = hl32_to_64 (w[1], w0);
235 SIPROUND (v0, v1, v2, v3);
236 SIPROUND (v0, v1, v2, v3);
243 for (i = 8, j = 2; i <= pw_len; i += 8, j += 2)
245 m = hl32_to_64 (w[j + 1], w[j + 0]);
249 SIPROUND (v0, v1, v2, v3);
250 SIPROUND (v0, v1, v2, v3);
257 SIPROUND (v0, v1, v2, v3);
258 SIPROUND (v0, v1, v2, v3);
259 SIPROUND (v0, v1, v2, v3);
260 SIPROUND (v0, v1, v2, v3);
262 const u64x v = v0 ^ v1 ^ v2 ^ v3;
264 const u32x a = l32_from_64 (v);
265 const u32x b = h32_from_64 (v);
272 #include VECT_COMPARE_S
276 __kernel void __attribute__((reqd_work_group_size (64, 1, 1))) m10100_m04 (__global pw_t *pws, __global gpu_rule_t *rules_buf, __global comb_t *combs_buf, __global u32x * words_buf_r, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 bfs_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max)
282 const u32 gid = get_global_id (0);
284 if (gid >= gid_max) return;
288 w[ 0] = pws[gid].i[ 0];
289 w[ 1] = pws[gid].i[ 1];
290 w[ 2] = pws[gid].i[ 2];
291 w[ 3] = pws[gid].i[ 3];
302 w[14] = pws[gid].i[14];
305 const u32 pw_len = pws[gid].pw_len;
311 m10100m (w, pw_len, pws, rules_buf, combs_buf, words_buf_r, tmps, hooks, bitmaps_buf_s1_a, bitmaps_buf_s1_b, bitmaps_buf_s1_c, bitmaps_buf_s1_d, bitmaps_buf_s2_a, bitmaps_buf_s2_b, bitmaps_buf_s2_c, bitmaps_buf_s2_d, plains_buf, digests_buf, hashes_shown, salt_bufs, esalt_bufs, d_return_buf, d_scryptV_buf, bitmap_mask, bitmap_shift1, bitmap_shift2, salt_pos, loop_pos, loop_cnt, bfs_cnt, digests_cnt, digests_offset);
314 __kernel void __attribute__((reqd_work_group_size (64, 1, 1))) m10100_m08 (__global pw_t *pws, __global gpu_rule_t *rules_buf, __global comb_t *combs_buf, __global u32x * words_buf_r, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 bfs_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max)
320 const u32 gid = get_global_id (0);
322 if (gid >= gid_max) return;
326 w[ 0] = pws[gid].i[ 0];
327 w[ 1] = pws[gid].i[ 1];
328 w[ 2] = pws[gid].i[ 2];
329 w[ 3] = pws[gid].i[ 3];
330 w[ 4] = pws[gid].i[ 4];
331 w[ 5] = pws[gid].i[ 5];
332 w[ 6] = pws[gid].i[ 6];
333 w[ 7] = pws[gid].i[ 7];
340 w[14] = pws[gid].i[14];
343 const u32 pw_len = pws[gid].pw_len;
349 m10100m (w, pw_len, pws, rules_buf, combs_buf, words_buf_r, tmps, hooks, bitmaps_buf_s1_a, bitmaps_buf_s1_b, bitmaps_buf_s1_c, bitmaps_buf_s1_d, bitmaps_buf_s2_a, bitmaps_buf_s2_b, bitmaps_buf_s2_c, bitmaps_buf_s2_d, plains_buf, digests_buf, hashes_shown, salt_bufs, esalt_bufs, d_return_buf, d_scryptV_buf, bitmap_mask, bitmap_shift1, bitmap_shift2, salt_pos, loop_pos, loop_cnt, bfs_cnt, digests_cnt, digests_offset);
352 __kernel void __attribute__((reqd_work_group_size (64, 1, 1))) m10100_m16 (__global pw_t *pws, __global gpu_rule_t *rules_buf, __global comb_t *combs_buf, __global u32x * words_buf_r, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 bfs_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max)
358 const u32 gid = get_global_id (0);
360 if (gid >= gid_max) return;
364 w[ 0] = pws[gid].i[ 0];
365 w[ 1] = pws[gid].i[ 1];
366 w[ 2] = pws[gid].i[ 2];
367 w[ 3] = pws[gid].i[ 3];
368 w[ 4] = pws[gid].i[ 4];
369 w[ 5] = pws[gid].i[ 5];
370 w[ 6] = pws[gid].i[ 6];
371 w[ 7] = pws[gid].i[ 7];
372 w[ 8] = pws[gid].i[ 8];
373 w[ 9] = pws[gid].i[ 9];
374 w[10] = pws[gid].i[10];
375 w[11] = pws[gid].i[11];
376 w[12] = pws[gid].i[12];
377 w[13] = pws[gid].i[13];
378 w[14] = pws[gid].i[14];
379 w[15] = pws[gid].i[15];
381 const u32 pw_len = pws[gid].pw_len;
387 m10100m (w, pw_len, pws, rules_buf, combs_buf, words_buf_r, tmps, hooks, bitmaps_buf_s1_a, bitmaps_buf_s1_b, bitmaps_buf_s1_c, bitmaps_buf_s1_d, bitmaps_buf_s2_a, bitmaps_buf_s2_b, bitmaps_buf_s2_c, bitmaps_buf_s2_d, plains_buf, digests_buf, hashes_shown, salt_bufs, esalt_bufs, d_return_buf, d_scryptV_buf, bitmap_mask, bitmap_shift1, bitmap_shift2, salt_pos, loop_pos, loop_cnt, bfs_cnt, digests_cnt, digests_offset);
390 __kernel void __attribute__((reqd_work_group_size (64, 1, 1))) m10100_s04 (__global pw_t *pws, __global gpu_rule_t *rules_buf, __global comb_t *combs_buf, __global u32x * words_buf_r, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 bfs_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max)
396 const u32 gid = get_global_id (0);
398 if (gid >= gid_max) return;
402 w[ 0] = pws[gid].i[ 0];
403 w[ 1] = pws[gid].i[ 1];
404 w[ 2] = pws[gid].i[ 2];
405 w[ 3] = pws[gid].i[ 3];
416 w[14] = pws[gid].i[14];
419 const u32 pw_len = pws[gid].pw_len;
425 m10100s (w, pw_len, pws, rules_buf, combs_buf, words_buf_r, tmps, hooks, bitmaps_buf_s1_a, bitmaps_buf_s1_b, bitmaps_buf_s1_c, bitmaps_buf_s1_d, bitmaps_buf_s2_a, bitmaps_buf_s2_b, bitmaps_buf_s2_c, bitmaps_buf_s2_d, plains_buf, digests_buf, hashes_shown, salt_bufs, esalt_bufs, d_return_buf, d_scryptV_buf, bitmap_mask, bitmap_shift1, bitmap_shift2, salt_pos, loop_pos, loop_cnt, bfs_cnt, digests_cnt, digests_offset);
428 __kernel void __attribute__((reqd_work_group_size (64, 1, 1))) m10100_s08 (__global pw_t *pws, __global gpu_rule_t *rules_buf, __global comb_t *combs_buf, __global u32x * words_buf_r, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 bfs_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max)
434 const u32 gid = get_global_id (0);
436 if (gid >= gid_max) return;
440 w[ 0] = pws[gid].i[ 0];
441 w[ 1] = pws[gid].i[ 1];
442 w[ 2] = pws[gid].i[ 2];
443 w[ 3] = pws[gid].i[ 3];
444 w[ 4] = pws[gid].i[ 4];
445 w[ 5] = pws[gid].i[ 5];
446 w[ 6] = pws[gid].i[ 6];
447 w[ 7] = pws[gid].i[ 7];
454 w[14] = pws[gid].i[14];
457 const u32 pw_len = pws[gid].pw_len;
463 m10100s (w, pw_len, pws, rules_buf, combs_buf, words_buf_r, tmps, hooks, bitmaps_buf_s1_a, bitmaps_buf_s1_b, bitmaps_buf_s1_c, bitmaps_buf_s1_d, bitmaps_buf_s2_a, bitmaps_buf_s2_b, bitmaps_buf_s2_c, bitmaps_buf_s2_d, plains_buf, digests_buf, hashes_shown, salt_bufs, esalt_bufs, d_return_buf, d_scryptV_buf, bitmap_mask, bitmap_shift1, bitmap_shift2, salt_pos, loop_pos, loop_cnt, bfs_cnt, digests_cnt, digests_offset);
466 __kernel void __attribute__((reqd_work_group_size (64, 1, 1))) m10100_s16 (__global pw_t *pws, __global gpu_rule_t *rules_buf, __global comb_t *combs_buf, __global u32x * words_buf_r, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 bfs_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max)
472 const u32 gid = get_global_id (0);
474 if (gid >= gid_max) return;
478 w[ 0] = pws[gid].i[ 0];
479 w[ 1] = pws[gid].i[ 1];
480 w[ 2] = pws[gid].i[ 2];
481 w[ 3] = pws[gid].i[ 3];
482 w[ 4] = pws[gid].i[ 4];
483 w[ 5] = pws[gid].i[ 5];
484 w[ 6] = pws[gid].i[ 6];
485 w[ 7] = pws[gid].i[ 7];
486 w[ 8] = pws[gid].i[ 8];
487 w[ 9] = pws[gid].i[ 9];
488 w[10] = pws[gid].i[10];
489 w[11] = pws[gid].i[11];
490 w[12] = pws[gid].i[12];
491 w[13] = pws[gid].i[13];
492 w[14] = pws[gid].i[14];
493 w[15] = pws[gid].i[15];
495 const u32 pw_len = pws[gid].pw_len;
501 m10100s (w, pw_len, pws, rules_buf, combs_buf, words_buf_r, tmps, hooks, bitmaps_buf_s1_a, bitmaps_buf_s1_b, bitmaps_buf_s1_c, bitmaps_buf_s1_d, bitmaps_buf_s2_a, bitmaps_buf_s2_b, bitmaps_buf_s2_c, bitmaps_buf_s2_d, plains_buf, digests_buf, hashes_shown, salt_bufs, esalt_bufs, d_return_buf, d_scryptV_buf, bitmap_mask, bitmap_shift1, bitmap_shift2, salt_pos, loop_pos, loop_cnt, bfs_cnt, digests_cnt, digests_offset);