2 * Author......: Jens Steube <jens.steube@gmail.com>
9 #include "include/constants.h"
10 #include "include/kernel_vendor.h"
25 #include "include/kernel_functions.c"
27 #include "common_nv.c"
30 #define VECT_COMPARE_S "check_single_vect1_comp4_warp.c"
31 #define VECT_COMPARE_M "check_multi_vect1_comp4_warp.c"
35 #define VECT_COMPARE_S "check_single_vect2_comp4_warp.c"
36 #define VECT_COMPARE_M "check_multi_vect2_comp4_warp.c"
40 #define VECT_COMPARE_S "check_single_vect4_comp4_warp.c"
41 #define VECT_COMPARE_M "check_multi_vect4_comp4_warp.c"
44 #define SIPROUND(v0,v1,v2,v3) \
46 (v1) = rotl64 ((v1), 13); \
48 (v0) = rotl64 ((v0), 32); \
50 (v3) = rotl64 ((v3), 16); \
53 (v3) = rotl64 ((v3), 21); \
56 (v1) = rotl64 ((v1), 17); \
58 (v2) = rotl64 ((v2), 32);
60 __device__ __constant__ u32x c_bfs[1024];
62 __device__ static void m10100m (u32 w[16], const u32 pw_len, const pw_t *pws, const gpu_rule_t *rules_buf, const comb_t *combs_buf, const u32x * words_buf_r, void *tmps, void *hooks, const u32 *bitmaps_buf_s1_a, const u32 *bitmaps_buf_s1_b, const u32 *bitmaps_buf_s1_c, const u32 *bitmaps_buf_s1_d, const u32 *bitmaps_buf_s2_a, const u32 *bitmaps_buf_s2_b, const u32 *bitmaps_buf_s2_c, const u32 *bitmaps_buf_s2_d, plain_t *plains_buf, const digest_t *digests_buf, u32 *hashes_shown, const salt_t *salt_bufs, const void *esalt_bufs, u32 *d_return_buf, u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 bfs_cnt, const u32 digests_cnt, const u32 digests_offset)
68 const u32 gid = (blockIdx.x * blockDim.x) + threadIdx.x;
69 const u32 lid = threadIdx.x;
80 v0p ^= hl32_to_64 (salt_bufs[salt_pos].salt_buf[1], salt_bufs[salt_pos].salt_buf[0]);
81 v1p ^= hl32_to_64 (salt_bufs[salt_pos].salt_buf[3], salt_bufs[salt_pos].salt_buf[2]);
82 v2p ^= hl32_to_64 (salt_bufs[salt_pos].salt_buf[1], salt_bufs[salt_pos].salt_buf[0]);
83 v3p ^= hl32_to_64 (salt_bufs[salt_pos].salt_buf[3], salt_bufs[salt_pos].salt_buf[2]);
85 u64 *w_ptr = (u64 *) w;
87 w_ptr[pw_len / 8] |= (u64) pw_len << 56;
93 const u32 bf_loops = ceil ((float) bfs_cnt / VECT_DIV);
97 for (u32 il_pos = 0; il_pos < bf_loops; il_pos++)
99 const u32x w0r = c_bfs[il_pos];
101 const u32x w0 = w0l | w0r;
108 u64x m = hl32_to_64 (w[1], w0);
112 SIPROUND (v0, v1, v2, v3);
113 SIPROUND (v0, v1, v2, v3);
120 for (i = 8, j = 2; i <= pw_len; i += 8, j += 2)
122 m = hl32_to_64 (w[j + 1], w[j + 0]);
126 SIPROUND (v0, v1, v2, v3);
127 SIPROUND (v0, v1, v2, v3);
134 SIPROUND (v0, v1, v2, v3);
135 SIPROUND (v0, v1, v2, v3);
136 SIPROUND (v0, v1, v2, v3);
137 SIPROUND (v0, v1, v2, v3);
139 const u64x v = v0 ^ v1 ^ v2 ^ v3;
141 const u32x a = l32_from_64 (v);
142 const u32x b = h32_from_64 (v);
149 #include VECT_COMPARE_M
153 __device__ static void m10100s (u32 w[16], const u32 pw_len, const pw_t *pws, const gpu_rule_t *rules_buf, const comb_t *combs_buf, const u32x * words_buf_r, void *tmps, void *hooks, const u32 *bitmaps_buf_s1_a, const u32 *bitmaps_buf_s1_b, const u32 *bitmaps_buf_s1_c, const u32 *bitmaps_buf_s1_d, const u32 *bitmaps_buf_s2_a, const u32 *bitmaps_buf_s2_b, const u32 *bitmaps_buf_s2_c, const u32 *bitmaps_buf_s2_d, plain_t *plains_buf, const digest_t *digests_buf, u32 *hashes_shown, const salt_t *salt_bufs, const void *esalt_bufs, u32 *d_return_buf, u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 bfs_cnt, const u32 digests_cnt, const u32 digests_offset)
159 const u32 gid = (blockIdx.x * blockDim.x) + threadIdx.x;
160 const u32 lid = threadIdx.x;
166 u64 v0p = SIPHASHM_0;
167 u64 v1p = SIPHASHM_1;
168 u64 v2p = SIPHASHM_2;
169 u64 v3p = SIPHASHM_3;
171 v0p ^= hl32_to_64 (salt_bufs[salt_pos].salt_buf[1], salt_bufs[salt_pos].salt_buf[0]);
172 v1p ^= hl32_to_64 (salt_bufs[salt_pos].salt_buf[3], salt_bufs[salt_pos].salt_buf[2]);
173 v2p ^= hl32_to_64 (salt_bufs[salt_pos].salt_buf[1], salt_bufs[salt_pos].salt_buf[0]);
174 v3p ^= hl32_to_64 (salt_bufs[salt_pos].salt_buf[3], salt_bufs[salt_pos].salt_buf[2]);
176 u64 *w_ptr = (u64 *) w;
178 w_ptr[pw_len / 8] |= (u64) pw_len << 56;
184 const u32 search[4] =
186 digests_buf[digests_offset].digest_buf[DGST_R0],
187 digests_buf[digests_offset].digest_buf[DGST_R1],
188 digests_buf[digests_offset].digest_buf[DGST_R2],
189 digests_buf[digests_offset].digest_buf[DGST_R3]
196 const u32 bf_loops = ceil ((float) bfs_cnt / VECT_DIV);
200 for (u32 il_pos = 0; il_pos < bf_loops; il_pos++)
202 const u32x w0r = c_bfs[il_pos];
204 const u32x w0 = w0l | w0r;
211 u64x m = hl32_to_64 (w[1], w0);
215 SIPROUND (v0, v1, v2, v3);
216 SIPROUND (v0, v1, v2, v3);
223 for (i = 8, j = 2; i <= pw_len; i += 8, j += 2)
225 m = hl32_to_64 (w[j + 1], w[j + 0]);
229 SIPROUND (v0, v1, v2, v3);
230 SIPROUND (v0, v1, v2, v3);
237 SIPROUND (v0, v1, v2, v3);
238 SIPROUND (v0, v1, v2, v3);
239 SIPROUND (v0, v1, v2, v3);
240 SIPROUND (v0, v1, v2, v3);
242 const u64x v = v0 ^ v1 ^ v2 ^ v3;
244 const u32x a = l32_from_64 (v);
245 const u32x b = h32_from_64 (v);
252 #include VECT_COMPARE_S
256 extern "C" __global__ void __launch_bounds__ (256, 1) m10100_m04 (const pw_t *pws, const gpu_rule_t *rules_buf, const comb_t *combs_buf, const u32x *words_buf_r,void *tmps, void *hooks, const u32 *bitmaps_buf_s1_a, const u32 *bitmaps_buf_s1_b, const u32 *bitmaps_buf_s1_c, const u32 *bitmaps_buf_s1_d, const u32 *bitmaps_buf_s2_a, const u32 *bitmaps_buf_s2_b, const u32 *bitmaps_buf_s2_c, const u32 *bitmaps_buf_s2_d, plain_t *plains_buf, const digest_t *digests_buf, u32 *hashes_shown, const salt_t *salt_bufs, const void *esalt_bufs, u32 *d_return_buf, u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 bfs_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max)
262 const u32 gid = (blockIdx.x * blockDim.x) + threadIdx.x;
264 if (gid >= gid_max) return;
268 w[ 0] = pws[gid].i[ 0];
269 w[ 1] = pws[gid].i[ 1];
270 w[ 2] = pws[gid].i[ 2];
271 w[ 3] = pws[gid].i[ 3];
282 w[14] = pws[gid].i[14];
285 const u32 pw_len = pws[gid].pw_len;
291 m10100m (w, pw_len, pws, rules_buf, combs_buf, words_buf_r, tmps, hooks, bitmaps_buf_s1_a, bitmaps_buf_s1_b, bitmaps_buf_s1_c, bitmaps_buf_s1_d, bitmaps_buf_s2_a, bitmaps_buf_s2_b, bitmaps_buf_s2_c, bitmaps_buf_s2_d, plains_buf, digests_buf, hashes_shown, salt_bufs, esalt_bufs, d_return_buf, d_scryptV_buf, bitmap_mask, bitmap_shift1, bitmap_shift2, salt_pos, loop_pos, loop_cnt, bfs_cnt, digests_cnt, digests_offset);
294 extern "C" __global__ void __launch_bounds__ (256, 1) m10100_m08 (const pw_t *pws, const gpu_rule_t *rules_buf, const comb_t *combs_buf, const u32x *words_buf_r,void *tmps, void *hooks, const u32 *bitmaps_buf_s1_a, const u32 *bitmaps_buf_s1_b, const u32 *bitmaps_buf_s1_c, const u32 *bitmaps_buf_s1_d, const u32 *bitmaps_buf_s2_a, const u32 *bitmaps_buf_s2_b, const u32 *bitmaps_buf_s2_c, const u32 *bitmaps_buf_s2_d, plain_t *plains_buf, const digest_t *digests_buf, u32 *hashes_shown, const salt_t *salt_bufs, const void *esalt_bufs, u32 *d_return_buf, u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 bfs_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max)
300 const u32 gid = (blockIdx.x * blockDim.x) + threadIdx.x;
302 if (gid >= gid_max) return;
306 w[ 0] = pws[gid].i[ 0];
307 w[ 1] = pws[gid].i[ 1];
308 w[ 2] = pws[gid].i[ 2];
309 w[ 3] = pws[gid].i[ 3];
310 w[ 4] = pws[gid].i[ 4];
311 w[ 5] = pws[gid].i[ 5];
312 w[ 6] = pws[gid].i[ 6];
313 w[ 7] = pws[gid].i[ 7];
320 w[14] = pws[gid].i[14];
323 const u32 pw_len = pws[gid].pw_len;
329 m10100m (w, pw_len, pws, rules_buf, combs_buf, words_buf_r, tmps, hooks, bitmaps_buf_s1_a, bitmaps_buf_s1_b, bitmaps_buf_s1_c, bitmaps_buf_s1_d, bitmaps_buf_s2_a, bitmaps_buf_s2_b, bitmaps_buf_s2_c, bitmaps_buf_s2_d, plains_buf, digests_buf, hashes_shown, salt_bufs, esalt_bufs, d_return_buf, d_scryptV_buf, bitmap_mask, bitmap_shift1, bitmap_shift2, salt_pos, loop_pos, loop_cnt, bfs_cnt, digests_cnt, digests_offset);
332 extern "C" __global__ void __launch_bounds__ (256, 1) m10100_m16 (const pw_t *pws, const gpu_rule_t *rules_buf, const comb_t *combs_buf, const u32x *words_buf_r,void *tmps, void *hooks, const u32 *bitmaps_buf_s1_a, const u32 *bitmaps_buf_s1_b, const u32 *bitmaps_buf_s1_c, const u32 *bitmaps_buf_s1_d, const u32 *bitmaps_buf_s2_a, const u32 *bitmaps_buf_s2_b, const u32 *bitmaps_buf_s2_c, const u32 *bitmaps_buf_s2_d, plain_t *plains_buf, const digest_t *digests_buf, u32 *hashes_shown, const salt_t *salt_bufs, const void *esalt_bufs, u32 *d_return_buf, u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 bfs_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max)
338 const u32 gid = (blockIdx.x * blockDim.x) + threadIdx.x;
340 if (gid >= gid_max) return;
344 w[ 0] = pws[gid].i[ 0];
345 w[ 1] = pws[gid].i[ 1];
346 w[ 2] = pws[gid].i[ 2];
347 w[ 3] = pws[gid].i[ 3];
348 w[ 4] = pws[gid].i[ 4];
349 w[ 5] = pws[gid].i[ 5];
350 w[ 6] = pws[gid].i[ 6];
351 w[ 7] = pws[gid].i[ 7];
352 w[ 8] = pws[gid].i[ 8];
353 w[ 9] = pws[gid].i[ 9];
354 w[10] = pws[gid].i[10];
355 w[11] = pws[gid].i[11];
356 w[12] = pws[gid].i[12];
357 w[13] = pws[gid].i[13];
358 w[14] = pws[gid].i[14];
359 w[15] = pws[gid].i[15];
361 const u32 pw_len = pws[gid].pw_len;
367 m10100m (w, pw_len, pws, rules_buf, combs_buf, words_buf_r, tmps, hooks, bitmaps_buf_s1_a, bitmaps_buf_s1_b, bitmaps_buf_s1_c, bitmaps_buf_s1_d, bitmaps_buf_s2_a, bitmaps_buf_s2_b, bitmaps_buf_s2_c, bitmaps_buf_s2_d, plains_buf, digests_buf, hashes_shown, salt_bufs, esalt_bufs, d_return_buf, d_scryptV_buf, bitmap_mask, bitmap_shift1, bitmap_shift2, salt_pos, loop_pos, loop_cnt, bfs_cnt, digests_cnt, digests_offset);
370 extern "C" __global__ void __launch_bounds__ (256, 1) m10100_s04 (const pw_t *pws, const gpu_rule_t *rules_buf, const comb_t *combs_buf, const u32x *words_buf_r,void *tmps, void *hooks, const u32 *bitmaps_buf_s1_a, const u32 *bitmaps_buf_s1_b, const u32 *bitmaps_buf_s1_c, const u32 *bitmaps_buf_s1_d, const u32 *bitmaps_buf_s2_a, const u32 *bitmaps_buf_s2_b, const u32 *bitmaps_buf_s2_c, const u32 *bitmaps_buf_s2_d, plain_t *plains_buf, const digest_t *digests_buf, u32 *hashes_shown, const salt_t *salt_bufs, const void *esalt_bufs, u32 *d_return_buf, u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 bfs_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max)
376 const u32 gid = (blockIdx.x * blockDim.x) + threadIdx.x;
378 if (gid >= gid_max) return;
382 w[ 0] = pws[gid].i[ 0];
383 w[ 1] = pws[gid].i[ 1];
384 w[ 2] = pws[gid].i[ 2];
385 w[ 3] = pws[gid].i[ 3];
396 w[14] = pws[gid].i[14];
399 const u32 pw_len = pws[gid].pw_len;
405 m10100s (w, pw_len, pws, rules_buf, combs_buf, words_buf_r, tmps, hooks, bitmaps_buf_s1_a, bitmaps_buf_s1_b, bitmaps_buf_s1_c, bitmaps_buf_s1_d, bitmaps_buf_s2_a, bitmaps_buf_s2_b, bitmaps_buf_s2_c, bitmaps_buf_s2_d, plains_buf, digests_buf, hashes_shown, salt_bufs, esalt_bufs, d_return_buf, d_scryptV_buf, bitmap_mask, bitmap_shift1, bitmap_shift2, salt_pos, loop_pos, loop_cnt, bfs_cnt, digests_cnt, digests_offset);
408 extern "C" __global__ void __launch_bounds__ (256, 1) m10100_s08 (const pw_t *pws, const gpu_rule_t *rules_buf, const comb_t *combs_buf, const u32x *words_buf_r,void *tmps, void *hooks, const u32 *bitmaps_buf_s1_a, const u32 *bitmaps_buf_s1_b, const u32 *bitmaps_buf_s1_c, const u32 *bitmaps_buf_s1_d, const u32 *bitmaps_buf_s2_a, const u32 *bitmaps_buf_s2_b, const u32 *bitmaps_buf_s2_c, const u32 *bitmaps_buf_s2_d, plain_t *plains_buf, const digest_t *digests_buf, u32 *hashes_shown, const salt_t *salt_bufs, const void *esalt_bufs, u32 *d_return_buf, u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 bfs_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max)
414 const u32 gid = (blockIdx.x * blockDim.x) + threadIdx.x;
416 if (gid >= gid_max) return;
420 w[ 0] = pws[gid].i[ 0];
421 w[ 1] = pws[gid].i[ 1];
422 w[ 2] = pws[gid].i[ 2];
423 w[ 3] = pws[gid].i[ 3];
424 w[ 4] = pws[gid].i[ 4];
425 w[ 5] = pws[gid].i[ 5];
426 w[ 6] = pws[gid].i[ 6];
427 w[ 7] = pws[gid].i[ 7];
434 w[14] = pws[gid].i[14];
437 const u32 pw_len = pws[gid].pw_len;
443 m10100s (w, pw_len, pws, rules_buf, combs_buf, words_buf_r, tmps, hooks, bitmaps_buf_s1_a, bitmaps_buf_s1_b, bitmaps_buf_s1_c, bitmaps_buf_s1_d, bitmaps_buf_s2_a, bitmaps_buf_s2_b, bitmaps_buf_s2_c, bitmaps_buf_s2_d, plains_buf, digests_buf, hashes_shown, salt_bufs, esalt_bufs, d_return_buf, d_scryptV_buf, bitmap_mask, bitmap_shift1, bitmap_shift2, salt_pos, loop_pos, loop_cnt, bfs_cnt, digests_cnt, digests_offset);
446 extern "C" __global__ void __launch_bounds__ (256, 1) m10100_s16 (const pw_t *pws, const gpu_rule_t *rules_buf, const comb_t *combs_buf, const u32x *words_buf_r,void *tmps, void *hooks, const u32 *bitmaps_buf_s1_a, const u32 *bitmaps_buf_s1_b, const u32 *bitmaps_buf_s1_c, const u32 *bitmaps_buf_s1_d, const u32 *bitmaps_buf_s2_a, const u32 *bitmaps_buf_s2_b, const u32 *bitmaps_buf_s2_c, const u32 *bitmaps_buf_s2_d, plain_t *plains_buf, const digest_t *digests_buf, u32 *hashes_shown, const salt_t *salt_bufs, const void *esalt_bufs, u32 *d_return_buf, u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 bfs_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max)
452 const u32 gid = (blockIdx.x * blockDim.x) + threadIdx.x;
454 if (gid >= gid_max) return;
458 w[ 0] = pws[gid].i[ 0];
459 w[ 1] = pws[gid].i[ 1];
460 w[ 2] = pws[gid].i[ 2];
461 w[ 3] = pws[gid].i[ 3];
462 w[ 4] = pws[gid].i[ 4];
463 w[ 5] = pws[gid].i[ 5];
464 w[ 6] = pws[gid].i[ 6];
465 w[ 7] = pws[gid].i[ 7];
466 w[ 8] = pws[gid].i[ 8];
467 w[ 9] = pws[gid].i[ 9];
468 w[10] = pws[gid].i[10];
469 w[11] = pws[gid].i[11];
470 w[12] = pws[gid].i[12];
471 w[13] = pws[gid].i[13];
472 w[14] = pws[gid].i[14];
473 w[15] = pws[gid].i[15];
475 const u32 pw_len = pws[gid].pw_len;
481 m10100s (w, pw_len, pws, rules_buf, combs_buf, words_buf_r, tmps, hooks, bitmaps_buf_s1_a, bitmaps_buf_s1_b, bitmaps_buf_s1_c, bitmaps_buf_s1_d, bitmaps_buf_s2_a, bitmaps_buf_s2_b, bitmaps_buf_s2_c, bitmaps_buf_s2_d, plains_buf, digests_buf, hashes_shown, salt_bufs, esalt_bufs, d_return_buf, d_scryptV_buf, bitmap_mask, bitmap_shift1, bitmap_shift2, salt_pos, loop_pos, loop_cnt, bfs_cnt, digests_cnt, digests_offset);