2 * Author......: Jens Steube <jens.steube@gmail.com>
8 #include "include/constants.h"
9 #include "include/kernel_vendor.h"
24 #include "include/kernel_functions.c"
26 #include "common_nv.c"
29 #define VECT_COMPARE_S "check_single_vect1_comp4.c"
30 #define VECT_COMPARE_M "check_multi_vect1_comp4.c"
34 #define VECT_COMPARE_S "check_single_vect2_comp4.c"
35 #define VECT_COMPARE_M "check_multi_vect2_comp4.c"
39 #define KECCAK_ROUNDS 24
42 #define Theta1(s) (st[0 + s] ^ st[5 + s] ^ st[10 + s] ^ st[15 + s] ^ st[20 + s])
55 u32 j = keccakf_piln[s]; \
56 u32 k = keccakf_rotc[s]; \
58 st[j] = rotl64 (t, k); \
69 st[0 + s] ^= ~bc1 & bc2; \
70 st[1 + s] ^= ~bc2 & bc3; \
71 st[2 + s] ^= ~bc3 & bc4; \
72 st[3 + s] ^= ~bc4 & bc0; \
73 st[4 + s] ^= ~bc0 & bc1; \
76 __device__ __constant__ comb_t c_combs[1024];
78 extern "C" __global__ void __launch_bounds__ (256, 1) m05000_m04 (const pw_t *pws, const gpu_rule_t *rules_buf, const comb_t *combs_buf, const bf_t *bfs_buf, const void *tmps, void *hooks, const u32 *bitmaps_buf_s1_a, const u32 *bitmaps_buf_s1_b, const u32 *bitmaps_buf_s1_c, const u32 *bitmaps_buf_s1_d, const u32 *bitmaps_buf_s2_a, const u32 *bitmaps_buf_s2_b, const u32 *bitmaps_buf_s2_c, const u32 *bitmaps_buf_s2_d, plain_t *plains_buf, const digest_t *digests_buf, u32 *hashes_shown, const salt_t *salt_bufs, const void *esalt_bufs, u32 *d_return_buf, u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 combs_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max)
84 const u32 lid = threadIdx.x;
90 const u32 gid = (blockIdx.x * blockDim.x) + threadIdx.x;
92 if (gid >= gid_max) return;
98 const u64 keccakf_rndc[24] =
100 0x0000000000000001, 0x0000000000008082, 0x800000000000808a,
101 0x8000000080008000, 0x000000000000808b, 0x0000000080000001,
102 0x8000000080008081, 0x8000000000008009, 0x000000000000008a,
103 0x0000000000000088, 0x0000000080008009, 0x000000008000000a,
104 0x000000008000808b, 0x800000000000008b, 0x8000000000008089,
105 0x8000000000008003, 0x8000000000008002, 0x8000000000000080,
106 0x000000000000800a, 0x800000008000000a, 0x8000000080008081,
107 0x8000000000008080, 0x0000000080000001, 0x8000000080008008
110 const u32 keccakf_rotc[24] =
112 1, 3, 6, 10, 15, 21, 28, 36, 45, 55, 2, 14,
113 27, 41, 56, 8, 25, 43, 62, 18, 39, 61, 20, 44
116 const u32 keccakf_piln[24] =
118 10, 7, 11, 17, 18, 3, 5, 16, 8, 21, 24, 4,
119 15, 23, 19, 13, 12, 2, 20, 14, 22, 9, 6, 1
124 wordl0[0] = pws[gid].i[ 0];
125 wordl0[1] = pws[gid].i[ 1];
126 wordl0[2] = pws[gid].i[ 2];
127 wordl0[3] = pws[gid].i[ 3];
131 wordl1[0] = pws[gid].i[ 4];
132 wordl1[1] = pws[gid].i[ 5];
133 wordl1[2] = pws[gid].i[ 6];
134 wordl1[3] = pws[gid].i[ 7];
150 const u32 pw_l_len = pws[gid].pw_len;
152 if (combs_mode == COMBINATOR_MODE_BASE_RIGHT)
154 append_0x01_2 (wordl0, wordl1, pw_l_len);
156 switch_buffer_by_offset (wordl0, wordl1, wordl2, wordl3, c_combs[0].pw_len);
160 * 0x80 keccak, very special
163 const u32 mdlen = salt_bufs[salt_pos].keccak_mdlen;
165 const u32 rsiz = 200 - (2 * mdlen);
167 const u32 add80w = (rsiz - 1) / 8;
173 for (u32 il_pos = 0; il_pos < combs_cnt; il_pos++)
175 const u32 pw_r_len = c_combs[il_pos].pw_len;
177 const u32 pw_len = pw_l_len + pw_r_len;
181 wordr0[0] = c_combs[il_pos].i[0];
182 wordr0[1] = c_combs[il_pos].i[1];
183 wordr0[2] = c_combs[il_pos].i[2];
184 wordr0[3] = c_combs[il_pos].i[3];
188 wordr1[0] = c_combs[il_pos].i[4];
189 wordr1[1] = c_combs[il_pos].i[5];
190 wordr1[2] = c_combs[il_pos].i[6];
191 wordr1[3] = c_combs[il_pos].i[7];
207 if (combs_mode == COMBINATOR_MODE_BASE_LEFT)
209 append_0x01_2 (wordr0, wordr1, pw_r_len);
211 switch_buffer_by_offset (wordr0, wordr1, wordr2, wordr3, pw_l_len);
216 w0[0] = wordl0[0] | wordr0[0];
217 w0[1] = wordl0[1] | wordr0[1];
218 w0[2] = wordl0[2] | wordr0[2];
219 w0[3] = wordl0[3] | wordr0[3];
223 w1[0] = wordl1[0] | wordr1[0];
224 w1[1] = wordl1[1] | wordr1[1];
225 w1[2] = wordl1[2] | wordr1[2];
226 w1[3] = wordl1[3] | wordr1[3];
230 w2[0] = wordl2[0] | wordr2[0];
231 w2[1] = wordl2[1] | wordr2[1];
232 w2[2] = wordl2[2] | wordr2[2];
233 w2[3] = wordl2[3] | wordr2[3];
237 w3[0] = wordl3[0] | wordr3[0];
238 w3[1] = wordl3[1] | wordr3[1];
244 st[ 0] = (u64x) (w0[0]) | (u64x) (w0[1]) << 32;
245 st[ 1] = (u64x) (w0[2]) | (u64x) (w0[3]) << 32;
246 st[ 2] = (u64x) (w1[0]) | (u64x) (w1[1]) << 32;
247 st[ 3] = (u64x) (w1[2]) | (u64x) (w1[3]) << 32;
270 st[add80w] |= 0x8000000000000000;
274 for (round = 0; round < KECCAK_ROUNDS; round++)
278 u64x bc0 = Theta1 (0);
279 u64x bc1 = Theta1 (1);
280 u64x bc2 = Theta1 (2);
281 u64x bc3 = Theta1 (3);
282 u64x bc4 = Theta1 (4);
286 t = bc4 ^ rotl64 (bc1, 1); Theta2 (0);
287 t = bc0 ^ rotl64 (bc2, 1); Theta2 (1);
288 t = bc1 ^ rotl64 (bc3, 1); Theta2 (2);
289 t = bc2 ^ rotl64 (bc4, 1); Theta2 (3);
290 t = bc3 ^ rotl64 (bc0, 1); Theta2 (4);
331 st[0] ^= keccakf_rndc[round];
334 const u32x r0 = l32_from_64 (st[1]);
335 const u32x r1 = h32_from_64 (st[1]);
336 const u32x r2 = l32_from_64 (st[2]);
337 const u32x r3 = h32_from_64 (st[2]);
339 #include VECT_COMPARE_M
343 extern "C" __global__ void __launch_bounds__ (256, 1) m05000_m08 (const pw_t *pws, const gpu_rule_t *rules_buf, const comb_t *combs_buf, const bf_t *bfs_buf, const void *tmps, void *hooks, const u32 *bitmaps_buf_s1_a, const u32 *bitmaps_buf_s1_b, const u32 *bitmaps_buf_s1_c, const u32 *bitmaps_buf_s1_d, const u32 *bitmaps_buf_s2_a, const u32 *bitmaps_buf_s2_b, const u32 *bitmaps_buf_s2_c, const u32 *bitmaps_buf_s2_d, plain_t *plains_buf, const digest_t *digests_buf, u32 *hashes_shown, const salt_t *salt_bufs, const void *esalt_bufs, u32 *d_return_buf, u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 rules_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max)
347 extern "C" __global__ void __launch_bounds__ (256, 1) m05000_m16 (const pw_t *pws, const gpu_rule_t *rules_buf, const comb_t *combs_buf, const bf_t *bfs_buf, const void *tmps, void *hooks, const u32 *bitmaps_buf_s1_a, const u32 *bitmaps_buf_s1_b, const u32 *bitmaps_buf_s1_c, const u32 *bitmaps_buf_s1_d, const u32 *bitmaps_buf_s2_a, const u32 *bitmaps_buf_s2_b, const u32 *bitmaps_buf_s2_c, const u32 *bitmaps_buf_s2_d, plain_t *plains_buf, const digest_t *digests_buf, u32 *hashes_shown, const salt_t *salt_bufs, const void *esalt_bufs, u32 *d_return_buf, u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 rules_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max)
351 extern "C" __global__ void __launch_bounds__ (256, 1) m05000_s04 (const pw_t *pws, const gpu_rule_t *rules_buf, const comb_t *combs_buf, const bf_t *bfs_buf, const void *tmps, void *hooks, const u32 *bitmaps_buf_s1_a, const u32 *bitmaps_buf_s1_b, const u32 *bitmaps_buf_s1_c, const u32 *bitmaps_buf_s1_d, const u32 *bitmaps_buf_s2_a, const u32 *bitmaps_buf_s2_b, const u32 *bitmaps_buf_s2_c, const u32 *bitmaps_buf_s2_d, plain_t *plains_buf, const digest_t *digests_buf, u32 *hashes_shown, const salt_t *salt_bufs, const void *esalt_bufs, u32 *d_return_buf, u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 combs_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max)
357 const u32 lid = threadIdx.x;
363 const u32 gid = (blockIdx.x * blockDim.x) + threadIdx.x;
365 if (gid >= gid_max) return;
371 const u32 search[4] =
373 digests_buf[digests_offset].digest_buf[DGST_R0],
374 digests_buf[digests_offset].digest_buf[DGST_R1],
375 digests_buf[digests_offset].digest_buf[DGST_R2],
376 digests_buf[digests_offset].digest_buf[DGST_R3]
383 const u64 keccakf_rndc[24] =
385 0x0000000000000001, 0x0000000000008082, 0x800000000000808a,
386 0x8000000080008000, 0x000000000000808b, 0x0000000080000001,
387 0x8000000080008081, 0x8000000000008009, 0x000000000000008a,
388 0x0000000000000088, 0x0000000080008009, 0x000000008000000a,
389 0x000000008000808b, 0x800000000000008b, 0x8000000000008089,
390 0x8000000000008003, 0x8000000000008002, 0x8000000000000080,
391 0x000000000000800a, 0x800000008000000a, 0x8000000080008081,
392 0x8000000000008080, 0x0000000080000001, 0x8000000080008008
395 const u32 keccakf_rotc[24] =
397 1, 3, 6, 10, 15, 21, 28, 36, 45, 55, 2, 14,
398 27, 41, 56, 8, 25, 43, 62, 18, 39, 61, 20, 44
401 const u32 keccakf_piln[24] =
403 10, 7, 11, 17, 18, 3, 5, 16, 8, 21, 24, 4,
404 15, 23, 19, 13, 12, 2, 20, 14, 22, 9, 6, 1
409 wordl0[0] = pws[gid].i[ 0];
410 wordl0[1] = pws[gid].i[ 1];
411 wordl0[2] = pws[gid].i[ 2];
412 wordl0[3] = pws[gid].i[ 3];
416 wordl1[0] = pws[gid].i[ 4];
417 wordl1[1] = pws[gid].i[ 5];
418 wordl1[2] = pws[gid].i[ 6];
419 wordl1[3] = pws[gid].i[ 7];
435 const u32 pw_l_len = pws[gid].pw_len;
437 if (combs_mode == COMBINATOR_MODE_BASE_RIGHT)
439 append_0x01_2 (wordl0, wordl1, pw_l_len);
441 switch_buffer_by_offset (wordl0, wordl1, wordl2, wordl3, c_combs[0].pw_len);
445 * 0x80 keccak, very special
448 const u32 mdlen = salt_bufs[salt_pos].keccak_mdlen;
450 const u32 rsiz = 200 - (2 * mdlen);
452 const u32 add80w = (rsiz - 1) / 8;
458 for (u32 il_pos = 0; il_pos < combs_cnt; il_pos++)
460 const u32 pw_r_len = c_combs[il_pos].pw_len;
462 const u32 pw_len = pw_l_len + pw_r_len;
466 wordr0[0] = c_combs[il_pos].i[0];
467 wordr0[1] = c_combs[il_pos].i[1];
468 wordr0[2] = c_combs[il_pos].i[2];
469 wordr0[3] = c_combs[il_pos].i[3];
473 wordr1[0] = c_combs[il_pos].i[4];
474 wordr1[1] = c_combs[il_pos].i[5];
475 wordr1[2] = c_combs[il_pos].i[6];
476 wordr1[3] = c_combs[il_pos].i[7];
492 if (combs_mode == COMBINATOR_MODE_BASE_LEFT)
494 append_0x01_2 (wordr0, wordr1, pw_r_len);
496 switch_buffer_by_offset (wordr0, wordr1, wordr2, wordr3, pw_l_len);
501 w0[0] = wordl0[0] | wordr0[0];
502 w0[1] = wordl0[1] | wordr0[1];
503 w0[2] = wordl0[2] | wordr0[2];
504 w0[3] = wordl0[3] | wordr0[3];
508 w1[0] = wordl1[0] | wordr1[0];
509 w1[1] = wordl1[1] | wordr1[1];
510 w1[2] = wordl1[2] | wordr1[2];
511 w1[3] = wordl1[3] | wordr1[3];
515 w2[0] = wordl2[0] | wordr2[0];
516 w2[1] = wordl2[1] | wordr2[1];
517 w2[2] = wordl2[2] | wordr2[2];
518 w2[3] = wordl2[3] | wordr2[3];
522 w3[0] = wordl3[0] | wordr3[0];
523 w3[1] = wordl3[1] | wordr3[1];
529 st[ 0] = (u64x) (w0[0]) | (u64x) (w0[1]) << 32;
530 st[ 1] = (u64x) (w0[2]) | (u64x) (w0[3]) << 32;
531 st[ 2] = (u64x) (w1[0]) | (u64x) (w1[1]) << 32;
532 st[ 3] = (u64x) (w1[2]) | (u64x) (w1[3]) << 32;
555 st[add80w] |= 0x8000000000000000;
559 for (round = 0; round < KECCAK_ROUNDS; round++)
563 u64x bc0 = Theta1 (0);
564 u64x bc1 = Theta1 (1);
565 u64x bc2 = Theta1 (2);
566 u64x bc3 = Theta1 (3);
567 u64x bc4 = Theta1 (4);
571 t = bc4 ^ rotl64 (bc1, 1); Theta2 (0);
572 t = bc0 ^ rotl64 (bc2, 1); Theta2 (1);
573 t = bc1 ^ rotl64 (bc3, 1); Theta2 (2);
574 t = bc2 ^ rotl64 (bc4, 1); Theta2 (3);
575 t = bc3 ^ rotl64 (bc0, 1); Theta2 (4);
616 st[0] ^= keccakf_rndc[round];
619 const u32x r0 = l32_from_64 (st[1]);
620 const u32x r1 = h32_from_64 (st[1]);
621 const u32x r2 = l32_from_64 (st[2]);
622 const u32x r3 = h32_from_64 (st[2]);
624 #include VECT_COMPARE_S
628 extern "C" __global__ void __launch_bounds__ (256, 1) m05000_s08 (const pw_t *pws, const gpu_rule_t *rules_buf, const comb_t *combs_buf, const bf_t *bfs_buf, const void *tmps, void *hooks, const u32 *bitmaps_buf_s1_a, const u32 *bitmaps_buf_s1_b, const u32 *bitmaps_buf_s1_c, const u32 *bitmaps_buf_s1_d, const u32 *bitmaps_buf_s2_a, const u32 *bitmaps_buf_s2_b, const u32 *bitmaps_buf_s2_c, const u32 *bitmaps_buf_s2_d, plain_t *plains_buf, const digest_t *digests_buf, u32 *hashes_shown, const salt_t *salt_bufs, const void *esalt_bufs, u32 *d_return_buf, u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 rules_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max)
632 extern "C" __global__ void __launch_bounds__ (256, 1) m05000_s16 (const pw_t *pws, const gpu_rule_t *rules_buf, const comb_t *combs_buf, const bf_t *bfs_buf, const void *tmps, void *hooks, const u32 *bitmaps_buf_s1_a, const u32 *bitmaps_buf_s1_b, const u32 *bitmaps_buf_s1_c, const u32 *bitmaps_buf_s1_d, const u32 *bitmaps_buf_s2_a, const u32 *bitmaps_buf_s2_b, const u32 *bitmaps_buf_s2_c, const u32 *bitmaps_buf_s2_d, plain_t *plains_buf, const digest_t *digests_buf, u32 *hashes_shown, const salt_t *salt_bufs, const void *esalt_bufs, u32 *d_return_buf, u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 rules_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max)