2 * Author......: Jens Steube <jens.steube@gmail.com>
9 //#define NEW_SIMD_CODE
11 #include "include/constants.h"
12 #include "include/kernel_vendor.h"
19 #include "include/kernel_functions.c"
20 #include "OpenCL/types_ocl.c"
21 #include "OpenCL/common.c"
22 #include "OpenCL/simd.c"
24 #define SIPROUND(v0,v1,v2,v3) \
26 (v1) = rotl64 ((v1), 13); \
28 (v0) = rotl64 ((v0), 32); \
30 (v3) = rotl64 ((v3), 16); \
33 (v3) = rotl64 ((v3), 21); \
36 (v1) = rotl64 ((v1), 17); \
38 (v2) = rotl64 ((v2), 32)
40 __kernel void m10100_m04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, __global comb_t *combs_buf, __global bf_t *bfs_buf, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 il_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max)
46 const u32 lid = get_local_id (0);
52 const u32 gid = get_global_id (0);
54 if (gid >= gid_max) return;
59 pw_buf0[0] = pws[gid].i[0];
60 pw_buf0[1] = pws[gid].i[1];
61 pw_buf0[2] = pws[gid].i[2];
62 pw_buf0[3] = pws[gid].i[3];
63 pw_buf1[0] = pws[gid].i[4];
64 pw_buf1[1] = pws[gid].i[5];
65 pw_buf1[2] = pws[gid].i[6];
66 pw_buf1[3] = pws[gid].i[7];
68 const u32 pw_l_len = pws[gid].pw_len;
74 u64x v0p = SIPHASHM_0;
75 u64x v1p = SIPHASHM_1;
76 u64x v2p = SIPHASHM_2;
77 u64x v3p = SIPHASHM_3;
79 v0p ^= hl32_to_64 (salt_bufs[salt_pos].salt_buf[1], salt_bufs[salt_pos].salt_buf[0]);
80 v1p ^= hl32_to_64 (salt_bufs[salt_pos].salt_buf[3], salt_bufs[salt_pos].salt_buf[2]);
81 v2p ^= hl32_to_64 (salt_bufs[salt_pos].salt_buf[1], salt_bufs[salt_pos].salt_buf[0]);
82 v3p ^= hl32_to_64 (salt_bufs[salt_pos].salt_buf[3], salt_bufs[salt_pos].salt_buf[2]);
88 for (u32 il_pos = 0; il_pos < il_cnt; il_pos += VECT_SIZE)
90 const u32x pw_r_len = pwlenx_create_combt (combs_buf, il_pos);
92 const u32x pw_len = pw_l_len + pw_r_len;
95 * concat password candidate
98 u32x wordl0[4] = { 0 };
99 u32x wordl1[4] = { 0 };
100 u32x wordl2[4] = { 0 };
101 u32x wordl3[4] = { 0 };
103 wordl0[0] = pw_buf0[0];
104 wordl0[1] = pw_buf0[1];
105 wordl0[2] = pw_buf0[2];
106 wordl0[3] = pw_buf0[3];
107 wordl1[0] = pw_buf1[0];
108 wordl1[1] = pw_buf1[1];
109 wordl1[2] = pw_buf1[2];
110 wordl1[3] = pw_buf1[3];
112 u32x wordr0[4] = { 0 };
113 u32x wordr1[4] = { 0 };
114 u32x wordr2[4] = { 0 };
115 u32x wordr3[4] = { 0 };
117 wordr0[0] = ix_create_combt (combs_buf, il_pos, 0);
118 wordr0[1] = ix_create_combt (combs_buf, il_pos, 1);
119 wordr0[2] = ix_create_combt (combs_buf, il_pos, 2);
120 wordr0[3] = ix_create_combt (combs_buf, il_pos, 3);
121 wordr1[0] = ix_create_combt (combs_buf, il_pos, 4);
122 wordr1[1] = ix_create_combt (combs_buf, il_pos, 5);
123 wordr1[2] = ix_create_combt (combs_buf, il_pos, 6);
124 wordr1[3] = ix_create_combt (combs_buf, il_pos, 7);
126 if (combs_mode == COMBINATOR_MODE_BASE_LEFT)
128 switch_buffer_by_offset_le_VV (wordr0, wordr1, wordr2, wordr3, pw_l_len);
132 switch_buffer_by_offset_le_VV (wordl0, wordl1, wordl2, wordl3, pw_r_len);
140 w0[0] = wordl0[0] | wordr0[0];
141 w0[1] = wordl0[1] | wordr0[1];
142 w0[2] = wordl0[2] | wordr0[2];
143 w0[3] = wordl0[3] | wordr0[3];
144 w1[0] = wordl1[0] | wordr1[0];
145 w1[1] = wordl1[1] | wordr1[1];
146 w1[2] = wordl1[2] | wordr1[2];
147 w1[3] = wordl1[3] | wordr1[3];
148 w2[0] = wordl2[0] | wordr2[0];
149 w2[1] = wordl2[1] | wordr2[1];
150 w2[2] = wordl2[2] | wordr2[2];
151 w2[3] = wordl2[3] | wordr2[3];
152 w3[0] = wordl3[0] | wordr3[0];
153 w3[1] = wordl3[1] | wordr3[1];
154 w3[2] = wordl3[2] | wordr3[2];
155 w3[3] = wordl3[3] | wordr3[3];
163 case 0: w0[1] |= pw_len << 24; break;
164 case 1: w0[3] |= pw_len << 24; break;
165 case 2: w1[1] |= pw_len << 24; break;
166 case 3: w1[3] |= pw_len << 24; break;
177 for (i = 0, j = 0; i <= pw_len && i < 16; i += 8, j += 2)
179 u64x m = hl32_to_64 (w0[j + 1], w0[j + 0]);
183 SIPROUND (v0, v1, v2, v3);
184 SIPROUND (v0, v1, v2, v3);
189 for ( j = 0; i <= pw_len && i < 32; i += 8, j += 2)
191 u64x m = hl32_to_64 (w1[j + 1], w1[j + 0]);
195 SIPROUND (v0, v1, v2, v3);
196 SIPROUND (v0, v1, v2, v3);
203 SIPROUND (v0, v1, v2, v3);
204 SIPROUND (v0, v1, v2, v3);
205 SIPROUND (v0, v1, v2, v3);
206 SIPROUND (v0, v1, v2, v3);
208 const u64x v = v0 ^ v1 ^ v2 ^ v3;
210 const u32x a = l32_from_64 (v);
211 const u32x b = h32_from_64 (v);
215 COMPARE_M_SIMD (a, b, c, d);
219 __kernel void m10100_m08 (__global pw_t *pws, __global kernel_rule_t *rules_buf, __global comb_t *combs_buf, __global bf_t *bfs_buf, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 il_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max)
223 __kernel void m10100_m16 (__global pw_t *pws, __global kernel_rule_t *rules_buf, __global comb_t *combs_buf, __global bf_t *bfs_buf, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 il_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max)
227 __kernel void m10100_s04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, __global comb_t *combs_buf, __global bf_t *bfs_buf, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 il_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max)
233 const u32 lid = get_local_id (0);
239 const u32 gid = get_global_id (0);
241 if (gid >= gid_max) return;
246 pw_buf0[0] = pws[gid].i[0];
247 pw_buf0[1] = pws[gid].i[1];
248 pw_buf0[2] = pws[gid].i[2];
249 pw_buf0[3] = pws[gid].i[3];
250 pw_buf1[0] = pws[gid].i[4];
251 pw_buf1[1] = pws[gid].i[5];
252 pw_buf1[2] = pws[gid].i[6];
253 pw_buf1[3] = pws[gid].i[7];
255 const u32 pw_l_len = pws[gid].pw_len;
261 u64x v0p = SIPHASHM_0;
262 u64x v1p = SIPHASHM_1;
263 u64x v2p = SIPHASHM_2;
264 u64x v3p = SIPHASHM_3;
266 v0p ^= hl32_to_64 (salt_bufs[salt_pos].salt_buf[1], salt_bufs[salt_pos].salt_buf[0]);
267 v1p ^= hl32_to_64 (salt_bufs[salt_pos].salt_buf[3], salt_bufs[salt_pos].salt_buf[2]);
268 v2p ^= hl32_to_64 (salt_bufs[salt_pos].salt_buf[1], salt_bufs[salt_pos].salt_buf[0]);
269 v3p ^= hl32_to_64 (salt_bufs[salt_pos].salt_buf[3], salt_bufs[salt_pos].salt_buf[2]);
275 const u32 search[4] =
277 digests_buf[digests_offset].digest_buf[DGST_R0],
278 digests_buf[digests_offset].digest_buf[DGST_R1],
279 digests_buf[digests_offset].digest_buf[DGST_R2],
280 digests_buf[digests_offset].digest_buf[DGST_R3]
287 for (u32 il_pos = 0; il_pos < il_cnt; il_pos += VECT_SIZE)
289 const u32x pw_r_len = pwlenx_create_combt (combs_buf, il_pos);
291 const u32x pw_len = pw_l_len + pw_r_len;
294 * concat password candidate
297 u32x wordl0[4] = { 0 };
298 u32x wordl1[4] = { 0 };
299 u32x wordl2[4] = { 0 };
300 u32x wordl3[4] = { 0 };
302 wordl0[0] = pw_buf0[0];
303 wordl0[1] = pw_buf0[1];
304 wordl0[2] = pw_buf0[2];
305 wordl0[3] = pw_buf0[3];
306 wordl1[0] = pw_buf1[0];
307 wordl1[1] = pw_buf1[1];
308 wordl1[2] = pw_buf1[2];
309 wordl1[3] = pw_buf1[3];
311 u32x wordr0[4] = { 0 };
312 u32x wordr1[4] = { 0 };
313 u32x wordr2[4] = { 0 };
314 u32x wordr3[4] = { 0 };
316 wordr0[0] = ix_create_combt (combs_buf, il_pos, 0);
317 wordr0[1] = ix_create_combt (combs_buf, il_pos, 1);
318 wordr0[2] = ix_create_combt (combs_buf, il_pos, 2);
319 wordr0[3] = ix_create_combt (combs_buf, il_pos, 3);
320 wordr1[0] = ix_create_combt (combs_buf, il_pos, 4);
321 wordr1[1] = ix_create_combt (combs_buf, il_pos, 5);
322 wordr1[2] = ix_create_combt (combs_buf, il_pos, 6);
323 wordr1[3] = ix_create_combt (combs_buf, il_pos, 7);
325 if (combs_mode == COMBINATOR_MODE_BASE_LEFT)
327 switch_buffer_by_offset_le_VV (wordr0, wordr1, wordr2, wordr3, pw_l_len);
331 switch_buffer_by_offset_le_VV (wordl0, wordl1, wordl2, wordl3, pw_r_len);
339 w0[0] = wordl0[0] | wordr0[0];
340 w0[1] = wordl0[1] | wordr0[1];
341 w0[2] = wordl0[2] | wordr0[2];
342 w0[3] = wordl0[3] | wordr0[3];
343 w1[0] = wordl1[0] | wordr1[0];
344 w1[1] = wordl1[1] | wordr1[1];
345 w1[2] = wordl1[2] | wordr1[2];
346 w1[3] = wordl1[3] | wordr1[3];
347 w2[0] = wordl2[0] | wordr2[0];
348 w2[1] = wordl2[1] | wordr2[1];
349 w2[2] = wordl2[2] | wordr2[2];
350 w2[3] = wordl2[3] | wordr2[3];
351 w3[0] = wordl3[0] | wordr3[0];
352 w3[1] = wordl3[1] | wordr3[1];
353 w3[2] = wordl3[2] | wordr3[2];
354 w3[3] = wordl3[3] | wordr3[3];
362 case 0: w0[1] |= pw_len << 24; break;
363 case 1: w0[3] |= pw_len << 24; break;
364 case 2: w1[1] |= pw_len << 24; break;
365 case 3: w1[3] |= pw_len << 24; break;
376 for (i = 0, j = 0; i <= pw_len && i < 16; i += 8, j += 2)
378 u64x m = hl32_to_64 (w0[j + 1], w0[j + 0]);
382 SIPROUND (v0, v1, v2, v3);
383 SIPROUND (v0, v1, v2, v3);
388 for ( j = 0; i <= pw_len && i < 32; i += 8, j += 2)
390 u64x m = hl32_to_64 (w1[j + 1], w1[j + 0]);
394 SIPROUND (v0, v1, v2, v3);
395 SIPROUND (v0, v1, v2, v3);
402 SIPROUND (v0, v1, v2, v3);
403 SIPROUND (v0, v1, v2, v3);
404 SIPROUND (v0, v1, v2, v3);
405 SIPROUND (v0, v1, v2, v3);
407 const u64x v = v0 ^ v1 ^ v2 ^ v3;
409 const u32x a = l32_from_64 (v);
410 const u32x b = h32_from_64 (v);
414 COMPARE_S_SIMD (a, b, c, d);
418 __kernel void m10100_s08 (__global pw_t *pws, __global kernel_rule_t *rules_buf, __global comb_t *combs_buf, __global bf_t *bfs_buf, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 il_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max)
422 __kernel void m10100_s16 (__global pw_t *pws, __global kernel_rule_t *rules_buf, __global comb_t *combs_buf, __global bf_t *bfs_buf, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 il_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max)