2 * Author......: Jens Steube <jens.steube@gmail.com>
8 #include "include/constants.h"
9 #include "include/kernel_vendor.h"
24 #include "include/kernel_functions.c"
26 #include "common_nv.c"
29 #define VECT_COMPARE_M "check_multi_vect1_comp4.c"
33 #define VECT_COMPARE_M "check_multi_vect2_comp4.c"
37 #define VECT_COMPARE_M "check_multi_vect4_comp4.c"
50 inline __device__ uintm (const u32 a, const u32 b, const u32 c, const u32 d) : x(a), y(b), z(c), w(d) { }
51 inline __device__ uintm (const u32 a) : x(a), y(a), z(a), w(a) { }
53 inline __device__ uintm (void) { }
54 inline __device__ ~uintm (void) { }
63 __device__ static uintm __byte_perm (const uintm a, const uintm b, const u32 c)
65 return uintm (__byte_perm (a.x, b.x, c),
66 __byte_perm (a.y, b.y, c),
67 __byte_perm (a.z, b.z, c),
68 __byte_perm (a.w, b.w, c));
71 __device__ static uintm rotate (const uintm a, const unsigned int n)
73 return uintm (rotl32 (a.x, n),
79 inline __device__ uintm wxyz (const uintm a) { return uintm (a.w, a.x, a.y, a.z); }
80 inline __device__ uintm zwxy (const uintm a) { return uintm (a.z, a.w, a.x, a.y); }
82 inline __device__ uintm operator << (const uintm a, const u32 b) { return uintm ((a.x << b ), (a.y << b ), (a.z << b ), (a.w << b )); }
83 inline __device__ uintm operator << (const uintm a, const uintm b) { return uintm ((a.x << b.x), (a.y << b.y), (a.z << b.z), (a.w << b.w)); }
85 inline __device__ uintm operator >> (const uintm a, const u32 b) { return uintm ((a.x >> b ), (a.y >> b ), (a.z >> b ), (a.w >> b )); }
86 inline __device__ uintm operator >> (const uintm a, const uintm b) { return uintm ((a.x >> b.x), (a.y >> b.y), (a.z >> b.z), (a.w >> b.w)); }
88 inline __device__ uintm operator ^ (const uintm a, const u32 b) { return uintm ((a.x ^ b ), (a.y ^ b ), (a.z ^ b ), (a.w ^ b )); }
89 inline __device__ uintm operator ^ (const uintm a, const uintm b) { return uintm ((a.x ^ b.x), (a.y ^ b.y), (a.z ^ b.z), (a.w ^ b.w)); }
91 inline __device__ uintm operator | (const uintm a, const u32 b) { return uintm ((a.x | b ), (a.y | b ), (a.z | b ), (a.w | b )); }
92 inline __device__ uintm operator | (const uintm a, const uintm b) { return uintm ((a.x | b.x), (a.y | b.y), (a.z | b.z), (a.w | b.w)); }
94 inline __device__ uintm operator & (const uintm a, const u32 b) { return uintm ((a.x & b ), (a.y & b ), (a.z & b ), (a.w & b )); }
95 inline __device__ uintm operator & (const uintm a, const uintm b) { return uintm ((a.x & b.x), (a.y & b.y), (a.z & b.z), (a.w & b.w)); }
97 inline __device__ uintm operator + (const uintm a, const u32 b) { return uintm ((a.x + b ), (a.y + b ), (a.z + b ), (a.w + b )); }
98 inline __device__ uintm operator + (const uintm a, const uintm b) { return uintm ((a.x + b.x), (a.y + b.y), (a.z + b.z), (a.w + b.w)); }
100 inline __device__ void operator ^= (uintm &a, const u32 b) { a.x ^= b; a.y ^= b; a.z ^= b; a.w ^= b; }
101 inline __device__ void operator ^= (uintm &a, const uintm b) { a.x ^= b.x; a.y ^= b.y; a.z ^= b.z; a.w ^= b.w; }
103 inline __device__ void operator += (uintm &a, const u32 b) { a.x += b; a.y += b; a.z += b; a.w += b; }
104 inline __device__ void operator += (uintm &a, const uintm b) { a.x += b.x; a.y += b.y; a.z += b.z; a.w += b.w; }
106 __constant__ u32 k_sha256[64] =
108 SHA256C00, SHA256C01, SHA256C02, SHA256C03,
109 SHA256C04, SHA256C05, SHA256C06, SHA256C07,
110 SHA256C08, SHA256C09, SHA256C0a, SHA256C0b,
111 SHA256C0c, SHA256C0d, SHA256C0e, SHA256C0f,
112 SHA256C10, SHA256C11, SHA256C12, SHA256C13,
113 SHA256C14, SHA256C15, SHA256C16, SHA256C17,
114 SHA256C18, SHA256C19, SHA256C1a, SHA256C1b,
115 SHA256C1c, SHA256C1d, SHA256C1e, SHA256C1f,
116 SHA256C20, SHA256C21, SHA256C22, SHA256C23,
117 SHA256C24, SHA256C25, SHA256C26, SHA256C27,
118 SHA256C28, SHA256C29, SHA256C2a, SHA256C2b,
119 SHA256C2c, SHA256C2d, SHA256C2e, SHA256C2f,
120 SHA256C30, SHA256C31, SHA256C32, SHA256C33,
121 SHA256C34, SHA256C35, SHA256C36, SHA256C37,
122 SHA256C38, SHA256C39, SHA256C3a, SHA256C3b,
123 SHA256C3c, SHA256C3d, SHA256C3e, SHA256C3f,
126 __device__ static void sha256_transform (const u32x w0[4], const u32x w1[4], const u32x w2[4], const u32x w3[4], u32x digest[8])
154 #define ROUND_EXPAND() \
156 w0_t = SHA256_EXPAND (we_t, w9_t, w1_t, w0_t); \
157 w1_t = SHA256_EXPAND (wf_t, wa_t, w2_t, w1_t); \
158 w2_t = SHA256_EXPAND (w0_t, wb_t, w3_t, w2_t); \
159 w3_t = SHA256_EXPAND (w1_t, wc_t, w4_t, w3_t); \
160 w4_t = SHA256_EXPAND (w2_t, wd_t, w5_t, w4_t); \
161 w5_t = SHA256_EXPAND (w3_t, we_t, w6_t, w5_t); \
162 w6_t = SHA256_EXPAND (w4_t, wf_t, w7_t, w6_t); \
163 w7_t = SHA256_EXPAND (w5_t, w0_t, w8_t, w7_t); \
164 w8_t = SHA256_EXPAND (w6_t, w1_t, w9_t, w8_t); \
165 w9_t = SHA256_EXPAND (w7_t, w2_t, wa_t, w9_t); \
166 wa_t = SHA256_EXPAND (w8_t, w3_t, wb_t, wa_t); \
167 wb_t = SHA256_EXPAND (w9_t, w4_t, wc_t, wb_t); \
168 wc_t = SHA256_EXPAND (wa_t, w5_t, wd_t, wc_t); \
169 wd_t = SHA256_EXPAND (wb_t, w6_t, we_t, wd_t); \
170 we_t = SHA256_EXPAND (wc_t, w7_t, wf_t, we_t); \
171 wf_t = SHA256_EXPAND (wd_t, w8_t, w0_t, wf_t); \
174 #define ROUND_STEP(i) \
176 SHA256_STEP (SHA256_F0o, SHA256_F1o, a, b, c, d, e, f, g, h, w0_t, k_sha256[i + 0]); \
177 SHA256_STEP (SHA256_F0o, SHA256_F1o, h, a, b, c, d, e, f, g, w1_t, k_sha256[i + 1]); \
178 SHA256_STEP (SHA256_F0o, SHA256_F1o, g, h, a, b, c, d, e, f, w2_t, k_sha256[i + 2]); \
179 SHA256_STEP (SHA256_F0o, SHA256_F1o, f, g, h, a, b, c, d, e, w3_t, k_sha256[i + 3]); \
180 SHA256_STEP (SHA256_F0o, SHA256_F1o, e, f, g, h, a, b, c, d, w4_t, k_sha256[i + 4]); \
181 SHA256_STEP (SHA256_F0o, SHA256_F1o, d, e, f, g, h, a, b, c, w5_t, k_sha256[i + 5]); \
182 SHA256_STEP (SHA256_F0o, SHA256_F1o, c, d, e, f, g, h, a, b, w6_t, k_sha256[i + 6]); \
183 SHA256_STEP (SHA256_F0o, SHA256_F1o, b, c, d, e, f, g, h, a, w7_t, k_sha256[i + 7]); \
184 SHA256_STEP (SHA256_F0o, SHA256_F1o, a, b, c, d, e, f, g, h, w8_t, k_sha256[i + 8]); \
185 SHA256_STEP (SHA256_F0o, SHA256_F1o, h, a, b, c, d, e, f, g, w9_t, k_sha256[i + 9]); \
186 SHA256_STEP (SHA256_F0o, SHA256_F1o, g, h, a, b, c, d, e, f, wa_t, k_sha256[i + 10]); \
187 SHA256_STEP (SHA256_F0o, SHA256_F1o, f, g, h, a, b, c, d, e, wb_t, k_sha256[i + 11]); \
188 SHA256_STEP (SHA256_F0o, SHA256_F1o, e, f, g, h, a, b, c, d, wc_t, k_sha256[i + 12]); \
189 SHA256_STEP (SHA256_F0o, SHA256_F1o, d, e, f, g, h, a, b, c, wd_t, k_sha256[i + 13]); \
190 SHA256_STEP (SHA256_F0o, SHA256_F1o, c, d, e, f, g, h, a, b, we_t, k_sha256[i + 14]); \
191 SHA256_STEP (SHA256_F0o, SHA256_F1o, b, c, d, e, f, g, h, a, wf_t, k_sha256[i + 15]); \
196 for (int i = 16; i < 64; i += 16)
198 ROUND_EXPAND (); ROUND_STEP (i);
211 __device__ static void hmac_sha256_pad (u32x w0[4], u32x w1[4], u32x w2[4], u32x w3[4], u32x ipad[8], u32x opad[8])
213 w0[0] = w0[0] ^ 0x36363636;
214 w0[1] = w0[1] ^ 0x36363636;
215 w0[2] = w0[2] ^ 0x36363636;
216 w0[3] = w0[3] ^ 0x36363636;
217 w1[0] = w1[0] ^ 0x36363636;
218 w1[1] = w1[1] ^ 0x36363636;
219 w1[2] = w1[2] ^ 0x36363636;
220 w1[3] = w1[3] ^ 0x36363636;
221 w2[0] = w2[0] ^ 0x36363636;
222 w2[1] = w2[1] ^ 0x36363636;
223 w2[2] = w2[2] ^ 0x36363636;
224 w2[3] = w2[3] ^ 0x36363636;
225 w3[0] = w3[0] ^ 0x36363636;
226 w3[1] = w3[1] ^ 0x36363636;
227 w3[2] = w3[2] ^ 0x36363636;
228 w3[3] = w3[3] ^ 0x36363636;
239 sha256_transform (w0, w1, w2, w3, ipad);
241 w0[0] = w0[0] ^ 0x6a6a6a6a;
242 w0[1] = w0[1] ^ 0x6a6a6a6a;
243 w0[2] = w0[2] ^ 0x6a6a6a6a;
244 w0[3] = w0[3] ^ 0x6a6a6a6a;
245 w1[0] = w1[0] ^ 0x6a6a6a6a;
246 w1[1] = w1[1] ^ 0x6a6a6a6a;
247 w1[2] = w1[2] ^ 0x6a6a6a6a;
248 w1[3] = w1[3] ^ 0x6a6a6a6a;
249 w2[0] = w2[0] ^ 0x6a6a6a6a;
250 w2[1] = w2[1] ^ 0x6a6a6a6a;
251 w2[2] = w2[2] ^ 0x6a6a6a6a;
252 w2[3] = w2[3] ^ 0x6a6a6a6a;
253 w3[0] = w3[0] ^ 0x6a6a6a6a;
254 w3[1] = w3[1] ^ 0x6a6a6a6a;
255 w3[2] = w3[2] ^ 0x6a6a6a6a;
256 w3[3] = w3[3] ^ 0x6a6a6a6a;
267 sha256_transform (w0, w1, w2, w3, opad);
270 __device__ static void hmac_sha256_run (u32x w0[4], u32x w1[4], u32x w2[4], u32x w3[4], u32x ipad[8], u32x opad[8], u32x digest[8])
281 sha256_transform (w0, w1, w2, w3, digest);
298 w3[3] = (64 + 32) * 8;
309 sha256_transform (w0, w1, w2, w3, digest);
312 __device__ static void memcat8 (u32x block0[4], u32x block1[4], u32x block2[4], u32x block3[4], const u32 block_len, const u32 append[2])
317 block0[0] = append[0];
318 block0[1] = append[1];
322 block0[0] = block0[0] | append[0] << 8;
323 block0[1] = append[0] >> 24 | append[1] << 8;
324 block0[2] = append[1] >> 24;
328 block0[0] = block0[0] | append[0] << 16;
329 block0[1] = append[0] >> 16 | append[1] << 16;
330 block0[2] = append[1] >> 16;
334 block0[0] = block0[0] | append[0] << 24;
335 block0[1] = append[0] >> 8 | append[1] << 24;
336 block0[2] = append[1] >> 8;
340 block0[1] = append[0];
341 block0[2] = append[1];
345 block0[1] = block0[1] | append[0] << 8;
346 block0[2] = append[0] >> 24 | append[1] << 8;
347 block0[3] = append[1] >> 24;
351 block0[1] = block0[1] | append[0] << 16;
352 block0[2] = append[0] >> 16 | append[1] << 16;
353 block0[3] = append[1] >> 16;
357 block0[1] = block0[1] | append[0] << 24;
358 block0[2] = append[0] >> 8 | append[1] << 24;
359 block0[3] = append[1] >> 8;
363 block0[2] = append[0];
364 block0[3] = append[1];
368 block0[2] = block0[2] | append[0] << 8;
369 block0[3] = append[0] >> 24 | append[1] << 8;
370 block1[0] = append[1] >> 24;
374 block0[2] = block0[2] | append[0] << 16;
375 block0[3] = append[0] >> 16 | append[1] << 16;
376 block1[0] = append[1] >> 16;
380 block0[2] = block0[2] | append[0] << 24;
381 block0[3] = append[0] >> 8 | append[1] << 24;
382 block1[0] = append[1] >> 8;
386 block0[3] = append[0];
387 block1[0] = append[1];
391 block0[3] = block0[3] | append[0] << 8;
392 block1[0] = append[0] >> 24 | append[1] << 8;
393 block1[1] = append[1] >> 24;
397 block0[3] = block0[3] | append[0] << 16;
398 block1[0] = append[0] >> 16 | append[1] << 16;
399 block1[1] = append[1] >> 16;
403 block0[3] = block0[3] | append[0] << 24;
404 block1[0] = append[0] >> 8 | append[1] << 24;
405 block1[1] = append[1] >> 8;
409 block1[0] = append[0];
410 block1[1] = append[1];
414 block1[0] = block1[0] | append[0] << 8;
415 block1[1] = append[0] >> 24 | append[1] << 8;
416 block1[2] = append[1] >> 24;
420 block1[0] = block1[0] | append[0] << 16;
421 block1[1] = append[0] >> 16 | append[1] << 16;
422 block1[2] = append[1] >> 16;
426 block1[0] = block1[0] | append[0] << 24;
427 block1[1] = append[0] >> 8 | append[1] << 24;
428 block1[2] = append[1] >> 8;
432 block1[1] = append[0];
433 block1[2] = append[1];
437 block1[1] = block1[1] | append[0] << 8;
438 block1[2] = append[0] >> 24 | append[1] << 8;
439 block1[3] = append[1] >> 24;
443 block1[1] = block1[1] | append[0] << 16;
444 block1[2] = append[0] >> 16 | append[1] << 16;
445 block1[3] = append[1] >> 16;
449 block1[1] = block1[1] | append[0] << 24;
450 block1[2] = append[0] >> 8 | append[1] << 24;
451 block1[3] = append[1] >> 8;
455 block1[2] = append[0];
456 block1[3] = append[1];
460 block1[2] = block1[2] | append[0] << 8;
461 block1[3] = append[0] >> 24 | append[1] << 8;
462 block2[0] = append[1] >> 24;
466 block1[2] = block1[2] | append[0] << 16;
467 block1[3] = append[0] >> 16 | append[1] << 16;
468 block2[0] = append[1] >> 16;
472 block1[2] = block1[2] | append[0] << 24;
473 block1[3] = append[0] >> 8 | append[1] << 24;
474 block2[0] = append[1] >> 8;
478 block1[3] = append[0];
479 block2[0] = append[1];
483 block1[3] = block1[3] | append[0] << 8;
484 block2[0] = append[0] >> 24 | append[1] << 8;
485 block2[1] = append[1] >> 24;
489 block1[3] = block1[3] | append[0] << 16;
490 block2[0] = append[0] >> 16 | append[1] << 16;
491 block2[1] = append[1] >> 16;
495 block1[3] = block1[3] | append[0] << 24;
496 block2[0] = append[0] >> 8 | append[1] << 24;
497 block2[1] = append[1] >> 8;
501 block2[0] = append[0];
502 block2[1] = append[1];
506 block2[0] = block2[0] | append[0] << 8;
507 block2[1] = append[0] >> 24 | append[1] << 8;
508 block2[2] = append[1] >> 24;
512 block2[0] = block2[0] | append[0] << 16;
513 block2[1] = append[0] >> 16 | append[1] << 16;
514 block2[2] = append[1] >> 16;
518 block2[0] = block2[0] | append[0] << 24;
519 block2[1] = append[0] >> 8 | append[1] << 24;
520 block2[2] = append[1] >> 8;
524 block2[1] = append[0];
525 block2[2] = append[1];
529 block2[1] = block2[1] | append[0] << 8;
530 block2[2] = append[0] >> 24 | append[1] << 8;
531 block2[3] = append[1] >> 24;
535 block2[1] = block2[1] | append[0] << 16;
536 block2[2] = append[0] >> 16 | append[1] << 16;
537 block2[3] = append[1] >> 16;
541 block2[1] = block2[1] | append[0] << 24;
542 block2[2] = append[0] >> 8 | append[1] << 24;
543 block2[3] = append[1] >> 8;
547 block2[2] = append[0];
548 block2[3] = append[1];
552 block2[2] = block2[2] | append[0] << 8;
553 block2[3] = append[0] >> 24 | append[1] << 8;
554 block3[0] = append[1] >> 24;
558 block2[2] = block2[2] | append[0] << 16;
559 block2[3] = append[0] >> 16 | append[1] << 16;
560 block3[0] = append[1] >> 16;
564 block2[2] = block2[2] | append[0] << 24;
565 block2[3] = append[0] >> 8 | append[1] << 24;
566 block3[0] = append[1] >> 8;
570 block2[3] = append[0];
571 block3[0] = append[1];
575 block2[3] = block2[3] | append[0] << 8;
576 block3[0] = append[0] >> 24 | append[1] << 8;
577 block3[1] = append[1] >> 24;
581 block2[3] = block2[3] | append[0] << 16;
582 block3[0] = append[0] >> 16 | append[1] << 16;
583 block3[1] = append[1] >> 16;
587 block2[3] = block2[3] | append[0] << 24;
588 block3[0] = append[0] >> 8 | append[1] << 24;
589 block3[1] = append[1] >> 8;
593 block3[0] = append[0];
594 block3[1] = append[1];
598 block3[0] = block3[0] | append[0] << 8;
599 block3[1] = append[0] >> 24 | append[1] << 8;
600 block3[2] = append[1] >> 24;
604 block3[0] = block3[0] | append[0] << 16;
605 block3[1] = append[0] >> 16 | append[1] << 16;
606 block3[2] = append[1] >> 16;
610 block3[0] = block3[0] | append[0] << 24;
611 block3[1] = append[0] >> 8 | append[1] << 24;
612 block3[2] = append[1] >> 8;
616 block3[1] = append[0];
617 block3[2] = append[1];
621 block3[1] = block3[1] | append[0] << 8;
622 block3[2] = append[0] >> 24 | append[1] << 8;
623 block3[3] = append[1] >> 24;
627 block3[1] = block3[1] | append[0] << 16;
628 block3[2] = append[0] >> 16 | append[1] << 16;
629 block3[3] = append[1] >> 16;
633 block3[1] = block3[1] | append[0] << 24;
634 block3[2] = append[0] >> 8 | append[1] << 24;
635 block3[3] = append[1] >> 8;
639 block3[2] = append[0];
640 block3[3] = append[1];
645 __device__ static uintm swap_workaround (uintm v)
647 return __byte_perm (v, 0, 0x0123);
650 #define GET_SCRYPT_CNT(r,p) (2 * (r) * 16 * (p))
651 #define GET_SMIX_CNT(r,N) (2 * (r) * 16 * (N))
652 #define GET_STATE_CNT(r) (2 * (r) * 16)
654 #define ADD_ROTATE_XOR(r,i1,i2,s) (r) ^= rotate ((i1) + (i2), (s));
656 #define SALSA20_2R() \
658 ADD_ROTATE_XOR (X1, X0, X3, 7); \
659 ADD_ROTATE_XOR (X2, X1, X0, 9); \
660 ADD_ROTATE_XOR (X3, X2, X1, 13); \
661 ADD_ROTATE_XOR (X0, X3, X2, 18); \
663 X1 = uintm (X1.w, X1.x, X1.y, X1.z); \
664 X2 = uintm (X2.z, X2.w, X2.x, X2.y); \
665 X3 = uintm (X3.y, X3.z, X3.w, X3.x); \
667 ADD_ROTATE_XOR (X3, X0, X1, 7); \
668 ADD_ROTATE_XOR (X2, X3, X0, 9); \
669 ADD_ROTATE_XOR (X1, X2, X3, 13); \
670 ADD_ROTATE_XOR (X0, X1, X2, 18); \
672 X1 = uintm (X1.y, X1.z, X1.w, X1.x); \
673 X2 = uintm (X2.z, X2.w, X2.x, X2.y); \
674 X3 = uintm (X3.w, X3.x, X3.y, X3.z); \
677 #define SALSA20_8_XOR() \
700 __device__ static void salsa_r (uintm T[8], const u32 r)
702 const u32 state_cnt = GET_STATE_CNT (r);
704 const u32 state_cnt4 = state_cnt / 4;
706 uintm R0 = T[state_cnt4 - 4];
707 uintm R1 = T[state_cnt4 - 3];
708 uintm R2 = T[state_cnt4 - 2];
709 uintm R3 = T[state_cnt4 - 1];
711 for (u32 i = 0; i < state_cnt4; i += 8)
743 #define exchg(x,y) { const uintm t = T[(x)]; T[(x)] = T[(y)]; T[(y)] = t; }
745 #define exchg4(x,y) \
747 const u32 x4 = (x) * 4; \
748 const u32 y4 = (y) * 4; \
750 exchg (x4 + 0, y4 + 0); \
751 exchg (x4 + 1, y4 + 1); \
752 exchg (x4 + 2, y4 + 2); \
753 exchg (x4 + 3, y4 + 3); \
756 for (u32 i = 1; i < r / 1; i++)
764 for (u32 i = 1; i < r / 2; i++)
769 const u32 xr1 = (r * 2) - 1 - x;
770 const u32 yr1 = (r * 2) - 1 - y;
776 __device__ static void scrypt_smix (uintm *X, uintm *T, const u32 N, const u32 r, const u32 tmto, const u32 phy, uintm *V)
778 const u32 state_cnt = GET_STATE_CNT (r);
780 const u32 state_cnt4 = state_cnt / 4;
782 #if __CUDA_ARCH__ >= 500
783 #define Coord(x,y,z) (((y) * zSIZE) + ((x) * zSIZE * ySIZE) + (z))
784 #define CO Coord(x,y,z)
786 #define Coord(x,y,z) (((x) * zSIZE) + ((y) * zSIZE * xSIZE) + (z))
787 #define CO Coord(x,y,z)
790 const u32 xSIZE = phy;
791 const u32 ySIZE = N / tmto;
792 const u32 zSIZE = state_cnt4;
794 const u32 gid = (blockIdx.x * blockDim.x) + threadIdx.x;
796 const u32 x = gid % xSIZE;
798 for (u32 i = 0; i < state_cnt4; i += 4)
800 T[0] = uintm (X[i + 0].x, X[i + 1].y, X[i + 2].z, X[i + 3].w);
801 T[1] = uintm (X[i + 1].x, X[i + 2].y, X[i + 3].z, X[i + 0].w);
802 T[2] = uintm (X[i + 2].x, X[i + 3].y, X[i + 0].z, X[i + 1].w);
803 T[3] = uintm (X[i + 3].x, X[i + 0].y, X[i + 1].z, X[i + 2].w);
811 for (u32 y = 0; y < ySIZE; y++)
813 for (u32 z = 0; z < zSIZE; z++) V[CO] = X[z];
815 for (u32 i = 0; i < tmto; i++) salsa_r (X, r);
818 for (u32 i = 0; i < N; i++)
820 const u32 k = X[zSIZE - 4].x & (N - 1);
822 const u32 y = k / tmto;
824 const u32 km = k - (y * tmto);
826 for (u32 z = 0; z < zSIZE; z++) T[z] = V[CO];
828 for (u32 i = 0; i < km; i++) salsa_r (T, r);
830 for (u32 z = 0; z < zSIZE; z++) X[z] ^= T[z];
835 for (u32 i = 0; i < state_cnt4; i += 4)
837 T[0] = uintm (X[i + 0].x, X[i + 3].y, X[i + 2].z, X[i + 1].w);
838 T[1] = uintm (X[i + 1].x, X[i + 0].y, X[i + 3].z, X[i + 2].w);
839 T[2] = uintm (X[i + 2].x, X[i + 1].y, X[i + 0].z, X[i + 3].w);
840 T[3] = uintm (X[i + 3].x, X[i + 2].y, X[i + 1].z, X[i + 0].w);
849 extern "C" __global__ void __launch_bounds__ (64, 1) m08900_init (const pw_t *pws, const gpu_rule_t *rules_buf, const comb_t *combs_buf, const bf_t *bfs_buf, scrypt_tmp_t *tmps, void *hooks, const u32 *bitmaps_buf_s1_a, const u32 *bitmaps_buf_s1_b, const u32 *bitmaps_buf_s1_c, const u32 *bitmaps_buf_s1_d, const u32 *bitmaps_buf_s2_a, const u32 *bitmaps_buf_s2_b, const u32 *bitmaps_buf_s2_c, const u32 *bitmaps_buf_s2_d, plain_t *plains_buf, const digest_t *digests_buf, u32 *hashes_shown, const salt_t *salt_bufs, const void *esalt_bufs, u32 *d_return_buf, uintm *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 rules_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max)
855 const u32 gid = (blockIdx.x * blockDim.x) + threadIdx.x;
857 if (gid >= gid_max) return;
861 w0[0] = pws[gid].i[ 0];
862 w0[1] = pws[gid].i[ 1];
863 w0[2] = pws[gid].i[ 2];
864 w0[3] = pws[gid].i[ 3];
868 w1[0] = pws[gid].i[ 4];
869 w1[1] = pws[gid].i[ 5];
870 w1[2] = pws[gid].i[ 6];
871 w1[3] = pws[gid].i[ 7];
875 w2[0] = pws[gid].i[ 8];
876 w2[1] = pws[gid].i[ 9];
877 w2[2] = pws[gid].i[10];
878 w2[3] = pws[gid].i[11];
882 w3[0] = pws[gid].i[12];
883 w3[1] = pws[gid].i[13];
884 w3[2] = pws[gid].i[14];
885 w3[3] = pws[gid].i[15];
893 salt_buf0[0] = salt_bufs[salt_pos].salt_buf[0];
894 salt_buf0[1] = salt_bufs[salt_pos].salt_buf[1];
895 salt_buf0[2] = salt_bufs[salt_pos].salt_buf[2];
896 salt_buf0[3] = salt_bufs[salt_pos].salt_buf[3];
900 salt_buf1[0] = salt_bufs[salt_pos].salt_buf[4];
901 salt_buf1[1] = salt_bufs[salt_pos].salt_buf[5];
902 salt_buf1[2] = salt_bufs[salt_pos].salt_buf[6];
903 salt_buf1[3] = salt_bufs[salt_pos].salt_buf[7];
905 const u32 salt_len = salt_bufs[salt_pos].salt_len;
911 const u32 scrypt_r = SCRYPT_R;
912 const u32 scrypt_p = SCRYPT_P;
913 //const u32 scrypt_N = SCRYPT_N;
915 //const u32 state_cnt = GET_STATE_CNT (scrypt_r);
916 const u32 scrypt_cnt = GET_SCRYPT_CNT (scrypt_r, scrypt_p);
917 //const u32 smix_cnt = GET_SMIX_CNT (scrypt_r, scrypt_N);
920 * 1st pbkdf2, creates B
923 w0[0] = swap_workaround (w0[0]);
924 w0[1] = swap_workaround (w0[1]);
925 w0[2] = swap_workaround (w0[2]);
926 w0[3] = swap_workaround (w0[3]);
927 w1[0] = swap_workaround (w1[0]);
928 w1[1] = swap_workaround (w1[1]);
929 w1[2] = swap_workaround (w1[2]);
930 w1[3] = swap_workaround (w1[3]);
931 w2[0] = swap_workaround (w2[0]);
932 w2[1] = swap_workaround (w2[1]);
933 w2[2] = swap_workaround (w2[2]);
934 w2[3] = swap_workaround (w2[3]);
935 w3[0] = swap_workaround (w3[0]);
936 w3[1] = swap_workaround (w3[1]);
937 w3[2] = swap_workaround (w3[2]);
938 w3[3] = swap_workaround (w3[3]);
943 hmac_sha256_pad (w0, w1, w2, w3, ipad, opad);
945 for (u32 i = 0, j = 0, k = 0; i < scrypt_cnt; i += 8, j += 1, k += 2)
947 w0[0] = salt_buf0[0];
948 w0[1] = salt_buf0[1];
949 w0[2] = salt_buf0[2];
950 w0[3] = salt_buf0[3];
951 w1[0] = salt_buf1[0];
952 w1[1] = salt_buf1[1];
953 w1[2] = salt_buf1[2];
954 w1[3] = salt_buf1[3];
966 append[0] = swap_workaround (j + 1);
969 memcat8 (w0, w1, w2, w3, salt_len, append);
971 w0[0] = swap_workaround (w0[0]);
972 w0[1] = swap_workaround (w0[1]);
973 w0[2] = swap_workaround (w0[2]);
974 w0[3] = swap_workaround (w0[3]);
975 w1[0] = swap_workaround (w1[0]);
976 w1[1] = swap_workaround (w1[1]);
977 w1[2] = swap_workaround (w1[2]);
978 w1[3] = swap_workaround (w1[3]);
979 w2[0] = swap_workaround (w2[0]);
980 w2[1] = swap_workaround (w2[1]);
981 w2[2] = swap_workaround (w2[2]);
982 w2[3] = swap_workaround (w2[3]);
983 w3[0] = swap_workaround (w3[0]);
984 w3[1] = swap_workaround (w3[1]);
986 w3[3] = (64 + salt_len + 4) * 8;
990 hmac_sha256_run (w0, w1, w2, w3, ipad, opad, digest);
992 const uintm tmp0 = uintm (digest[0], digest[1], digest[2], digest[3]);
993 const uintm tmp1 = uintm (digest[4], digest[5], digest[6], digest[7]);
997 tmps[gid].P[k + 0] = tmp0;
998 tmps[gid].P[k + 1] = tmp1;
1002 extern "C" __global__ void __launch_bounds__ (64, 1) m08900_loop (const pw_t *pws, const gpu_rule_t *rules_buf, const comb_t *combs_buf, const bf_t *bfs_buf, scrypt_tmp_t *tmps, void *hooks, const u32 *bitmaps_buf_s1_a, const u32 *bitmaps_buf_s1_b, const u32 *bitmaps_buf_s1_c, const u32 *bitmaps_buf_s1_d, const u32 *bitmaps_buf_s2_a, const u32 *bitmaps_buf_s2_b, const u32 *bitmaps_buf_s2_c, const u32 *bitmaps_buf_s2_d, plain_t *plains_buf, const digest_t *digests_buf, u32 *hashes_shown, const salt_t *salt_bufs, const void *esalt_bufs, u32 *d_return_buf, uintm *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 rules_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max)
1004 const u32 gid = (blockIdx.x * blockDim.x) + threadIdx.x;
1006 if (gid >= gid_max) return;
1008 const u32 scrypt_phy = salt_bufs[salt_pos].scrypt_phy;
1010 const u32 state_cnt = GET_STATE_CNT (SCRYPT_R);
1011 const u32 scrypt_cnt = GET_SCRYPT_CNT (SCRYPT_R, SCRYPT_P);
1013 const u32 state_cnt4 = state_cnt / 4;
1014 const u32 scrypt_cnt4 = scrypt_cnt / 4;
1016 uintm X[state_cnt4];
1017 uintm T[state_cnt4];
1019 for (int z = 0; z < state_cnt4; z++) X[z] = swap_workaround (tmps[gid].P[z]);
1021 scrypt_smix (X, T, SCRYPT_N, SCRYPT_R, SCRYPT_TMTO, scrypt_phy, d_scryptV_buf);
1023 for (int z = 0; z < state_cnt4; z++) tmps[gid].P[z] = swap_workaround (X[z]);
1026 for (int i = state_cnt4; i < scrypt_cnt4; i += state_cnt4)
1028 for (int z = 0; z < state_cnt4; z++) X[z] = swap_workaround (tmps[gid].P[i + z]);
1030 scrypt_smix (X, T, SCRYPT_N, SCRYPT_R, SCRYPT_TMTO, scrypt_phy, d_scryptV_buf);
1032 for (int z = 0; z < state_cnt4; z++) tmps[gid].P[i + z] = swap_workaround (X[z]);
1037 extern "C" __global__ void __launch_bounds__ (64, 1) m08900_comp (const pw_t *pws, const gpu_rule_t *rules_buf, const comb_t *combs_buf, const bf_t *bfs_buf, scrypt_tmp_t *tmps, void *hooks, const u32 *bitmaps_buf_s1_a, const u32 *bitmaps_buf_s1_b, const u32 *bitmaps_buf_s1_c, const u32 *bitmaps_buf_s1_d, const u32 *bitmaps_buf_s2_a, const u32 *bitmaps_buf_s2_b, const u32 *bitmaps_buf_s2_c, const u32 *bitmaps_buf_s2_d, plain_t *plains_buf, const digest_t *digests_buf, u32 *hashes_shown, const salt_t *salt_bufs, const void *esalt_bufs, u32 *d_return_buf, uintm *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 rules_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max)
1043 const u32 gid = (blockIdx.x * blockDim.x) + threadIdx.x;;
1044 const u32 lid = threadIdx.x;
1046 if (gid >= gid_max) return;
1050 w0[0] = pws[gid].i[ 0];
1051 w0[1] = pws[gid].i[ 1];
1052 w0[2] = pws[gid].i[ 2];
1053 w0[3] = pws[gid].i[ 3];
1057 w1[0] = pws[gid].i[ 4];
1058 w1[1] = pws[gid].i[ 5];
1059 w1[2] = pws[gid].i[ 6];
1060 w1[3] = pws[gid].i[ 7];
1064 w2[0] = pws[gid].i[ 8];
1065 w2[1] = pws[gid].i[ 9];
1066 w2[2] = pws[gid].i[10];
1067 w2[3] = pws[gid].i[11];
1071 w3[0] = pws[gid].i[12];
1072 w3[1] = pws[gid].i[13];
1073 w3[2] = pws[gid].i[14];
1074 w3[3] = pws[gid].i[15];
1080 const u32 scrypt_r = SCRYPT_R;
1081 const u32 scrypt_p = SCRYPT_P;
1083 const u32 scrypt_cnt = GET_SCRYPT_CNT (scrypt_r, scrypt_p);
1085 const u32 scrypt_cnt4 = scrypt_cnt / 4;
1088 * 2nd pbkdf2, creates B
1091 w0[0] = swap_workaround (w0[0]);
1092 w0[1] = swap_workaround (w0[1]);
1093 w0[2] = swap_workaround (w0[2]);
1094 w0[3] = swap_workaround (w0[3]);
1095 w1[0] = swap_workaround (w1[0]);
1096 w1[1] = swap_workaround (w1[1]);
1097 w1[2] = swap_workaround (w1[2]);
1098 w1[3] = swap_workaround (w1[3]);
1099 w2[0] = swap_workaround (w2[0]);
1100 w2[1] = swap_workaround (w2[1]);
1101 w2[2] = swap_workaround (w2[2]);
1102 w2[3] = swap_workaround (w2[3]);
1103 w3[0] = swap_workaround (w3[0]);
1104 w3[1] = swap_workaround (w3[1]);
1105 w3[2] = swap_workaround (w3[2]);
1106 w3[3] = swap_workaround (w3[3]);
1111 hmac_sha256_pad (w0, w1, w2, w3, ipad, opad);
1113 for (u32 l = 0; l < scrypt_cnt4; l += 4)
1119 tmp = tmps[gid].P[l + 0];
1126 tmp = tmps[gid].P[l + 1];
1133 tmp = tmps[gid].P[l + 2];
1140 tmp = tmps[gid].P[l + 3];
1147 sha256_transform (w0, w1, w2, w3, ipad);
1165 w3[3] = (64 + (scrypt_cnt * 4) + 4) * 8;
1169 hmac_sha256_run (w0, w1, w2, w3, ipad, opad, digest);
1171 const u32x r0 = swap_workaround (digest[DGST_R0]);
1172 const u32x r1 = swap_workaround (digest[DGST_R1]);
1173 const u32x r2 = swap_workaround (digest[DGST_R2]);
1174 const u32x r3 = swap_workaround (digest[DGST_R3]);
1178 #include VECT_COMPARE_M