nv/m12000.cu

   1 /**
   2  * Author......: Jens Steube <jens.steube@gmail.com>
   3  * License.....: MIT
   4  */
   5
   6 #define _PBKDF2_SHA1_
   7
   8 #include "include/constants.h"
   9 #include "include/kernel_vendor.h"
  10
  11 #ifdef  VLIW1
  12 #define VECT_SIZE1
  13 #endif
  14
  15 #ifdef  VLIW2
  16 #define VECT_SIZE1
  17 #endif
  18
  19 #define DGST_R0 0
  20 #define DGST_R1 1
  21 #define DGST_R2 2
  22 #define DGST_R3 3
  23
  24 #include "include/kernel_functions.c"
  25
  26 #include "types_nv.c"
  27 #include "common_nv.c"
  28
  29 #ifdef  VECT_SIZE1
  30 #define VECT_COMPARE_M "check_multi_vect1_comp4.c"
  31 #endif
  32
  33 __device__ static void sha1_transform (const u32x w0[4], const u32x w1[4], const u32x w2[4], const u32x w3[4], u32x digest[5])
  34 {
  35   u32x A = digest[0];
  36   u32x B = digest[1];
  37   u32x C = digest[2];
  38   u32x D = digest[3];
  39   u32x E = digest[4];
  40
  41   u32x w0_t = w0[0];
  42   u32x w1_t = w0[1];
  43   u32x w2_t = w0[2];
  44   u32x w3_t = w0[3];
  45   u32x w4_t = w1[0];
  46   u32x w5_t = w1[1];
  47   u32x w6_t = w1[2];
  48   u32x w7_t = w1[3];
  49   u32x w8_t = w2[0];
  50   u32x w9_t = w2[1];
  51   u32x wa_t = w2[2];
  52   u32x wb_t = w2[3];
  53   u32x wc_t = w3[0];
  54   u32x wd_t = w3[1];
  55   u32x we_t = w3[2];
  56   u32x wf_t = w3[3];
  57
  58   #undef K
  59   #define K SHA1C00
  60
  61   SHA1_STEP (SHA1_F0o, A, B, C, D, E, w0_t);
  62   SHA1_STEP (SHA1_F0o, E, A, B, C, D, w1_t);
  63   SHA1_STEP (SHA1_F0o, D, E, A, B, C, w2_t);
  64   SHA1_STEP (SHA1_F0o, C, D, E, A, B, w3_t);
  65   SHA1_STEP (SHA1_F0o, B, C, D, E, A, w4_t);
  66   SHA1_STEP (SHA1_F0o, A, B, C, D, E, w5_t);
  67   SHA1_STEP (SHA1_F0o, E, A, B, C, D, w6_t);
  68   SHA1_STEP (SHA1_F0o, D, E, A, B, C, w7_t);
  69   SHA1_STEP (SHA1_F0o, C, D, E, A, B, w8_t);
  70   SHA1_STEP (SHA1_F0o, B, C, D, E, A, w9_t);
  71   SHA1_STEP (SHA1_F0o, A, B, C, D, E, wa_t);
  72   SHA1_STEP (SHA1_F0o, E, A, B, C, D, wb_t);
  73   SHA1_STEP (SHA1_F0o, D, E, A, B, C, wc_t);
  74   SHA1_STEP (SHA1_F0o, C, D, E, A, B, wd_t);
  75   SHA1_STEP (SHA1_F0o, B, C, D, E, A, we_t);
  76   SHA1_STEP (SHA1_F0o, A, B, C, D, E, wf_t);
  77   w0_t = rotl32 ((wd_t ^ w8_t ^ w2_t ^ w0_t), 1u); SHA1_STEP (SHA1_F0o, E, A, B, C, D, w0_t);
  78   w1_t = rotl32 ((we_t ^ w9_t ^ w3_t ^ w1_t), 1u); SHA1_STEP (SHA1_F0o, D, E, A, B, C, w1_t);
  79   w2_t = rotl32 ((wf_t ^ wa_t ^ w4_t ^ w2_t), 1u); SHA1_STEP (SHA1_F0o, C, D, E, A, B, w2_t);
  80   w3_t = rotl32 ((w0_t ^ wb_t ^ w5_t ^ w3_t), 1u); SHA1_STEP (SHA1_F0o, B, C, D, E, A, w3_t);
  81
  82   #undef K
  83   #define K SHA1C01
  84
  85   w4_t = rotl32 ((w1_t ^ wc_t ^ w6_t ^ w4_t), 1u); SHA1_STEP (SHA1_F1, A, B, C, D, E, w4_t);
  86   w5_t = rotl32 ((w2_t ^ wd_t ^ w7_t ^ w5_t), 1u); SHA1_STEP (SHA1_F1, E, A, B, C, D, w5_t);
  87   w6_t = rotl32 ((w3_t ^ we_t ^ w8_t ^ w6_t), 1u); SHA1_STEP (SHA1_F1, D, E, A, B, C, w6_t);
  88   w7_t = rotl32 ((w4_t ^ wf_t ^ w9_t ^ w7_t), 1u); SHA1_STEP (SHA1_F1, C, D, E, A, B, w7_t);
  89   w8_t = rotl32 ((w5_t ^ w0_t ^ wa_t ^ w8_t), 1u); SHA1_STEP (SHA1_F1, B, C, D, E, A, w8_t);
  90   w9_t = rotl32 ((w6_t ^ w1_t ^ wb_t ^ w9_t), 1u); SHA1_STEP (SHA1_F1, A, B, C, D, E, w9_t);
  91   wa_t = rotl32 ((w7_t ^ w2_t ^ wc_t ^ wa_t), 1u); SHA1_STEP (SHA1_F1, E, A, B, C, D, wa_t);
  92   wb_t = rotl32 ((w8_t ^ w3_t ^ wd_t ^ wb_t), 1u); SHA1_STEP (SHA1_F1, D, E, A, B, C, wb_t);
  93   wc_t = rotl32 ((w9_t ^ w4_t ^ we_t ^ wc_t), 1u); SHA1_STEP (SHA1_F1, C, D, E, A, B, wc_t);
  94   wd_t = rotl32 ((wa_t ^ w5_t ^ wf_t ^ wd_t), 1u); SHA1_STEP (SHA1_F1, B, C, D, E, A, wd_t);
  95   we_t = rotl32 ((wb_t ^ w6_t ^ w0_t ^ we_t), 1u); SHA1_STEP (SHA1_F1, A, B, C, D, E, we_t);
  96   wf_t = rotl32 ((wc_t ^ w7_t ^ w1_t ^ wf_t), 1u); SHA1_STEP (SHA1_F1, E, A, B, C, D, wf_t);
  97   w0_t = rotl32 ((wd_t ^ w8_t ^ w2_t ^ w0_t), 1u); SHA1_STEP (SHA1_F1, D, E, A, B, C, w0_t);
  98   w1_t = rotl32 ((we_t ^ w9_t ^ w3_t ^ w1_t), 1u); SHA1_STEP (SHA1_F1, C, D, E, A, B, w1_t);
  99   w2_t = rotl32 ((wf_t ^ wa_t ^ w4_t ^ w2_t), 1u); SHA1_STEP (SHA1_F1, B, C, D, E, A, w2_t);
 100   w3_t = rotl32 ((w0_t ^ wb_t ^ w5_t ^ w3_t), 1u); SHA1_STEP (SHA1_F1, A, B, C, D, E, w3_t);
 101   w4_t = rotl32 ((w1_t ^ wc_t ^ w6_t ^ w4_t), 1u); SHA1_STEP (SHA1_F1, E, A, B, C, D, w4_t);
 102   w5_t = rotl32 ((w2_t ^ wd_t ^ w7_t ^ w5_t), 1u); SHA1_STEP (SHA1_F1, D, E, A, B, C, w5_t);
 103   w6_t = rotl32 ((w3_t ^ we_t ^ w8_t ^ w6_t), 1u); SHA1_STEP (SHA1_F1, C, D, E, A, B, w6_t);
 104   w7_t = rotl32 ((w4_t ^ wf_t ^ w9_t ^ w7_t), 1u); SHA1_STEP (SHA1_F1, B, C, D, E, A, w7_t);
 105
 106   #undef K
 107   #define K SHA1C02
 108
 109   w8_t = rotl32 ((w5_t ^ w0_t ^ wa_t ^ w8_t), 1u); SHA1_STEP (SHA1_F2o, A, B, C, D, E, w8_t);
 110   w9_t = rotl32 ((w6_t ^ w1_t ^ wb_t ^ w9_t), 1u); SHA1_STEP (SHA1_F2o, E, A, B, C, D, w9_t);
 111   wa_t = rotl32 ((w7_t ^ w2_t ^ wc_t ^ wa_t), 1u); SHA1_STEP (SHA1_F2o, D, E, A, B, C, wa_t);
 112   wb_t = rotl32 ((w8_t ^ w3_t ^ wd_t ^ wb_t), 1u); SHA1_STEP (SHA1_F2o, C, D, E, A, B, wb_t);
 113   wc_t = rotl32 ((w9_t ^ w4_t ^ we_t ^ wc_t), 1u); SHA1_STEP (SHA1_F2o, B, C, D, E, A, wc_t);
 114   wd_t = rotl32 ((wa_t ^ w5_t ^ wf_t ^ wd_t), 1u); SHA1_STEP (SHA1_F2o, A, B, C, D, E, wd_t);
 115   we_t = rotl32 ((wb_t ^ w6_t ^ w0_t ^ we_t), 1u); SHA1_STEP (SHA1_F2o, E, A, B, C, D, we_t);
 116   wf_t = rotl32 ((wc_t ^ w7_t ^ w1_t ^ wf_t), 1u); SHA1_STEP (SHA1_F2o, D, E, A, B, C, wf_t);
 117   w0_t = rotl32 ((wd_t ^ w8_t ^ w2_t ^ w0_t), 1u); SHA1_STEP (SHA1_F2o, C, D, E, A, B, w0_t);
 118   w1_t = rotl32 ((we_t ^ w9_t ^ w3_t ^ w1_t), 1u); SHA1_STEP (SHA1_F2o, B, C, D, E, A, w1_t);
 119   w2_t = rotl32 ((wf_t ^ wa_t ^ w4_t ^ w2_t), 1u); SHA1_STEP (SHA1_F2o, A, B, C, D, E, w2_t);
 120   w3_t = rotl32 ((w0_t ^ wb_t ^ w5_t ^ w3_t), 1u); SHA1_STEP (SHA1_F2o, E, A, B, C, D, w3_t);
 121   w4_t = rotl32 ((w1_t ^ wc_t ^ w6_t ^ w4_t), 1u); SHA1_STEP (SHA1_F2o, D, E, A, B, C, w4_t);
 122   w5_t = rotl32 ((w2_t ^ wd_t ^ w7_t ^ w5_t), 1u); SHA1_STEP (SHA1_F2o, C, D, E, A, B, w5_t);
 123   w6_t = rotl32 ((w3_t ^ we_t ^ w8_t ^ w6_t), 1u); SHA1_STEP (SHA1_F2o, B, C, D, E, A, w6_t);
 124   w7_t = rotl32 ((w4_t ^ wf_t ^ w9_t ^ w7_t), 1u); SHA1_STEP (SHA1_F2o, A, B, C, D, E, w7_t);
 125   w8_t = rotl32 ((w5_t ^ w0_t ^ wa_t ^ w8_t), 1u); SHA1_STEP (SHA1_F2o, E, A, B, C, D, w8_t);
 126   w9_t = rotl32 ((w6_t ^ w1_t ^ wb_t ^ w9_t), 1u); SHA1_STEP (SHA1_F2o, D, E, A, B, C, w9_t);
 127   wa_t = rotl32 ((w7_t ^ w2_t ^ wc_t ^ wa_t), 1u); SHA1_STEP (SHA1_F2o, C, D, E, A, B, wa_t);
 128   wb_t = rotl32 ((w8_t ^ w3_t ^ wd_t ^ wb_t), 1u); SHA1_STEP (SHA1_F2o, B, C, D, E, A, wb_t);
 129
 130   #undef K
 131   #define K SHA1C03
 132
 133   wc_t = rotl32 ((w9_t ^ w4_t ^ we_t ^ wc_t), 1u); SHA1_STEP (SHA1_F1, A, B, C, D, E, wc_t);
 134   wd_t = rotl32 ((wa_t ^ w5_t ^ wf_t ^ wd_t), 1u); SHA1_STEP (SHA1_F1, E, A, B, C, D, wd_t);
 135   we_t = rotl32 ((wb_t ^ w6_t ^ w0_t ^ we_t), 1u); SHA1_STEP (SHA1_F1, D, E, A, B, C, we_t);
 136   wf_t = rotl32 ((wc_t ^ w7_t ^ w1_t ^ wf_t), 1u); SHA1_STEP (SHA1_F1, C, D, E, A, B, wf_t);
 137   w0_t = rotl32 ((wd_t ^ w8_t ^ w2_t ^ w0_t), 1u); SHA1_STEP (SHA1_F1, B, C, D, E, A, w0_t);
 138   w1_t = rotl32 ((we_t ^ w9_t ^ w3_t ^ w1_t), 1u); SHA1_STEP (SHA1_F1, A, B, C, D, E, w1_t);
 139   w2_t = rotl32 ((wf_t ^ wa_t ^ w4_t ^ w2_t), 1u); SHA1_STEP (SHA1_F1, E, A, B, C, D, w2_t);
 140   w3_t = rotl32 ((w0_t ^ wb_t ^ w5_t ^ w3_t), 1u); SHA1_STEP (SHA1_F1, D, E, A, B, C, w3_t);
 141   w4_t = rotl32 ((w1_t ^ wc_t ^ w6_t ^ w4_t), 1u); SHA1_STEP (SHA1_F1, C, D, E, A, B, w4_t);
 142   w5_t = rotl32 ((w2_t ^ wd_t ^ w7_t ^ w5_t), 1u); SHA1_STEP (SHA1_F1, B, C, D, E, A, w5_t);
 143   w6_t = rotl32 ((w3_t ^ we_t ^ w8_t ^ w6_t), 1u); SHA1_STEP (SHA1_F1, A, B, C, D, E, w6_t);
 144   w7_t = rotl32 ((w4_t ^ wf_t ^ w9_t ^ w7_t), 1u); SHA1_STEP (SHA1_F1, E, A, B, C, D, w7_t);
 145   w8_t = rotl32 ((w5_t ^ w0_t ^ wa_t ^ w8_t), 1u); SHA1_STEP (SHA1_F1, D, E, A, B, C, w8_t);
 146   w9_t = rotl32 ((w6_t ^ w1_t ^ wb_t ^ w9_t), 1u); SHA1_STEP (SHA1_F1, C, D, E, A, B, w9_t);
 147   wa_t = rotl32 ((w7_t ^ w2_t ^ wc_t ^ wa_t), 1u); SHA1_STEP (SHA1_F1, B, C, D, E, A, wa_t);
 148   wb_t = rotl32 ((w8_t ^ w3_t ^ wd_t ^ wb_t), 1u); SHA1_STEP (SHA1_F1, A, B, C, D, E, wb_t);
 149   wc_t = rotl32 ((w9_t ^ w4_t ^ we_t ^ wc_t), 1u); SHA1_STEP (SHA1_F1, E, A, B, C, D, wc_t);
 150   wd_t = rotl32 ((wa_t ^ w5_t ^ wf_t ^ wd_t), 1u); SHA1_STEP (SHA1_F1, D, E, A, B, C, wd_t);
 151   we_t = rotl32 ((wb_t ^ w6_t ^ w0_t ^ we_t), 1u); SHA1_STEP (SHA1_F1, C, D, E, A, B, we_t);
 152   wf_t = rotl32 ((wc_t ^ w7_t ^ w1_t ^ wf_t), 1u); SHA1_STEP (SHA1_F1, B, C, D, E, A, wf_t);
 153
 154   digest[0] += A;
 155   digest[1] += B;
 156   digest[2] += C;
 157   digest[3] += D;
 158   digest[4] += E;
 159 }
 160
 161 __device__ static void hmac_sha1_pad (u32x w0[4], u32x w1[4], u32x w2[4], u32x w3[4], u32x ipad[5], u32x opad[5])
 162 {
 163   w0[0] = w0[0] ^ 0x36363636;
 164   w0[1] = w0[1] ^ 0x36363636;
 165   w0[2] = w0[2] ^ 0x36363636;
 166   w0[3] = w0[3] ^ 0x36363636;
 167   w1[0] = w1[0] ^ 0x36363636;
 168   w1[1] = w1[1] ^ 0x36363636;
 169   w1[2] = w1[2] ^ 0x36363636;
 170   w1[3] = w1[3] ^ 0x36363636;
 171   w2[0] = w2[0] ^ 0x36363636;
 172   w2[1] = w2[1] ^ 0x36363636;
 173   w2[2] = w2[2] ^ 0x36363636;
 174   w2[3] = w2[3] ^ 0x36363636;
 175   w3[0] = w3[0] ^ 0x36363636;
 176   w3[1] = w3[1] ^ 0x36363636;
 177   w3[2] = w3[2] ^ 0x36363636;
 178   w3[3] = w3[3] ^ 0x36363636;
 179
 180   ipad[0] = SHA1M_A;
 181   ipad[1] = SHA1M_B;
 182   ipad[2] = SHA1M_C;
 183   ipad[3] = SHA1M_D;
 184   ipad[4] = SHA1M_E;
 185
 186   sha1_transform (w0, w1, w2, w3, ipad);
 187
 188   w0[0] = w0[0] ^ 0x6a6a6a6a;
 189   w0[1] = w0[1] ^ 0x6a6a6a6a;
 190   w0[2] = w0[2] ^ 0x6a6a6a6a;
 191   w0[3] = w0[3] ^ 0x6a6a6a6a;
 192   w1[0] = w1[0] ^ 0x6a6a6a6a;
 193   w1[1] = w1[1] ^ 0x6a6a6a6a;
 194   w1[2] = w1[2] ^ 0x6a6a6a6a;
 195   w1[3] = w1[3] ^ 0x6a6a6a6a;
 196   w2[0] = w2[0] ^ 0x6a6a6a6a;
 197   w2[1] = w2[1] ^ 0x6a6a6a6a;
 198   w2[2] = w2[2] ^ 0x6a6a6a6a;
 199   w2[3] = w2[3] ^ 0x6a6a6a6a;
 200   w3[0] = w3[0] ^ 0x6a6a6a6a;
 201   w3[1] = w3[1] ^ 0x6a6a6a6a;
 202   w3[2] = w3[2] ^ 0x6a6a6a6a;
 203   w3[3] = w3[3] ^ 0x6a6a6a6a;
 204
 205   opad[0] = SHA1M_A;
 206   opad[1] = SHA1M_B;
 207   opad[2] = SHA1M_C;
 208   opad[3] = SHA1M_D;
 209   opad[4] = SHA1M_E;
 210
 211   sha1_transform (w0, w1, w2, w3, opad);
 212 }
 213
 214 __device__ static void hmac_sha1_run (u32x w0[4], u32x w1[4], u32x w2[4], u32x w3[4], u32x ipad[5], u32x opad[5], u32x digest[5])
 215 {
 216   digest[0] = ipad[0];
 217   digest[1] = ipad[1];
 218   digest[2] = ipad[2];
 219   digest[3] = ipad[3];
 220   digest[4] = ipad[4];
 221
 222   sha1_transform (w0, w1, w2, w3, digest);
 223
 224   w0[0] = digest[0];
 225   w0[1] = digest[1];
 226   w0[2] = digest[2];
 227   w0[3] = digest[3];
 228   w1[0] = digest[4];
 229   w1[1] = 0x80000000;
 230   w1[2] = 0;
 231   w1[3] = 0;
 232   w2[0] = 0;
 233   w2[1] = 0;
 234   w2[2] = 0;
 235   w2[3] = 0;
 236   w3[0] = 0;
 237   w3[1] = 0;
 238   w3[2] = 0;
 239   w3[3] = (64 + 20) * 8;
 240
 241   digest[0] = opad[0];
 242   digest[1] = opad[1];
 243   digest[2] = opad[2];
 244   digest[3] = opad[3];
 245   digest[4] = opad[4];
 246
 247   sha1_transform (w0, w1, w2, w3, digest);
 248 }
 249
 250 extern "C" __global__ void __launch_bounds__ (256, 1) m12000_init (const pw_t *pws, const gpu_rule_t *rules_buf, const comb_t *combs_buf, const bf_t *bfs_buf, pbkdf2_sha1_tmp_t *tmps, void *hooks, const u32 *bitmaps_buf_s1_a, const u32 *bitmaps_buf_s1_b, const u32 *bitmaps_buf_s1_c, const u32 *bitmaps_buf_s1_d, const u32 *bitmaps_buf_s2_a, const u32 *bitmaps_buf_s2_b, const u32 *bitmaps_buf_s2_c, const u32 *bitmaps_buf_s2_d, plain_t *plains_buf, const digest_t *digests_buf, u32 *hashes_shown, const salt_t *salt_bufs, const pbkdf2_sha1_t *esalt_bufs, u32 *d_return_buf, u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 rules_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max)
 251 {
 252   /**
 253    * base
 254    */
 255
 256   const u32 gid = (blockIdx.x * blockDim.x) + threadIdx.x;
 257
 258   if (gid >= gid_max) return;
 259
 260   u32x w0[4];
 261
 262   w0[0] = swap_workaround (pws[gid].i[ 0]);
 263   w0[1] = swap_workaround (pws[gid].i[ 1]);
 264   w0[2] = swap_workaround (pws[gid].i[ 2]);
 265   w0[3] = swap_workaround (pws[gid].i[ 3]);
 266
 267   u32x w1[4];
 268
 269   w1[0] = swap_workaround (pws[gid].i[ 4]);
 270   w1[1] = swap_workaround (pws[gid].i[ 5]);
 271   w1[2] = swap_workaround (pws[gid].i[ 6]);
 272   w1[3] = swap_workaround (pws[gid].i[ 7]);
 273
 274   u32x w2[4];
 275
 276   w2[0] = swap_workaround (pws[gid].i[ 8]);
 277   w2[1] = swap_workaround (pws[gid].i[ 9]);
 278   w2[2] = swap_workaround (pws[gid].i[10]);
 279   w2[3] = swap_workaround (pws[gid].i[11]);
 280
 281   u32x w3[4];
 282
 283   w3[0] = swap_workaround (pws[gid].i[12]);
 284   w3[1] = swap_workaround (pws[gid].i[13]);
 285   w3[2] = swap_workaround (pws[gid].i[14]);
 286   w3[3] = swap_workaround (pws[gid].i[15]);
 287
 288   /**
 289    * salt
 290    */
 291
 292   const u32 salt_len = salt_bufs[salt_pos].salt_len;
 293
 294   u32 esalt_buf0[4];
 295   u32 esalt_buf1[4];
 296   u32 esalt_buf2[4];
 297   u32 esalt_buf3[4];
 298
 299   esalt_buf0[0] = swap_workaround (esalt_bufs[salt_pos].salt_buf[ 0]);
 300   esalt_buf0[1] = swap_workaround (esalt_bufs[salt_pos].salt_buf[ 1]);
 301   esalt_buf0[2] = swap_workaround (esalt_bufs[salt_pos].salt_buf[ 2]);
 302   esalt_buf0[3] = swap_workaround (esalt_bufs[salt_pos].salt_buf[ 3]);
 303   esalt_buf1[0] = swap_workaround (esalt_bufs[salt_pos].salt_buf[ 4]);
 304   esalt_buf1[1] = swap_workaround (esalt_bufs[salt_pos].salt_buf[ 5]);
 305   esalt_buf1[2] = swap_workaround (esalt_bufs[salt_pos].salt_buf[ 6]);
 306   esalt_buf1[3] = swap_workaround (esalt_bufs[salt_pos].salt_buf[ 7]);
 307   esalt_buf2[0] = swap_workaround (esalt_bufs[salt_pos].salt_buf[ 8]);
 308   esalt_buf2[1] = swap_workaround (esalt_bufs[salt_pos].salt_buf[ 9]);
 309   esalt_buf2[2] = swap_workaround (esalt_bufs[salt_pos].salt_buf[10]);
 310   esalt_buf2[3] = swap_workaround (esalt_bufs[salt_pos].salt_buf[11]);
 311   esalt_buf3[0] = swap_workaround (esalt_bufs[salt_pos].salt_buf[12]);
 312   esalt_buf3[1] = swap_workaround (esalt_bufs[salt_pos].salt_buf[13]);
 313   esalt_buf3[2] = 0;
 314   esalt_buf3[3] = (64 + salt_len + 4) * 8;
 315
 316   u32 ipad[5];
 317   u32 opad[5];
 318
 319   hmac_sha1_pad (w0, w1, w2, w3, ipad, opad);
 320
 321   tmps[gid].ipad[0] = ipad[0];
 322   tmps[gid].ipad[1] = ipad[1];
 323   tmps[gid].ipad[2] = ipad[2];
 324   tmps[gid].ipad[3] = ipad[3];
 325   tmps[gid].ipad[4] = ipad[4];
 326
 327   tmps[gid].opad[0] = opad[0];
 328   tmps[gid].opad[1] = opad[1];
 329   tmps[gid].opad[2] = opad[2];
 330   tmps[gid].opad[3] = opad[3];
 331   tmps[gid].opad[4] = opad[4];
 332
 333   for (u32 i = 0, j = 1; i < 5; i += 5, j += 1)
 334   {
 335     u32 dgst[5];
 336
 337     hmac_sha1_run (esalt_buf0, esalt_buf1, esalt_buf2, esalt_buf3, ipad, opad, dgst);
 338
 339     tmps[gid].dgst[i + 0] = dgst[0];
 340     tmps[gid].dgst[i + 1] = dgst[1];
 341     tmps[gid].dgst[i + 2] = dgst[2];
 342     tmps[gid].dgst[i + 3] = dgst[3];
 343     tmps[gid].dgst[i + 4] = dgst[4];
 344
 345     tmps[gid].out[i + 0] = dgst[0];
 346     tmps[gid].out[i + 1] = dgst[1];
 347     tmps[gid].out[i + 2] = dgst[2];
 348     tmps[gid].out[i + 3] = dgst[3];
 349     tmps[gid].out[i + 4] = dgst[4];
 350   }
 351 }
 352
 353 extern "C" __global__ void __launch_bounds__ (256, 1) m12000_loop (const pw_t *pws, const gpu_rule_t *rules_buf, const comb_t *combs_buf, const bf_t *bfs_buf, pbkdf2_sha1_tmp_t *tmps, void *hooks, const u32 *bitmaps_buf_s1_a, const u32 *bitmaps_buf_s1_b, const u32 *bitmaps_buf_s1_c, const u32 *bitmaps_buf_s1_d, const u32 *bitmaps_buf_s2_a, const u32 *bitmaps_buf_s2_b, const u32 *bitmaps_buf_s2_c, const u32 *bitmaps_buf_s2_d, plain_t *plains_buf, const digest_t *digests_buf, u32 *hashes_shown, const salt_t *salt_bufs, const pbkdf2_sha1_t *esalt_bufs, u32 *d_return_buf, u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 rules_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max)
 354 {
 355   const u32 gid = (blockIdx.x * blockDim.x) + threadIdx.x;
 356
 357   if (gid >= gid_max) return;
 358
 359   u32 ipad[5];
 360
 361   ipad[0] = tmps[gid].ipad[0];
 362   ipad[1] = tmps[gid].ipad[1];
 363   ipad[2] = tmps[gid].ipad[2];
 364   ipad[3] = tmps[gid].ipad[3];
 365   ipad[4] = tmps[gid].ipad[4];
 366
 367   u32 opad[5];
 368
 369   opad[0] = tmps[gid].opad[0];
 370   opad[1] = tmps[gid].opad[1];
 371   opad[2] = tmps[gid].opad[2];
 372   opad[3] = tmps[gid].opad[3];
 373   opad[4] = tmps[gid].opad[4];
 374
 375   for (u32 i = 0; i < 5; i += 5)
 376   {
 377     u32 dgst[5];
 378
 379     dgst[0] = tmps[gid].dgst[i + 0];
 380     dgst[1] = tmps[gid].dgst[i + 1];
 381     dgst[2] = tmps[gid].dgst[i + 2];
 382     dgst[3] = tmps[gid].dgst[i + 3];
 383     dgst[4] = tmps[gid].dgst[i + 4];
 384
 385     u32 out[5];
 386
 387     out[0] = tmps[gid].out[i + 0];
 388     out[1] = tmps[gid].out[i + 1];
 389     out[2] = tmps[gid].out[i + 2];
 390     out[3] = tmps[gid].out[i + 3];
 391     out[4] = tmps[gid].out[i + 4];
 392
 393     for (u32 j = 0; j < loop_cnt; j++)
 394     {
 395       u32 w0[4];
 396       u32 w1[4];
 397       u32 w2[4];
 398       u32 w3[4];
 399
 400       w0[0] = dgst[0];
 401       w0[1] = dgst[1];
 402       w0[2] = dgst[2];
 403       w0[3] = dgst[3];
 404       w1[0] = dgst[4];
 405       w1[1] = 0x80000000;
 406       w1[2] = 0;
 407       w1[3] = 0;
 408       w2[0] = 0;
 409       w2[1] = 0;
 410       w2[2] = 0;
 411       w2[3] = 0;
 412       w3[0] = 0;
 413       w3[1] = 0;
 414       w3[2] = 0;
 415       w3[3] = (64 + 20) * 8;
 416
 417       hmac_sha1_run (w0, w1, w2, w3, ipad, opad, dgst);
 418
 419       out[0] ^= dgst[0];
 420       out[1] ^= dgst[1];
 421       out[2] ^= dgst[2];
 422       out[3] ^= dgst[3];
 423       out[4] ^= dgst[4];
 424     }
 425
 426     tmps[gid].dgst[i + 0] = dgst[0];
 427     tmps[gid].dgst[i + 1] = dgst[1];
 428     tmps[gid].dgst[i + 2] = dgst[2];
 429     tmps[gid].dgst[i + 3] = dgst[3];
 430     tmps[gid].dgst[i + 4] = dgst[4];
 431
 432     tmps[gid].out[i + 0] = out[0];
 433     tmps[gid].out[i + 1] = out[1];
 434     tmps[gid].out[i + 2] = out[2];
 435     tmps[gid].out[i + 3] = out[3];
 436     tmps[gid].out[i + 4] = out[4];
 437   }
 438 }
 439
 440 extern "C" __global__ void __launch_bounds__ (256, 1) m12000_comp (const pw_t *pws, const gpu_rule_t *rules_buf, const comb_t *combs_buf, const bf_t *bfs_buf, pbkdf2_sha1_tmp_t *tmps, void *hooks, const u32 *bitmaps_buf_s1_a, const u32 *bitmaps_buf_s1_b, const u32 *bitmaps_buf_s1_c, const u32 *bitmaps_buf_s1_d, const u32 *bitmaps_buf_s2_a, const u32 *bitmaps_buf_s2_b, const u32 *bitmaps_buf_s2_c, const u32 *bitmaps_buf_s2_d, plain_t *plains_buf, const digest_t *digests_buf, u32 *hashes_shown, const salt_t *salt_bufs, const pbkdf2_sha1_t *esalt_bufs, u32 *d_return_buf, u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 rules_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max)
 441 {
 442   /**
 443    * base
 444    */
 445
 446   const u32 gid = (blockIdx.x * blockDim.x) + threadIdx.x;
 447
 448   if (gid >= gid_max) return;
 449
 450   const u32 lid = threadIdx.x;
 451
 452   const u32x r0 = tmps[gid].out[DGST_R0];
 453   const u32x r1 = tmps[gid].out[DGST_R1];
 454   const u32x r2 = tmps[gid].out[DGST_R2];
 455   const u32x r3 = tmps[gid].out[DGST_R3];
 456
 457   #define il_pos 0
 458
 459   #include VECT_COMPARE_M
 460 }