amd/m10100_a1.cl

   1 /**
   2  * Author......: Jens Steube <jens.steube@gmail.com>
   3  * License.....: MIT
   4  */
   5
   6 #define _SIPHASH_
   7
   8 #include "include/constants.h"
   9 #include "include/kernel_vendor.h"
  10
  11 #ifdef  VLIW1
  12 #define VECT_SIZE1
  13 #endif
  14
  15 #ifdef  VLIW4
  16 #define VECT_SIZE1
  17 #endif
  18
  19 #ifdef  VLIW5
  20 #define VECT_SIZE1
  21 #endif
  22
  23 #define DGST_R0 0
  24 #define DGST_R1 1
  25 #define DGST_R2 2
  26 #define DGST_R3 3
  27
  28 #include "include/kernel_functions.c"
  29 #include "types_amd.c"
  30 #include "common_amd.c"
  31
  32 #ifdef  VECT_SIZE1
  33 #define VECT_COMPARE_S "check_single_vect1_comp4.c"
  34 #define VECT_COMPARE_M "check_multi_vect1_comp4.c"
  35 #endif
  36
  37 #ifdef  VECT_SIZE2
  38 #define VECT_COMPARE_S "check_single_vect2_comp4.c"
  39 #define VECT_COMPARE_M "check_multi_vect2_comp4.c"
  40 #endif
  41
  42 #ifdef  VECT_SIZE4
  43 #define VECT_COMPARE_S "check_single_vect4_comp4.c"
  44 #define VECT_COMPARE_M "check_multi_vect4_comp4.c"
  45 #endif
  46
  47 #ifdef VECT_SIZE1
  48 #define SIPROUND(v0,v1,v2,v3) \
  49   (v0) += (v1);               \
  50   (v1)  = rotl64 ((v1), 13);  \
  51   (v1) ^= (v0);               \
  52   (v0)  = as_ulong (as_uint2 ((v0)).s10); \
  53   (v2) += (v3);               \
  54   (v3)  = rotl64 ((v3), 16);  \
  55   (v3) ^= (v2);               \
  56   (v0) += (v3);               \
  57   (v3)  = rotl64 ((v3), 21);  \
  58   (v3) ^= (v0);               \
  59   (v2) += (v1);               \
  60   (v1)  = rotl64 ((v1), 17);  \
  61   (v1) ^= (v2);               \
  62   (v2)  = as_ulong (as_uint2 ((v2)).s10);
  63 #else
  64 #define SIPROUND(v0,v1,v2,v3) \
  65   (v0) += (v1);               \
  66   (v1)  = rotl64 ((v1), 13);  \
  67   (v1) ^= (v0);               \
  68   (v0)  = rotl64 ((v0), 32);  \
  69   (v2) += (v3);               \
  70   (v3)  = rotl64 ((v3), 16);  \
  71   (v3) ^= (v2);               \
  72   (v0) += (v3);               \
  73   (v3)  = rotl64 ((v3), 21);  \
  74   (v3) ^= (v0);               \
  75   (v2) += (v1);               \
  76   (v1)  = rotl64 ((v1), 17);  \
  77   (v1) ^= (v2);               \
  78   (v2)  = rotl64 ((v2), 32);
  79 #endif
  80
  81 __kernel void __attribute__((reqd_work_group_size (64, 1, 1))) m10100_m04 (__global pw_t *pws, __global gpu_rule_t *rules_buf, __global comb_t *combs_buf, __global bf_t *bfs_buf, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 combs_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max)
  82 {
  83   /**
  84    * modifier
  85    */
  86
  87   const u32 lid = get_local_id (0);
  88
  89   /**
  90    * base
  91    */
  92
  93   const u32 gid = get_global_id (0);
  94
  95   if (gid >= gid_max) return;
  96
  97   u32x wordl0[4];
  98
  99   wordl0[0] = pws[gid].i[ 0];
 100   wordl0[1] = pws[gid].i[ 1];
 101   wordl0[2] = pws[gid].i[ 2];
 102   wordl0[3] = pws[gid].i[ 3];
 103
 104   u32x wordl1[4];
 105
 106   wordl1[0] = pws[gid].i[ 4];
 107   wordl1[1] = pws[gid].i[ 5];
 108   wordl1[2] = pws[gid].i[ 6];
 109   wordl1[3] = pws[gid].i[ 7];
 110
 111   u32x wordl2[4];
 112
 113   wordl2[0] = 0;
 114   wordl2[1] = 0;
 115   wordl2[2] = 0;
 116   wordl2[3] = 0;
 117
 118   u32x wordl3[4];
 119
 120   wordl3[0] = 0;
 121   wordl3[1] = 0;
 122   wordl3[2] = 0;
 123   wordl3[3] = 0;
 124
 125   const u32 pw_l_len = pws[gid].pw_len;
 126
 127   if (combs_mode == COMBINATOR_MODE_BASE_RIGHT)
 128   {
 129     switch_buffer_by_offset (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len);
 130   }
 131
 132   /**
 133    * base
 134    */
 135
 136   u64 v0p = SIPHASHM_0;
 137   u64 v1p = SIPHASHM_1;
 138   u64 v2p = SIPHASHM_2;
 139   u64 v3p = SIPHASHM_3;
 140
 141   v0p ^= hl32_to_64 (salt_bufs[salt_pos].salt_buf[1], salt_bufs[salt_pos].salt_buf[0]);
 142   v1p ^= hl32_to_64 (salt_bufs[salt_pos].salt_buf[3], salt_bufs[salt_pos].salt_buf[2]);
 143   v2p ^= hl32_to_64 (salt_bufs[salt_pos].salt_buf[1], salt_bufs[salt_pos].salt_buf[0]);
 144   v3p ^= hl32_to_64 (salt_bufs[salt_pos].salt_buf[3], salt_bufs[salt_pos].salt_buf[2]);
 145
 146   /**
 147    * loop
 148    */
 149
 150   for (u32 il_pos = 0; il_pos < combs_cnt; il_pos++)
 151   {
 152     const u32 pw_r_len = combs_buf[il_pos].pw_len;
 153
 154     const u32 pw_len = pw_l_len + pw_r_len;
 155
 156     u32 wordr0[4];
 157
 158     wordr0[0] = combs_buf[il_pos].i[0];
 159     wordr0[1] = combs_buf[il_pos].i[1];
 160     wordr0[2] = combs_buf[il_pos].i[2];
 161     wordr0[3] = combs_buf[il_pos].i[3];
 162
 163     u32 wordr1[4];
 164
 165     wordr1[0] = combs_buf[il_pos].i[4];
 166     wordr1[1] = combs_buf[il_pos].i[5];
 167     wordr1[2] = combs_buf[il_pos].i[6];
 168     wordr1[3] = combs_buf[il_pos].i[7];
 169
 170     u32 wordr2[4];
 171
 172     wordr2[0] = 0;
 173     wordr2[1] = 0;
 174     wordr2[2] = 0;
 175     wordr2[3] = 0;
 176
 177     u32 wordr3[4];
 178
 179     wordr3[0] = 0;
 180     wordr3[1] = 0;
 181     wordr3[2] = 0;
 182     wordr3[3] = 0;
 183
 184     if (combs_mode == COMBINATOR_MODE_BASE_LEFT)
 185     {
 186       switch_buffer_by_offset (wordr0, wordr1, wordr2, wordr3, pw_l_len);
 187     }
 188
 189     u32x w[16];
 190
 191     w[ 0] = wordl0[0] | wordr0[0];
 192     w[ 1] = wordl0[1] | wordr0[1];
 193     w[ 2] = wordl0[2] | wordr0[2];
 194     w[ 3] = wordl0[3] | wordr0[3];
 195     w[ 4] = wordl1[0] | wordr1[0];
 196     w[ 5] = wordl1[1] | wordr1[1];
 197     w[ 6] = wordl1[2] | wordr1[2];
 198     w[ 7] = wordl1[3] | wordr1[3];
 199     w[ 8] = wordl2[0] | wordr2[0];
 200     w[ 9] = wordl2[1] | wordr2[1];
 201     w[10] = wordl2[2] | wordr2[2];
 202     w[11] = wordl2[3] | wordr2[3];
 203     w[12] = wordl3[0] | wordr3[0];
 204     w[13] = wordl3[1] | wordr3[1];
 205     w[14] = wordl3[2] | wordr3[2];
 206     w[15] = wordl3[3] | wordr3[3];
 207
 208     u64 *w_ptr = (u64 *) w;
 209
 210     w_ptr[pw_len / 8] |= (u64) pw_len << 56;
 211
 212     u64x v0 = v0p;
 213     u64x v1 = v1p;
 214     u64x v2 = v2p;
 215     u64x v3 = v3p;
 216
 217     int i;
 218     int j;
 219
 220     for (i = 0, j = 0; i <= pw_len; i += 8, j += 2)
 221     {
 222       u64x m = hl32_to_64 (w[j + 1], w[j + 0]);
 223
 224       v3 ^= m;
 225
 226       SIPROUND (v0, v1, v2, v3);
 227       SIPROUND (v0, v1, v2, v3);
 228
 229       v0 ^= m;
 230     }
 231
 232     v2 ^= 0xff;
 233
 234     SIPROUND (v0, v1, v2, v3);
 235     SIPROUND (v0, v1, v2, v3);
 236     SIPROUND (v0, v1, v2, v3);
 237     SIPROUND (v0, v1, v2, v3);
 238
 239     const u64x v = v0 ^ v1 ^ v2 ^ v3;
 240
 241     const u32x a = l32_from_64 (v);
 242     const u32x b = h32_from_64 (v);
 243
 244     const u32x r0 = a;
 245     const u32x r1 = b;
 246     const u32x r2 = 0;
 247     const u32x r3 = 0;
 248
 249     #include VECT_COMPARE_M
 250   }
 251 }
 252
 253 __kernel void __attribute__((reqd_work_group_size (64, 1, 1))) m10100_m08 (__global pw_t *pws, __global gpu_rule_t *rules_buf, __global comb_t *combs_buf, __global bf_t *bfs_buf, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 combs_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max)
 254 {
 255 }
 256
 257 __kernel void __attribute__((reqd_work_group_size (64, 1, 1))) m10100_m16 (__global pw_t *pws, __global gpu_rule_t *rules_buf, __global comb_t *combs_buf, __global bf_t *bfs_buf, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 combs_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max)
 258 {
 259 }
 260
 261 __kernel void __attribute__((reqd_work_group_size (64, 1, 1))) m10100_s04 (__global pw_t *pws, __global gpu_rule_t *rules_buf, __global comb_t *combs_buf, __global bf_t *bfs_buf, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 combs_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max)
 262 {
 263   /**
 264    * modifier
 265    */
 266
 267   const u32 lid = get_local_id (0);
 268
 269   /**
 270    * base
 271    */
 272
 273   const u32 gid = get_global_id (0);
 274
 275   if (gid >= gid_max) return;
 276
 277   u32x wordl0[4];
 278
 279   wordl0[0] = pws[gid].i[ 0];
 280   wordl0[1] = pws[gid].i[ 1];
 281   wordl0[2] = pws[gid].i[ 2];
 282   wordl0[3] = pws[gid].i[ 3];
 283
 284   u32x wordl1[4];
 285
 286   wordl1[0] = pws[gid].i[ 4];
 287   wordl1[1] = pws[gid].i[ 5];
 288   wordl1[2] = pws[gid].i[ 6];
 289   wordl1[3] = pws[gid].i[ 7];
 290
 291   u32x wordl2[4];
 292
 293   wordl2[0] = 0;
 294   wordl2[1] = 0;
 295   wordl2[2] = 0;
 296   wordl2[3] = 0;
 297
 298   u32x wordl3[4];
 299
 300   wordl3[0] = 0;
 301   wordl3[1] = 0;
 302   wordl3[2] = 0;
 303   wordl3[3] = 0;
 304
 305   const u32 pw_l_len = pws[gid].pw_len;
 306
 307   if (combs_mode == COMBINATOR_MODE_BASE_RIGHT)
 308   {
 309     switch_buffer_by_offset (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len);
 310   }
 311
 312   /**
 313    * digest
 314    */
 315
 316   const u32 search[4] =
 317   {
 318     digests_buf[digests_offset].digest_buf[DGST_R0],
 319     digests_buf[digests_offset].digest_buf[DGST_R1],
 320     digests_buf[digests_offset].digest_buf[DGST_R2],
 321     digests_buf[digests_offset].digest_buf[DGST_R3]
 322   };
 323
 324   /**
 325    * base
 326    */
 327
 328   u64 v0p = SIPHASHM_0;
 329   u64 v1p = SIPHASHM_1;
 330   u64 v2p = SIPHASHM_2;
 331   u64 v3p = SIPHASHM_3;
 332
 333   v0p ^= hl32_to_64 (salt_bufs[salt_pos].salt_buf[1], salt_bufs[salt_pos].salt_buf[0]);
 334   v1p ^= hl32_to_64 (salt_bufs[salt_pos].salt_buf[3], salt_bufs[salt_pos].salt_buf[2]);
 335   v2p ^= hl32_to_64 (salt_bufs[salt_pos].salt_buf[1], salt_bufs[salt_pos].salt_buf[0]);
 336   v3p ^= hl32_to_64 (salt_bufs[salt_pos].salt_buf[3], salt_bufs[salt_pos].salt_buf[2]);
 337
 338   /**
 339    * loop
 340    */
 341
 342   for (u32 il_pos = 0; il_pos < combs_cnt; il_pos++)
 343   {
 344     const u32 pw_r_len = combs_buf[il_pos].pw_len;
 345
 346     const u32 pw_len = pw_l_len + pw_r_len;
 347
 348     u32 wordr0[4];
 349
 350     wordr0[0] = combs_buf[il_pos].i[0];
 351     wordr0[1] = combs_buf[il_pos].i[1];
 352     wordr0[2] = combs_buf[il_pos].i[2];
 353     wordr0[3] = combs_buf[il_pos].i[3];
 354
 355     u32 wordr1[4];
 356
 357     wordr1[0] = combs_buf[il_pos].i[4];
 358     wordr1[1] = combs_buf[il_pos].i[5];
 359     wordr1[2] = combs_buf[il_pos].i[6];
 360     wordr1[3] = combs_buf[il_pos].i[7];
 361
 362     u32 wordr2[4];
 363
 364     wordr2[0] = 0;
 365     wordr2[1] = 0;
 366     wordr2[2] = 0;
 367     wordr2[3] = 0;
 368
 369     u32 wordr3[4];
 370
 371     wordr3[0] = 0;
 372     wordr3[1] = 0;
 373     wordr3[2] = 0;
 374     wordr3[3] = 0;
 375
 376     if (combs_mode == COMBINATOR_MODE_BASE_LEFT)
 377     {
 378       switch_buffer_by_offset (wordr0, wordr1, wordr2, wordr3, pw_l_len);
 379     }
 380
 381     u32x w[16];
 382
 383     w[ 0] = wordl0[0] | wordr0[0];
 384     w[ 1] = wordl0[1] | wordr0[1];
 385     w[ 2] = wordl0[2] | wordr0[2];
 386     w[ 3] = wordl0[3] | wordr0[3];
 387     w[ 4] = wordl1[0] | wordr1[0];
 388     w[ 5] = wordl1[1] | wordr1[1];
 389     w[ 6] = wordl1[2] | wordr1[2];
 390     w[ 7] = wordl1[3] | wordr1[3];
 391     w[ 8] = wordl2[0] | wordr2[0];
 392     w[ 9] = wordl2[1] | wordr2[1];
 393     w[10] = wordl2[2] | wordr2[2];
 394     w[11] = wordl2[3] | wordr2[3];
 395     w[12] = wordl3[0] | wordr3[0];
 396     w[13] = wordl3[1] | wordr3[1];
 397     w[14] = wordl3[2] | wordr3[2];
 398     w[15] = wordl3[3] | wordr3[3];
 399
 400     u64 *w_ptr = (u64 *) w;
 401
 402     w_ptr[pw_len / 8] |= (u64) pw_len << 56;
 403
 404     u64x v0 = v0p;
 405     u64x v1 = v1p;
 406     u64x v2 = v2p;
 407     u64x v3 = v3p;
 408
 409     int i;
 410     int j;
 411
 412     for (i = 0, j = 0; i <= pw_len; i += 8, j += 2)
 413     {
 414       u64x m = hl32_to_64 (w[j + 1], w[j + 0]);
 415
 416       v3 ^= m;
 417
 418       SIPROUND (v0, v1, v2, v3);
 419       SIPROUND (v0, v1, v2, v3);
 420
 421       v0 ^= m;
 422     }
 423
 424     v2 ^= 0xff;
 425
 426     SIPROUND (v0, v1, v2, v3);
 427     SIPROUND (v0, v1, v2, v3);
 428     SIPROUND (v0, v1, v2, v3);
 429     SIPROUND (v0, v1, v2, v3);
 430
 431     const u64x v = v0 ^ v1 ^ v2 ^ v3;
 432
 433     const u32x a = l32_from_64 (v);
 434     const u32x b = h32_from_64 (v);
 435
 436     const u32x r0 = a;
 437     const u32x r1 = b;
 438     const u32x r2 = 0;
 439     const u32x r3 = 0;
 440
 441     #include VECT_COMPARE_S
 442   }
 443 }
 444
 445 __kernel void __attribute__((reqd_work_group_size (64, 1, 1))) m10100_s08 (__global pw_t *pws, __global gpu_rule_t *rules_buf, __global comb_t *combs_buf, __global bf_t *bfs_buf, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 combs_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max)
 446 {
 447 }
 448
 449 __kernel void __attribute__((reqd_work_group_size (64, 1, 1))) m10100_s16 (__global pw_t *pws, __global gpu_rule_t *rules_buf, __global comb_t *combs_buf, __global bf_t *bfs_buf, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 combs_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max)
 450 {
 451 }