Add mangle to mode 60 attack 3

[hashcat.git] / OpenCL / m08900.cl
diff --git a/OpenCL/m08900.cl b/OpenCL/m08900.cl

index 25bfeac..598d578 100644 (file)
--- a/OpenCL/m08900.cl
+++ b/OpenCL/m08900.cl
@@ -5,29 +5,14 @@
  
  #define _SCRYPT_
  
-#include "include/constants.h"
-#include "include/kernel_vendor.h"
+#include "inc_vendor.cl"
+#include "inc_hash_constants.h"
+#include "inc_hash_functions.cl"
+#include "inc_types.cl"
+#include "inc_common.cl"
  
-#define DGST_R0 0
-#define DGST_R1 1
-#define DGST_R2 2
-#define DGST_R3 3
-
-#include "include/kernel_functions.c"
-#include "types_ocl.c"
-#include "common.c"
-
-#ifdef  VECT_SIZE1
-#define COMPARE_M "check_multi_vect1_comp4.c"
-#endif
-
-#ifdef  VECT_SIZE2
-#define COMPARE_M "check_multi_vect2_comp4.c"
-#endif
-
-#ifdef  VECT_SIZE4
-#define COMPARE_M "check_multi_vect4_comp4.c"
-#endif
+#define COMPARE_S "inc_comp_single.cl"
+#define COMPARE_M "inc_comp_multi.cl"
  
  __constant u32 k_sha256[64] =
  {
@@ -49,7 +34,7 @@ __constant u32 k_sha256[64] =
    SHA256C3c, SHA256C3d, SHA256C3e, SHA256C3f,
  };
  
-static void sha256_transform (const u32 w0[4], const u32 w1[4], const u32 w2[4], const u32 w3[4], u32 digest[8])
+void sha256_transform (const u32 w0[4], const u32 w1[4], const u32 w2[4], const u32 w3[4], u32 digest[8])
  {
    u32 a = digest[0];
    u32 b = digest[1];
@@ -119,7 +104,9 @@ static void sha256_transform (const u32 w0[4], const u32 w1[4], const u32 w2[4],
  
    ROUND_STEP (0);
  
+  #ifdef _unroll
    #pragma unroll
+  #endif
    for (int i = 16; i < 64; i += 16)
    {
      ROUND_EXPAND (); ROUND_STEP (i);
@@ -135,7 +122,7 @@ static void sha256_transform (const u32 w0[4], const u32 w1[4], const u32 w2[4],
    digest[7] += h;
  }
  
-static void hmac_sha256_pad (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], u32 ipad[8], u32 opad[8])
+void hmac_sha256_pad (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], u32 ipad[8], u32 opad[8])
  {
    w0[0] = w0[0] ^ 0x36363636;
    w0[1] = w0[1] ^ 0x36363636;
@@ -194,7 +181,7 @@ static void hmac_sha256_pad (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], u32 ipa
    sha256_transform (w0, w1, w2, w3, opad);
  }
  
-static void hmac_sha256_run (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], u32 ipad[8], u32 opad[8], u32 digest[8])
+void hmac_sha256_run (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], u32 ipad[8], u32 opad[8], u32 digest[8])
  {
    digest[0] = ipad[0];
    digest[1] = ipad[1];
@@ -236,7 +223,7 @@ static void hmac_sha256_run (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], u32 ipa
    sha256_transform (w0, w1, w2, w3, digest);
  }
  
-static void memcat8 (u32 block0[4], u32 block1[4], u32 block2[4], u32 block3[4], const u32 block_len, const u32 append[2])
+void memcat8 (u32 block0[4], u32 block1[4], u32 block2[4], u32 block3[4], const u32 block_len, const u32 append[2])
  {
    switch (block_len)
    {
@@ -569,7 +556,7 @@ static void memcat8 (u32 block0[4], u32 block1[4], u32 block2[4], u32 block3[4],
    }
  }
  
-static uint4 swap_workaround (uint4 v)
+uint4 swap32_4 (uint4 v)
  {
    return (rotate ((v & 0x00FF00FF), 24u) | rotate ((v & 0xFF00FF00),  8u));
  }
@@ -578,6 +565,11 @@ static uint4 swap_workaround (uint4 v)
  #define GET_SMIX_CNT(r,N)   (2 * (r) * 16 * (N))
  #define GET_STATE_CNT(r)    (2 * (r) * 16)
  
+#define SCRYPT_CNT  GET_SCRYPT_CNT (SCRYPT_R, SCRYPT_P)
+#define SCRYPT_CNT4 (SCRYPT_CNT / 4)
+#define STATE_CNT   GET_STATE_CNT  (SCRYPT_R)
+#define STATE_CNT4  (STATE_CNT / 4)
+
  #define ADD_ROTATE_XOR(r,i1,i2,s) (r) ^= rotate ((i1) + (i2), (s));
  
  #define SALSA20_2R()                \
@@ -624,101 +616,75 @@ static uint4 swap_workaround (uint4 v)
    R3 = R3 + X3;         \
  }
  
-static void salsa_r (uint4 *T, const u32 r)
+void salsa_r (uint4 *TI)
  {
-  const u32 state_cnt = GET_STATE_CNT (r);
+  uint4 R0 = TI[STATE_CNT4 - 4];
+  uint4 R1 = TI[STATE_CNT4 - 3];
+  uint4 R2 = TI[STATE_CNT4 - 2];
+  uint4 R3 = TI[STATE_CNT4 - 1];
  
-  const u32 state_cnt4 = state_cnt / 4;
+  uint4 TO[STATE_CNT4];
  
-  uint4 R0 = T[state_cnt4 - 4];
-  uint4 R1 = T[state_cnt4 - 3];
-  uint4 R2 = T[state_cnt4 - 2];
-  uint4 R3 = T[state_cnt4 - 1];
+  int idx_y  = 0;
+  int idx_r1 = 0;
+  int idx_r2 = SCRYPT_R * 4;
  
-  for (u32 i = 0; i < state_cnt4; i += 8)
+  for (int i = 0; i < SCRYPT_R; i++)
    {
      uint4 Y0;
      uint4 Y1;
      uint4 Y2;
      uint4 Y3;
  
-    Y0 = T[i + 0];
-    Y1 = T[i + 1];
-    Y2 = T[i + 2];
-    Y3 = T[i + 3];
+    Y0 = TI[idx_y++];
+    Y1 = TI[idx_y++];
+    Y2 = TI[idx_y++];
+    Y3 = TI[idx_y++];
  
      SALSA20_8_XOR ();
  
-    T[i + 0] = R0;
-    T[i + 1] = R1;
-    T[i + 2] = R2;
-    T[i + 3] = R3;
+    TO[idx_r1++] = R0;
+    TO[idx_r1++] = R1;
+    TO[idx_r1++] = R2;
+    TO[idx_r1++] = R3;
  
-    Y0 = T[i + 4];
-    Y1 = T[i + 5];
-    Y2 = T[i + 6];
-    Y3 = T[i + 7];
+    Y0 = TI[idx_y++];
+    Y1 = TI[idx_y++];
+    Y2 = TI[idx_y++];
+    Y3 = TI[idx_y++];
  
      SALSA20_8_XOR ();
  
-    T[i + 4] = R0;
-    T[i + 5] = R1;
-    T[i + 6] = R2;
-    T[i + 7] = R3;
-  }
-
-  #define exchg(x,y) { const uint4 t = T[(x)]; T[(x)] = T[(y)]; T[(y)] = t; }
-
-  #define exchg4(x,y)         \
-  {                           \
-    const u32 x4 = (x) * 4;  \
-    const u32 y4 = (y) * 4;  \
-                              \
-    exchg (x4 + 0, y4 + 0);   \
-    exchg (x4 + 1, y4 + 1);   \
-    exchg (x4 + 2, y4 + 2);   \
-    exchg (x4 + 3, y4 + 3);   \
+    TO[idx_r2++] = R0;
+    TO[idx_r2++] = R1;
+    TO[idx_r2++] = R2;
+    TO[idx_r2++] = R3;
    }
  
-  for (u32 i = 1; i < r / 1; i++)
-  {
-    const u32 x = i * 1;
-    const u32 y = i * 2;
-
-    exchg4 (x, y);
-  }
-
-  for (u32 i = 1; i < r / 2; i++)
+  #pragma unroll
+  for (int i = 0; i < STATE_CNT4; i++)
    {
-    const u32 x = i * 1;
-    const u32 y = i * 2;
-
-    const u32 xr1 = (r * 2) - 1 - x;
-    const u32 yr1 = (r * 2) - 1 - y;
-
-    exchg4 (xr1, yr1);
+    TI[i] = TO[i];
    }
  }
  
-static void scrypt_smix (uint4 *X, uint4 *T, const u32 N, const u32 r, const u32 tmto, const u32 phy, __global uint4 *V)
+void scrypt_smix (uint4 *X, uint4 *T, __global uint4 *V0, __global uint4 *V1, __global uint4 *V2, __global uint4 *V3)
  {
-  const u32 state_cnt = GET_STATE_CNT (r);
+  #define Coord(xd4,y,z) (((xd4) * ySIZE * zSIZE) + ((y) * zSIZE) + (z))
+  #define CO Coord(xd4,y,z)
  
-  const u32 state_cnt4 = state_cnt / 4;
+  const u32 ySIZE = SCRYPT_N / SCRYPT_TMTO;
+  const u32 zSIZE = STATE_CNT4;
  
-  #define Coord(x,y,z) (((x) * zSIZE) + ((y) * zSIZE * xSIZE) + (z))
-  #define CO Coord(x,y,z)
+  const u32 x = get_global_id (0);
  
-  const u32 xSIZE = phy;
-  const u32 ySIZE = N / tmto;
-  const u32 zSIZE = state_cnt4;
-
-  const u32 gid = get_global_id (0);
-
-  const u32 x = gid % xSIZE;
+  const u32 xd4 = x / 4;
+  const u32 xm4 = x & 3;
  
+  #ifdef _unroll
    #pragma unroll
-  for (u32 i = 0; i < state_cnt4; i += 4)
+  #endif
+  for (u32 i = 0; i < STATE_CNT4; i += 4)
    {
      T[0] = (uint4) (X[i + 0].x, X[i + 1].y, X[i + 2].z, X[i + 3].w);
      T[1] = (uint4) (X[i + 1].x, X[i + 2].y, X[i + 3].z, X[i + 0].w);
@@ -733,30 +699,44 @@ static void scrypt_smix (uint4 *X, uint4 *T, const u32 N, const u32 r, const u32
  
    for (u32 y = 0; y < ySIZE; y++)
    {
-    for (u32 z = 0; z < zSIZE; z++) V[CO] = X[z];
-
-    for (u32 i = 0; i < tmto; i++) salsa_r (X, r);
+    switch (xm4)
+    {
+      case 0: for (u32 z = 0; z < zSIZE; z++) V0[CO] = X[z]; break;
+      case 1: for (u32 z = 0; z < zSIZE; z++) V1[CO] = X[z]; break;
+      case 2: for (u32 z = 0; z < zSIZE; z++) V2[CO] = X[z]; break;
+      case 3: for (u32 z = 0; z < zSIZE; z++) V3[CO] = X[z]; break;
+    }
+
+    for (u32 i = 0; i < SCRYPT_TMTO; i++) salsa_r (X);
    }
  
-  for (u32 i = 0; i < N; i++)
+  for (u32 i = 0; i < SCRYPT_N; i++)
    {
-    const u32 k = X[zSIZE - 4].x & (N - 1);
+    const u32 k = X[zSIZE - 4].x & (SCRYPT_N - 1);
  
-    const u32 y = k / tmto;
+    const u32 y = k / SCRYPT_TMTO;
  
-    const u32 km = k - (y * tmto);
+    const u32 km = k - (y * SCRYPT_TMTO);
  
-    for (u32 z = 0; z < zSIZE; z++) T[z] = V[CO];
+    switch (xm4)
+    {
+      case 0: for (u32 z = 0; z < zSIZE; z++) T[z] = V0[CO]; break;
+      case 1: for (u32 z = 0; z < zSIZE; z++) T[z] = V1[CO]; break;
+      case 2: for (u32 z = 0; z < zSIZE; z++) T[z] = V2[CO]; break;
+      case 3: for (u32 z = 0; z < zSIZE; z++) T[z] = V3[CO]; break;
+    }
  
-    for (u32 i = 0; i < km; i++) salsa_r (T, r);
+    for (u32 i = 0; i < km; i++) salsa_r (T);
  
      for (u32 z = 0; z < zSIZE; z++) X[z] ^= T[z];
  
-    salsa_r (X, r);
+    salsa_r (X);
    }
  
+  #ifdef _unroll
    #pragma unroll
-  for (u32 i = 0; i < state_cnt4; i += 4)
+  #endif
+  for (u32 i = 0; i < STATE_CNT4; i += 4)
    {
      T[0] = (uint4) (X[i + 0].x, X[i + 3].y, X[i + 2].z, X[i + 1].w);
      T[1] = (uint4) (X[i + 1].x, X[i + 0].y, X[i + 3].z, X[i + 2].w);
@@ -770,7 +750,7 @@ static void scrypt_smix (uint4 *X, uint4 *T, const u32 N, const u32 r, const u32
    }
  }
  
-__kernel void __attribute__((reqd_work_group_size (64, 1, 1))) m08900_init (__global pw_t *pws, __global gpu_rule_t *rules_buf, __global comb_t *combs_buf, __global bf_t *bfs_buf, __global scrypt_tmp_t *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global void *esalt_bufs, __global u32 *d_return_buf, __global uint4 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 rules_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max)
+__kernel void m08900_init (__global pw_t *pws, __global kernel_rule_t *rules_buf, __global comb_t *combs_buf, __global bf_t *bfs_buf, __global scrypt_tmp_t *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global void *esalt_bufs, __global u32 *d_return_buf, __global uint4 *d_scryptV0_buf, __global uint4 *d_scryptV1_buf, __global uint4 *d_scryptV2_buf, __global uint4 *d_scryptV3_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 il_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max)
  {
    /**
     * base
@@ -828,45 +808,33 @@ __kernel void __attribute__((reqd_work_group_size (64, 1, 1))) m08900_init (__gl
  
    const u32 salt_len = salt_bufs[salt_pos].salt_len;
  
-  /**
-   * memory buffers
-   */
-
-  const u32 scrypt_r = SCRYPT_R;
-  const u32 scrypt_p = SCRYPT_P;
-  //const u32 scrypt_N = SCRYPT_N;
-
-  //const u32 state_cnt  = GET_STATE_CNT  (scrypt_r);
-  const u32 scrypt_cnt = GET_SCRYPT_CNT (scrypt_r, scrypt_p);
-  //const u32 smix_cnt   = GET_SMIX_CNT   (scrypt_r, scrypt_N);
-
    /**
     * 1st pbkdf2, creates B
     */
  
-  w0[0] = swap_workaround (w0[0]);
-  w0[1] = swap_workaround (w0[1]);
-  w0[2] = swap_workaround (w0[2]);
-  w0[3] = swap_workaround (w0[3]);
-  w1[0] = swap_workaround (w1[0]);
-  w1[1] = swap_workaround (w1[1]);
-  w1[2] = swap_workaround (w1[2]);
-  w1[3] = swap_workaround (w1[3]);
-  w2[0] = swap_workaround (w2[0]);
-  w2[1] = swap_workaround (w2[1]);
-  w2[2] = swap_workaround (w2[2]);
-  w2[3] = swap_workaround (w2[3]);
-  w3[0] = swap_workaround (w3[0]);
-  w3[1] = swap_workaround (w3[1]);
-  w3[2] = swap_workaround (w3[2]);
-  w3[3] = swap_workaround (w3[3]);
+  w0[0] = swap32 (w0[0]);
+  w0[1] = swap32 (w0[1]);
+  w0[2] = swap32 (w0[2]);
+  w0[3] = swap32 (w0[3]);
+  w1[0] = swap32 (w1[0]);
+  w1[1] = swap32 (w1[1]);
+  w1[2] = swap32 (w1[2]);
+  w1[3] = swap32 (w1[3]);
+  w2[0] = swap32 (w2[0]);
+  w2[1] = swap32 (w2[1]);
+  w2[2] = swap32 (w2[2]);
+  w2[3] = swap32 (w2[3]);
+  w3[0] = swap32 (w3[0]);
+  w3[1] = swap32 (w3[1]);
+  w3[2] = swap32 (w3[2]);
+  w3[3] = swap32 (w3[3]);
  
    u32 ipad[8];
    u32 opad[8];
  
    hmac_sha256_pad (w0, w1, w2, w3, ipad, opad);
  
-  for (u32 i = 0, j = 0, k = 0; i < scrypt_cnt; i += 8, j += 1, k += 2)
+  for (u32 i = 0, j = 0, k = 0; i < SCRYPT_CNT; i += 8, j += 1, k += 2)
    {
      w0[0] = salt_buf0[0];
      w0[1] = salt_buf0[1];
@@ -887,25 +855,25 @@ __kernel void __attribute__((reqd_work_group_size (64, 1, 1))) m08900_init (__gl
  
      u32 append[2];
  
-    append[0] = swap_workaround (j + 1);
+    append[0] = swap32 (j + 1);
      append[1] = 0x80;
  
      memcat8 (w0, w1, w2, w3, salt_len, append);
  
-    w0[0] = swap_workaround (w0[0]);
-    w0[1] = swap_workaround (w0[1]);
-    w0[2] = swap_workaround (w0[2]);
-    w0[3] = swap_workaround (w0[3]);
-    w1[0] = swap_workaround (w1[0]);
-    w1[1] = swap_workaround (w1[1]);
-    w1[2] = swap_workaround (w1[2]);
-    w1[3] = swap_workaround (w1[3]);
-    w2[0] = swap_workaround (w2[0]);
-    w2[1] = swap_workaround (w2[1]);
-    w2[2] = swap_workaround (w2[2]);
-    w2[3] = swap_workaround (w2[3]);
-    w3[0] = swap_workaround (w3[0]);
-    w3[1] = swap_workaround (w3[1]);
+    w0[0] = swap32 (w0[0]);
+    w0[1] = swap32 (w0[1]);
+    w0[2] = swap32 (w0[2]);
+    w0[3] = swap32 (w0[3]);
+    w1[0] = swap32 (w1[0]);
+    w1[1] = swap32 (w1[1]);
+    w1[2] = swap32 (w1[2]);
+    w1[3] = swap32 (w1[3]);
+    w2[0] = swap32 (w2[0]);
+    w2[1] = swap32 (w2[1]);
+    w2[2] = swap32 (w2[2]);
+    w2[3] = swap32 (w2[3]);
+    w3[0] = swap32 (w3[0]);
+    w3[1] = swap32 (w3[1]);
      w3[2] = 0;
      w3[3] = (64 + salt_len + 4) * 8;
  
@@ -923,44 +891,40 @@ __kernel void __attribute__((reqd_work_group_size (64, 1, 1))) m08900_init (__gl
    }
  }
  
-__kernel void __attribute__((reqd_work_group_size (64, 1, 1))) m08900_loop (__global pw_t *pws, __global gpu_rule_t *rules_buf, __global comb_t *combs_buf, __global bf_t *bfs_buf, __global scrypt_tmp_t *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global void *esalt_bufs, __global u32 *d_return_buf, __global uint4 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 rules_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max)
+__kernel void m08900_loop (__global pw_t *pws, __global kernel_rule_t *rules_buf, __global comb_t *combs_buf, __global bf_t *bfs_buf, __global scrypt_tmp_t *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global void *esalt_bufs, __global u32 *d_return_buf, __global uint4 *d_scryptV0_buf, __global uint4 *d_scryptV1_buf, __global uint4 *d_scryptV2_buf, __global uint4 *d_scryptV3_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 il_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max)
  {
    const u32 gid = get_global_id (0);
  
    if (gid >= gid_max) return;
  
-  const u32 scrypt_phy   = salt_bufs[salt_pos].scrypt_phy;
-
-  const u32 state_cnt    = GET_STATE_CNT  (SCRYPT_R);
-  const u32 scrypt_cnt   = GET_SCRYPT_CNT (SCRYPT_R, SCRYPT_P);
-
-  const u32 state_cnt4   = state_cnt  / 4;
-  const u32 scrypt_cnt4  = scrypt_cnt / 4;
-
-  uint4 X[state_cnt4];
-  uint4 T[state_cnt4];
+  uint4 X[STATE_CNT4];
+  uint4 T[STATE_CNT4];
  
+  #ifdef _unroll
    #pragma unroll
-  for (int z = 0; z < state_cnt4; z++) X[z] = swap_workaround (tmps[gid].P[z]);
+  #endif
+  for (int z = 0; z < STATE_CNT4; z++) X[z] = swap32_4 (tmps[gid].P[z]);
  
-  scrypt_smix (X, T, SCRYPT_N, SCRYPT_R, SCRYPT_TMTO, scrypt_phy, d_scryptV_buf);
+  scrypt_smix (X, T, d_scryptV0_buf, d_scryptV1_buf, d_scryptV2_buf, d_scryptV3_buf);
  
+  #ifdef _unroll
    #pragma unroll
-  for (int z = 0; z < state_cnt4; z++) tmps[gid].P[z] = swap_workaround (X[z]);
+  #endif
+  for (int z = 0; z < STATE_CNT4; z++) tmps[gid].P[z] = swap32_4 (X[z]);
  
    #if SCRYPT_P >= 1
-  for (int i = state_cnt4; i < scrypt_cnt4; i += state_cnt4)
+  for (int i = STATE_CNT4; i < SCRYPT_CNT4; i += STATE_CNT4)
    {
-    for (int z = 0; z < state_cnt4; z++) X[z] = swap_workaround (tmps[gid].P[i + z]);
+    for (int z = 0; z < STATE_CNT4; z++) X[z] = swap32_4 (tmps[gid].P[i + z]);
  
-    scrypt_smix (X, T, SCRYPT_N, SCRYPT_R, SCRYPT_TMTO, scrypt_phy, d_scryptV_buf);
+    scrypt_smix (X, T, d_scryptV0_buf, d_scryptV1_buf, d_scryptV2_buf, d_scryptV3_buf);
  
-    for (int z = 0; z < state_cnt4; z++) tmps[gid].P[i + z] = swap_workaround (X[z]);
+    for (int z = 0; z < STATE_CNT4; z++) tmps[gid].P[i + z] = swap32_4 (X[z]);
    }
    #endif
  }
  
-__kernel void __attribute__((reqd_work_group_size (64, 1, 1))) m08900_comp (__global pw_t *pws, __global gpu_rule_t *rules_buf, __global comb_t *combs_buf, __global bf_t *bfs_buf, __global scrypt_tmp_t *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global void *esalt_bufs, __global u32 *d_return_buf, __global uint4 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 rules_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max)
+__kernel void m08900_comp (__global pw_t *pws, __global kernel_rule_t *rules_buf, __global comb_t *combs_buf, __global bf_t *bfs_buf, __global scrypt_tmp_t *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global void *esalt_bufs, __global u32 *d_return_buf, __global uint4 *d_scryptV0_buf, __global uint4 *d_scryptV1_buf, __global uint4 *d_scryptV2_buf, __global uint4 *d_scryptV3_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 il_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max)
  {
    /**
     * base
@@ -999,45 +963,33 @@ __kernel void __attribute__((reqd_work_group_size (64, 1, 1))) m08900_comp (__gl
    w3[2] = pws[gid].i[14];
    w3[3] = pws[gid].i[15];
  
-  /**
-   * memory buffers
-   */
-
-  const u32 scrypt_r = SCRYPT_R;
-  const u32 scrypt_p = SCRYPT_P;
-  //const u32 scrypt_N = SCRYPT_N;
-
-  const u32 scrypt_cnt = GET_SCRYPT_CNT (scrypt_r, scrypt_p);
-
-  const u32 scrypt_cnt4  = scrypt_cnt / 4;
-
    /**
     * 2nd pbkdf2, creates B
     */
  
-  w0[0] = swap_workaround (w0[0]);
-  w0[1] = swap_workaround (w0[1]);
-  w0[2] = swap_workaround (w0[2]);
-  w0[3] = swap_workaround (w0[3]);
-  w1[0] = swap_workaround (w1[0]);
-  w1[1] = swap_workaround (w1[1]);
-  w1[2] = swap_workaround (w1[2]);
-  w1[3] = swap_workaround (w1[3]);
-  w2[0] = swap_workaround (w2[0]);
-  w2[1] = swap_workaround (w2[1]);
-  w2[2] = swap_workaround (w2[2]);
-  w2[3] = swap_workaround (w2[3]);
-  w3[0] = swap_workaround (w3[0]);
-  w3[1] = swap_workaround (w3[1]);
-  w3[2] = swap_workaround (w3[2]);
-  w3[3] = swap_workaround (w3[3]);
+  w0[0] = swap32 (w0[0]);
+  w0[1] = swap32 (w0[1]);
+  w0[2] = swap32 (w0[2]);
+  w0[3] = swap32 (w0[3]);
+  w1[0] = swap32 (w1[0]);
+  w1[1] = swap32 (w1[1]);
+  w1[2] = swap32 (w1[2]);
+  w1[3] = swap32 (w1[3]);
+  w2[0] = swap32 (w2[0]);
+  w2[1] = swap32 (w2[1]);
+  w2[2] = swap32 (w2[2]);
+  w2[3] = swap32 (w2[3]);
+  w3[0] = swap32 (w3[0]);
+  w3[1] = swap32 (w3[1]);
+  w3[2] = swap32 (w3[2]);
+  w3[3] = swap32 (w3[3]);
  
    u32 ipad[8];
    u32 opad[8];
  
    hmac_sha256_pad (w0, w1, w2, w3, ipad, opad);
  
-  for (u32 l = 0; l < scrypt_cnt4; l += 4)
+  for (u32 l = 0; l < SCRYPT_CNT4; l += 4)
    {
      barrier (CLK_GLOBAL_MEM_FENCE);
  
@@ -1089,16 +1041,16 @@ __kernel void __attribute__((reqd_work_group_size (64, 1, 1))) m08900_comp (__gl
    w3[0] = 0;
    w3[1] = 0;
    w3[2] = 0;
-  w3[3] = (64 + (scrypt_cnt * 4) + 4) * 8;
+  w3[3] = (64 + (SCRYPT_CNT * 4) + 4) * 8;
  
    u32 digest[8];
  
    hmac_sha256_run (w0, w1, w2, w3, ipad, opad, digest);
  
-  const u32 r0 = swap_workaround (digest[DGST_R0]);
-  const u32 r1 = swap_workaround (digest[DGST_R1]);
-  const u32 r2 = swap_workaround (digest[DGST_R2]);
-  const u32 r3 = swap_workaround (digest[DGST_R3]);
+  const u32 r0 = swap32 (digest[DGST_R0]);
+  const u32 r1 = swap32 (digest[DGST_R1]);
+  const u32 r2 = swap32 (digest[DGST_R2]);
+  const u32 r3 = swap32 (digest[DGST_R3]);
  
    #define il_pos 0