Fix m 60 a 0 by making modified variable non-const
[hashcat.git] / OpenCL / m11600.cl
index 1ba1fbe..e840e92 100644 (file)
@@ -1,24 +1,20 @@
 /**
- * Author......: Jens Steube <jens.steube@gmail.com>
+ * Authors.....: Jens Steube <jens.steube@gmail.com>
+ *               Gabriele Gristina <matrix@hashcat.net>
+ *
  * License.....: MIT
  */
 
 #define _SEVEN_ZIP_
 
-#include "include/constants.h"
-#include "include/kernel_vendor.h"
+#include "inc_vendor.cl"
+#include "inc_hash_constants.h"
+#include "inc_hash_functions.cl"
+#include "inc_types.cl"
+#include "inc_common.cl"
 
-#define DGST_R0 0
-#define DGST_R1 1
-#define DGST_R2 2
-#define DGST_R3 3
-
-#include "include/kernel_functions.c"
-#include "types_ocl.c"
-#include "common.c"
-
-#define COMPARE_S "check_single_comp4.c"
-#define COMPARE_M "check_multi_comp4.c"
+#define COMPARE_S "inc_comp_single.cl"
+#define COMPARE_M "inc_comp_multi.cl"
 
 __constant u32 te0[256] =
 {
@@ -707,7 +703,7 @@ __constant u32 rcon[] =
   0x1b000000, 0x36000000,
 };
 
-static void AES256_ExpandKey (u32 *userkey, u32 *rek, __local u32 s_te0[256], __local u32 s_te1[256], __local u32 s_te2[256], __local u32 s_te3[256], __local u32 s_te4[256])
+void AES256_ExpandKey (u32 *userkey, u32 *rek, __local u32 *s_te0, __local u32 *s_te1, __local u32 *s_te2, __local u32 *s_te3, __local u32 *s_te4)
 {
   rek[0] = userkey[0];
   rek[1] = userkey[1];
@@ -763,7 +759,7 @@ static void AES256_ExpandKey (u32 *userkey, u32 *rek, __local u32 s_te0[256], __
   }
 }
 
-static void AES256_InvertKey (u32 *rdk, __local u32 s_td0[256], __local u32 s_td1[256], __local u32 s_td2[256], __local u32 s_td3[256], __local u32 s_td4[256], __local u32 s_te0[256], __local u32 s_te1[256], __local u32 s_te2[256], __local u32 s_te3[256], __local u32 s_te4[256])
+void AES256_InvertKey (u32 *rdk, __local u32 *s_td0, __local u32 *s_td1, __local u32 *s_td2, __local u32 *s_td3, __local u32 *s_td4, __local u32 *s_te0, __local u32 *s_te1, __local u32 *s_te2, __local u32 *s_te3, __local u32 *s_te4)
 {
   for (u32 i = 0, j = 56; i < j; i += 4, j -= 4)
   {
@@ -803,7 +799,7 @@ static void AES256_InvertKey (u32 *rdk, __local u32 s_td0[256], __local u32 s_td
   }
 }
 
-static void AES256_decrypt (const u32 *in, u32 *out, const u32 *rdk, __local u32 s_td0[256], __local u32 s_td1[256], __local u32 s_td2[256], __local u32 s_td3[256], __local u32 s_td4[256])
+void AES256_decrypt (const u32 *in, u32 *out, const u32 *rdk, __local u32 *s_td0, __local u32 *s_td1, __local u32 *s_td2, __local u32 *s_td3, __local u32 *s_td4)
 {
   u32 s0 = in[0] ^ rdk[0];
   u32 s1 = in[1] ^ rdk[1];
@@ -913,7 +909,7 @@ __constant u32 k_sha256[64] =
   SHA256C3c, SHA256C3d, SHA256C3e, SHA256C3f,
 };
 
-static void sha256_transform (const u32 w[16], u32 digest[8])
+void sha256_transform (const u32 w[16], u32 digest[8])
 {
   u32 a = digest[0];
   u32 b = digest[1];
@@ -924,22 +920,22 @@ static void sha256_transform (const u32 w[16], u32 digest[8])
   u32 g = digest[6];
   u32 h = digest[7];
 
-  u32 w0_t = swap32 (w[ 0]);
-  u32 w1_t = swap32 (w[ 1]);
-  u32 w2_t = swap32 (w[ 2]);
-  u32 w3_t = swap32 (w[ 3]);
-  u32 w4_t = swap32 (w[ 4]);
-  u32 w5_t = swap32 (w[ 5]);
-  u32 w6_t = swap32 (w[ 6]);
-  u32 w7_t = swap32 (w[ 7]);
-  u32 w8_t = swap32 (w[ 8]);
-  u32 w9_t = swap32 (w[ 9]);
-  u32 wa_t = swap32 (w[10]);
-  u32 wb_t = swap32 (w[11]);
-  u32 wc_t = swap32 (w[12]);
-  u32 wd_t = swap32 (w[13]);
-  u32 we_t = swap32 (w[14]);
-  u32 wf_t = swap32 (w[15]);
+  u32 w0_t = w[ 0];
+  u32 w1_t = w[ 1];
+  u32 w2_t = w[ 2];
+  u32 w3_t = w[ 3];
+  u32 w4_t = w[ 4];
+  u32 w5_t = w[ 5];
+  u32 w6_t = w[ 6];
+  u32 w7_t = w[ 7];
+  u32 w8_t = w[ 8];
+  u32 w9_t = w[ 9];
+  u32 wa_t = w[10];
+  u32 wb_t = w[11];
+  u32 wc_t = w[12];
+  u32 wd_t = w[13];
+  u32 we_t = w[14];
+  u32 wf_t = w[15];
 
   #define ROUND_EXPAND()                            \
   {                                                 \
@@ -983,7 +979,9 @@ static void sha256_transform (const u32 w[16], u32 digest[8])
 
   ROUND_STEP (0);
 
+  #ifdef _unroll
   #pragma unroll
+  #endif
   for (int i = 16; i < 64; i += 16)
   {
     ROUND_EXPAND (); ROUND_STEP (i);
@@ -1067,34 +1065,20 @@ __constant u32 crc32tab[0x100] =
   0xb40bbe37, 0xc30c8ea1, 0x5a05df1b, 0x2d02ef8d
 };
 
-static u32 round_crc32 (u32 a, const u32 v)
+u32 round_crc32 (u32 a, const u32 v)
 {
   const u32 k = (a ^ v) & 0xff;
 
   const u32 s = a >> 8;
 
-  #ifdef VECT_SIZE1
-  a    = crc32tab[k];
-  #endif
-
-  #ifdef VECT_SIZE2
-  a.s0 = crc32tab[k.s0];
-  a.s1 = crc32tab[k.s1];
-  #endif
-
-  #ifdef VECT_SIZE4
-  a.s0 = crc32tab[k.s0];
-  a.s1 = crc32tab[k.s1];
-  a.s2 = crc32tab[k.s2];
-  a.s3 = crc32tab[k.s3];
-  #endif
+  a = crc32tab[k];
 
   a ^= s;
 
   return a;
 }
 
-static u32 crc32 (const u32 w[16], const u32 pw_len, const u32 iv)
+u32 crc32 (const u32 w[16], const u32 pw_len, const u32 iv)
 {
   u32 a = iv ^ ~0;
 
@@ -1114,127 +1098,77 @@ static u32 crc32 (const u32 w[16], const u32 pw_len, const u32 iv)
   return ~a;
 }
 
-static void bzero16 (u32 block[16])
-{
-  block[ 0] = 0;
-  block[ 1] = 0;
-  block[ 2] = 0;
-  block[ 3] = 0;
-  block[ 4] = 0;
-  block[ 5] = 0;
-  block[ 6] = 0;
-  block[ 7] = 0;
-  block[ 8] = 0;
-  block[ 9] = 0;
-  block[10] = 0;
-  block[11] = 0;
-  block[12] = 0;
-  block[13] = 0;
-  block[14] = 0;
-  block[15] = 0;
-}
-
-static u32 memcat8c (u32 block[16], const u32 block_len, const u32 append[2], const u32 append_len, u32 digest[8])
+u32 memcat8c_be (u32 block[16], const u32 block_len, const u32 append, const u32 append_len, u32 digest[8])
 {
   const u32 mod = block_len & 3;
   const u32 div = block_len / 4;
 
   u32 tmp0;
   u32 tmp1;
-  u32 tmp2;
 
   #ifdef IS_NV
-  const int offset_minus_4 = 4 - (block_len & 3);
-
-  const int selector = (0x76543210 >> (offset_minus_4 * 4)) & 0xffff;
+  const int selector = (0x76543210 >> ((block_len & 3) * 4)) & 0xffff;
 
-  tmp0 = __byte_perm (        0, append[0], selector);
-  tmp1 = __byte_perm (append[0], append[1], selector);
-  tmp2 = __byte_perm (append[1],         0, selector);
+  tmp0 = __byte_perm (append, 0, selector);
+  tmp1 = __byte_perm (0, append, selector);
   #endif
 
-  #ifdef IS_AMD
-  const int offset_minus_4 = 4 - block_len;
-
-  tmp0 = amd_bytealign (append[0],         0, offset_minus_4);
-  tmp1 = amd_bytealign (append[1], append[0], offset_minus_4);
-  tmp2 = amd_bytealign (        0, append[1], offset_minus_4);
-
-  if (mod == 0)
-  {
-    tmp0 = tmp1;
-    tmp1 = tmp2;
-    tmp2 = 0;
-  }
+  #if defined IS_AMD || defined IS_GENERIC
+  tmp0 = amd_bytealign (0, append, block_len);
+  tmp1 = amd_bytealign (append, 0, block_len);
   #endif
 
-  u32 carry[2] = { 0, 0 };
+  u32 carry = 0;
 
   switch (div)
   {
     case  0:  block[ 0] |= tmp0;
               block[ 1]  = tmp1;
-              block[ 2]  = tmp2;
               break;
     case  1:  block[ 1] |= tmp0;
               block[ 2]  = tmp1;
-              block[ 3]  = tmp2;
               break;
     case  2:  block[ 2] |= tmp0;
               block[ 3]  = tmp1;
-              block[ 4]  = tmp2;
               break;
     case  3:  block[ 3] |= tmp0;
               block[ 4]  = tmp1;
-              block[ 5]  = tmp2;
               break;
     case  4:  block[ 4] |= tmp0;
               block[ 5]  = tmp1;
-              block[ 6]  = tmp2;
               break;
     case  5:  block[ 5] |= tmp0;
               block[ 6]  = tmp1;
-              block[ 7]  = tmp2;
               break;
     case  6:  block[ 6] |= tmp0;
               block[ 7]  = tmp1;
-              block[ 8]  = tmp2;
               break;
     case  7:  block[ 7] |= tmp0;
               block[ 8]  = tmp1;
-              block[ 9]  = tmp2;
               break;
     case  8:  block[ 8] |= tmp0;
               block[ 9]  = tmp1;
-              block[10]  = tmp2;
               break;
     case  9:  block[ 9] |= tmp0;
               block[10]  = tmp1;
-              block[11]  = tmp2;
               break;
     case 10:  block[10] |= tmp0;
               block[11]  = tmp1;
-              block[12]  = tmp2;
               break;
     case 11:  block[11] |= tmp0;
               block[12]  = tmp1;
-              block[13]  = tmp2;
               break;
     case 12:  block[12] |= tmp0;
               block[13]  = tmp1;
-              block[14]  = tmp2;
               break;
     case 13:  block[13] |= tmp0;
               block[14]  = tmp1;
-              block[15]  = tmp2;
               break;
     case 14:  block[14] |= tmp0;
               block[15]  = tmp1;
-              carry[ 0]  = tmp2;
               break;
     case 15:  block[15] |= tmp0;
-              carry[ 0]  = tmp1;
-              carry[ 1]  = tmp2;
+              carry      = tmp1;
               break;
   }
 
@@ -1246,236 +1180,383 @@ static u32 memcat8c (u32 block[16], const u32 block_len, const u32 append[2], co
 
     sha256_transform (block, digest);
 
-    bzero16 (block);
-
-    block[0] = carry[0];
-    block[1] = carry[1];
+    block[ 0] = carry;
+    block[ 1] = 0;
+    block[ 2] = 0;
+    block[ 3] = 0;
+    block[ 4] = 0;
+    block[ 5] = 0;
+    block[ 6] = 0;
+    block[ 7] = 0;
+    block[ 8] = 0;
+    block[ 9] = 0;
+    block[10] = 0;
+    block[11] = 0;
+    block[12] = 0;
+    block[13] = 0;
+    block[14] = 0;
+    block[15] = 0;
   }
 
   return new_len;
 }
 
-static u32 memcat32c (u32 block[16], const u32 block_len, const u32 append[8], const u32 append_len, u32 digest[8])
+u32 memcat64c_be (u32 block[16], const u32 block_len, const u32 append[16], const u32 append_len, u32 digest[8])
 {
   const u32 mod = block_len & 3;
   const u32 div = block_len / 4;
 
-  u32 tmp0;
-  u32 tmp1;
-  u32 tmp2;
-  u32 tmp3;
-  u32 tmp4;
-  u32 tmp5;
-  u32 tmp6;
-  u32 tmp7;
-  u32 tmp8;
+  u32 tmp00;
+  u32 tmp01;
+  u32 tmp02;
+  u32 tmp03;
+  u32 tmp04;
+  u32 tmp05;
+  u32 tmp06;
+  u32 tmp07;
+  u32 tmp08;
+  u32 tmp09;
+  u32 tmp10;
+  u32 tmp11;
+  u32 tmp12;
+  u32 tmp13;
+  u32 tmp14;
+  u32 tmp15;
+  u32 tmp16;
 
   #ifdef IS_NV
-  const int offset_minus_4 = 4 - (block_len & 3);
-
-  const int selector = (0x76543210 >> (offset_minus_4 * 4)) & 0xffff;
-
-  tmp0 = __byte_perm (        0, append[0], selector);
-  tmp1 = __byte_perm (append[0], append[1], selector);
-  tmp2 = __byte_perm (append[1], append[2], selector);
-  tmp3 = __byte_perm (append[2], append[3], selector);
-  tmp4 = __byte_perm (append[3], append[4], selector);
-  tmp5 = __byte_perm (append[4], append[5], selector);
-  tmp6 = __byte_perm (append[5], append[6], selector);
-  tmp7 = __byte_perm (append[6], append[7], selector);
-  tmp8 = __byte_perm (append[7],         0, selector);
+  const int selector = (0x76543210 >> ((block_len & 3) * 4)) & 0xffff;
+
+  tmp00 = __byte_perm (append[ 0],          0, selector);
+  tmp01 = __byte_perm (append[ 1], append[ 0], selector);
+  tmp02 = __byte_perm (append[ 2], append[ 1], selector);
+  tmp03 = __byte_perm (append[ 3], append[ 2], selector);
+  tmp04 = __byte_perm (append[ 4], append[ 3], selector);
+  tmp05 = __byte_perm (append[ 5], append[ 4], selector);
+  tmp06 = __byte_perm (append[ 6], append[ 5], selector);
+  tmp07 = __byte_perm (append[ 7], append[ 6], selector);
+  tmp08 = __byte_perm (append[ 8], append[ 7], selector);
+  tmp09 = __byte_perm (append[ 9], append[ 8], selector);
+  tmp10 = __byte_perm (append[10], append[ 9], selector);
+  tmp11 = __byte_perm (append[11], append[10], selector);
+  tmp12 = __byte_perm (append[12], append[11], selector);
+  tmp13 = __byte_perm (append[13], append[12], selector);
+  tmp14 = __byte_perm (append[14], append[13], selector);
+  tmp15 = __byte_perm (append[15], append[14], selector);
+  tmp16 = __byte_perm (         0, append[15], selector);
   #endif
 
-  #ifdef IS_AMD
-  const int offset_minus_4 = 4 - block_len;
-
-  tmp0 = amd_bytealign (append[0],         0, offset_minus_4);
-  tmp1 = amd_bytealign (append[1], append[0], offset_minus_4);
-  tmp2 = amd_bytealign (append[2], append[1], offset_minus_4);
-  tmp3 = amd_bytealign (append[3], append[2], offset_minus_4);
-  tmp4 = amd_bytealign (append[4], append[3], offset_minus_4);
-  tmp5 = amd_bytealign (append[5], append[4], offset_minus_4);
-  tmp6 = amd_bytealign (append[6], append[5], offset_minus_4);
-  tmp7 = amd_bytealign (append[7], append[6], offset_minus_4);
-  tmp8 = amd_bytealign (        0, append[7], offset_minus_4);
-
-  if (mod == 0)
-  {
-    tmp0 = tmp1;
-    tmp1 = tmp2;
-    tmp2 = tmp3;
-    tmp3 = tmp4;
-    tmp4 = tmp5;
-    tmp5 = tmp6;
-    tmp6 = tmp7;
-    tmp7 = tmp8;
-    tmp8 = 0;
-  }
+  #if defined IS_AMD || defined IS_GENERIC
+  tmp00 = amd_bytealign (         0, append[ 0], block_len);
+  tmp01 = amd_bytealign (append[ 0], append[ 1], block_len);
+  tmp02 = amd_bytealign (append[ 1], append[ 2], block_len);
+  tmp03 = amd_bytealign (append[ 2], append[ 3], block_len);
+  tmp04 = amd_bytealign (append[ 3], append[ 4], block_len);
+  tmp05 = amd_bytealign (append[ 4], append[ 5], block_len);
+  tmp06 = amd_bytealign (append[ 5], append[ 6], block_len);
+  tmp07 = amd_bytealign (append[ 6], append[ 7], block_len);
+  tmp08 = amd_bytealign (append[ 7], append[ 8], block_len);
+  tmp09 = amd_bytealign (append[ 8], append[ 9], block_len);
+  tmp10 = amd_bytealign (append[ 9], append[10], block_len);
+  tmp11 = amd_bytealign (append[10], append[11], block_len);
+  tmp12 = amd_bytealign (append[11], append[12], block_len);
+  tmp13 = amd_bytealign (append[12], append[13], block_len);
+  tmp14 = amd_bytealign (append[13], append[14], block_len);
+  tmp15 = amd_bytealign (append[14], append[15], block_len);
+  tmp16 = amd_bytealign (append[15],          0, block_len);
   #endif
 
-  u32 carry[8] = { 0, 0, 0, 0, 0, 0, 0, 0 };
+  u32 carry[16] = { 0 };
 
   switch (div)
   {
-    case  0:  block[ 0] |= tmp0;
-              block[ 1]  = tmp1;
-              block[ 2]  = tmp2;
-              block[ 3]  = tmp3;
-              block[ 4]  = tmp4;
-              block[ 5]  = tmp5;
-              block[ 6]  = tmp6;
-              block[ 7]  = tmp7;
-              block[ 8]  = tmp8;
+    case  0:  block[ 0] |= tmp00;
+              block[ 1]  = tmp01;
+              block[ 2]  = tmp02;
+              block[ 3]  = tmp03;
+              block[ 4]  = tmp04;
+              block[ 5]  = tmp05;
+              block[ 6]  = tmp06;
+              block[ 7]  = tmp07;
+              block[ 8]  = tmp08;
+              block[ 9]  = tmp09;
+              block[10]  = tmp10;
+              block[11]  = tmp11;
+              block[12]  = tmp12;
+              block[13]  = tmp13;
+              block[14]  = tmp14;
+              block[15]  = tmp15;
+              carry[ 0]  = tmp16;
               break;
-    case  1:  block[ 1] |= tmp0;
-              block[ 2]  = tmp1;
-              block[ 3]  = tmp2;
-              block[ 4]  = tmp3;
-              block[ 5]  = tmp4;
-              block[ 6]  = tmp5;
-              block[ 7]  = tmp6;
-              block[ 8]  = tmp7;
-              block[ 9]  = tmp8;
+    case  1:  block[ 1] |= tmp00;
+              block[ 2]  = tmp01;
+              block[ 3]  = tmp02;
+              block[ 4]  = tmp03;
+              block[ 5]  = tmp04;
+              block[ 6]  = tmp05;
+              block[ 7]  = tmp06;
+              block[ 8]  = tmp07;
+              block[ 9]  = tmp08;
+              block[10]  = tmp09;
+              block[11]  = tmp10;
+              block[12]  = tmp11;
+              block[13]  = tmp12;
+              block[14]  = tmp13;
+              block[15]  = tmp14;
+              carry[ 0]  = tmp15;
+              carry[ 1]  = tmp16;
               break;
-    case  2:  block[ 2] |= tmp0;
-              block[ 3]  = tmp1;
-              block[ 4]  = tmp2;
-              block[ 5]  = tmp3;
-              block[ 6]  = tmp4;
-              block[ 7]  = tmp5;
-              block[ 8]  = tmp6;
-              block[ 9]  = tmp7;
-              block[10]  = tmp8;
+    case  2:  block[ 2] |= tmp00;
+              block[ 3]  = tmp01;
+              block[ 4]  = tmp02;
+              block[ 5]  = tmp03;
+              block[ 6]  = tmp04;
+              block[ 7]  = tmp05;
+              block[ 8]  = tmp06;
+              block[ 9]  = tmp07;
+              block[10]  = tmp08;
+              block[11]  = tmp09;
+              block[12]  = tmp10;
+              block[13]  = tmp11;
+              block[14]  = tmp12;
+              block[15]  = tmp13;
+              carry[ 0]  = tmp14;
+              carry[ 1]  = tmp15;
+              carry[ 2]  = tmp16;
               break;
-    case  3:  block[ 3] |= tmp0;
-              block[ 4]  = tmp1;
-              block[ 5]  = tmp2;
-              block[ 6]  = tmp3;
-              block[ 7]  = tmp4;
-              block[ 8]  = tmp5;
-              block[ 9]  = tmp6;
-              block[10]  = tmp7;
-              block[11]  = tmp8;
+    case  3:  block[ 3] |= tmp00;
+              block[ 4]  = tmp01;
+              block[ 5]  = tmp02;
+              block[ 6]  = tmp03;
+              block[ 7]  = tmp04;
+              block[ 8]  = tmp05;
+              block[ 9]  = tmp06;
+              block[10]  = tmp07;
+              block[11]  = tmp08;
+              block[12]  = tmp09;
+              block[13]  = tmp10;
+              block[14]  = tmp11;
+              block[15]  = tmp12;
+              carry[ 0]  = tmp13;
+              carry[ 1]  = tmp14;
+              carry[ 2]  = tmp15;
+              carry[ 3]  = tmp16;
               break;
-    case  4:  block[ 4] |= tmp0;
-              block[ 5]  = tmp1;
-              block[ 6]  = tmp2;
-              block[ 7]  = tmp3;
-              block[ 8]  = tmp4;
-              block[ 9]  = tmp5;
-              block[10]  = tmp6;
-              block[11]  = tmp7;
-              block[12]  = tmp8;
+    case  4:  block[ 4] |= tmp00;
+              block[ 5]  = tmp01;
+              block[ 6]  = tmp02;
+              block[ 7]  = tmp03;
+              block[ 8]  = tmp04;
+              block[ 9]  = tmp05;
+              block[10]  = tmp06;
+              block[11]  = tmp07;
+              block[12]  = tmp08;
+              block[13]  = tmp09;
+              block[14]  = tmp10;
+              block[15]  = tmp11;
+              carry[ 0]  = tmp12;
+              carry[ 1]  = tmp13;
+              carry[ 2]  = tmp14;
+              carry[ 3]  = tmp15;
+              carry[ 4]  = tmp16;
               break;
-    case  5:  block[ 5] |= tmp0;
-              block[ 6]  = tmp1;
-              block[ 7]  = tmp2;
-              block[ 8]  = tmp3;
-              block[ 9]  = tmp4;
-              block[10]  = tmp5;
-              block[11]  = tmp6;
-              block[12]  = tmp7;
-              block[13]  = tmp8;
+    case  5:  block[ 5] |= tmp00;
+              block[ 6]  = tmp01;
+              block[ 7]  = tmp02;
+              block[ 8]  = tmp03;
+              block[ 9]  = tmp04;
+              block[10]  = tmp05;
+              block[11]  = tmp06;
+              block[12]  = tmp07;
+              block[13]  = tmp08;
+              block[14]  = tmp09;
+              block[15]  = tmp10;
+              carry[ 0]  = tmp11;
+              carry[ 1]  = tmp12;
+              carry[ 2]  = tmp13;
+              carry[ 3]  = tmp14;
+              carry[ 4]  = tmp15;
+              carry[ 5]  = tmp16;
               break;
-    case  6:  block[ 6] |= tmp0;
-              block[ 7]  = tmp1;
-              block[ 8]  = tmp2;
-              block[ 9]  = tmp3;
-              block[10]  = tmp4;
-              block[11]  = tmp5;
-              block[12]  = tmp6;
-              block[13]  = tmp7;
-              block[14]  = tmp8;
+    case  6:  block[ 6] |= tmp00;
+              block[ 7]  = tmp01;
+              block[ 8]  = tmp02;
+              block[ 9]  = tmp03;
+              block[10]  = tmp04;
+              block[11]  = tmp05;
+              block[12]  = tmp06;
+              block[13]  = tmp07;
+              block[14]  = tmp08;
+              block[15]  = tmp09;
+              carry[ 0]  = tmp10;
+              carry[ 1]  = tmp11;
+              carry[ 2]  = tmp12;
+              carry[ 3]  = tmp13;
+              carry[ 4]  = tmp14;
+              carry[ 5]  = tmp15;
+              carry[ 6]  = tmp16;
               break;
-    case  7:  block[ 7] |= tmp0;
-              block[ 8]  = tmp1;
-              block[ 9]  = tmp2;
-              block[10]  = tmp3;
-              block[11]  = tmp4;
-              block[12]  = tmp5;
-              block[13]  = tmp6;
-              block[14]  = tmp7;
-              block[15]  = tmp8;
+    case  7:  block[ 7] |= tmp00;
+              block[ 8]  = tmp01;
+              block[ 9]  = tmp02;
+              block[10]  = tmp03;
+              block[11]  = tmp04;
+              block[12]  = tmp05;
+              block[13]  = tmp06;
+              block[14]  = tmp07;
+              block[15]  = tmp08;
+              carry[ 0]  = tmp09;
+              carry[ 1]  = tmp10;
+              carry[ 2]  = tmp11;
+              carry[ 3]  = tmp12;
+              carry[ 4]  = tmp13;
+              carry[ 5]  = tmp14;
+              carry[ 6]  = tmp15;
+              carry[ 7]  = tmp16;
               break;
-    case  8:  block[ 8] |= tmp0;
-              block[ 9]  = tmp1;
-              block[10]  = tmp2;
-              block[11]  = tmp3;
-              block[12]  = tmp4;
-              block[13]  = tmp5;
-              block[14]  = tmp6;
-              block[15]  = tmp7;
-              carry[ 0]  = tmp8;
+    case  8:  block[ 8] |= tmp00;
+              block[ 9]  = tmp01;
+              block[10]  = tmp02;
+              block[11]  = tmp03;
+              block[12]  = tmp04;
+              block[13]  = tmp05;
+              block[14]  = tmp06;
+              block[15]  = tmp07;
+              carry[ 0]  = tmp08;
+              carry[ 1]  = tmp09;
+              carry[ 2]  = tmp10;
+              carry[ 3]  = tmp11;
+              carry[ 4]  = tmp12;
+              carry[ 5]  = tmp13;
+              carry[ 6]  = tmp14;
+              carry[ 7]  = tmp15;
+              carry[ 8]  = tmp16;
               break;
-    case  9:  block[ 9] |= tmp0;
-              block[10]  = tmp1;
-              block[11]  = tmp2;
-              block[12]  = tmp3;
-              block[13]  = tmp4;
-              block[14]  = tmp5;
-              block[15]  = tmp6;
-              carry[ 0]  = tmp7;
-              carry[ 1]  = tmp8;
+    case  9:  block[ 9] |= tmp00;
+              block[10]  = tmp01;
+              block[11]  = tmp02;
+              block[12]  = tmp03;
+              block[13]  = tmp04;
+              block[14]  = tmp05;
+              block[15]  = tmp06;
+              carry[ 0]  = tmp07;
+              carry[ 1]  = tmp08;
+              carry[ 2]  = tmp09;
+              carry[ 3]  = tmp10;
+              carry[ 4]  = tmp11;
+              carry[ 5]  = tmp12;
+              carry[ 6]  = tmp13;
+              carry[ 7]  = tmp14;
+              carry[ 8]  = tmp15;
+              carry[ 9]  = tmp16;
               break;
-    case 10:  block[10] |= tmp0;
-              block[11]  = tmp1;
-              block[12]  = tmp2;
-              block[13]  = tmp3;
-              block[14]  = tmp4;
-              block[15]  = tmp5;
-              carry[ 0]  = tmp6;
-              carry[ 1]  = tmp7;
-              carry[ 2]  = tmp8;
+    case 10:  block[10] |= tmp00;
+              block[11]  = tmp01;
+              block[12]  = tmp02;
+              block[13]  = tmp03;
+              block[14]  = tmp04;
+              block[15]  = tmp05;
+              carry[ 0]  = tmp06;
+              carry[ 1]  = tmp07;
+              carry[ 2]  = tmp08;
+              carry[ 3]  = tmp09;
+              carry[ 4]  = tmp10;
+              carry[ 5]  = tmp11;
+              carry[ 6]  = tmp12;
+              carry[ 7]  = tmp13;
+              carry[ 8]  = tmp14;
+              carry[ 9]  = tmp15;
+              carry[10]  = tmp16;
               break;
-    case 11:  block[11] |= tmp0;
-              block[12]  = tmp1;
-              block[13]  = tmp2;
-              block[14]  = tmp3;
-              block[15]  = tmp4;
-              carry[ 0]  = tmp5;
-              carry[ 1]  = tmp6;
-              carry[ 2]  = tmp7;
-              carry[ 3]  = tmp8;
+    case 11:  block[11] |= tmp00;
+              block[12]  = tmp01;
+              block[13]  = tmp02;
+              block[14]  = tmp03;
+              block[15]  = tmp04;
+              carry[ 0]  = tmp05;
+              carry[ 1]  = tmp06;
+              carry[ 2]  = tmp07;
+              carry[ 3]  = tmp08;
+              carry[ 4]  = tmp09;
+              carry[ 5]  = tmp10;
+              carry[ 6]  = tmp11;
+              carry[ 7]  = tmp12;
+              carry[ 8]  = tmp13;
+              carry[ 9]  = tmp14;
+              carry[10]  = tmp15;
+              carry[11]  = tmp16;
               break;
-    case 12:  block[12] |= tmp0;
-              block[13]  = tmp1;
-              block[14]  = tmp2;
-              block[15]  = tmp3;
-              carry[ 0]  = tmp4;
-              carry[ 1]  = tmp5;
-              carry[ 2]  = tmp6;
-              carry[ 3]  = tmp7;
-              carry[ 4]  = tmp8;
+    case 12:  block[12] |= tmp00;
+              block[13]  = tmp01;
+              block[14]  = tmp02;
+              block[15]  = tmp03;
+              carry[ 0]  = tmp04;
+              carry[ 1]  = tmp05;
+              carry[ 2]  = tmp06;
+              carry[ 3]  = tmp07;
+              carry[ 4]  = tmp08;
+              carry[ 5]  = tmp09;
+              carry[ 6]  = tmp10;
+              carry[ 7]  = tmp11;
+              carry[ 8]  = tmp12;
+              carry[ 9]  = tmp13;
+              carry[10]  = tmp14;
+              carry[11]  = tmp15;
+              carry[12]  = tmp16;
               break;
-    case 13:  block[13] |= tmp0;
-              block[14]  = tmp1;
-              block[15]  = tmp2;
-              carry[ 0]  = tmp3;
-              carry[ 1]  = tmp4;
-              carry[ 2]  = tmp5;
-              carry[ 3]  = tmp6;
-              carry[ 4]  = tmp7;
-              carry[ 5]  = tmp8;
+    case 13:  block[13] |= tmp00;
+              block[14]  = tmp01;
+              block[15]  = tmp02;
+              carry[ 0]  = tmp03;
+              carry[ 1]  = tmp04;
+              carry[ 2]  = tmp05;
+              carry[ 3]  = tmp06;
+              carry[ 4]  = tmp07;
+              carry[ 5]  = tmp08;
+              carry[ 6]  = tmp09;
+              carry[ 7]  = tmp10;
+              carry[ 8]  = tmp11;
+              carry[ 9]  = tmp12;
+              carry[10]  = tmp13;
+              carry[11]  = tmp14;
+              carry[12]  = tmp15;
+              carry[13]  = tmp16;
               break;
-    case 14:  block[14] |= tmp0;
-              block[15]  = tmp1;
-              carry[ 0]  = tmp2;
-              carry[ 1]  = tmp3;
-              carry[ 2]  = tmp4;
-              carry[ 3]  = tmp5;
-              carry[ 4]  = tmp6;
-              carry[ 5]  = tmp7;
-              carry[ 6]  = tmp8;
+    case 14:  block[14] |= tmp00;
+              block[15]  = tmp01;
+              carry[ 0]  = tmp02;
+              carry[ 1]  = tmp03;
+              carry[ 2]  = tmp04;
+              carry[ 3]  = tmp05;
+              carry[ 4]  = tmp06;
+              carry[ 5]  = tmp07;
+              carry[ 6]  = tmp08;
+              carry[ 7]  = tmp09;
+              carry[ 8]  = tmp10;
+              carry[ 9]  = tmp11;
+              carry[10]  = tmp12;
+              carry[11]  = tmp13;
+              carry[12]  = tmp14;
+              carry[13]  = tmp15;
+              carry[14]  = tmp16;
               break;
-    case 15:  block[15] |= tmp0;
-              carry[ 0]  = tmp1;
-              carry[ 1]  = tmp2;
-              carry[ 2]  = tmp3;
-              carry[ 3]  = tmp4;
-              carry[ 4]  = tmp5;
-              carry[ 5]  = tmp6;
-              carry[ 6]  = tmp7;
-              carry[ 7]  = tmp8;
+    case 15:  block[15] |= tmp00;
+              carry[ 0]  = tmp01;
+              carry[ 1]  = tmp02;
+              carry[ 2]  = tmp03;
+              carry[ 3]  = tmp04;
+              carry[ 4]  = tmp05;
+              carry[ 5]  = tmp06;
+              carry[ 6]  = tmp07;
+              carry[ 7]  = tmp08;
+              carry[ 8]  = tmp09;
+              carry[ 9]  = tmp10;
+              carry[10]  = tmp11;
+              carry[11]  = tmp12;
+              carry[12]  = tmp13;
+              carry[13]  = tmp14;
+              carry[14]  = tmp15;
+              carry[15]  = tmp16;
               break;
   }
 
@@ -1487,22 +1568,28 @@ static u32 memcat32c (u32 block[16], const u32 block_len, const u32 append[8], c
 
     sha256_transform (block, digest);
 
-    bzero16 (block);
-
-    block[0] = carry[0];
-    block[1] = carry[1];
-    block[2] = carry[2];
-    block[3] = carry[3];
-    block[4] = carry[4];
-    block[5] = carry[5];
-    block[6] = carry[6];
-    block[7] = carry[7];
+    block[ 0] = carry[ 0];
+    block[ 1] = carry[ 1];
+    block[ 2] = carry[ 2];
+    block[ 3] = carry[ 3];
+    block[ 4] = carry[ 4];
+    block[ 5] = carry[ 5];
+    block[ 6] = carry[ 6];
+    block[ 7] = carry[ 7];
+    block[ 8] = carry[ 8];
+    block[ 9] = carry[ 9];
+    block[10] = carry[10];
+    block[11] = carry[11];
+    block[12] = carry[12];
+    block[13] = carry[13];
+    block[14] = carry[14];
+    block[15] = carry[15];
   }
 
   return new_len;
 }
 
-__kernel void __attribute__((reqd_work_group_size (64, 1, 1))) m11600_init (__global pw_t *pws, __global gpu_rule_t *rules_buf, __global comb_t *combs_buf, __global bf_t *bfs_buf, __global seven_zip_tmp_t *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global seven_zip_t *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 rules_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max)
+__kernel void m11600_init (__global pw_t *pws, __global kernel_rule_t *rules_buf, __global comb_t *combs_buf, __global bf_t *bfs_buf, __global seven_zip_tmp_t *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global seven_zip_t *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV0_buf, __global u32 *d_scryptV1_buf, __global u32 *d_scryptV2_buf, __global u32 *d_scryptV3_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 il_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max)
 {
   /**
    * base
@@ -1512,63 +1599,41 @@ __kernel void __attribute__((reqd_work_group_size (64, 1, 1))) m11600_init (__gl
 
   if (gid >= gid_max) return;
 
-  /**
-   * algo starts here already
-   */
-
-  u32 dgst[8];
-
-  dgst[0] = SHA256M_A;
-  dgst[1] = SHA256M_B;
-  dgst[2] = SHA256M_C;
-  dgst[3] = SHA256M_D;
-  dgst[4] = SHA256M_E;
-  dgst[5] = SHA256M_F;
-  dgst[6] = SHA256M_G;
-  dgst[7] = SHA256M_H;
-
-  u32 block[16];
-
-  bzero16 (block);
-
-  u32 block_len = 0;
-  u32 final_len = 0;
-
   /**
    * context save
    */
 
-  tmps[gid].dgst[0] = dgst[0];
-  tmps[gid].dgst[1] = dgst[1];
-  tmps[gid].dgst[2] = dgst[2];
-  tmps[gid].dgst[3] = dgst[3];
-  tmps[gid].dgst[4] = dgst[4];
-  tmps[gid].dgst[5] = dgst[5];
-  tmps[gid].dgst[6] = dgst[6];
-  tmps[gid].dgst[7] = dgst[7];
-
-  tmps[gid].block[ 0] = block[ 0];
-  tmps[gid].block[ 1] = block[ 1];
-  tmps[gid].block[ 2] = block[ 2];
-  tmps[gid].block[ 3] = block[ 3];
-  tmps[gid].block[ 4] = block[ 4];
-  tmps[gid].block[ 5] = block[ 5];
-  tmps[gid].block[ 6] = block[ 6];
-  tmps[gid].block[ 7] = block[ 7];
-  tmps[gid].block[ 8] = block[ 8];
-  tmps[gid].block[ 9] = block[ 9];
-  tmps[gid].block[10] = block[10];
-  tmps[gid].block[11] = block[11];
-  tmps[gid].block[12] = block[12];
-  tmps[gid].block[13] = block[13];
-  tmps[gid].block[14] = block[14];
-  tmps[gid].block[15] = block[15];
-
-  tmps[gid].block_len = block_len;
-  tmps[gid].final_len = final_len;
+  tmps[gid].dgst[0] = SHA256M_A;
+  tmps[gid].dgst[1] = SHA256M_B;
+  tmps[gid].dgst[2] = SHA256M_C;
+  tmps[gid].dgst[3] = SHA256M_D;
+  tmps[gid].dgst[4] = SHA256M_E;
+  tmps[gid].dgst[5] = SHA256M_F;
+  tmps[gid].dgst[6] = SHA256M_G;
+  tmps[gid].dgst[7] = SHA256M_H;
+
+  tmps[gid].block[ 0] = 0;
+  tmps[gid].block[ 1] = 0;
+  tmps[gid].block[ 2] = 0;
+  tmps[gid].block[ 3] = 0;
+  tmps[gid].block[ 4] = 0;
+  tmps[gid].block[ 5] = 0;
+  tmps[gid].block[ 6] = 0;
+  tmps[gid].block[ 7] = 0;
+  tmps[gid].block[ 8] = 0;
+  tmps[gid].block[ 9] = 0;
+  tmps[gid].block[10] = 0;
+  tmps[gid].block[11] = 0;
+  tmps[gid].block[12] = 0;
+  tmps[gid].block[13] = 0;
+  tmps[gid].block[14] = 0;
+  tmps[gid].block[15] = 0;
+
+  tmps[gid].block_len = 0;
+  tmps[gid].final_len = 0;
 }
 
-__kernel void __attribute__((reqd_work_group_size (64, 1, 1))) m11600_loop (__global pw_t *pws, __global gpu_rule_t *rules_buf, __global comb_t *combs_buf, __global bf_t *bfs_buf, __global seven_zip_tmp_t *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global seven_zip_t *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 rules_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max)
+__kernel void m11600_loop (__global pw_t *pws, __global kernel_rule_t *rules_buf, __global comb_t *combs_buf, __global bf_t *bfs_buf, __global seven_zip_tmp_t *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global seven_zip_t *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV0_buf, __global u32 *d_scryptV1_buf, __global u32 *d_scryptV2_buf, __global u32 *d_scryptV3_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 il_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max)
 {
   /**
    * base
@@ -1578,23 +1643,41 @@ __kernel void __attribute__((reqd_work_group_size (64, 1, 1))) m11600_loop (__gl
 
   if (gid >= gid_max) return;
 
-  u32 pw[8];
+  u32 pw[16] = { 0 };
 
-  pw[0] = pws[gid].i[ 0];
-  pw[1] = pws[gid].i[ 1];
-  pw[2] = pws[gid].i[ 2];
-  pw[3] = pws[gid].i[ 3];
-  pw[4] = 0;
-  pw[5] = 0;
-  pw[6] = 0;
-  pw[7] = 0;
+  pw[0] = pws[gid].i[0];
+  pw[1] = pws[gid].i[1];
+  pw[2] = pws[gid].i[2];
+  pw[3] = pws[gid].i[3];
+  pw[4] = pws[gid].i[4];
+  pw[5] = pws[gid].i[5];
+  pw[6] = pws[gid].i[6];
+  pw[7] = pws[gid].i[7];
 
   u32 pw_len = pws[gid].pw_len;
 
-  make_unicode (&pw[0], &pw[0], &pw[4]);
+  make_unicode (&pw[ 4], &pw[ 8], &pw[12]);
+  make_unicode (&pw[ 0], &pw[ 0], &pw[ 4]);
 
   pw_len *= 2;
 
+  pw[ 0] = swap32 (pw[ 0]);
+  pw[ 1] = swap32 (pw[ 1]);
+  pw[ 2] = swap32 (pw[ 2]);
+  pw[ 3] = swap32 (pw[ 3]);
+  pw[ 4] = swap32 (pw[ 4]);
+  pw[ 5] = swap32 (pw[ 5]);
+  pw[ 6] = swap32 (pw[ 6]);
+  pw[ 7] = swap32 (pw[ 7]);
+  pw[ 8] = swap32 (pw[ 8]);
+  pw[ 9] = swap32 (pw[ 9]);
+  pw[10] = swap32 (pw[10]);
+  pw[11] = swap32 (pw[11]);
+  pw[12] = swap32 (pw[12]);
+  pw[13] = swap32 (pw[13]);
+  pw[14] = swap32 (pw[14]);
+  pw[15] = swap32 (pw[15]);
+
   /**
    * context load
    */
@@ -1638,13 +1721,10 @@ __kernel void __attribute__((reqd_work_group_size (64, 1, 1))) m11600_loop (__gl
 
   for (u32 i = 0, j = loop_pos; i < loop_cnt; i++, j++)
   {
-    u32 it[2];
-
-    it[0] = j;
-    it[1] = 0;
+    const u32 j_swap = swap32 (j);
 
-    block_len = memcat32c (block, block_len, pw, pw_len, dgst); final_len += pw_len;
-    block_len = memcat8c  (block, block_len, it,      8, dgst); final_len += 8;
+    block_len = memcat64c_be (block, block_len,     pw, pw_len, dgst); final_len += pw_len;
+    block_len = memcat8c_be  (block, block_len, j_swap,      8, dgst); final_len += 8;
   }
 
   /**
@@ -1681,7 +1761,7 @@ __kernel void __attribute__((reqd_work_group_size (64, 1, 1))) m11600_loop (__gl
   tmps[gid].final_len = final_len;
 }
 
-__kernel void __attribute__((reqd_work_group_size (64, 1, 1))) m11600_comp (__global pw_t *pws, __global gpu_rule_t *rules_buf, __global comb_t *combs_buf, __global bf_t *bfs_buf, __global seven_zip_tmp_t *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global seven_zip_t *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 rules_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max)
+__kernel void m11600_comp (__global pw_t *pws, __global kernel_rule_t *rules_buf, __global comb_t *combs_buf, __global bf_t *bfs_buf, __global seven_zip_tmp_t *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global seven_zip_t *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV0_buf, __global u32 *d_scryptV1_buf, __global u32 *d_scryptV2_buf, __global u32 *d_scryptV3_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 il_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max)
 {
   /**
    * base
@@ -1689,13 +1769,12 @@ __kernel void __attribute__((reqd_work_group_size (64, 1, 1))) m11600_comp (__gl
 
   const u32 gid = get_global_id (0);
   const u32 lid = get_local_id (0);
+  const u32 lsz = get_local_size (0);
 
   /**
    * aes shared
    */
 
-  const u32 lid4 = lid * 4;
-
   __local u32 s_td0[256];
   __local u32 s_td1[256];
   __local u32 s_td2[256];
@@ -1708,55 +1787,20 @@ __kernel void __attribute__((reqd_work_group_size (64, 1, 1))) m11600_comp (__gl
   __local u32 s_te3[256];
   __local u32 s_te4[256];
 
-  s_td0[lid4 + 0] = td0[lid4 + 0];
-  s_td0[lid4 + 1] = td0[lid4 + 1];
-  s_td0[lid4 + 2] = td0[lid4 + 2];
-  s_td0[lid4 + 3] = td0[lid4 + 3];
-
-  s_td1[lid4 + 0] = td1[lid4 + 0];
-  s_td1[lid4 + 1] = td1[lid4 + 1];
-  s_td1[lid4 + 2] = td1[lid4 + 2];
-  s_td1[lid4 + 3] = td1[lid4 + 3];
-
-  s_td2[lid4 + 0] = td2[lid4 + 0];
-  s_td2[lid4 + 1] = td2[lid4 + 1];
-  s_td2[lid4 + 2] = td2[lid4 + 2];
-  s_td2[lid4 + 3] = td2[lid4 + 3];
-
-  s_td3[lid4 + 0] = td3[lid4 + 0];
-  s_td3[lid4 + 1] = td3[lid4 + 1];
-  s_td3[lid4 + 2] = td3[lid4 + 2];
-  s_td3[lid4 + 3] = td3[lid4 + 3];
-
-  s_td4[lid4 + 0] = td4[lid4 + 0];
-  s_td4[lid4 + 1] = td4[lid4 + 1];
-  s_td4[lid4 + 2] = td4[lid4 + 2];
-  s_td4[lid4 + 3] = td4[lid4 + 3];
-
-  s_te0[lid4 + 0] = te0[lid4 + 0];
-  s_te0[lid4 + 1] = te0[lid4 + 1];
-  s_te0[lid4 + 2] = te0[lid4 + 2];
-  s_te0[lid4 + 3] = te0[lid4 + 3];
-
-  s_te1[lid4 + 0] = te1[lid4 + 0];
-  s_te1[lid4 + 1] = te1[lid4 + 1];
-  s_te1[lid4 + 2] = te1[lid4 + 2];
-  s_te1[lid4 + 3] = te1[lid4 + 3];
-
-  s_te2[lid4 + 0] = te2[lid4 + 0];
-  s_te2[lid4 + 1] = te2[lid4 + 1];
-  s_te2[lid4 + 2] = te2[lid4 + 2];
-  s_te2[lid4 + 3] = te2[lid4 + 3];
-
-  s_te3[lid4 + 0] = te3[lid4 + 0];
-  s_te3[lid4 + 1] = te3[lid4 + 1];
-  s_te3[lid4 + 2] = te3[lid4 + 2];
-  s_te3[lid4 + 3] = te3[lid4 + 3];
-
-  s_te4[lid4 + 0] = te4[lid4 + 0];
-  s_te4[lid4 + 1] = te4[lid4 + 1];
-  s_te4[lid4 + 2] = te4[lid4 + 2];
-  s_te4[lid4 + 3] = te4[lid4 + 3];
+  for (u32 i = lid; i < 256; i += lsz)
+  {
+    s_td0[i] = td0[i];
+    s_td1[i] = td1[i];
+    s_td2[i] = td2[i];
+    s_td3[i] = td3[i];
+    s_td4[i] = td4[i];
+
+    s_te0[i] = te0[i];
+    s_te1[i] = te1[i];
+    s_te2[i] = te2[i];
+    s_te3[i] = te3[i];
+    s_te4[i] = te4[i];
+  }
 
   barrier (CLK_LOCAL_MEM_FENCE);
 
@@ -1777,38 +1821,29 @@ __kernel void __attribute__((reqd_work_group_size (64, 1, 1))) m11600_comp (__gl
   dgst[6] = tmps[gid].dgst[6];
   dgst[7] = tmps[gid].dgst[7];
 
-  u32 block[16];
-
-  block[ 0] = tmps[gid].block[ 0];
-  block[ 1] = tmps[gid].block[ 1];
-  block[ 2] = tmps[gid].block[ 2];
-  block[ 3] = tmps[gid].block[ 3];
-  block[ 4] = tmps[gid].block[ 4];
-  block[ 5] = tmps[gid].block[ 5];
-  block[ 6] = tmps[gid].block[ 6];
-  block[ 7] = tmps[gid].block[ 7];
-  block[ 8] = tmps[gid].block[ 8];
-  block[ 9] = tmps[gid].block[ 9];
-  block[10] = tmps[gid].block[10];
-  block[11] = tmps[gid].block[11];
-  block[12] = tmps[gid].block[12];
-  block[13] = tmps[gid].block[13];
-  block[14] = tmps[gid].block[14];
-  block[15] = tmps[gid].block[15];
-
   u32 block_len = tmps[gid].block_len;
   u32 final_len = tmps[gid].final_len;
 
-  append_0x80_1x16 (block, block_len);
+  // this optimization should work as long as we have an iteration 6 or higher
 
-  if (block_len >= 56)
-  {
-    sha256_transform (block, dgst);
-
-    bzero16 (block);
-  }
+  u32 block[16];
 
-  block[15] = swap32 (final_len * 8);
+  block[ 0] = 0x80000000;
+  block[ 1] = 0;
+  block[ 2] = 0;
+  block[ 3] = 0;
+  block[ 4] = 0;
+  block[ 5] = 0;
+  block[ 6] = 0;
+  block[ 7] = 0;
+  block[ 8] = 0;
+  block[ 9] = 0;
+  block[10] = 0;
+  block[11] = 0;
+  block[12] = 0;
+  block[13] = 0;
+  block[14] = 0;
+  block[15] = final_len * 8;
 
   sha256_transform (block, dgst);
 
@@ -1974,9 +2009,7 @@ __kernel void __attribute__((reqd_work_group_size (64, 1, 1))) m11600_comp (__gl
 
     if ((out[0] == 0) && (out[1] == 0) && (out[2] == 0) && (out[3] == 0))
     {
-      mark_hash (plains_buf, hashes_shown, digests_offset + 0, gid, 0);
-
-      d_return_buf[lid] = 1;
+      mark_hash (plains_buf, d_return_buf, salt_pos, 0, digests_offset + 0, gid, 0);
     }
   }