#include "include/kernel_vendor.h"
#include "OpenCL/types_ocl.c"
-static void switch_buffer_by_offset (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], const u32 offset)
+static void switch_buffer_by_offset_le (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], const u32 offset)
{
#if defined IS_AMD || defined IS_GENERIC
const int offset_mod_4 = offset & 3;
if (combs_mode == COMBINATOR_MODE_BASE_LEFT)
{
- switch_buffer_by_offset (wordr0, wordr1, wordr2, wordr3, pw_l_len);
+ switch_buffer_by_offset_le (wordr0, wordr1, wordr2, wordr3, pw_l_len);
}
if (combs_mode == COMBINATOR_MODE_BASE_RIGHT)
{
- switch_buffer_by_offset (wordl0, wordl1, wordl2, wordl3, pw_r_len);
+ switch_buffer_by_offset_le (wordl0, wordl1, wordl2, wordl3, pw_r_len);
}
u32 w0[4];
* License.....: MIT
*/
+/**
+ * pure scalar functions
+ */
+
static int hash_comp (const u32 d1[4], __global u32 *d2)
{
if (d1[3] > d2[DGST_R3]) return ( 1);
plains_buf[hash_pos].il_pos = il_pos;
}
-static void truncate_block (u32 w[4], const u32 len)
+/**
+ * vector functions
+ */
+
+static void truncate_block (u32x w[4], const u32 len)
{
switch (len)
{
}
}
-static void make_unicode_S (const u32 in[4], u32 out1[4], u32 out2[4])
-{
- #ifdef IS_NV
- out2[3] = __byte_perm_S (in[3], 0, 0x7372);
- out2[2] = __byte_perm_S (in[3], 0, 0x7170);
- out2[1] = __byte_perm_S (in[2], 0, 0x7372);
- out2[0] = __byte_perm_S (in[2], 0, 0x7170);
- out1[3] = __byte_perm_S (in[1], 0, 0x7372);
- out1[2] = __byte_perm_S (in[1], 0, 0x7170);
- out1[1] = __byte_perm_S (in[0], 0, 0x7372);
- out1[0] = __byte_perm_S (in[0], 0, 0x7170);
- #endif
-
- #if defined IS_AMD || defined IS_GENERIC
- out2[3] = ((in[3] >> 8) & 0x00FF0000) | ((in[3] >> 16) & 0x000000FF);
- out2[2] = ((in[3] << 8) & 0x00FF0000) | ((in[3] >> 0) & 0x000000FF);
- out2[1] = ((in[2] >> 8) & 0x00FF0000) | ((in[2] >> 16) & 0x000000FF);
- out2[0] = ((in[2] << 8) & 0x00FF0000) | ((in[2] >> 0) & 0x000000FF);
- out1[3] = ((in[1] >> 8) & 0x00FF0000) | ((in[1] >> 16) & 0x000000FF);
- out1[2] = ((in[1] << 8) & 0x00FF0000) | ((in[1] >> 0) & 0x000000FF);
- out1[1] = ((in[0] >> 8) & 0x00FF0000) | ((in[0] >> 16) & 0x000000FF);
- out1[0] = ((in[0] << 8) & 0x00FF0000) | ((in[0] >> 0) & 0x000000FF);
- #endif
-}
-
static void make_unicode (const u32x in[4], u32x out1[4], u32x out2[4])
{
#ifdef IS_NV
#endif
}
-static void undo_unicode_S (const u32 in1[4], const u32 in2[4], u32 out[4])
-{
- #ifdef IS_NV
- out[0] = __byte_perm_S (in1[0], in1[1], 0x6420);
- out[1] = __byte_perm_S (in1[2], in1[3], 0x6420);
- out[2] = __byte_perm_S (in2[0], in2[1], 0x6420);
- out[3] = __byte_perm_S (in2[2], in2[3], 0x6420);
- #endif
-
- #if defined IS_AMD || defined IS_GENERIC
- out[0] = ((in1[0] & 0x000000ff) >> 0) | ((in1[0] & 0x00ff0000) >> 8)
- | ((in1[1] & 0x000000ff) << 16) | ((in1[1] & 0x00ff0000) << 8);
- out[1] = ((in1[2] & 0x000000ff) >> 0) | ((in1[2] & 0x00ff0000) >> 8)
- | ((in1[3] & 0x000000ff) << 16) | ((in1[3] & 0x00ff0000) << 8);
- out[2] = ((in2[0] & 0x000000ff) >> 0) | ((in2[0] & 0x00ff0000) >> 8)
- | ((in2[1] & 0x000000ff) << 16) | ((in2[1] & 0x00ff0000) << 8);
- out[3] = ((in2[2] & 0x000000ff) >> 0) | ((in2[2] & 0x00ff0000) >> 8)
- | ((in2[3] & 0x000000ff) << 16) | ((in2[3] & 0x00ff0000) << 8);
- #endif
-}
-
static void undo_unicode (const u32x in1[4], const u32x in2[4], u32x out[4])
{
#ifdef IS_NV
#endif
}
-static void append_0x01_1x4 (u32 w0[4], const u32 offset)
+static void append_0x01_1x4 (u32x w0[4], const u32 offset)
{
switch (offset)
{
}
}
-static void append_0x01_2x4 (u32 w0[4], u32 w1[4], const u32 offset)
+static void append_0x01_2x4 (u32x w0[4], u32x w1[4], const u32 offset)
{
switch (offset)
{
}
}
-static void append_0x01_3x4 (u32 w0[4], u32 w1[4], u32 w2[4], const u32 offset)
+static void append_0x01_3x4 (u32x w0[4], u32x w1[4], u32x w2[4], const u32 offset)
{
switch (offset)
{
}
}
-static void append_0x01_4x4 (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], const u32 offset)
+static void append_0x01_4x4 (u32x w0[4], u32x w1[4], u32x w2[4], u32x w3[4], const u32 offset)
{
switch (offset)
{
}
}
-static void append_0x01_8x4 (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], u32 w4[4], u32 w5[4], u32 w6[4], u32 w7[4], const u32 offset)
+static void append_0x01_8x4 (u32x w0[4], u32x w1[4], u32x w2[4], u32x w3[4], u32x w4[4], u32x w5[4], u32x w6[4], u32x w7[4], const u32 offset)
{
switch (offset)
{
}
}
-static void append_0x02_1x4 (u32 w0[4], const u32 offset)
+static void append_0x02_1x4 (u32x w0[4], const u32 offset)
{
switch (offset)
{
}
}
-static void append_0x02_2x4 (u32 w0[4], u32 w1[4], const u32 offset)
+static void append_0x02_2x4 (u32x w0[4], u32x w1[4], const u32 offset)
{
switch (offset)
{
}
}
-static void append_0x02_3x4 (u32 w0[4], u32 w1[4], u32 w2[4], const u32 offset)
+static void append_0x02_3x4 (u32x w0[4], u32x w1[4], u32x w2[4], const u32 offset)
{
switch (offset)
{
}
}
-static void append_0x02_4x4 (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], const u32 offset)
+static void append_0x02_4x4 (u32x w0[4], u32x w1[4], u32x w2[4], u32x w3[4], const u32 offset)
{
switch (offset)
{
}
}
-static void append_0x02_8x4 (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], u32 w4[4], u32 w5[4], u32 w6[4], u32 w7[4], const u32 offset)
+static void append_0x02_8x4 (u32x w0[4], u32x w1[4], u32x w2[4], u32x w3[4], u32x w4[4], u32x w5[4], u32x w6[4], u32x w7[4], const u32 offset)
{
switch (offset)
{
}
}
-static void append_0x80_1x4 (u32 w0[4], const u32 offset)
+static void append_0x80_1x4 (u32x w0[4], const u32 offset)
{
switch (offset)
{
}
}
-static void append_0x80_3x4 (u32 w0[4], u32 w1[4], u32 w2[4], const u32 offset)
+static void append_0x80_3x4 (u32x w0[4], u32x w1[4], u32x w2[4], const u32 offset)
{
switch (offset)
{
}
}
-static void append_0x80_4x4 (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], const u32 offset)
+static void append_0x80_4x4 (u32x w0[4], u32x w1[4], u32x w2[4], u32x w3[4], const u32 offset)
{
switch (offset)
{
}
}
-static void append_0x80_8x4 (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], u32 w4[4], u32 w5[4], u32 w6[4], u32 w7[4], const u32 offset)
+static void append_0x80_8x4 (u32x w0[4], u32x w1[4], u32x w2[4], u32x w3[4], u32x w4[4], u32x w5[4], u32x w6[4], u32x w7[4], const u32 offset)
{
switch (offset)
{
}
}
-static void append_0x80_1x16 (u32 w[16], const u32 offset)
+static void append_0x80_1x16 (u32x w[16], const u32 offset)
{
switch (offset)
{
}
}
-static void switch_buffer_by_offset_S (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], const u32 offset)
+static void switch_buffer_by_offset_le (u32x w0[4], u32x w1[4], u32x w2[4], u32x w3[4], const u32 offset)
{
#if defined IS_AMD || defined IS_GENERIC
const int offset_mod_4 = offset & 3;
switch (offset / 4)
{
case 0:
- w3[2] = amd_bytealign_S ( 0, w3[1], offset_minus_4);
- w3[1] = amd_bytealign_S (w3[1], w3[0], offset_minus_4);
- w3[0] = amd_bytealign_S (w3[0], w2[3], offset_minus_4);
- w2[3] = amd_bytealign_S (w2[3], w2[2], offset_minus_4);
- w2[2] = amd_bytealign_S (w2[2], w2[1], offset_minus_4);
- w2[1] = amd_bytealign_S (w2[1], w2[0], offset_minus_4);
- w2[0] = amd_bytealign_S (w2[0], w1[3], offset_minus_4);
- w1[3] = amd_bytealign_S (w1[3], w1[2], offset_minus_4);
- w1[2] = amd_bytealign_S (w1[2], w1[1], offset_minus_4);
- w1[1] = amd_bytealign_S (w1[1], w1[0], offset_minus_4);
- w1[0] = amd_bytealign_S (w1[0], w0[3], offset_minus_4);
- w0[3] = amd_bytealign_S (w0[3], w0[2], offset_minus_4);
- w0[2] = amd_bytealign_S (w0[2], w0[1], offset_minus_4);
- w0[1] = amd_bytealign_S (w0[1], w0[0], offset_minus_4);
- w0[0] = amd_bytealign_S (w0[0], 0, offset_minus_4);
+ w3[2] = amd_bytealign ( 0, w3[1], offset_minus_4);
+ w3[1] = amd_bytealign (w3[1], w3[0], offset_minus_4);
+ w3[0] = amd_bytealign (w3[0], w2[3], offset_minus_4);
+ w2[3] = amd_bytealign (w2[3], w2[2], offset_minus_4);
+ w2[2] = amd_bytealign (w2[2], w2[1], offset_minus_4);
+ w2[1] = amd_bytealign (w2[1], w2[0], offset_minus_4);
+ w2[0] = amd_bytealign (w2[0], w1[3], offset_minus_4);
+ w1[3] = amd_bytealign (w1[3], w1[2], offset_minus_4);
+ w1[2] = amd_bytealign (w1[2], w1[1], offset_minus_4);
+ w1[1] = amd_bytealign (w1[1], w1[0], offset_minus_4);
+ w1[0] = amd_bytealign (w1[0], w0[3], offset_minus_4);
+ w0[3] = amd_bytealign (w0[3], w0[2], offset_minus_4);
+ w0[2] = amd_bytealign (w0[2], w0[1], offset_minus_4);
+ w0[1] = amd_bytealign (w0[1], w0[0], offset_minus_4);
+ w0[0] = amd_bytealign (w0[0], 0, offset_minus_4);
if (offset_mod_4 == 0)
{
break;
case 1:
- w3[2] = amd_bytealign_S ( 0, w3[0], offset_minus_4);
- w3[1] = amd_bytealign_S (w3[0], w2[3], offset_minus_4);
- w3[0] = amd_bytealign_S (w2[3], w2[2], offset_minus_4);
- w2[3] = amd_bytealign_S (w2[2], w2[1], offset_minus_4);
- w2[2] = amd_bytealign_S (w2[1], w2[0], offset_minus_4);
- w2[1] = amd_bytealign_S (w2[0], w1[3], offset_minus_4);
- w2[0] = amd_bytealign_S (w1[3], w1[2], offset_minus_4);
- w1[3] = amd_bytealign_S (w1[2], w1[1], offset_minus_4);
- w1[2] = amd_bytealign_S (w1[1], w1[0], offset_minus_4);
- w1[1] = amd_bytealign_S (w1[0], w0[3], offset_minus_4);
- w1[0] = amd_bytealign_S (w0[3], w0[2], offset_minus_4);
- w0[3] = amd_bytealign_S (w0[2], w0[1], offset_minus_4);
- w0[2] = amd_bytealign_S (w0[1], w0[0], offset_minus_4);
- w0[1] = amd_bytealign_S (w0[0], 0, offset_minus_4);
+ w3[2] = amd_bytealign ( 0, w3[0], offset_minus_4);
+ w3[1] = amd_bytealign (w3[0], w2[3], offset_minus_4);
+ w3[0] = amd_bytealign (w2[3], w2[2], offset_minus_4);
+ w2[3] = amd_bytealign (w2[2], w2[1], offset_minus_4);
+ w2[2] = amd_bytealign (w2[1], w2[0], offset_minus_4);
+ w2[1] = amd_bytealign (w2[0], w1[3], offset_minus_4);
+ w2[0] = amd_bytealign (w1[3], w1[2], offset_minus_4);
+ w1[3] = amd_bytealign (w1[2], w1[1], offset_minus_4);
+ w1[2] = amd_bytealign (w1[1], w1[0], offset_minus_4);
+ w1[1] = amd_bytealign (w1[0], w0[3], offset_minus_4);
+ w1[0] = amd_bytealign (w0[3], w0[2], offset_minus_4);
+ w0[3] = amd_bytealign (w0[2], w0[1], offset_minus_4);
+ w0[2] = amd_bytealign (w0[1], w0[0], offset_minus_4);
+ w0[1] = amd_bytealign (w0[0], 0, offset_minus_4);
w0[0] = 0;
if (offset_mod_4 == 0)
break;
case 2:
- w3[2] = amd_bytealign_S ( 0, w2[3], offset_minus_4);
- w3[1] = amd_bytealign_S (w2[3], w2[2], offset_minus_4);
- w3[0] = amd_bytealign_S (w2[2], w2[1], offset_minus_4);
- w2[3] = amd_bytealign_S (w2[1], w2[0], offset_minus_4);
- w2[2] = amd_bytealign_S (w2[0], w1[3], offset_minus_4);
- w2[1] = amd_bytealign_S (w1[3], w1[2], offset_minus_4);
- w2[0] = amd_bytealign_S (w1[2], w1[1], offset_minus_4);
- w1[3] = amd_bytealign_S (w1[1], w1[0], offset_minus_4);
- w1[2] = amd_bytealign_S (w1[0], w0[3], offset_minus_4);
- w1[1] = amd_bytealign_S (w0[3], w0[2], offset_minus_4);
- w1[0] = amd_bytealign_S (w0[2], w0[1], offset_minus_4);
- w0[3] = amd_bytealign_S (w0[1], w0[0], offset_minus_4);
- w0[2] = amd_bytealign_S (w0[0], 0, offset_minus_4);
+ w3[2] = amd_bytealign ( 0, w2[3], offset_minus_4);
+ w3[1] = amd_bytealign (w2[3], w2[2], offset_minus_4);
+ w3[0] = amd_bytealign (w2[2], w2[1], offset_minus_4);
+ w2[3] = amd_bytealign (w2[1], w2[0], offset_minus_4);
+ w2[2] = amd_bytealign (w2[0], w1[3], offset_minus_4);
+ w2[1] = amd_bytealign (w1[3], w1[2], offset_minus_4);
+ w2[0] = amd_bytealign (w1[2], w1[1], offset_minus_4);
+ w1[3] = amd_bytealign (w1[1], w1[0], offset_minus_4);
+ w1[2] = amd_bytealign (w1[0], w0[3], offset_minus_4);
+ w1[1] = amd_bytealign (w0[3], w0[2], offset_minus_4);
+ w1[0] = amd_bytealign (w0[2], w0[1], offset_minus_4);
+ w0[3] = amd_bytealign (w0[1], w0[0], offset_minus_4);
+ w0[2] = amd_bytealign (w0[0], 0, offset_minus_4);
w0[1] = 0;
w0[0] = 0;
break;
case 3:
- w3[2] = amd_bytealign_S ( 0, w2[2], offset_minus_4);
- w3[1] = amd_bytealign_S (w2[2], w2[1], offset_minus_4);
- w3[0] = amd_bytealign_S (w2[1], w2[0], offset_minus_4);
- w2[3] = amd_bytealign_S (w2[0], w1[3], offset_minus_4);
- w2[2] = amd_bytealign_S (w1[3], w1[2], offset_minus_4);
- w2[1] = amd_bytealign_S (w1[2], w1[1], offset_minus_4);
- w2[0] = amd_bytealign_S (w1[1], w1[0], offset_minus_4);
- w1[3] = amd_bytealign_S (w1[0], w0[3], offset_minus_4);
- w1[2] = amd_bytealign_S (w0[3], w0[2], offset_minus_4);
- w1[1] = amd_bytealign_S (w0[2], w0[1], offset_minus_4);
- w1[0] = amd_bytealign_S (w0[1], w0[0], offset_minus_4);
- w0[3] = amd_bytealign_S (w0[0], 0, offset_minus_4);
+ w3[2] = amd_bytealign ( 0, w2[2], offset_minus_4);
+ w3[1] = amd_bytealign (w2[2], w2[1], offset_minus_4);
+ w3[0] = amd_bytealign (w2[1], w2[0], offset_minus_4);
+ w2[3] = amd_bytealign (w2[0], w1[3], offset_minus_4);
+ w2[2] = amd_bytealign (w1[3], w1[2], offset_minus_4);
+ w2[1] = amd_bytealign (w1[2], w1[1], offset_minus_4);
+ w2[0] = amd_bytealign (w1[1], w1[0], offset_minus_4);
+ w1[3] = amd_bytealign (w1[0], w0[3], offset_minus_4);
+ w1[2] = amd_bytealign (w0[3], w0[2], offset_minus_4);
+ w1[1] = amd_bytealign (w0[2], w0[1], offset_minus_4);
+ w1[0] = amd_bytealign (w0[1], w0[0], offset_minus_4);
+ w0[3] = amd_bytealign (w0[0], 0, offset_minus_4);
w0[2] = 0;
w0[1] = 0;
w0[0] = 0;
break;
case 4:
- w3[2] = amd_bytealign_S ( 0, w2[1], offset_minus_4);
- w3[1] = amd_bytealign_S (w2[1], w2[0], offset_minus_4);
- w3[0] = amd_bytealign_S (w2[0], w1[3], offset_minus_4);
- w2[3] = amd_bytealign_S (w1[3], w1[2], offset_minus_4);
- w2[2] = amd_bytealign_S (w1[2], w1[1], offset_minus_4);
- w2[1] = amd_bytealign_S (w1[1], w1[0], offset_minus_4);
- w2[0] = amd_bytealign_S (w1[0], w0[3], offset_minus_4);
- w1[3] = amd_bytealign_S (w0[3], w0[2], offset_minus_4);
- w1[2] = amd_bytealign_S (w0[2], w0[1], offset_minus_4);
- w1[1] = amd_bytealign_S (w0[1], w0[0], offset_minus_4);
- w1[0] = amd_bytealign_S (w0[0], 0, offset_minus_4);
+ w3[2] = amd_bytealign ( 0, w2[1], offset_minus_4);
+ w3[1] = amd_bytealign (w2[1], w2[0], offset_minus_4);
+ w3[0] = amd_bytealign (w2[0], w1[3], offset_minus_4);
+ w2[3] = amd_bytealign (w1[3], w1[2], offset_minus_4);
+ w2[2] = amd_bytealign (w1[2], w1[1], offset_minus_4);
+ w2[1] = amd_bytealign (w1[1], w1[0], offset_minus_4);
+ w2[0] = amd_bytealign (w1[0], w0[3], offset_minus_4);
+ w1[3] = amd_bytealign (w0[3], w0[2], offset_minus_4);
+ w1[2] = amd_bytealign (w0[2], w0[1], offset_minus_4);
+ w1[1] = amd_bytealign (w0[1], w0[0], offset_minus_4);
+ w1[0] = amd_bytealign (w0[0], 0, offset_minus_4);
w0[3] = 0;
w0[2] = 0;
w0[1] = 0;
break;
case 5:
- w3[2] = amd_bytealign_S ( 0, w2[0], offset_minus_4);
- w3[1] = amd_bytealign_S (w2[0], w1[3], offset_minus_4);
- w3[0] = amd_bytealign_S (w1[3], w1[2], offset_minus_4);
- w2[3] = amd_bytealign_S (w1[2], w1[1], offset_minus_4);
- w2[2] = amd_bytealign_S (w1[1], w1[0], offset_minus_4);
- w2[1] = amd_bytealign_S (w1[0], w0[3], offset_minus_4);
- w2[0] = amd_bytealign_S (w0[3], w0[2], offset_minus_4);
- w1[3] = amd_bytealign_S (w0[2], w0[1], offset_minus_4);
- w1[2] = amd_bytealign_S (w0[1], w0[0], offset_minus_4);
- w1[1] = amd_bytealign_S (w0[0], 0, offset_minus_4);
+ w3[2] = amd_bytealign ( 0, w2[0], offset_minus_4);
+ w3[1] = amd_bytealign (w2[0], w1[3], offset_minus_4);
+ w3[0] = amd_bytealign (w1[3], w1[2], offset_minus_4);
+ w2[3] = amd_bytealign (w1[2], w1[1], offset_minus_4);
+ w2[2] = amd_bytealign (w1[1], w1[0], offset_minus_4);
+ w2[1] = amd_bytealign (w1[0], w0[3], offset_minus_4);
+ w2[0] = amd_bytealign (w0[3], w0[2], offset_minus_4);
+ w1[3] = amd_bytealign (w0[2], w0[1], offset_minus_4);
+ w1[2] = amd_bytealign (w0[1], w0[0], offset_minus_4);
+ w1[1] = amd_bytealign (w0[0], 0, offset_minus_4);
w1[0] = 0;
w0[3] = 0;
w0[2] = 0;
break;
case 6:
- w3[2] = amd_bytealign_S ( 0, w1[3], offset_minus_4);
- w3[1] = amd_bytealign_S (w1[3], w1[2], offset_minus_4);
- w3[0] = amd_bytealign_S (w1[2], w1[1], offset_minus_4);
- w2[3] = amd_bytealign_S (w1[1], w1[0], offset_minus_4);
- w2[2] = amd_bytealign_S (w1[0], w0[3], offset_minus_4);
- w2[1] = amd_bytealign_S (w0[3], w0[2], offset_minus_4);
- w2[0] = amd_bytealign_S (w0[2], w0[1], offset_minus_4);
- w1[3] = amd_bytealign_S (w0[1], w0[0], offset_minus_4);
- w1[2] = amd_bytealign_S (w0[0], 0, offset_minus_4);
+ w3[2] = amd_bytealign ( 0, w1[3], offset_minus_4);
+ w3[1] = amd_bytealign (w1[3], w1[2], offset_minus_4);
+ w3[0] = amd_bytealign (w1[2], w1[1], offset_minus_4);
+ w2[3] = amd_bytealign (w1[1], w1[0], offset_minus_4);
+ w2[2] = amd_bytealign (w1[0], w0[3], offset_minus_4);
+ w2[1] = amd_bytealign (w0[3], w0[2], offset_minus_4);
+ w2[0] = amd_bytealign (w0[2], w0[1], offset_minus_4);
+ w1[3] = amd_bytealign (w0[1], w0[0], offset_minus_4);
+ w1[2] = amd_bytealign (w0[0], 0, offset_minus_4);
w1[1] = 0;
w1[0] = 0;
w0[3] = 0;
break;
case 7:
- w3[2] = amd_bytealign_S ( 0, w1[2], offset_minus_4);
- w3[1] = amd_bytealign_S (w1[2], w1[1], offset_minus_4);
- w3[0] = amd_bytealign_S (w1[1], w1[0], offset_minus_4);
- w2[3] = amd_bytealign_S (w1[0], w0[3], offset_minus_4);
- w2[2] = amd_bytealign_S (w0[3], w0[2], offset_minus_4);
- w2[1] = amd_bytealign_S (w0[2], w0[1], offset_minus_4);
- w2[0] = amd_bytealign_S (w0[1], w0[0], offset_minus_4);
- w1[3] = amd_bytealign_S (w0[0], 0, offset_minus_4);
+ w3[2] = amd_bytealign ( 0, w1[2], offset_minus_4);
+ w3[1] = amd_bytealign (w1[2], w1[1], offset_minus_4);
+ w3[0] = amd_bytealign (w1[1], w1[0], offset_minus_4);
+ w2[3] = amd_bytealign (w1[0], w0[3], offset_minus_4);
+ w2[2] = amd_bytealign (w0[3], w0[2], offset_minus_4);
+ w2[1] = amd_bytealign (w0[2], w0[1], offset_minus_4);
+ w2[0] = amd_bytealign (w0[1], w0[0], offset_minus_4);
+ w1[3] = amd_bytealign (w0[0], 0, offset_minus_4);
w1[2] = 0;
w1[1] = 0;
w1[0] = 0;
break;
case 8:
- w3[2] = amd_bytealign_S ( 0, w1[1], offset_minus_4);
- w3[1] = amd_bytealign_S (w1[1], w1[0], offset_minus_4);
- w3[0] = amd_bytealign_S (w1[0], w0[3], offset_minus_4);
- w2[3] = amd_bytealign_S (w0[3], w0[2], offset_minus_4);
- w2[2] = amd_bytealign_S (w0[2], w0[1], offset_minus_4);
- w2[1] = amd_bytealign_S (w0[1], w0[0], offset_minus_4);
- w2[0] = amd_bytealign_S (w0[0], 0, offset_minus_4);
+ w3[2] = amd_bytealign ( 0, w1[1], offset_minus_4);
+ w3[1] = amd_bytealign (w1[1], w1[0], offset_minus_4);
+ w3[0] = amd_bytealign (w1[0], w0[3], offset_minus_4);
+ w2[3] = amd_bytealign (w0[3], w0[2], offset_minus_4);
+ w2[2] = amd_bytealign (w0[2], w0[1], offset_minus_4);
+ w2[1] = amd_bytealign (w0[1], w0[0], offset_minus_4);
+ w2[0] = amd_bytealign (w0[0], 0, offset_minus_4);
w1[3] = 0;
w1[2] = 0;
w1[1] = 0;
break;
case 9:
- w3[2] = amd_bytealign_S ( 0, w1[0], offset_minus_4);
- w3[1] = amd_bytealign_S (w1[0], w0[3], offset_minus_4);
- w3[0] = amd_bytealign_S (w0[3], w0[2], offset_minus_4);
- w2[3] = amd_bytealign_S (w0[2], w0[1], offset_minus_4);
- w2[2] = amd_bytealign_S (w0[1], w0[0], offset_minus_4);
- w2[1] = amd_bytealign_S (w0[0], 0, offset_minus_4);
+ w3[2] = amd_bytealign ( 0, w1[0], offset_minus_4);
+ w3[1] = amd_bytealign (w1[0], w0[3], offset_minus_4);
+ w3[0] = amd_bytealign (w0[3], w0[2], offset_minus_4);
+ w2[3] = amd_bytealign (w0[2], w0[1], offset_minus_4);
+ w2[2] = amd_bytealign (w0[1], w0[0], offset_minus_4);
+ w2[1] = amd_bytealign (w0[0], 0, offset_minus_4);
w2[0] = 0;
w1[3] = 0;
w1[2] = 0;
break;
case 10:
- w3[2] = amd_bytealign_S ( 0, w0[3], offset_minus_4);
- w3[1] = amd_bytealign_S (w0[3], w0[2], offset_minus_4);
- w3[0] = amd_bytealign_S (w0[2], w0[1], offset_minus_4);
- w2[3] = amd_bytealign_S (w0[1], w0[0], offset_minus_4);
- w2[2] = amd_bytealign_S (w0[0], 0, offset_minus_4);
+ w3[2] = amd_bytealign ( 0, w0[3], offset_minus_4);
+ w3[1] = amd_bytealign (w0[3], w0[2], offset_minus_4);
+ w3[0] = amd_bytealign (w0[2], w0[1], offset_minus_4);
+ w2[3] = amd_bytealign (w0[1], w0[0], offset_minus_4);
+ w2[2] = amd_bytealign (w0[0], 0, offset_minus_4);
w2[1] = 0;
w2[0] = 0;
w1[3] = 0;
break;
case 11:
- w3[2] = amd_bytealign_S ( 0, w0[2], offset_minus_4);
- w3[1] = amd_bytealign_S (w0[2], w0[1], offset_minus_4);
- w3[0] = amd_bytealign_S (w0[1], w0[0], offset_minus_4);
- w2[3] = amd_bytealign_S (w0[0], 0, offset_minus_4);
+ w3[2] = amd_bytealign ( 0, w0[2], offset_minus_4);
+ w3[1] = amd_bytealign (w0[2], w0[1], offset_minus_4);
+ w3[0] = amd_bytealign (w0[1], w0[0], offset_minus_4);
+ w2[3] = amd_bytealign (w0[0], 0, offset_minus_4);
w2[2] = 0;
w2[1] = 0;
w2[0] = 0;
break;
case 12:
- w3[2] = amd_bytealign_S ( 0, w0[1], offset_minus_4);
- w3[1] = amd_bytealign_S (w0[1], w0[0], offset_minus_4);
- w3[0] = amd_bytealign_S (w0[0], 0, offset_minus_4);
+ w3[2] = amd_bytealign ( 0, w0[1], offset_minus_4);
+ w3[1] = amd_bytealign (w0[1], w0[0], offset_minus_4);
+ w3[0] = amd_bytealign (w0[0], 0, offset_minus_4);
w2[3] = 0;
w2[2] = 0;
w2[1] = 0;
break;
case 13:
- w3[2] = amd_bytealign_S ( 0, w0[0], offset_minus_4);
- w3[1] = amd_bytealign_S (w0[0], 0, offset_minus_4);
+ w3[2] = amd_bytealign ( 0, w0[0], offset_minus_4);
+ w3[1] = amd_bytealign (w0[0], 0, offset_minus_4);
w3[0] = 0;
w2[3] = 0;
w2[2] = 0;
switch (offset / 4)
{
case 0:
- w3[1] = __byte_perm_S (w3[0], w3[1], selector);
- w3[0] = __byte_perm_S (w2[3], w3[0], selector);
- w2[3] = __byte_perm_S (w2[2], w2[3], selector);
- w2[2] = __byte_perm_S (w2[1], w2[2], selector);
- w2[1] = __byte_perm_S (w2[0], w2[1], selector);
- w2[0] = __byte_perm_S (w1[3], w2[0], selector);
- w1[3] = __byte_perm_S (w1[2], w1[3], selector);
- w1[2] = __byte_perm_S (w1[1], w1[2], selector);
- w1[1] = __byte_perm_S (w1[0], w1[1], selector);
- w1[0] = __byte_perm_S (w0[3], w1[0], selector);
- w0[3] = __byte_perm_S (w0[2], w0[3], selector);
- w0[2] = __byte_perm_S (w0[1], w0[2], selector);
- w0[1] = __byte_perm_S (w0[0], w0[1], selector);
- w0[0] = __byte_perm_S ( 0, w0[0], selector);
+ w3[1] = __byte_perm (w3[0], w3[1], selector);
+ w3[0] = __byte_perm (w2[3], w3[0], selector);
+ w2[3] = __byte_perm (w2[2], w2[3], selector);
+ w2[2] = __byte_perm (w2[1], w2[2], selector);
+ w2[1] = __byte_perm (w2[0], w2[1], selector);
+ w2[0] = __byte_perm (w1[3], w2[0], selector);
+ w1[3] = __byte_perm (w1[2], w1[3], selector);
+ w1[2] = __byte_perm (w1[1], w1[2], selector);
+ w1[1] = __byte_perm (w1[0], w1[1], selector);
+ w1[0] = __byte_perm (w0[3], w1[0], selector);
+ w0[3] = __byte_perm (w0[2], w0[3], selector);
+ w0[2] = __byte_perm (w0[1], w0[2], selector);
+ w0[1] = __byte_perm (w0[0], w0[1], selector);
+ w0[0] = __byte_perm ( 0, w0[0], selector);
break;
case 1:
- w3[1] = __byte_perm_S (w2[3], w3[0], selector);
- w3[0] = __byte_perm_S (w2[2], w2[3], selector);
- w2[3] = __byte_perm_S (w2[1], w2[2], selector);
- w2[2] = __byte_perm_S (w2[0], w2[1], selector);
- w2[1] = __byte_perm_S (w1[3], w2[0], selector);
- w2[0] = __byte_perm_S (w1[2], w1[3], selector);
- w1[3] = __byte_perm_S (w1[1], w1[2], selector);
- w1[2] = __byte_perm_S (w1[0], w1[1], selector);
- w1[1] = __byte_perm_S (w0[3], w1[0], selector);
- w1[0] = __byte_perm_S (w0[2], w0[3], selector);
- w0[3] = __byte_perm_S (w0[1], w0[2], selector);
- w0[2] = __byte_perm_S (w0[0], w0[1], selector);
- w0[1] = __byte_perm_S ( 0, w0[0], selector);
+ w3[1] = __byte_perm (w2[3], w3[0], selector);
+ w3[0] = __byte_perm (w2[2], w2[3], selector);
+ w2[3] = __byte_perm (w2[1], w2[2], selector);
+ w2[2] = __byte_perm (w2[0], w2[1], selector);
+ w2[1] = __byte_perm (w1[3], w2[0], selector);
+ w2[0] = __byte_perm (w1[2], w1[3], selector);
+ w1[3] = __byte_perm (w1[1], w1[2], selector);
+ w1[2] = __byte_perm (w1[0], w1[1], selector);
+ w1[1] = __byte_perm (w0[3], w1[0], selector);
+ w1[0] = __byte_perm (w0[2], w0[3], selector);
+ w0[3] = __byte_perm (w0[1], w0[2], selector);
+ w0[2] = __byte_perm (w0[0], w0[1], selector);
+ w0[1] = __byte_perm ( 0, w0[0], selector);
w0[0] = 0;
break;
case 2:
- w3[1] = __byte_perm_S (w2[2], w2[3], selector);
- w3[0] = __byte_perm_S (w2[1], w2[2], selector);
- w2[3] = __byte_perm_S (w2[0], w2[1], selector);
- w2[2] = __byte_perm_S (w1[3], w2[0], selector);
- w2[1] = __byte_perm_S (w1[2], w1[3], selector);
- w2[0] = __byte_perm_S (w1[1], w1[2], selector);
- w1[3] = __byte_perm_S (w1[0], w1[1], selector);
- w1[2] = __byte_perm_S (w0[3], w1[0], selector);
- w1[1] = __byte_perm_S (w0[2], w0[3], selector);
- w1[0] = __byte_perm_S (w0[1], w0[2], selector);
- w0[3] = __byte_perm_S (w0[0], w0[1], selector);
- w0[2] = __byte_perm_S ( 0, w0[0], selector);
+ w3[1] = __byte_perm (w2[2], w2[3], selector);
+ w3[0] = __byte_perm (w2[1], w2[2], selector);
+ w2[3] = __byte_perm (w2[0], w2[1], selector);
+ w2[2] = __byte_perm (w1[3], w2[0], selector);
+ w2[1] = __byte_perm (w1[2], w1[3], selector);
+ w2[0] = __byte_perm (w1[1], w1[2], selector);
+ w1[3] = __byte_perm (w1[0], w1[1], selector);
+ w1[2] = __byte_perm (w0[3], w1[0], selector);
+ w1[1] = __byte_perm (w0[2], w0[3], selector);
+ w1[0] = __byte_perm (w0[1], w0[2], selector);
+ w0[3] = __byte_perm (w0[0], w0[1], selector);
+ w0[2] = __byte_perm ( 0, w0[0], selector);
w0[1] = 0;
w0[0] = 0;
break;
case 3:
- w3[1] = __byte_perm_S (w2[1], w2[2], selector);
- w3[0] = __byte_perm_S (w2[0], w2[1], selector);
- w2[3] = __byte_perm_S (w1[3], w2[0], selector);
- w2[2] = __byte_perm_S (w1[2], w1[3], selector);
- w2[1] = __byte_perm_S (w1[1], w1[2], selector);
- w2[0] = __byte_perm_S (w1[0], w1[1], selector);
- w1[3] = __byte_perm_S (w0[3], w1[0], selector);
- w1[2] = __byte_perm_S (w0[2], w0[3], selector);
- w1[1] = __byte_perm_S (w0[1], w0[2], selector);
- w1[0] = __byte_perm_S (w0[0], w0[1], selector);
- w0[3] = __byte_perm_S ( 0, w0[0], selector);
+ w3[1] = __byte_perm (w2[1], w2[2], selector);
+ w3[0] = __byte_perm (w2[0], w2[1], selector);
+ w2[3] = __byte_perm (w1[3], w2[0], selector);
+ w2[2] = __byte_perm (w1[2], w1[3], selector);
+ w2[1] = __byte_perm (w1[1], w1[2], selector);
+ w2[0] = __byte_perm (w1[0], w1[1], selector);
+ w1[3] = __byte_perm (w0[3], w1[0], selector);
+ w1[2] = __byte_perm (w0[2], w0[3], selector);
+ w1[1] = __byte_perm (w0[1], w0[2], selector);
+ w1[0] = __byte_perm (w0[0], w0[1], selector);
+ w0[3] = __byte_perm ( 0, w0[0], selector);
w0[2] = 0;
w0[1] = 0;
w0[0] = 0;
break;
case 4:
- w3[1] = __byte_perm_S (w2[0], w2[1], selector);
- w3[0] = __byte_perm_S (w1[3], w2[0], selector);
- w2[3] = __byte_perm_S (w1[2], w1[3], selector);
- w2[2] = __byte_perm_S (w1[1], w1[2], selector);
- w2[1] = __byte_perm_S (w1[0], w1[1], selector);
- w2[0] = __byte_perm_S (w0[3], w1[0], selector);
- w1[3] = __byte_perm_S (w0[2], w0[3], selector);
- w1[2] = __byte_perm_S (w0[1], w0[2], selector);
- w1[1] = __byte_perm_S (w0[0], w0[1], selector);
- w1[0] = __byte_perm_S ( 0, w0[0], selector);
+ w3[1] = __byte_perm (w2[0], w2[1], selector);
+ w3[0] = __byte_perm (w1[3], w2[0], selector);
+ w2[3] = __byte_perm (w1[2], w1[3], selector);
+ w2[2] = __byte_perm (w1[1], w1[2], selector);
+ w2[1] = __byte_perm (w1[0], w1[1], selector);
+ w2[0] = __byte_perm (w0[3], w1[0], selector);
+ w1[3] = __byte_perm (w0[2], w0[3], selector);
+ w1[2] = __byte_perm (w0[1], w0[2], selector);
+ w1[1] = __byte_perm (w0[0], w0[1], selector);
+ w1[0] = __byte_perm ( 0, w0[0], selector);
w0[3] = 0;
w0[2] = 0;
w0[1] = 0;
break;
case 5:
- w3[1] = __byte_perm_S (w1[3], w2[0], selector);
- w3[0] = __byte_perm_S (w1[2], w1[3], selector);
- w2[3] = __byte_perm_S (w1[1], w1[2], selector);
- w2[2] = __byte_perm_S (w1[0], w1[1], selector);
- w2[1] = __byte_perm_S (w0[3], w1[0], selector);
- w2[0] = __byte_perm_S (w0[2], w0[3], selector);
- w1[3] = __byte_perm_S (w0[1], w0[2], selector);
- w1[2] = __byte_perm_S (w0[0], w0[1], selector);
- w1[1] = __byte_perm_S ( 0, w0[0], selector);
+ w3[1] = __byte_perm (w1[3], w2[0], selector);
+ w3[0] = __byte_perm (w1[2], w1[3], selector);
+ w2[3] = __byte_perm (w1[1], w1[2], selector);
+ w2[2] = __byte_perm (w1[0], w1[1], selector);
+ w2[1] = __byte_perm (w0[3], w1[0], selector);
+ w2[0] = __byte_perm (w0[2], w0[3], selector);
+ w1[3] = __byte_perm (w0[1], w0[2], selector);
+ w1[2] = __byte_perm (w0[0], w0[1], selector);
+ w1[1] = __byte_perm ( 0, w0[0], selector);
w1[0] = 0;
w0[3] = 0;
w0[2] = 0;
break;
case 6:
- w3[1] = __byte_perm_S (w1[2], w1[3], selector);
- w3[0] = __byte_perm_S (w1[1], w1[2], selector);
- w2[3] = __byte_perm_S (w1[0], w1[1], selector);
- w2[2] = __byte_perm_S (w0[3], w1[0], selector);
- w2[1] = __byte_perm_S (w0[2], w0[3], selector);
- w2[0] = __byte_perm_S (w0[1], w0[2], selector);
- w1[3] = __byte_perm_S (w0[0], w0[1], selector);
- w1[2] = __byte_perm_S ( 0, w0[0], selector);
+ w3[1] = __byte_perm (w1[2], w1[3], selector);
+ w3[0] = __byte_perm (w1[1], w1[2], selector);
+ w2[3] = __byte_perm (w1[0], w1[1], selector);
+ w2[2] = __byte_perm (w0[3], w1[0], selector);
+ w2[1] = __byte_perm (w0[2], w0[3], selector);
+ w2[0] = __byte_perm (w0[1], w0[2], selector);
+ w1[3] = __byte_perm (w0[0], w0[1], selector);
+ w1[2] = __byte_perm ( 0, w0[0], selector);
w1[1] = 0;
w1[0] = 0;
w0[3] = 0;
break;
case 7:
- w3[1] = __byte_perm_S (w1[1], w1[2], selector);
- w3[0] = __byte_perm_S (w1[0], w1[1], selector);
- w2[3] = __byte_perm_S (w0[3], w1[0], selector);
- w2[2] = __byte_perm_S (w0[2], w0[3], selector);
- w2[1] = __byte_perm_S (w0[1], w0[2], selector);
- w2[0] = __byte_perm_S (w0[0], w0[1], selector);
- w1[3] = __byte_perm_S ( 0, w0[0], selector);
+ w3[1] = __byte_perm (w1[1], w1[2], selector);
+ w3[0] = __byte_perm (w1[0], w1[1], selector);
+ w2[3] = __byte_perm (w0[3], w1[0], selector);
+ w2[2] = __byte_perm (w0[2], w0[3], selector);
+ w2[1] = __byte_perm (w0[1], w0[2], selector);
+ w2[0] = __byte_perm (w0[0], w0[1], selector);
+ w1[3] = __byte_perm ( 0, w0[0], selector);
w1[2] = 0;
w1[1] = 0;
w1[0] = 0;
break;
case 8:
- w3[1] = __byte_perm_S (w1[0], w1[1], selector);
- w3[0] = __byte_perm_S (w0[3], w1[0], selector);
- w2[3] = __byte_perm_S (w0[2], w0[3], selector);
- w2[2] = __byte_perm_S (w0[1], w0[2], selector);
- w2[1] = __byte_perm_S (w0[0], w0[1], selector);
- w2[0] = __byte_perm_S ( 0, w0[0], selector);
+ w3[1] = __byte_perm (w1[0], w1[1], selector);
+ w3[0] = __byte_perm (w0[3], w1[0], selector);
+ w2[3] = __byte_perm (w0[2], w0[3], selector);
+ w2[2] = __byte_perm (w0[1], w0[2], selector);
+ w2[1] = __byte_perm (w0[0], w0[1], selector);
+ w2[0] = __byte_perm ( 0, w0[0], selector);
w1[3] = 0;
w1[2] = 0;
w1[1] = 0;
break;
case 9:
- w3[1] = __byte_perm_S (w0[3], w1[0], selector);
- w3[0] = __byte_perm_S (w0[2], w0[3], selector);
- w2[3] = __byte_perm_S (w0[1], w0[2], selector);
- w2[2] = __byte_perm_S (w0[0], w0[1], selector);
- w2[1] = __byte_perm_S ( 0, w0[0], selector);
+ w3[1] = __byte_perm (w0[3], w1[0], selector);
+ w3[0] = __byte_perm (w0[2], w0[3], selector);
+ w2[3] = __byte_perm (w0[1], w0[2], selector);
+ w2[2] = __byte_perm (w0[0], w0[1], selector);
+ w2[1] = __byte_perm ( 0, w0[0], selector);
w2[0] = 0;
w1[3] = 0;
w1[2] = 0;
break;
case 10:
- w3[1] = __byte_perm_S (w0[2], w0[3], selector);
- w3[0] = __byte_perm_S (w0[1], w0[2], selector);
- w2[3] = __byte_perm_S (w0[0], w0[1], selector);
- w2[2] = __byte_perm_S ( 0, w0[0], selector);
+ w3[1] = __byte_perm (w0[2], w0[3], selector);
+ w3[0] = __byte_perm (w0[1], w0[2], selector);
+ w2[3] = __byte_perm (w0[0], w0[1], selector);
+ w2[2] = __byte_perm ( 0, w0[0], selector);
w2[1] = 0;
w2[0] = 0;
w1[3] = 0;
break;
case 11:
- w3[1] = __byte_perm_S (w0[1], w0[2], selector);
- w3[0] = __byte_perm_S (w0[0], w0[1], selector);
- w2[3] = __byte_perm_S ( 0, w0[0], selector);
+ w3[1] = __byte_perm (w0[1], w0[2], selector);
+ w3[0] = __byte_perm (w0[0], w0[1], selector);
+ w2[3] = __byte_perm ( 0, w0[0], selector);
w2[2] = 0;
w2[1] = 0;
w2[0] = 0;
break;
case 12:
- w3[1] = __byte_perm_S (w0[0], w0[1], selector);
- w3[0] = __byte_perm_S ( 0, w0[0], selector);
+ w3[1] = __byte_perm (w0[0], w0[1], selector);
+ w3[0] = __byte_perm ( 0, w0[0], selector);
w2[3] = 0;
w2[2] = 0;
w2[1] = 0;
break;
case 13:
- w3[1] = __byte_perm_S ( 0, w0[0], selector);
+ w3[1] = __byte_perm ( 0, w0[0], selector);
w3[0] = 0;
w2[3] = 0;
w2[2] = 0;
#endif
}
-static void switch_buffer_by_offset_be_S (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], const u32 offset)
+static void switch_buffer_by_offset_be (u32x w0[4], u32x w1[4], u32x w2[4], u32x w3[4], const u32 offset)
{
#if defined IS_AMD || defined IS_GENERIC
switch (offset / 4)
{
case 0:
- w3[2] = amd_bytealign_S (w3[1], 0, offset);
- w3[1] = amd_bytealign_S (w3[0], w3[1], offset);
- w3[0] = amd_bytealign_S (w2[3], w3[0], offset);
- w2[3] = amd_bytealign_S (w2[2], w2[3], offset);
- w2[2] = amd_bytealign_S (w2[1], w2[2], offset);
- w2[1] = amd_bytealign_S (w2[0], w2[1], offset);
- w2[0] = amd_bytealign_S (w1[3], w2[0], offset);
- w1[3] = amd_bytealign_S (w1[2], w1[3], offset);
- w1[2] = amd_bytealign_S (w1[1], w1[2], offset);
- w1[1] = amd_bytealign_S (w1[0], w1[1], offset);
- w1[0] = amd_bytealign_S (w0[3], w1[0], offset);
- w0[3] = amd_bytealign_S (w0[2], w0[3], offset);
- w0[2] = amd_bytealign_S (w0[1], w0[2], offset);
- w0[1] = amd_bytealign_S (w0[0], w0[1], offset);
- w0[0] = amd_bytealign_S ( 0, w0[0], offset);
+ w3[2] = amd_bytealign (w3[1], 0, offset);
+ w3[1] = amd_bytealign (w3[0], w3[1], offset);
+ w3[0] = amd_bytealign (w2[3], w3[0], offset);
+ w2[3] = amd_bytealign (w2[2], w2[3], offset);
+ w2[2] = amd_bytealign (w2[1], w2[2], offset);
+ w2[1] = amd_bytealign (w2[0], w2[1], offset);
+ w2[0] = amd_bytealign (w1[3], w2[0], offset);
+ w1[3] = amd_bytealign (w1[2], w1[3], offset);
+ w1[2] = amd_bytealign (w1[1], w1[2], offset);
+ w1[1] = amd_bytealign (w1[0], w1[1], offset);
+ w1[0] = amd_bytealign (w0[3], w1[0], offset);
+ w0[3] = amd_bytealign (w0[2], w0[3], offset);
+ w0[2] = amd_bytealign (w0[1], w0[2], offset);
+ w0[1] = amd_bytealign (w0[0], w0[1], offset);
+ w0[0] = amd_bytealign ( 0, w0[0], offset);
break;
case 1:
- w3[2] = amd_bytealign_S (w3[0], 0, offset);
- w3[1] = amd_bytealign_S (w2[3], w3[0], offset);
- w3[0] = amd_bytealign_S (w2[2], w2[3], offset);
- w2[3] = amd_bytealign_S (w2[1], w2[2], offset);
- w2[2] = amd_bytealign_S (w2[0], w2[1], offset);
- w2[1] = amd_bytealign_S (w1[3], w2[0], offset);
- w2[0] = amd_bytealign_S (w1[2], w1[3], offset);
- w1[3] = amd_bytealign_S (w1[1], w1[2], offset);
- w1[2] = amd_bytealign_S (w1[0], w1[1], offset);
- w1[1] = amd_bytealign_S (w0[3], w1[0], offset);
- w1[0] = amd_bytealign_S (w0[2], w0[3], offset);
- w0[3] = amd_bytealign_S (w0[1], w0[2], offset);
- w0[2] = amd_bytealign_S (w0[0], w0[1], offset);
- w0[1] = amd_bytealign_S ( 0, w0[0], offset);
+ w3[2] = amd_bytealign (w3[0], 0, offset);
+ w3[1] = amd_bytealign (w2[3], w3[0], offset);
+ w3[0] = amd_bytealign (w2[2], w2[3], offset);
+ w2[3] = amd_bytealign (w2[1], w2[2], offset);
+ w2[2] = amd_bytealign (w2[0], w2[1], offset);
+ w2[1] = amd_bytealign (w1[3], w2[0], offset);
+ w2[0] = amd_bytealign (w1[2], w1[3], offset);
+ w1[3] = amd_bytealign (w1[1], w1[2], offset);
+ w1[2] = amd_bytealign (w1[0], w1[1], offset);
+ w1[1] = amd_bytealign (w0[3], w1[0], offset);
+ w1[0] = amd_bytealign (w0[2], w0[3], offset);
+ w0[3] = amd_bytealign (w0[1], w0[2], offset);
+ w0[2] = amd_bytealign (w0[0], w0[1], offset);
+ w0[1] = amd_bytealign ( 0, w0[0], offset);
w0[0] = 0;
break;
case 2:
- w3[2] = amd_bytealign_S (w2[3], 0, offset);
- w3[1] = amd_bytealign_S (w2[2], w2[3], offset);
- w3[0] = amd_bytealign_S (w2[1], w2[2], offset);
- w2[3] = amd_bytealign_S (w2[0], w2[1], offset);
- w2[2] = amd_bytealign_S (w1[3], w2[0], offset);
- w2[1] = amd_bytealign_S (w1[2], w1[3], offset);
- w2[0] = amd_bytealign_S (w1[1], w1[2], offset);
- w1[3] = amd_bytealign_S (w1[0], w1[1], offset);
- w1[2] = amd_bytealign_S (w0[3], w1[0], offset);
- w1[1] = amd_bytealign_S (w0[2], w0[3], offset);
- w1[0] = amd_bytealign_S (w0[1], w0[2], offset);
- w0[3] = amd_bytealign_S (w0[0], w0[1], offset);
- w0[2] = amd_bytealign_S ( 0, w0[0], offset);
+ w3[2] = amd_bytealign (w2[3], 0, offset);
+ w3[1] = amd_bytealign (w2[2], w2[3], offset);
+ w3[0] = amd_bytealign (w2[1], w2[2], offset);
+ w2[3] = amd_bytealign (w2[0], w2[1], offset);
+ w2[2] = amd_bytealign (w1[3], w2[0], offset);
+ w2[1] = amd_bytealign (w1[2], w1[3], offset);
+ w2[0] = amd_bytealign (w1[1], w1[2], offset);
+ w1[3] = amd_bytealign (w1[0], w1[1], offset);
+ w1[2] = amd_bytealign (w0[3], w1[0], offset);
+ w1[1] = amd_bytealign (w0[2], w0[3], offset);
+ w1[0] = amd_bytealign (w0[1], w0[2], offset);
+ w0[3] = amd_bytealign (w0[0], w0[1], offset);
+ w0[2] = amd_bytealign ( 0, w0[0], offset);
w0[1] = 0;
w0[0] = 0;
break;
case 3:
- w3[2] = amd_bytealign_S (w2[2], 0, offset);
- w3[1] = amd_bytealign_S (w2[1], w2[2], offset);
- w3[0] = amd_bytealign_S (w2[0], w2[1], offset);
- w2[3] = amd_bytealign_S (w1[3], w2[0], offset);
- w2[2] = amd_bytealign_S (w1[2], w1[3], offset);
- w2[1] = amd_bytealign_S (w1[1], w1[2], offset);
- w2[0] = amd_bytealign_S (w1[0], w1[1], offset);
- w1[3] = amd_bytealign_S (w0[3], w1[0], offset);
- w1[2] = amd_bytealign_S (w0[2], w0[3], offset);
- w1[1] = amd_bytealign_S (w0[1], w0[2], offset);
- w1[0] = amd_bytealign_S (w0[0], w0[1], offset);
- w0[3] = amd_bytealign_S ( 0, w0[0], offset);
+ w3[2] = amd_bytealign (w2[2], 0, offset);
+ w3[1] = amd_bytealign (w2[1], w2[2], offset);
+ w3[0] = amd_bytealign (w2[0], w2[1], offset);
+ w2[3] = amd_bytealign (w1[3], w2[0], offset);
+ w2[2] = amd_bytealign (w1[2], w1[3], offset);
+ w2[1] = amd_bytealign (w1[1], w1[2], offset);
+ w2[0] = amd_bytealign (w1[0], w1[1], offset);
+ w1[3] = amd_bytealign (w0[3], w1[0], offset);
+ w1[2] = amd_bytealign (w0[2], w0[3], offset);
+ w1[1] = amd_bytealign (w0[1], w0[2], offset);
+ w1[0] = amd_bytealign (w0[0], w0[1], offset);
+ w0[3] = amd_bytealign ( 0, w0[0], offset);
w0[2] = 0;
w0[1] = 0;
w0[0] = 0;
break;
case 4:
- w3[2] = amd_bytealign_S (w2[1], 0, offset);
- w3[1] = amd_bytealign_S (w2[0], w2[1], offset);
- w3[0] = amd_bytealign_S (w1[3], w2[0], offset);
- w2[3] = amd_bytealign_S (w1[2], w1[3], offset);
- w2[2] = amd_bytealign_S (w1[1], w1[2], offset);
- w2[1] = amd_bytealign_S (w1[0], w1[1], offset);
- w2[0] = amd_bytealign_S (w0[3], w1[0], offset);
- w1[3] = amd_bytealign_S (w0[2], w0[3], offset);
- w1[2] = amd_bytealign_S (w0[1], w0[2], offset);
- w1[1] = amd_bytealign_S (w0[0], w0[1], offset);
- w1[0] = amd_bytealign_S ( 0, w0[0], offset);
+ w3[2] = amd_bytealign (w2[1], 0, offset);
+ w3[1] = amd_bytealign (w2[0], w2[1], offset);
+ w3[0] = amd_bytealign (w1[3], w2[0], offset);
+ w2[3] = amd_bytealign (w1[2], w1[3], offset);
+ w2[2] = amd_bytealign (w1[1], w1[2], offset);
+ w2[1] = amd_bytealign (w1[0], w1[1], offset);
+ w2[0] = amd_bytealign (w0[3], w1[0], offset);
+ w1[3] = amd_bytealign (w0[2], w0[3], offset);
+ w1[2] = amd_bytealign (w0[1], w0[2], offset);
+ w1[1] = amd_bytealign (w0[0], w0[1], offset);
+ w1[0] = amd_bytealign ( 0, w0[0], offset);
w0[3] = 0;
w0[2] = 0;
w0[1] = 0;
break;
case 5:
- w3[2] = amd_bytealign_S (w2[0], 0, offset);
- w3[1] = amd_bytealign_S (w1[3], w2[0], offset);
- w3[0] = amd_bytealign_S (w1[2], w1[3], offset);
- w2[3] = amd_bytealign_S (w1[1], w1[2], offset);
- w2[2] = amd_bytealign_S (w1[0], w1[1], offset);
- w2[1] = amd_bytealign_S (w0[3], w1[0], offset);
- w2[0] = amd_bytealign_S (w0[2], w0[3], offset);
- w1[3] = amd_bytealign_S (w0[1], w0[2], offset);
- w1[2] = amd_bytealign_S (w0[0], w0[1], offset);
- w1[1] = amd_bytealign_S ( 0, w0[0], offset);
+ w3[2] = amd_bytealign (w2[0], 0, offset);
+ w3[1] = amd_bytealign (w1[3], w2[0], offset);
+ w3[0] = amd_bytealign (w1[2], w1[3], offset);
+ w2[3] = amd_bytealign (w1[1], w1[2], offset);
+ w2[2] = amd_bytealign (w1[0], w1[1], offset);
+ w2[1] = amd_bytealign (w0[3], w1[0], offset);
+ w2[0] = amd_bytealign (w0[2], w0[3], offset);
+ w1[3] = amd_bytealign (w0[1], w0[2], offset);
+ w1[2] = amd_bytealign (w0[0], w0[1], offset);
+ w1[1] = amd_bytealign ( 0, w0[0], offset);
w1[0] = 0;
w0[3] = 0;
w0[2] = 0;
break;
case 6:
- w3[2] = amd_bytealign_S (w1[3], 0, offset);
- w3[1] = amd_bytealign_S (w1[2], w1[3], offset);
- w3[0] = amd_bytealign_S (w1[1], w1[2], offset);
- w2[3] = amd_bytealign_S (w1[0], w1[1], offset);
- w2[2] = amd_bytealign_S (w0[3], w1[0], offset);
- w2[1] = amd_bytealign_S (w0[2], w0[3], offset);
- w2[0] = amd_bytealign_S (w0[1], w0[2], offset);
- w1[3] = amd_bytealign_S (w0[0], w0[1], offset);
- w1[2] = amd_bytealign_S ( 0, w0[0], offset);
+ w3[2] = amd_bytealign (w1[3], 0, offset);
+ w3[1] = amd_bytealign (w1[2], w1[3], offset);
+ w3[0] = amd_bytealign (w1[1], w1[2], offset);
+ w2[3] = amd_bytealign (w1[0], w1[1], offset);
+ w2[2] = amd_bytealign (w0[3], w1[0], offset);
+ w2[1] = amd_bytealign (w0[2], w0[3], offset);
+ w2[0] = amd_bytealign (w0[1], w0[2], offset);
+ w1[3] = amd_bytealign (w0[0], w0[1], offset);
+ w1[2] = amd_bytealign ( 0, w0[0], offset);
w1[1] = 0;
w1[0] = 0;
w0[3] = 0;
break;
case 7:
- w3[2] = amd_bytealign_S (w1[2], 0, offset);
- w3[1] = amd_bytealign_S (w1[1], w1[2], offset);
- w3[0] = amd_bytealign_S (w1[0], w1[1], offset);
- w2[3] = amd_bytealign_S (w0[3], w1[0], offset);
- w2[2] = amd_bytealign_S (w0[2], w0[3], offset);
- w2[1] = amd_bytealign_S (w0[1], w0[2], offset);
- w2[0] = amd_bytealign_S (w0[0], w0[1], offset);
- w1[3] = amd_bytealign_S ( 0, w0[0], offset);
+ w3[2] = amd_bytealign (w1[2], 0, offset);
+ w3[1] = amd_bytealign (w1[1], w1[2], offset);
+ w3[0] = amd_bytealign (w1[0], w1[1], offset);
+ w2[3] = amd_bytealign (w0[3], w1[0], offset);
+ w2[2] = amd_bytealign (w0[2], w0[3], offset);
+ w2[1] = amd_bytealign (w0[1], w0[2], offset);
+ w2[0] = amd_bytealign (w0[0], w0[1], offset);
+ w1[3] = amd_bytealign ( 0, w0[0], offset);
w1[2] = 0;
w1[1] = 0;
w1[0] = 0;
break;
case 8:
- w3[2] = amd_bytealign_S (w1[1], 0, offset);
- w3[1] = amd_bytealign_S (w1[0], w1[1], offset);
- w3[0] = amd_bytealign_S (w0[3], w1[0], offset);
- w2[3] = amd_bytealign_S (w0[2], w0[3], offset);
- w2[2] = amd_bytealign_S (w0[1], w0[2], offset);
- w2[1] = amd_bytealign_S (w0[0], w0[1], offset);
- w2[0] = amd_bytealign_S ( 0, w0[0], offset);
+ w3[2] = amd_bytealign (w1[1], 0, offset);
+ w3[1] = amd_bytealign (w1[0], w1[1], offset);
+ w3[0] = amd_bytealign (w0[3], w1[0], offset);
+ w2[3] = amd_bytealign (w0[2], w0[3], offset);
+ w2[2] = amd_bytealign (w0[1], w0[2], offset);
+ w2[1] = amd_bytealign (w0[0], w0[1], offset);
+ w2[0] = amd_bytealign ( 0, w0[0], offset);
w1[3] = 0;
w1[2] = 0;
w1[1] = 0;
break;
case 9:
- w3[2] = amd_bytealign_S (w1[0], 0, offset);
- w3[1] = amd_bytealign_S (w0[3], w1[0], offset);
- w3[0] = amd_bytealign_S (w0[2], w0[3], offset);
- w2[3] = amd_bytealign_S (w0[1], w0[2], offset);
- w2[2] = amd_bytealign_S (w0[0], w0[1], offset);
- w2[1] = amd_bytealign_S ( 0, w0[0], offset);
+ w3[2] = amd_bytealign (w1[0], 0, offset);
+ w3[1] = amd_bytealign (w0[3], w1[0], offset);
+ w3[0] = amd_bytealign (w0[2], w0[3], offset);
+ w2[3] = amd_bytealign (w0[1], w0[2], offset);
+ w2[2] = amd_bytealign (w0[0], w0[1], offset);
+ w2[1] = amd_bytealign ( 0, w0[0], offset);
w2[0] = 0;
w1[3] = 0;
w1[2] = 0;
break;
case 10:
- w3[2] = amd_bytealign_S (w0[3], 0, offset);
- w3[1] = amd_bytealign_S (w0[2], w0[3], offset);
- w3[0] = amd_bytealign_S (w0[1], w0[2], offset);
- w2[3] = amd_bytealign_S (w0[0], w0[1], offset);
- w2[2] = amd_bytealign_S ( 0, w0[0], offset);
+ w3[2] = amd_bytealign (w0[3], 0, offset);
+ w3[1] = amd_bytealign (w0[2], w0[3], offset);
+ w3[0] = amd_bytealign (w0[1], w0[2], offset);
+ w2[3] = amd_bytealign (w0[0], w0[1], offset);
+ w2[2] = amd_bytealign ( 0, w0[0], offset);
w2[1] = 0;
w2[0] = 0;
w1[3] = 0;
break;
case 11:
- w3[2] = amd_bytealign_S (w0[2], 0, offset);
- w3[1] = amd_bytealign_S (w0[1], w0[2], offset);
- w3[0] = amd_bytealign_S (w0[0], w0[1], offset);
- w2[3] = amd_bytealign_S ( 0, w0[0], offset);
+ w3[2] = amd_bytealign (w0[2], 0, offset);
+ w3[1] = amd_bytealign (w0[1], w0[2], offset);
+ w3[0] = amd_bytealign (w0[0], w0[1], offset);
+ w2[3] = amd_bytealign ( 0, w0[0], offset);
w2[2] = 0;
w2[1] = 0;
w2[0] = 0;
break;
case 12:
- w3[2] = amd_bytealign_S (w0[1], 0, offset);
- w3[1] = amd_bytealign_S (w0[0], w0[1], offset);
- w3[0] = amd_bytealign_S ( 0, w0[0], offset);
+ w3[2] = amd_bytealign (w0[1], 0, offset);
+ w3[1] = amd_bytealign (w0[0], w0[1], offset);
+ w3[0] = amd_bytealign ( 0, w0[0], offset);
w2[3] = 0;
w2[2] = 0;
w2[1] = 0;
break;
case 13:
- w3[2] = amd_bytealign_S (w0[0], 0, offset);
- w3[1] = amd_bytealign_S ( 0, w0[0], offset);
+ w3[2] = amd_bytealign (w0[0], 0, offset);
+ w3[1] = amd_bytealign ( 0, w0[0], offset);
w3[0] = 0;
w2[3] = 0;
w2[2] = 0;
switch (offset / 4)
{
case 0:
- w3[1] = __byte_perm_S (w3[1], w3[0], selector);
- w3[0] = __byte_perm_S (w3[0], w2[3], selector);
- w2[3] = __byte_perm_S (w2[3], w2[2], selector);
- w2[2] = __byte_perm_S (w2[2], w2[1], selector);
- w2[1] = __byte_perm_S (w2[1], w2[0], selector);
- w2[0] = __byte_perm_S (w2[0], w1[3], selector);
- w1[3] = __byte_perm_S (w1[3], w1[2], selector);
- w1[2] = __byte_perm_S (w1[2], w1[1], selector);
- w1[1] = __byte_perm_S (w1[1], w1[0], selector);
- w1[0] = __byte_perm_S (w1[0], w0[3], selector);
- w0[3] = __byte_perm_S (w0[3], w0[2], selector);
- w0[2] = __byte_perm_S (w0[2], w0[1], selector);
- w0[1] = __byte_perm_S (w0[1], w0[0], selector);
- w0[0] = __byte_perm_S (w0[0], 0, selector);
+ w3[1] = __byte_perm (w3[1], w3[0], selector);
+ w3[0] = __byte_perm (w3[0], w2[3], selector);
+ w2[3] = __byte_perm (w2[3], w2[2], selector);
+ w2[2] = __byte_perm (w2[2], w2[1], selector);
+ w2[1] = __byte_perm (w2[1], w2[0], selector);
+ w2[0] = __byte_perm (w2[0], w1[3], selector);
+ w1[3] = __byte_perm (w1[3], w1[2], selector);
+ w1[2] = __byte_perm (w1[2], w1[1], selector);
+ w1[1] = __byte_perm (w1[1], w1[0], selector);
+ w1[0] = __byte_perm (w1[0], w0[3], selector);
+ w0[3] = __byte_perm (w0[3], w0[2], selector);
+ w0[2] = __byte_perm (w0[2], w0[1], selector);
+ w0[1] = __byte_perm (w0[1], w0[0], selector);
+ w0[0] = __byte_perm (w0[0], 0, selector);
break;
case 1:
- w3[1] = __byte_perm_S (w3[0], w2[3], selector);
- w3[0] = __byte_perm_S (w2[3], w2[2], selector);
- w2[3] = __byte_perm_S (w2[2], w2[1], selector);
- w2[2] = __byte_perm_S (w2[1], w2[0], selector);
- w2[1] = __byte_perm_S (w2[0], w1[3], selector);
- w2[0] = __byte_perm_S (w1[3], w1[2], selector);
- w1[3] = __byte_perm_S (w1[2], w1[1], selector);
- w1[2] = __byte_perm_S (w1[1], w1[0], selector);
- w1[1] = __byte_perm_S (w1[0], w0[3], selector);
- w1[0] = __byte_perm_S (w0[3], w0[2], selector);
- w0[3] = __byte_perm_S (w0[2], w0[1], selector);
- w0[2] = __byte_perm_S (w0[1], w0[0], selector);
- w0[1] = __byte_perm_S (w0[0], 0, selector);
+ w3[1] = __byte_perm (w3[0], w2[3], selector);
+ w3[0] = __byte_perm (w2[3], w2[2], selector);
+ w2[3] = __byte_perm (w2[2], w2[1], selector);
+ w2[2] = __byte_perm (w2[1], w2[0], selector);
+ w2[1] = __byte_perm (w2[0], w1[3], selector);
+ w2[0] = __byte_perm (w1[3], w1[2], selector);
+ w1[3] = __byte_perm (w1[2], w1[1], selector);
+ w1[2] = __byte_perm (w1[1], w1[0], selector);
+ w1[1] = __byte_perm (w1[0], w0[3], selector);
+ w1[0] = __byte_perm (w0[3], w0[2], selector);
+ w0[3] = __byte_perm (w0[2], w0[1], selector);
+ w0[2] = __byte_perm (w0[1], w0[0], selector);
+ w0[1] = __byte_perm (w0[0], 0, selector);
w0[0] = 0;
break;
case 2:
- w3[1] = __byte_perm_S (w2[3], w2[2], selector);
- w3[0] = __byte_perm_S (w2[2], w2[1], selector);
- w2[3] = __byte_perm_S (w2[1], w2[0], selector);
- w2[2] = __byte_perm_S (w2[0], w1[3], selector);
- w2[1] = __byte_perm_S (w1[3], w1[2], selector);
- w2[0] = __byte_perm_S (w1[2], w1[1], selector);
- w1[3] = __byte_perm_S (w1[1], w1[0], selector);
- w1[2] = __byte_perm_S (w1[0], w0[3], selector);
- w1[1] = __byte_perm_S (w0[3], w0[2], selector);
- w1[0] = __byte_perm_S (w0[2], w0[1], selector);
- w0[3] = __byte_perm_S (w0[1], w0[0], selector);
- w0[2] = __byte_perm_S (w0[0], 0, selector);
+ w3[1] = __byte_perm (w2[3], w2[2], selector);
+ w3[0] = __byte_perm (w2[2], w2[1], selector);
+ w2[3] = __byte_perm (w2[1], w2[0], selector);
+ w2[2] = __byte_perm (w2[0], w1[3], selector);
+ w2[1] = __byte_perm (w1[3], w1[2], selector);
+ w2[0] = __byte_perm (w1[2], w1[1], selector);
+ w1[3] = __byte_perm (w1[1], w1[0], selector);
+ w1[2] = __byte_perm (w1[0], w0[3], selector);
+ w1[1] = __byte_perm (w0[3], w0[2], selector);
+ w1[0] = __byte_perm (w0[2], w0[1], selector);
+ w0[3] = __byte_perm (w0[1], w0[0], selector);
+ w0[2] = __byte_perm (w0[0], 0, selector);
w0[1] = 0;
w0[0] = 0;
break;
case 3:
- w3[1] = __byte_perm_S (w2[2], w2[1], selector);
- w3[0] = __byte_perm_S (w2[1], w2[0], selector);
- w2[3] = __byte_perm_S (w2[0], w1[3], selector);
- w2[2] = __byte_perm_S (w1[3], w1[2], selector);
- w2[1] = __byte_perm_S (w1[2], w1[1], selector);
- w2[0] = __byte_perm_S (w1[1], w1[0], selector);
- w1[3] = __byte_perm_S (w1[0], w0[3], selector);
- w1[2] = __byte_perm_S (w0[3], w0[2], selector);
- w1[1] = __byte_perm_S (w0[2], w0[1], selector);
- w1[0] = __byte_perm_S (w0[1], w0[0], selector);
- w0[3] = __byte_perm_S (w0[0], 0, selector);
+ w3[1] = __byte_perm (w2[2], w2[1], selector);
+ w3[0] = __byte_perm (w2[1], w2[0], selector);
+ w2[3] = __byte_perm (w2[0], w1[3], selector);
+ w2[2] = __byte_perm (w1[3], w1[2], selector);
+ w2[1] = __byte_perm (w1[2], w1[1], selector);
+ w2[0] = __byte_perm (w1[1], w1[0], selector);
+ w1[3] = __byte_perm (w1[0], w0[3], selector);
+ w1[2] = __byte_perm (w0[3], w0[2], selector);
+ w1[1] = __byte_perm (w0[2], w0[1], selector);
+ w1[0] = __byte_perm (w0[1], w0[0], selector);
+ w0[3] = __byte_perm (w0[0], 0, selector);
w0[2] = 0;
w0[1] = 0;
w0[0] = 0;
break;
case 4:
- w3[1] = __byte_perm_S (w2[1], w2[0], selector);
- w3[0] = __byte_perm_S (w2[0], w1[3], selector);
- w2[3] = __byte_perm_S (w1[3], w1[2], selector);
- w2[2] = __byte_perm_S (w1[2], w1[1], selector);
- w2[1] = __byte_perm_S (w1[1], w1[0], selector);
- w2[0] = __byte_perm_S (w1[0], w0[3], selector);
- w1[3] = __byte_perm_S (w0[3], w0[2], selector);
- w1[2] = __byte_perm_S (w0[2], w0[1], selector);
- w1[1] = __byte_perm_S (w0[1], w0[0], selector);
- w1[0] = __byte_perm_S (w0[0], 0, selector);
+ w3[1] = __byte_perm (w2[1], w2[0], selector);
+ w3[0] = __byte_perm (w2[0], w1[3], selector);
+ w2[3] = __byte_perm (w1[3], w1[2], selector);
+ w2[2] = __byte_perm (w1[2], w1[1], selector);
+ w2[1] = __byte_perm (w1[1], w1[0], selector);
+ w2[0] = __byte_perm (w1[0], w0[3], selector);
+ w1[3] = __byte_perm (w0[3], w0[2], selector);
+ w1[2] = __byte_perm (w0[2], w0[1], selector);
+ w1[1] = __byte_perm (w0[1], w0[0], selector);
+ w1[0] = __byte_perm (w0[0], 0, selector);
w0[3] = 0;
w0[2] = 0;
w0[1] = 0;
break;
case 5:
- w3[1] = __byte_perm_S (w2[0], w1[3], selector);
- w3[0] = __byte_perm_S (w1[3], w1[2], selector);
- w2[3] = __byte_perm_S (w1[2], w1[1], selector);
- w2[2] = __byte_perm_S (w1[1], w1[0], selector);
- w2[1] = __byte_perm_S (w1[0], w0[3], selector);
- w2[0] = __byte_perm_S (w0[3], w0[2], selector);
- w1[3] = __byte_perm_S (w0[2], w0[1], selector);
- w1[2] = __byte_perm_S (w0[1], w0[0], selector);
- w1[1] = __byte_perm_S (w0[0], 0, selector);
+ w3[1] = __byte_perm (w2[0], w1[3], selector);
+ w3[0] = __byte_perm (w1[3], w1[2], selector);
+ w2[3] = __byte_perm (w1[2], w1[1], selector);
+ w2[2] = __byte_perm (w1[1], w1[0], selector);
+ w2[1] = __byte_perm (w1[0], w0[3], selector);
+ w2[0] = __byte_perm (w0[3], w0[2], selector);
+ w1[3] = __byte_perm (w0[2], w0[1], selector);
+ w1[2] = __byte_perm (w0[1], w0[0], selector);
+ w1[1] = __byte_perm (w0[0], 0, selector);
w1[0] = 0;
w0[3] = 0;
w0[2] = 0;
break;
case 6:
- w3[1] = __byte_perm_S (w1[3], w1[2], selector);
- w3[0] = __byte_perm_S (w1[2], w1[1], selector);
- w2[3] = __byte_perm_S (w1[1], w1[0], selector);
- w2[2] = __byte_perm_S (w1[0], w0[3], selector);
- w2[1] = __byte_perm_S (w0[3], w0[2], selector);
- w2[0] = __byte_perm_S (w0[2], w0[1], selector);
- w1[3] = __byte_perm_S (w0[1], w0[0], selector);
- w1[2] = __byte_perm_S (w0[0], 0, selector);
+ w3[1] = __byte_perm (w1[3], w1[2], selector);
+ w3[0] = __byte_perm (w1[2], w1[1], selector);
+ w2[3] = __byte_perm (w1[1], w1[0], selector);
+ w2[2] = __byte_perm (w1[0], w0[3], selector);
+ w2[1] = __byte_perm (w0[3], w0[2], selector);
+ w2[0] = __byte_perm (w0[2], w0[1], selector);
+ w1[3] = __byte_perm (w0[1], w0[0], selector);
+ w1[2] = __byte_perm (w0[0], 0, selector);
w1[1] = 0;
w1[0] = 0;
w0[3] = 0;
break;
case 7:
- w3[1] = __byte_perm_S (w1[2], w1[1], selector);
- w3[0] = __byte_perm_S (w1[1], w1[0], selector);
- w2[3] = __byte_perm_S (w1[0], w0[3], selector);
- w2[2] = __byte_perm_S (w0[3], w0[2], selector);
- w2[1] = __byte_perm_S (w0[2], w0[1], selector);
- w2[0] = __byte_perm_S (w0[1], w0[0], selector);
- w1[3] = __byte_perm_S (w0[0], 0, selector);
+ w3[1] = __byte_perm (w1[2], w1[1], selector);
+ w3[0] = __byte_perm (w1[1], w1[0], selector);
+ w2[3] = __byte_perm (w1[0], w0[3], selector);
+ w2[2] = __byte_perm (w0[3], w0[2], selector);
+ w2[1] = __byte_perm (w0[2], w0[1], selector);
+ w2[0] = __byte_perm (w0[1], w0[0], selector);
+ w1[3] = __byte_perm (w0[0], 0, selector);
w1[2] = 0;
w1[1] = 0;
w1[0] = 0;
break;
case 8:
- w3[1] = __byte_perm_S (w1[1], w1[0], selector);
- w3[0] = __byte_perm_S (w1[0], w0[3], selector);
- w2[3] = __byte_perm_S (w0[3], w0[2], selector);
- w2[2] = __byte_perm_S (w0[2], w0[1], selector);
- w2[1] = __byte_perm_S (w0[1], w0[0], selector);
- w2[0] = __byte_perm_S (w0[0], 0, selector);
+ w3[1] = __byte_perm (w1[1], w1[0], selector);
+ w3[0] = __byte_perm (w1[0], w0[3], selector);
+ w2[3] = __byte_perm (w0[3], w0[2], selector);
+ w2[2] = __byte_perm (w0[2], w0[1], selector);
+ w2[1] = __byte_perm (w0[1], w0[0], selector);
+ w2[0] = __byte_perm (w0[0], 0, selector);
w1[3] = 0;
w1[2] = 0;
w1[1] = 0;
break;
case 9:
- w3[1] = __byte_perm_S (w1[0], w0[3], selector);
- w3[0] = __byte_perm_S (w0[3], w0[2], selector);
- w2[3] = __byte_perm_S (w0[2], w0[1], selector);
- w2[2] = __byte_perm_S (w0[1], w0[0], selector);
- w2[1] = __byte_perm_S (w0[0], 0, selector);
+ w3[1] = __byte_perm (w1[0], w0[3], selector);
+ w3[0] = __byte_perm (w0[3], w0[2], selector);
+ w2[3] = __byte_perm (w0[2], w0[1], selector);
+ w2[2] = __byte_perm (w0[1], w0[0], selector);
+ w2[1] = __byte_perm (w0[0], 0, selector);
w2[0] = 0;
w1[3] = 0;
w1[2] = 0;
break;
case 10:
- w3[1] = __byte_perm_S (w0[3], w0[2], selector);
- w3[0] = __byte_perm_S (w0[2], w0[1], selector);
- w2[3] = __byte_perm_S (w0[1], w0[0], selector);
- w2[2] = __byte_perm_S (w0[0], 0, selector);
+ w3[1] = __byte_perm (w0[3], w0[2], selector);
+ w3[0] = __byte_perm (w0[2], w0[1], selector);
+ w2[3] = __byte_perm (w0[1], w0[0], selector);
+ w2[2] = __byte_perm (w0[0], 0, selector);
w2[1] = 0;
w2[0] = 0;
w1[3] = 0;
break;
case 11:
- w3[1] = __byte_perm_S (w0[2], w0[1], selector);
- w3[0] = __byte_perm_S (w0[1], w0[0], selector);
- w2[3] = __byte_perm_S (w0[0], 0, selector);
+ w3[1] = __byte_perm (w0[2], w0[1], selector);
+ w3[0] = __byte_perm (w0[1], w0[0], selector);
+ w2[3] = __byte_perm (w0[0], 0, selector);
w2[2] = 0;
w2[1] = 0;
w2[0] = 0;
break;
case 12:
- w3[1] = __byte_perm_S (w0[1], w0[0], selector);
- w3[0] = __byte_perm_S (w0[0], 0, selector);
+ w3[1] = __byte_perm (w0[1], w0[0], selector);
+ w3[0] = __byte_perm (w0[0], 0, selector);
w2[3] = 0;
w2[2] = 0;
w2[1] = 0;
break;
case 13:
- w3[1] = __byte_perm_S (w0[0], 0, selector);
+ w3[1] = __byte_perm (w0[0], 0, selector);
w3[0] = 0;
w2[3] = 0;
w2[2] = 0;
#endif
}
-static void switch_buffer_by_offset (u32x w0[4], u32x w1[4], u32x w2[4], u32x w3[4], const u32 offset)
+static void overwrite_at_le (u32x sw[16], const u32x w0, const u32 salt_len)
{
- #if defined IS_AMD || defined IS_GENERIC
- const int offset_mod_4 = offset & 3;
+ #if defined cl_amd_media_ops
+ switch (salt_len)
+ {
+ case 0: sw[0] = w0;
+ break;
+ case 1: sw[0] = amd_bytealign (w0, sw[0] << 24, 3);
+ sw[1] = amd_bytealign (sw[1] >> 8, w0, 3);
+ break;
+ case 2: sw[0] = amd_bytealign (w0, sw[0] << 16, 2);
+ sw[1] = amd_bytealign (sw[1] >> 16, w0, 2);
+ break;
+ case 3: sw[0] = amd_bytealign (w0, sw[0] << 8, 1);
+ sw[1] = amd_bytealign (sw[1] >> 24, w0, 1);
+ break;
+ case 4: sw[1] = w0;
+ break;
+ case 5: sw[1] = amd_bytealign (w0, sw[1] << 24, 3);
+ sw[2] = amd_bytealign (sw[2] >> 8, w0, 3);
+ break;
+ case 6: sw[1] = amd_bytealign (w0, sw[1] << 16, 2);
+ sw[2] = amd_bytealign (sw[2] >> 16, w0, 2);
+ break;
+ case 7: sw[1] = amd_bytealign (w0, sw[1] << 8, 1);
+ sw[2] = amd_bytealign (sw[2] >> 24, w0, 1);
+ break;
+ case 8: sw[2] = w0;
+ break;
+ case 9: sw[2] = amd_bytealign (w0, sw[2] << 24, 3);
+ sw[3] = amd_bytealign (sw[3] >> 8, w0, 3);
+ break;
+ case 10: sw[2] = amd_bytealign (w0, sw[2] << 16, 2);
+ sw[3] = amd_bytealign (sw[3] >> 16, w0, 2);
+ break;
+ case 11: sw[2] = amd_bytealign (w0, sw[2] << 8, 1);
+ sw[3] = amd_bytealign (sw[3] >> 24, w0, 1);
+ break;
+ case 12: sw[3] = w0;
+ break;
+ case 13: sw[3] = amd_bytealign (w0, sw[3] << 24, 3);
+ sw[4] = amd_bytealign (sw[4] >> 8, w0, 3);
+ break;
+ case 14: sw[3] = amd_bytealign (w0, sw[3] << 16, 2);
+ sw[4] = amd_bytealign (sw[4] >> 16, w0, 2);
+ break;
+ case 15: sw[3] = amd_bytealign (w0, sw[3] << 8, 1);
+ sw[4] = amd_bytealign (sw[4] >> 24, w0, 1);
+ break;
+ case 16: sw[4] = w0;
+ break;
+ case 17: sw[4] = amd_bytealign (w0, sw[4] << 24, 3);
+ sw[5] = amd_bytealign (sw[5] >> 8, w0, 3);
+ break;
+ case 18: sw[4] = amd_bytealign (w0, sw[4] << 16, 2);
+ sw[5] = amd_bytealign (sw[5] >> 16, w0, 2);
+ break;
+ case 19: sw[4] = amd_bytealign (w0, sw[4] << 8, 1);
+ sw[5] = amd_bytealign (sw[5] >> 24, w0, 1);
+ break;
+ case 20: sw[5] = w0;
+ break;
+ case 21: sw[5] = amd_bytealign (w0, sw[5] << 24, 3);
+ sw[6] = amd_bytealign (sw[6] >> 8, w0, 3);
+ break;
+ case 22: sw[5] = amd_bytealign (w0, sw[5] << 16, 2);
+ sw[6] = amd_bytealign (sw[6] >> 16, w0, 2);
+ break;
+ case 23: sw[5] = amd_bytealign (w0, sw[5] << 8, 1);
+ sw[6] = amd_bytealign (sw[6] >> 24, w0, 1);
+ break;
+ case 24: sw[6] = w0;
+ break;
+ case 25: sw[6] = amd_bytealign (w0, sw[6] << 24, 3);
+ sw[7] = amd_bytealign (sw[7] >> 8, w0, 3);
+ break;
+ case 26: sw[6] = amd_bytealign (w0, sw[6] << 16, 2);
+ sw[7] = amd_bytealign (sw[7] >> 16, w0, 2);
+ break;
+ case 27: sw[6] = amd_bytealign (w0, sw[6] << 8, 1);
+ sw[7] = amd_bytealign (sw[7] >> 24, w0, 1);
+ break;
+ case 28: sw[7] = w0;
+ break;
+ case 29: sw[7] = amd_bytealign (w0, sw[7] << 24, 3);
+ sw[8] = amd_bytealign (sw[8] >> 8, w0, 3);
+ break;
+ case 30: sw[7] = amd_bytealign (w0, sw[7] << 16, 2);
+ sw[8] = amd_bytealign (sw[8] >> 16, w0, 2);
+ break;
+ case 31: sw[7] = amd_bytealign (w0, sw[7] << 8, 1);
+ sw[8] = amd_bytealign (sw[8] >> 24, w0, 1);
+ break;
+ }
+ #else
+ switch (salt_len)
+ {
+ case 0: sw[0] = w0;
+ break;
+ case 1: sw[0] = (sw[0] & 0x000000ff) | (w0 << 8);
+ sw[1] = (sw[1] & 0xffffff00) | (w0 >> 24);
+ break;
+ case 2: sw[0] = (sw[0] & 0x0000ffff) | (w0 << 16);
+ sw[1] = (sw[1] & 0xffff0000) | (w0 >> 16);
+ break;
+ case 3: sw[0] = (sw[0] & 0x00ffffff) | (w0 << 24);
+ sw[1] = (sw[1] & 0xff000000) | (w0 >> 8);
+ break;
+ case 4: sw[1] = w0;
+ break;
+ case 5: sw[1] = (sw[1] & 0x000000ff) | (w0 << 8);
+ sw[2] = (sw[2] & 0xffffff00) | (w0 >> 24);
+ break;
+ case 6: sw[1] = (sw[1] & 0x0000ffff) | (w0 << 16);
+ sw[2] = (sw[2] & 0xffff0000) | (w0 >> 16);
+ break;
+ case 7: sw[1] = (sw[1] & 0x00ffffff) | (w0 << 24);
+ sw[2] = (sw[2] & 0xff000000) | (w0 >> 8);
+ break;
+ case 8: sw[2] = w0;
+ break;
+ case 9: sw[2] = (sw[2] & 0x000000ff) | (w0 << 8);
+ sw[3] = (sw[3] & 0xffffff00) | (w0 >> 24);
+ break;
+ case 10: sw[2] = (sw[2] & 0x0000ffff) | (w0 << 16);
+ sw[3] = (sw[3] & 0xffff0000) | (w0 >> 16);
+ break;
+ case 11: sw[2] = (sw[2] & 0x00ffffff) | (w0 << 24);
+ sw[3] = (sw[3] & 0xff000000) | (w0 >> 8);
+ break;
+ case 12: sw[3] = w0;
+ break;
+ case 13: sw[3] = (sw[3] & 0x000000ff) | (w0 << 8);
+ sw[4] = (sw[4] & 0xffffff00) | (w0 >> 24);
+ break;
+ case 14: sw[3] = (sw[3] & 0x0000ffff) | (w0 << 16);
+ sw[4] = (sw[4] & 0xffff0000) | (w0 >> 16);
+ break;
+ case 15: sw[3] = (sw[3] & 0x00ffffff) | (w0 << 24);
+ sw[4] = (sw[4] & 0xff000000) | (w0 >> 8);
+ break;
+ case 16: sw[4] = w0;
+ break;
+ case 17: sw[4] = (sw[4] & 0x000000ff) | (w0 << 8);
+ sw[5] = (sw[5] & 0xffffff00) | (w0 >> 24);
+ break;
+ case 18: sw[4] = (sw[4] & 0x0000ffff) | (w0 << 16);
+ sw[5] = (sw[5] & 0xffff0000) | (w0 >> 16);
+ break;
+ case 19: sw[4] = (sw[4] & 0x00ffffff) | (w0 << 24);
+ sw[5] = (sw[5] & 0xff000000) | (w0 >> 8);
+ break;
+ case 20: sw[5] = w0;
+ break;
+ case 21: sw[5] = (sw[5] & 0x000000ff) | (w0 << 8);
+ sw[6] = (sw[6] & 0xffffff00) | (w0 >> 24);
+ break;
+ case 22: sw[5] = (sw[5] & 0x0000ffff) | (w0 << 16);
+ sw[6] = (sw[6] & 0xffff0000) | (w0 >> 16);
+ break;
+ case 23: sw[5] = (sw[5] & 0x00ffffff) | (w0 << 24);
+ sw[6] = (sw[6] & 0xff000000) | (w0 >> 8);
+ break;
+ case 24: sw[6] = w0;
+ break;
+ case 25: sw[6] = (sw[6] & 0x000000ff) | (w0 << 8);
+ sw[7] = (sw[7] & 0xffffff00) | (w0 >> 24);
+ break;
+ case 26: sw[6] = (sw[6] & 0x0000ffff) | (w0 << 16);
+ sw[7] = (sw[7] & 0xffff0000) | (w0 >> 16);
+ break;
+ case 27: sw[6] = (sw[6] & 0x00ffffff) | (w0 << 24);
+ sw[7] = (sw[7] & 0xff000000) | (w0 >> 8);
+ break;
+ case 28: sw[7] = w0;
+ break;
+ case 29: sw[7] = (sw[7] & 0x000000ff) | (w0 << 8);
+ sw[8] = (sw[8] & 0xffffff00) | (w0 >> 24);
+ break;
+ case 30: sw[7] = (sw[7] & 0x0000ffff) | (w0 << 16);
+ sw[8] = (sw[8] & 0xffff0000) | (w0 >> 16);
+ break;
+ case 31: sw[7] = (sw[7] & 0x00ffffff) | (w0 << 24);
+ sw[8] = (sw[8] & 0xff000000) | (w0 >> 8);
+ break;
+ }
+ #endif
+}
- const int offset_minus_4 = 4 - offset;
+static void overwrite_at_be (u32x sw[16], const u32x w0, const u32 salt_len)
+{
+ // would be nice to have optimization based on amd_bytealign as with _le counterpart
- switch (offset / 4)
+ switch (salt_len)
{
- case 0:
- w3[2] = amd_bytealign ( 0, w3[1], offset_minus_4);
- w3[1] = amd_bytealign (w3[1], w3[0], offset_minus_4);
- w3[0] = amd_bytealign (w3[0], w2[3], offset_minus_4);
- w2[3] = amd_bytealign (w2[3], w2[2], offset_minus_4);
- w2[2] = amd_bytealign (w2[2], w2[1], offset_minus_4);
- w2[1] = amd_bytealign (w2[1], w2[0], offset_minus_4);
- w2[0] = amd_bytealign (w2[0], w1[3], offset_minus_4);
- w1[3] = amd_bytealign (w1[3], w1[2], offset_minus_4);
- w1[2] = amd_bytealign (w1[2], w1[1], offset_minus_4);
- w1[1] = amd_bytealign (w1[1], w1[0], offset_minus_4);
- w1[0] = amd_bytealign (w1[0], w0[3], offset_minus_4);
- w0[3] = amd_bytealign (w0[3], w0[2], offset_minus_4);
- w0[2] = amd_bytealign (w0[2], w0[1], offset_minus_4);
- w0[1] = amd_bytealign (w0[1], w0[0], offset_minus_4);
- w0[0] = amd_bytealign (w0[0], 0, offset_minus_4);
-
- if (offset_mod_4 == 0)
- {
- w0[0] = w0[1];
- w0[1] = w0[2];
- w0[2] = w0[3];
- w0[3] = w1[0];
- w1[0] = w1[1];
- w1[1] = w1[2];
- w1[2] = w1[3];
- w1[3] = w2[0];
- w2[0] = w2[1];
- w2[1] = w2[2];
- w2[2] = w2[3];
- w2[3] = w3[0];
- w3[0] = w3[1];
- w3[1] = w3[2];
- w3[2] = 0;
- }
-
- break;
-
- case 1:
- w3[2] = amd_bytealign ( 0, w3[0], offset_minus_4);
- w3[1] = amd_bytealign (w3[0], w2[3], offset_minus_4);
- w3[0] = amd_bytealign (w2[3], w2[2], offset_minus_4);
- w2[3] = amd_bytealign (w2[2], w2[1], offset_minus_4);
- w2[2] = amd_bytealign (w2[1], w2[0], offset_minus_4);
- w2[1] = amd_bytealign (w2[0], w1[3], offset_minus_4);
- w2[0] = amd_bytealign (w1[3], w1[2], offset_minus_4);
- w1[3] = amd_bytealign (w1[2], w1[1], offset_minus_4);
- w1[2] = amd_bytealign (w1[1], w1[0], offset_minus_4);
- w1[1] = amd_bytealign (w1[0], w0[3], offset_minus_4);
- w1[0] = amd_bytealign (w0[3], w0[2], offset_minus_4);
- w0[3] = amd_bytealign (w0[2], w0[1], offset_minus_4);
- w0[2] = amd_bytealign (w0[1], w0[0], offset_minus_4);
- w0[1] = amd_bytealign (w0[0], 0, offset_minus_4);
- w0[0] = 0;
-
- if (offset_mod_4 == 0)
- {
- w0[1] = w0[2];
- w0[2] = w0[3];
- w0[3] = w1[0];
- w1[0] = w1[1];
- w1[1] = w1[2];
- w1[2] = w1[3];
- w1[3] = w2[0];
- w2[0] = w2[1];
- w2[1] = w2[2];
- w2[2] = w2[3];
- w2[3] = w3[0];
- w3[0] = w3[1];
- w3[1] = w3[2];
- w3[2] = 0;
- }
-
- break;
-
- case 2:
- w3[2] = amd_bytealign ( 0, w2[3], offset_minus_4);
- w3[1] = amd_bytealign (w2[3], w2[2], offset_minus_4);
- w3[0] = amd_bytealign (w2[2], w2[1], offset_minus_4);
- w2[3] = amd_bytealign (w2[1], w2[0], offset_minus_4);
- w2[2] = amd_bytealign (w2[0], w1[3], offset_minus_4);
- w2[1] = amd_bytealign (w1[3], w1[2], offset_minus_4);
- w2[0] = amd_bytealign (w1[2], w1[1], offset_minus_4);
- w1[3] = amd_bytealign (w1[1], w1[0], offset_minus_4);
- w1[2] = amd_bytealign (w1[0], w0[3], offset_minus_4);
- w1[1] = amd_bytealign (w0[3], w0[2], offset_minus_4);
- w1[0] = amd_bytealign (w0[2], w0[1], offset_minus_4);
- w0[3] = amd_bytealign (w0[1], w0[0], offset_minus_4);
- w0[2] = amd_bytealign (w0[0], 0, offset_minus_4);
- w0[1] = 0;
- w0[0] = 0;
-
- if (offset_mod_4 == 0)
- {
- w0[2] = w0[3];
- w0[3] = w1[0];
- w1[0] = w1[1];
- w1[1] = w1[2];
- w1[2] = w1[3];
- w1[3] = w2[0];
- w2[0] = w2[1];
- w2[1] = w2[2];
- w2[2] = w2[3];
- w2[3] = w3[0];
- w3[0] = w3[1];
- w3[1] = w3[2];
- w3[2] = 0;
- }
-
- break;
-
- case 3:
- w3[2] = amd_bytealign ( 0, w2[2], offset_minus_4);
- w3[1] = amd_bytealign (w2[2], w2[1], offset_minus_4);
- w3[0] = amd_bytealign (w2[1], w2[0], offset_minus_4);
- w2[3] = amd_bytealign (w2[0], w1[3], offset_minus_4);
- w2[2] = amd_bytealign (w1[3], w1[2], offset_minus_4);
- w2[1] = amd_bytealign (w1[2], w1[1], offset_minus_4);
- w2[0] = amd_bytealign (w1[1], w1[0], offset_minus_4);
- w1[3] = amd_bytealign (w1[0], w0[3], offset_minus_4);
- w1[2] = amd_bytealign (w0[3], w0[2], offset_minus_4);
- w1[1] = amd_bytealign (w0[2], w0[1], offset_minus_4);
- w1[0] = amd_bytealign (w0[1], w0[0], offset_minus_4);
- w0[3] = amd_bytealign (w0[0], 0, offset_minus_4);
- w0[2] = 0;
- w0[1] = 0;
- w0[0] = 0;
-
- if (offset_mod_4 == 0)
- {
- w0[3] = w1[0];
- w1[0] = w1[1];
- w1[1] = w1[2];
- w1[2] = w1[3];
- w1[3] = w2[0];
- w2[0] = w2[1];
- w2[1] = w2[2];
- w2[2] = w2[3];
- w2[3] = w3[0];
- w3[0] = w3[1];
- w3[1] = w3[2];
- w3[2] = 0;
- }
-
- break;
-
- case 4:
- w3[2] = amd_bytealign ( 0, w2[1], offset_minus_4);
- w3[1] = amd_bytealign (w2[1], w2[0], offset_minus_4);
- w3[0] = amd_bytealign (w2[0], w1[3], offset_minus_4);
- w2[3] = amd_bytealign (w1[3], w1[2], offset_minus_4);
- w2[2] = amd_bytealign (w1[2], w1[1], offset_minus_4);
- w2[1] = amd_bytealign (w1[1], w1[0], offset_minus_4);
- w2[0] = amd_bytealign (w1[0], w0[3], offset_minus_4);
- w1[3] = amd_bytealign (w0[3], w0[2], offset_minus_4);
- w1[2] = amd_bytealign (w0[2], w0[1], offset_minus_4);
- w1[1] = amd_bytealign (w0[1], w0[0], offset_minus_4);
- w1[0] = amd_bytealign (w0[0], 0, offset_minus_4);
- w0[3] = 0;
- w0[2] = 0;
- w0[1] = 0;
- w0[0] = 0;
-
- if (offset_mod_4 == 0)
- {
- w1[0] = w1[1];
- w1[1] = w1[2];
- w1[2] = w1[3];
- w1[3] = w2[0];
- w2[0] = w2[1];
- w2[1] = w2[2];
- w2[2] = w2[3];
- w2[3] = w3[0];
- w3[0] = w3[1];
- w3[1] = w3[2];
- w3[2] = 0;
- }
-
- break;
-
- case 5:
- w3[2] = amd_bytealign ( 0, w2[0], offset_minus_4);
- w3[1] = amd_bytealign (w2[0], w1[3], offset_minus_4);
- w3[0] = amd_bytealign (w1[3], w1[2], offset_minus_4);
- w2[3] = amd_bytealign (w1[2], w1[1], offset_minus_4);
- w2[2] = amd_bytealign (w1[1], w1[0], offset_minus_4);
- w2[1] = amd_bytealign (w1[0], w0[3], offset_minus_4);
- w2[0] = amd_bytealign (w0[3], w0[2], offset_minus_4);
- w1[3] = amd_bytealign (w0[2], w0[1], offset_minus_4);
- w1[2] = amd_bytealign (w0[1], w0[0], offset_minus_4);
- w1[1] = amd_bytealign (w0[0], 0, offset_minus_4);
- w1[0] = 0;
- w0[3] = 0;
- w0[2] = 0;
- w0[1] = 0;
- w0[0] = 0;
-
- if (offset_mod_4 == 0)
- {
- w1[1] = w1[2];
- w1[2] = w1[3];
- w1[3] = w2[0];
- w2[0] = w2[1];
- w2[1] = w2[2];
- w2[2] = w2[3];
- w2[3] = w3[0];
- w3[0] = w3[1];
- w3[1] = w3[2];
- w3[2] = 0;
- }
-
- break;
-
- case 6:
- w3[2] = amd_bytealign ( 0, w1[3], offset_minus_4);
- w3[1] = amd_bytealign (w1[3], w1[2], offset_minus_4);
- w3[0] = amd_bytealign (w1[2], w1[1], offset_minus_4);
- w2[3] = amd_bytealign (w1[1], w1[0], offset_minus_4);
- w2[2] = amd_bytealign (w1[0], w0[3], offset_minus_4);
- w2[1] = amd_bytealign (w0[3], w0[2], offset_minus_4);
- w2[0] = amd_bytealign (w0[2], w0[1], offset_minus_4);
- w1[3] = amd_bytealign (w0[1], w0[0], offset_minus_4);
- w1[2] = amd_bytealign (w0[0], 0, offset_minus_4);
- w1[1] = 0;
- w1[0] = 0;
- w0[3] = 0;
- w0[2] = 0;
- w0[1] = 0;
- w0[0] = 0;
-
- if (offset_mod_4 == 0)
- {
- w1[2] = w1[3];
- w1[3] = w2[0];
- w2[0] = w2[1];
- w2[1] = w2[2];
- w2[2] = w2[3];
- w2[3] = w3[0];
- w3[0] = w3[1];
- w3[1] = w3[2];
- w3[2] = 0;
- }
-
- break;
-
- case 7:
- w3[2] = amd_bytealign ( 0, w1[2], offset_minus_4);
- w3[1] = amd_bytealign (w1[2], w1[1], offset_minus_4);
- w3[0] = amd_bytealign (w1[1], w1[0], offset_minus_4);
- w2[3] = amd_bytealign (w1[0], w0[3], offset_minus_4);
- w2[2] = amd_bytealign (w0[3], w0[2], offset_minus_4);
- w2[1] = amd_bytealign (w0[2], w0[1], offset_minus_4);
- w2[0] = amd_bytealign (w0[1], w0[0], offset_minus_4);
- w1[3] = amd_bytealign (w0[0], 0, offset_minus_4);
- w1[2] = 0;
- w1[1] = 0;
- w1[0] = 0;
- w0[3] = 0;
- w0[2] = 0;
- w0[1] = 0;
- w0[0] = 0;
-
- if (offset_mod_4 == 0)
- {
- w1[3] = w2[0];
- w2[0] = w2[1];
- w2[1] = w2[2];
- w2[2] = w2[3];
- w2[3] = w3[0];
- w3[0] = w3[1];
- w3[1] = w3[2];
- w3[2] = 0;
- }
-
- break;
-
- case 8:
- w3[2] = amd_bytealign ( 0, w1[1], offset_minus_4);
- w3[1] = amd_bytealign (w1[1], w1[0], offset_minus_4);
- w3[0] = amd_bytealign (w1[0], w0[3], offset_minus_4);
- w2[3] = amd_bytealign (w0[3], w0[2], offset_minus_4);
- w2[2] = amd_bytealign (w0[2], w0[1], offset_minus_4);
- w2[1] = amd_bytealign (w0[1], w0[0], offset_minus_4);
- w2[0] = amd_bytealign (w0[0], 0, offset_minus_4);
- w1[3] = 0;
- w1[2] = 0;
- w1[1] = 0;
- w1[0] = 0;
- w0[3] = 0;
- w0[2] = 0;
- w0[1] = 0;
- w0[0] = 0;
-
- if (offset_mod_4 == 0)
- {
- w2[0] = w2[1];
- w2[1] = w2[2];
- w2[2] = w2[3];
- w2[3] = w3[0];
- w3[0] = w3[1];
- w3[1] = w3[2];
- w3[2] = 0;
- }
-
- break;
-
- case 9:
- w3[2] = amd_bytealign ( 0, w1[0], offset_minus_4);
- w3[1] = amd_bytealign (w1[0], w0[3], offset_minus_4);
- w3[0] = amd_bytealign (w0[3], w0[2], offset_minus_4);
- w2[3] = amd_bytealign (w0[2], w0[1], offset_minus_4);
- w2[2] = amd_bytealign (w0[1], w0[0], offset_minus_4);
- w2[1] = amd_bytealign (w0[0], 0, offset_minus_4);
- w2[0] = 0;
- w1[3] = 0;
- w1[2] = 0;
- w1[1] = 0;
- w1[0] = 0;
- w0[3] = 0;
- w0[2] = 0;
- w0[1] = 0;
- w0[0] = 0;
-
- if (offset_mod_4 == 0)
- {
- w2[1] = w2[2];
- w2[2] = w2[3];
- w2[3] = w3[0];
- w3[0] = w3[1];
- w3[1] = w3[2];
- w3[2] = 0;
- }
-
- break;
-
- case 10:
- w3[2] = amd_bytealign ( 0, w0[3], offset_minus_4);
- w3[1] = amd_bytealign (w0[3], w0[2], offset_minus_4);
- w3[0] = amd_bytealign (w0[2], w0[1], offset_minus_4);
- w2[3] = amd_bytealign (w0[1], w0[0], offset_minus_4);
- w2[2] = amd_bytealign (w0[0], 0, offset_minus_4);
- w2[1] = 0;
- w2[0] = 0;
- w1[3] = 0;
- w1[2] = 0;
- w1[1] = 0;
- w1[0] = 0;
- w0[3] = 0;
- w0[2] = 0;
- w0[1] = 0;
- w0[0] = 0;
-
- if (offset_mod_4 == 0)
- {
- w2[2] = w2[3];
- w2[3] = w3[0];
- w3[0] = w3[1];
- w3[1] = w3[2];
- w3[2] = 0;
- }
-
- break;
-
- case 11:
- w3[2] = amd_bytealign ( 0, w0[2], offset_minus_4);
- w3[1] = amd_bytealign (w0[2], w0[1], offset_minus_4);
- w3[0] = amd_bytealign (w0[1], w0[0], offset_minus_4);
- w2[3] = amd_bytealign (w0[0], 0, offset_minus_4);
- w2[2] = 0;
- w2[1] = 0;
- w2[0] = 0;
- w1[3] = 0;
- w1[2] = 0;
- w1[1] = 0;
- w1[0] = 0;
- w0[3] = 0;
- w0[2] = 0;
- w0[1] = 0;
- w0[0] = 0;
-
- if (offset_mod_4 == 0)
- {
- w2[3] = w3[0];
- w3[0] = w3[1];
- w3[1] = w3[2];
- w3[2] = 0;
- }
-
- break;
-
- case 12:
- w3[2] = amd_bytealign ( 0, w0[1], offset_minus_4);
- w3[1] = amd_bytealign (w0[1], w0[0], offset_minus_4);
- w3[0] = amd_bytealign (w0[0], 0, offset_minus_4);
- w2[3] = 0;
- w2[2] = 0;
- w2[1] = 0;
- w2[0] = 0;
- w1[3] = 0;
- w1[2] = 0;
- w1[1] = 0;
- w1[0] = 0;
- w0[3] = 0;
- w0[2] = 0;
- w0[1] = 0;
- w0[0] = 0;
-
- if (offset_mod_4 == 0)
- {
- w3[0] = w3[1];
- w3[1] = w3[2];
- w3[2] = 0;
- }
-
- break;
-
- case 13:
- w3[2] = amd_bytealign ( 0, w0[0], offset_minus_4);
- w3[1] = amd_bytealign (w0[0], 0, offset_minus_4);
- w3[0] = 0;
- w2[3] = 0;
- w2[2] = 0;
- w2[1] = 0;
- w2[0] = 0;
- w1[3] = 0;
- w1[2] = 0;
- w1[1] = 0;
- w1[0] = 0;
- w0[3] = 0;
- w0[2] = 0;
- w0[1] = 0;
- w0[0] = 0;
-
- if (offset_mod_4 == 0)
- {
- w3[1] = w3[2];
- w3[2] = 0;
- }
-
- break;
- }
- #endif
-
- #ifdef IS_NV
- const int offset_minus_4 = 4 - (offset % 4);
-
- const int selector = (0x76543210 >> (offset_minus_4 * 4)) & 0xffff;
-
- switch (offset / 4)
- {
- case 0:
- w3[1] = __byte_perm (w3[0], w3[1], selector);
- w3[0] = __byte_perm (w2[3], w3[0], selector);
- w2[3] = __byte_perm (w2[2], w2[3], selector);
- w2[2] = __byte_perm (w2[1], w2[2], selector);
- w2[1] = __byte_perm (w2[0], w2[1], selector);
- w2[0] = __byte_perm (w1[3], w2[0], selector);
- w1[3] = __byte_perm (w1[2], w1[3], selector);
- w1[2] = __byte_perm (w1[1], w1[2], selector);
- w1[1] = __byte_perm (w1[0], w1[1], selector);
- w1[0] = __byte_perm (w0[3], w1[0], selector);
- w0[3] = __byte_perm (w0[2], w0[3], selector);
- w0[2] = __byte_perm (w0[1], w0[2], selector);
- w0[1] = __byte_perm (w0[0], w0[1], selector);
- w0[0] = __byte_perm ( 0, w0[0], selector);
-
- break;
-
- case 1:
- w3[1] = __byte_perm (w2[3], w3[0], selector);
- w3[0] = __byte_perm (w2[2], w2[3], selector);
- w2[3] = __byte_perm (w2[1], w2[2], selector);
- w2[2] = __byte_perm (w2[0], w2[1], selector);
- w2[1] = __byte_perm (w1[3], w2[0], selector);
- w2[0] = __byte_perm (w1[2], w1[3], selector);
- w1[3] = __byte_perm (w1[1], w1[2], selector);
- w1[2] = __byte_perm (w1[0], w1[1], selector);
- w1[1] = __byte_perm (w0[3], w1[0], selector);
- w1[0] = __byte_perm (w0[2], w0[3], selector);
- w0[3] = __byte_perm (w0[1], w0[2], selector);
- w0[2] = __byte_perm (w0[0], w0[1], selector);
- w0[1] = __byte_perm ( 0, w0[0], selector);
- w0[0] = 0;
-
- break;
-
- case 2:
- w3[1] = __byte_perm (w2[2], w2[3], selector);
- w3[0] = __byte_perm (w2[1], w2[2], selector);
- w2[3] = __byte_perm (w2[0], w2[1], selector);
- w2[2] = __byte_perm (w1[3], w2[0], selector);
- w2[1] = __byte_perm (w1[2], w1[3], selector);
- w2[0] = __byte_perm (w1[1], w1[2], selector);
- w1[3] = __byte_perm (w1[0], w1[1], selector);
- w1[2] = __byte_perm (w0[3], w1[0], selector);
- w1[1] = __byte_perm (w0[2], w0[3], selector);
- w1[0] = __byte_perm (w0[1], w0[2], selector);
- w0[3] = __byte_perm (w0[0], w0[1], selector);
- w0[2] = __byte_perm ( 0, w0[0], selector);
- w0[1] = 0;
- w0[0] = 0;
-
- break;
-
- case 3:
- w3[1] = __byte_perm (w2[1], w2[2], selector);
- w3[0] = __byte_perm (w2[0], w2[1], selector);
- w2[3] = __byte_perm (w1[3], w2[0], selector);
- w2[2] = __byte_perm (w1[2], w1[3], selector);
- w2[1] = __byte_perm (w1[1], w1[2], selector);
- w2[0] = __byte_perm (w1[0], w1[1], selector);
- w1[3] = __byte_perm (w0[3], w1[0], selector);
- w1[2] = __byte_perm (w0[2], w0[3], selector);
- w1[1] = __byte_perm (w0[1], w0[2], selector);
- w1[0] = __byte_perm (w0[0], w0[1], selector);
- w0[3] = __byte_perm ( 0, w0[0], selector);
- w0[2] = 0;
- w0[1] = 0;
- w0[0] = 0;
-
- break;
-
- case 4:
- w3[1] = __byte_perm (w2[0], w2[1], selector);
- w3[0] = __byte_perm (w1[3], w2[0], selector);
- w2[3] = __byte_perm (w1[2], w1[3], selector);
- w2[2] = __byte_perm (w1[1], w1[2], selector);
- w2[1] = __byte_perm (w1[0], w1[1], selector);
- w2[0] = __byte_perm (w0[3], w1[0], selector);
- w1[3] = __byte_perm (w0[2], w0[3], selector);
- w1[2] = __byte_perm (w0[1], w0[2], selector);
- w1[1] = __byte_perm (w0[0], w0[1], selector);
- w1[0] = __byte_perm ( 0, w0[0], selector);
- w0[3] = 0;
- w0[2] = 0;
- w0[1] = 0;
- w0[0] = 0;
-
- break;
-
- case 5:
- w3[1] = __byte_perm (w1[3], w2[0], selector);
- w3[0] = __byte_perm (w1[2], w1[3], selector);
- w2[3] = __byte_perm (w1[1], w1[2], selector);
- w2[2] = __byte_perm (w1[0], w1[1], selector);
- w2[1] = __byte_perm (w0[3], w1[0], selector);
- w2[0] = __byte_perm (w0[2], w0[3], selector);
- w1[3] = __byte_perm (w0[1], w0[2], selector);
- w1[2] = __byte_perm (w0[0], w0[1], selector);
- w1[1] = __byte_perm ( 0, w0[0], selector);
- w1[0] = 0;
- w0[3] = 0;
- w0[2] = 0;
- w0[1] = 0;
- w0[0] = 0;
-
- break;
-
- case 6:
- w3[1] = __byte_perm (w1[2], w1[3], selector);
- w3[0] = __byte_perm (w1[1], w1[2], selector);
- w2[3] = __byte_perm (w1[0], w1[1], selector);
- w2[2] = __byte_perm (w0[3], w1[0], selector);
- w2[1] = __byte_perm (w0[2], w0[3], selector);
- w2[0] = __byte_perm (w0[1], w0[2], selector);
- w1[3] = __byte_perm (w0[0], w0[1], selector);
- w1[2] = __byte_perm ( 0, w0[0], selector);
- w1[1] = 0;
- w1[0] = 0;
- w0[3] = 0;
- w0[2] = 0;
- w0[1] = 0;
- w0[0] = 0;
-
- break;
-
- case 7:
- w3[1] = __byte_perm (w1[1], w1[2], selector);
- w3[0] = __byte_perm (w1[0], w1[1], selector);
- w2[3] = __byte_perm (w0[3], w1[0], selector);
- w2[2] = __byte_perm (w0[2], w0[3], selector);
- w2[1] = __byte_perm (w0[1], w0[2], selector);
- w2[0] = __byte_perm (w0[0], w0[1], selector);
- w1[3] = __byte_perm ( 0, w0[0], selector);
- w1[2] = 0;
- w1[1] = 0;
- w1[0] = 0;
- w0[3] = 0;
- w0[2] = 0;
- w0[1] = 0;
- w0[0] = 0;
-
- break;
-
- case 8:
- w3[1] = __byte_perm (w1[0], w1[1], selector);
- w3[0] = __byte_perm (w0[3], w1[0], selector);
- w2[3] = __byte_perm (w0[2], w0[3], selector);
- w2[2] = __byte_perm (w0[1], w0[2], selector);
- w2[1] = __byte_perm (w0[0], w0[1], selector);
- w2[0] = __byte_perm ( 0, w0[0], selector);
- w1[3] = 0;
- w1[2] = 0;
- w1[1] = 0;
- w1[0] = 0;
- w0[3] = 0;
- w0[2] = 0;
- w0[1] = 0;
- w0[0] = 0;
-
- break;
-
- case 9:
- w3[1] = __byte_perm (w0[3], w1[0], selector);
- w3[0] = __byte_perm (w0[2], w0[3], selector);
- w2[3] = __byte_perm (w0[1], w0[2], selector);
- w2[2] = __byte_perm (w0[0], w0[1], selector);
- w2[1] = __byte_perm ( 0, w0[0], selector);
- w2[0] = 0;
- w1[3] = 0;
- w1[2] = 0;
- w1[1] = 0;
- w1[0] = 0;
- w0[3] = 0;
- w0[2] = 0;
- w0[1] = 0;
- w0[0] = 0;
-
- break;
-
- case 10:
- w3[1] = __byte_perm (w0[2], w0[3], selector);
- w3[0] = __byte_perm (w0[1], w0[2], selector);
- w2[3] = __byte_perm (w0[0], w0[1], selector);
- w2[2] = __byte_perm ( 0, w0[0], selector);
- w2[1] = 0;
- w2[0] = 0;
- w1[3] = 0;
- w1[2] = 0;
- w1[1] = 0;
- w1[0] = 0;
- w0[3] = 0;
- w0[2] = 0;
- w0[1] = 0;
- w0[0] = 0;
-
- break;
-
- case 11:
- w3[1] = __byte_perm (w0[1], w0[2], selector);
- w3[0] = __byte_perm (w0[0], w0[1], selector);
- w2[3] = __byte_perm ( 0, w0[0], selector);
- w2[2] = 0;
- w2[1] = 0;
- w2[0] = 0;
- w1[3] = 0;
- w1[2] = 0;
- w1[1] = 0;
- w1[0] = 0;
- w0[3] = 0;
- w0[2] = 0;
- w0[1] = 0;
- w0[0] = 0;
-
- break;
-
- case 12:
- w3[1] = __byte_perm (w0[0], w0[1], selector);
- w3[0] = __byte_perm ( 0, w0[0], selector);
- w2[3] = 0;
- w2[2] = 0;
- w2[1] = 0;
- w2[0] = 0;
- w1[3] = 0;
- w1[2] = 0;
- w1[1] = 0;
- w1[0] = 0;
- w0[3] = 0;
- w0[2] = 0;
- w0[1] = 0;
- w0[0] = 0;
-
- break;
-
- case 13:
- w3[1] = __byte_perm ( 0, w0[0], selector);
- w3[0] = 0;
- w2[3] = 0;
- w2[2] = 0;
- w2[1] = 0;
- w2[0] = 0;
- w1[3] = 0;
- w1[2] = 0;
- w1[1] = 0;
- w1[0] = 0;
- w0[3] = 0;
- w0[2] = 0;
- w0[1] = 0;
- w0[0] = 0;
-
- break;
- }
- #endif
-}
-
-static void switch_buffer_by_offset_be (u32x w0[4], u32x w1[4], u32x w2[4], u32x w3[4], const u32 offset)
-{
- #if defined IS_AMD || defined IS_GENERIC
- switch (offset / 4)
- {
- case 0:
- w3[2] = amd_bytealign (w3[1], 0, offset);
- w3[1] = amd_bytealign (w3[0], w3[1], offset);
- w3[0] = amd_bytealign (w2[3], w3[0], offset);
- w2[3] = amd_bytealign (w2[2], w2[3], offset);
- w2[2] = amd_bytealign (w2[1], w2[2], offset);
- w2[1] = amd_bytealign (w2[0], w2[1], offset);
- w2[0] = amd_bytealign (w1[3], w2[0], offset);
- w1[3] = amd_bytealign (w1[2], w1[3], offset);
- w1[2] = amd_bytealign (w1[1], w1[2], offset);
- w1[1] = amd_bytealign (w1[0], w1[1], offset);
- w1[0] = amd_bytealign (w0[3], w1[0], offset);
- w0[3] = amd_bytealign (w0[2], w0[3], offset);
- w0[2] = amd_bytealign (w0[1], w0[2], offset);
- w0[1] = amd_bytealign (w0[0], w0[1], offset);
- w0[0] = amd_bytealign ( 0, w0[0], offset);
- break;
-
- case 1:
- w3[2] = amd_bytealign (w3[0], 0, offset);
- w3[1] = amd_bytealign (w2[3], w3[0], offset);
- w3[0] = amd_bytealign (w2[2], w2[3], offset);
- w2[3] = amd_bytealign (w2[1], w2[2], offset);
- w2[2] = amd_bytealign (w2[0], w2[1], offset);
- w2[1] = amd_bytealign (w1[3], w2[0], offset);
- w2[0] = amd_bytealign (w1[2], w1[3], offset);
- w1[3] = amd_bytealign (w1[1], w1[2], offset);
- w1[2] = amd_bytealign (w1[0], w1[1], offset);
- w1[1] = amd_bytealign (w0[3], w1[0], offset);
- w1[0] = amd_bytealign (w0[2], w0[3], offset);
- w0[3] = amd_bytealign (w0[1], w0[2], offset);
- w0[2] = amd_bytealign (w0[0], w0[1], offset);
- w0[1] = amd_bytealign ( 0, w0[0], offset);
- w0[0] = 0;
- break;
-
- case 2:
- w3[2] = amd_bytealign (w2[3], 0, offset);
- w3[1] = amd_bytealign (w2[2], w2[3], offset);
- w3[0] = amd_bytealign (w2[1], w2[2], offset);
- w2[3] = amd_bytealign (w2[0], w2[1], offset);
- w2[2] = amd_bytealign (w1[3], w2[0], offset);
- w2[1] = amd_bytealign (w1[2], w1[3], offset);
- w2[0] = amd_bytealign (w1[1], w1[2], offset);
- w1[3] = amd_bytealign (w1[0], w1[1], offset);
- w1[2] = amd_bytealign (w0[3], w1[0], offset);
- w1[1] = amd_bytealign (w0[2], w0[3], offset);
- w1[0] = amd_bytealign (w0[1], w0[2], offset);
- w0[3] = amd_bytealign (w0[0], w0[1], offset);
- w0[2] = amd_bytealign ( 0, w0[0], offset);
- w0[1] = 0;
- w0[0] = 0;
- break;
-
- case 3:
- w3[2] = amd_bytealign (w2[2], 0, offset);
- w3[1] = amd_bytealign (w2[1], w2[2], offset);
- w3[0] = amd_bytealign (w2[0], w2[1], offset);
- w2[3] = amd_bytealign (w1[3], w2[0], offset);
- w2[2] = amd_bytealign (w1[2], w1[3], offset);
- w2[1] = amd_bytealign (w1[1], w1[2], offset);
- w2[0] = amd_bytealign (w1[0], w1[1], offset);
- w1[3] = amd_bytealign (w0[3], w1[0], offset);
- w1[2] = amd_bytealign (w0[2], w0[3], offset);
- w1[1] = amd_bytealign (w0[1], w0[2], offset);
- w1[0] = amd_bytealign (w0[0], w0[1], offset);
- w0[3] = amd_bytealign ( 0, w0[0], offset);
- w0[2] = 0;
- w0[1] = 0;
- w0[0] = 0;
- break;
-
- case 4:
- w3[2] = amd_bytealign (w2[1], 0, offset);
- w3[1] = amd_bytealign (w2[0], w2[1], offset);
- w3[0] = amd_bytealign (w1[3], w2[0], offset);
- w2[3] = amd_bytealign (w1[2], w1[3], offset);
- w2[2] = amd_bytealign (w1[1], w1[2], offset);
- w2[1] = amd_bytealign (w1[0], w1[1], offset);
- w2[0] = amd_bytealign (w0[3], w1[0], offset);
- w1[3] = amd_bytealign (w0[2], w0[3], offset);
- w1[2] = amd_bytealign (w0[1], w0[2], offset);
- w1[1] = amd_bytealign (w0[0], w0[1], offset);
- w1[0] = amd_bytealign ( 0, w0[0], offset);
- w0[3] = 0;
- w0[2] = 0;
- w0[1] = 0;
- w0[0] = 0;
- break;
-
- case 5:
- w3[2] = amd_bytealign (w2[0], 0, offset);
- w3[1] = amd_bytealign (w1[3], w2[0], offset);
- w3[0] = amd_bytealign (w1[2], w1[3], offset);
- w2[3] = amd_bytealign (w1[1], w1[2], offset);
- w2[2] = amd_bytealign (w1[0], w1[1], offset);
- w2[1] = amd_bytealign (w0[3], w1[0], offset);
- w2[0] = amd_bytealign (w0[2], w0[3], offset);
- w1[3] = amd_bytealign (w0[1], w0[2], offset);
- w1[2] = amd_bytealign (w0[0], w0[1], offset);
- w1[1] = amd_bytealign ( 0, w0[0], offset);
- w1[0] = 0;
- w0[3] = 0;
- w0[2] = 0;
- w0[1] = 0;
- w0[0] = 0;
- break;
-
- case 6:
- w3[2] = amd_bytealign (w1[3], 0, offset);
- w3[1] = amd_bytealign (w1[2], w1[3], offset);
- w3[0] = amd_bytealign (w1[1], w1[2], offset);
- w2[3] = amd_bytealign (w1[0], w1[1], offset);
- w2[2] = amd_bytealign (w0[3], w1[0], offset);
- w2[1] = amd_bytealign (w0[2], w0[3], offset);
- w2[0] = amd_bytealign (w0[1], w0[2], offset);
- w1[3] = amd_bytealign (w0[0], w0[1], offset);
- w1[2] = amd_bytealign ( 0, w0[0], offset);
- w1[1] = 0;
- w1[0] = 0;
- w0[3] = 0;
- w0[2] = 0;
- w0[1] = 0;
- w0[0] = 0;
- break;
-
- case 7:
- w3[2] = amd_bytealign (w1[2], 0, offset);
- w3[1] = amd_bytealign (w1[1], w1[2], offset);
- w3[0] = amd_bytealign (w1[0], w1[1], offset);
- w2[3] = amd_bytealign (w0[3], w1[0], offset);
- w2[2] = amd_bytealign (w0[2], w0[3], offset);
- w2[1] = amd_bytealign (w0[1], w0[2], offset);
- w2[0] = amd_bytealign (w0[0], w0[1], offset);
- w1[3] = amd_bytealign ( 0, w0[0], offset);
- w1[2] = 0;
- w1[1] = 0;
- w1[0] = 0;
- w0[3] = 0;
- w0[2] = 0;
- w0[1] = 0;
- w0[0] = 0;
- break;
-
- case 8:
- w3[2] = amd_bytealign (w1[1], 0, offset);
- w3[1] = amd_bytealign (w1[0], w1[1], offset);
- w3[0] = amd_bytealign (w0[3], w1[0], offset);
- w2[3] = amd_bytealign (w0[2], w0[3], offset);
- w2[2] = amd_bytealign (w0[1], w0[2], offset);
- w2[1] = amd_bytealign (w0[0], w0[1], offset);
- w2[0] = amd_bytealign ( 0, w0[0], offset);
- w1[3] = 0;
- w1[2] = 0;
- w1[1] = 0;
- w1[0] = 0;
- w0[3] = 0;
- w0[2] = 0;
- w0[1] = 0;
- w0[0] = 0;
- break;
-
- case 9:
- w3[2] = amd_bytealign (w1[0], 0, offset);
- w3[1] = amd_bytealign (w0[3], w1[0], offset);
- w3[0] = amd_bytealign (w0[2], w0[3], offset);
- w2[3] = amd_bytealign (w0[1], w0[2], offset);
- w2[2] = amd_bytealign (w0[0], w0[1], offset);
- w2[1] = amd_bytealign ( 0, w0[0], offset);
- w2[0] = 0;
- w1[3] = 0;
- w1[2] = 0;
- w1[1] = 0;
- w1[0] = 0;
- w0[3] = 0;
- w0[2] = 0;
- w0[1] = 0;
- w0[0] = 0;
- break;
-
- case 10:
- w3[2] = amd_bytealign (w0[3], 0, offset);
- w3[1] = amd_bytealign (w0[2], w0[3], offset);
- w3[0] = amd_bytealign (w0[1], w0[2], offset);
- w2[3] = amd_bytealign (w0[0], w0[1], offset);
- w2[2] = amd_bytealign ( 0, w0[0], offset);
- w2[1] = 0;
- w2[0] = 0;
- w1[3] = 0;
- w1[2] = 0;
- w1[1] = 0;
- w1[0] = 0;
- w0[3] = 0;
- w0[2] = 0;
- w0[1] = 0;
- w0[0] = 0;
- break;
-
- case 11:
- w3[2] = amd_bytealign (w0[2], 0, offset);
- w3[1] = amd_bytealign (w0[1], w0[2], offset);
- w3[0] = amd_bytealign (w0[0], w0[1], offset);
- w2[3] = amd_bytealign ( 0, w0[0], offset);
- w2[2] = 0;
- w2[1] = 0;
- w2[0] = 0;
- w1[3] = 0;
- w1[2] = 0;
- w1[1] = 0;
- w1[0] = 0;
- w0[3] = 0;
- w0[2] = 0;
- w0[1] = 0;
- w0[0] = 0;
- break;
-
- case 12:
- w3[2] = amd_bytealign (w0[1], 0, offset);
- w3[1] = amd_bytealign (w0[0], w0[1], offset);
- w3[0] = amd_bytealign ( 0, w0[0], offset);
- w2[3] = 0;
- w2[2] = 0;
- w2[1] = 0;
- w2[0] = 0;
- w1[3] = 0;
- w1[2] = 0;
- w1[1] = 0;
- w1[0] = 0;
- w0[3] = 0;
- w0[2] = 0;
- w0[1] = 0;
- w0[0] = 0;
- break;
-
- case 13:
- w3[2] = amd_bytealign (w0[0], 0, offset);
- w3[1] = amd_bytealign ( 0, w0[0], offset);
- w3[0] = 0;
- w2[3] = 0;
- w2[2] = 0;
- w2[1] = 0;
- w2[0] = 0;
- w1[3] = 0;
- w1[2] = 0;
- w1[1] = 0;
- w1[0] = 0;
- w0[3] = 0;
- w0[2] = 0;
- w0[1] = 0;
- w0[0] = 0;
- break;
- }
- #endif
-
- #ifdef IS_NV
- const int selector = (0x76543210 >> ((offset & 3) * 4)) & 0xffff;
-
- switch (offset / 4)
- {
- case 0:
- w3[1] = __byte_perm (w3[1], w3[0], selector);
- w3[0] = __byte_perm (w3[0], w2[3], selector);
- w2[3] = __byte_perm (w2[3], w2[2], selector);
- w2[2] = __byte_perm (w2[2], w2[1], selector);
- w2[1] = __byte_perm (w2[1], w2[0], selector);
- w2[0] = __byte_perm (w2[0], w1[3], selector);
- w1[3] = __byte_perm (w1[3], w1[2], selector);
- w1[2] = __byte_perm (w1[2], w1[1], selector);
- w1[1] = __byte_perm (w1[1], w1[0], selector);
- w1[0] = __byte_perm (w1[0], w0[3], selector);
- w0[3] = __byte_perm (w0[3], w0[2], selector);
- w0[2] = __byte_perm (w0[2], w0[1], selector);
- w0[1] = __byte_perm (w0[1], w0[0], selector);
- w0[0] = __byte_perm (w0[0], 0, selector);
- break;
-
- case 1:
- w3[1] = __byte_perm (w3[0], w2[3], selector);
- w3[0] = __byte_perm (w2[3], w2[2], selector);
- w2[3] = __byte_perm (w2[2], w2[1], selector);
- w2[2] = __byte_perm (w2[1], w2[0], selector);
- w2[1] = __byte_perm (w2[0], w1[3], selector);
- w2[0] = __byte_perm (w1[3], w1[2], selector);
- w1[3] = __byte_perm (w1[2], w1[1], selector);
- w1[2] = __byte_perm (w1[1], w1[0], selector);
- w1[1] = __byte_perm (w1[0], w0[3], selector);
- w1[0] = __byte_perm (w0[3], w0[2], selector);
- w0[3] = __byte_perm (w0[2], w0[1], selector);
- w0[2] = __byte_perm (w0[1], w0[0], selector);
- w0[1] = __byte_perm (w0[0], 0, selector);
- w0[0] = 0;
- break;
-
- case 2:
- w3[1] = __byte_perm (w2[3], w2[2], selector);
- w3[0] = __byte_perm (w2[2], w2[1], selector);
- w2[3] = __byte_perm (w2[1], w2[0], selector);
- w2[2] = __byte_perm (w2[0], w1[3], selector);
- w2[1] = __byte_perm (w1[3], w1[2], selector);
- w2[0] = __byte_perm (w1[2], w1[1], selector);
- w1[3] = __byte_perm (w1[1], w1[0], selector);
- w1[2] = __byte_perm (w1[0], w0[3], selector);
- w1[1] = __byte_perm (w0[3], w0[2], selector);
- w1[0] = __byte_perm (w0[2], w0[1], selector);
- w0[3] = __byte_perm (w0[1], w0[0], selector);
- w0[2] = __byte_perm (w0[0], 0, selector);
- w0[1] = 0;
- w0[0] = 0;
- break;
-
- case 3:
- w3[1] = __byte_perm (w2[2], w2[1], selector);
- w3[0] = __byte_perm (w2[1], w2[0], selector);
- w2[3] = __byte_perm (w2[0], w1[3], selector);
- w2[2] = __byte_perm (w1[3], w1[2], selector);
- w2[1] = __byte_perm (w1[2], w1[1], selector);
- w2[0] = __byte_perm (w1[1], w1[0], selector);
- w1[3] = __byte_perm (w1[0], w0[3], selector);
- w1[2] = __byte_perm (w0[3], w0[2], selector);
- w1[1] = __byte_perm (w0[2], w0[1], selector);
- w1[0] = __byte_perm (w0[1], w0[0], selector);
- w0[3] = __byte_perm (w0[0], 0, selector);
- w0[2] = 0;
- w0[1] = 0;
- w0[0] = 0;
- break;
-
- case 4:
- w3[1] = __byte_perm (w2[1], w2[0], selector);
- w3[0] = __byte_perm (w2[0], w1[3], selector);
- w2[3] = __byte_perm (w1[3], w1[2], selector);
- w2[2] = __byte_perm (w1[2], w1[1], selector);
- w2[1] = __byte_perm (w1[1], w1[0], selector);
- w2[0] = __byte_perm (w1[0], w0[3], selector);
- w1[3] = __byte_perm (w0[3], w0[2], selector);
- w1[2] = __byte_perm (w0[2], w0[1], selector);
- w1[1] = __byte_perm (w0[1], w0[0], selector);
- w1[0] = __byte_perm (w0[0], 0, selector);
- w0[3] = 0;
- w0[2] = 0;
- w0[1] = 0;
- w0[0] = 0;
- break;
-
- case 5:
- w3[1] = __byte_perm (w2[0], w1[3], selector);
- w3[0] = __byte_perm (w1[3], w1[2], selector);
- w2[3] = __byte_perm (w1[2], w1[1], selector);
- w2[2] = __byte_perm (w1[1], w1[0], selector);
- w2[1] = __byte_perm (w1[0], w0[3], selector);
- w2[0] = __byte_perm (w0[3], w0[2], selector);
- w1[3] = __byte_perm (w0[2], w0[1], selector);
- w1[2] = __byte_perm (w0[1], w0[0], selector);
- w1[1] = __byte_perm (w0[0], 0, selector);
- w1[0] = 0;
- w0[3] = 0;
- w0[2] = 0;
- w0[1] = 0;
- w0[0] = 0;
- break;
-
- case 6:
- w3[1] = __byte_perm (w1[3], w1[2], selector);
- w3[0] = __byte_perm (w1[2], w1[1], selector);
- w2[3] = __byte_perm (w1[1], w1[0], selector);
- w2[2] = __byte_perm (w1[0], w0[3], selector);
- w2[1] = __byte_perm (w0[3], w0[2], selector);
- w2[0] = __byte_perm (w0[2], w0[1], selector);
- w1[3] = __byte_perm (w0[1], w0[0], selector);
- w1[2] = __byte_perm (w0[0], 0, selector);
- w1[1] = 0;
- w1[0] = 0;
- w0[3] = 0;
- w0[2] = 0;
- w0[1] = 0;
- w0[0] = 0;
- break;
-
- case 7:
- w3[1] = __byte_perm (w1[2], w1[1], selector);
- w3[0] = __byte_perm (w1[1], w1[0], selector);
- w2[3] = __byte_perm (w1[0], w0[3], selector);
- w2[2] = __byte_perm (w0[3], w0[2], selector);
- w2[1] = __byte_perm (w0[2], w0[1], selector);
- w2[0] = __byte_perm (w0[1], w0[0], selector);
- w1[3] = __byte_perm (w0[0], 0, selector);
- w1[2] = 0;
- w1[1] = 0;
- w1[0] = 0;
- w0[3] = 0;
- w0[2] = 0;
- w0[1] = 0;
- w0[0] = 0;
- break;
-
- case 8:
- w3[1] = __byte_perm (w1[1], w1[0], selector);
- w3[0] = __byte_perm (w1[0], w0[3], selector);
- w2[3] = __byte_perm (w0[3], w0[2], selector);
- w2[2] = __byte_perm (w0[2], w0[1], selector);
- w2[1] = __byte_perm (w0[1], w0[0], selector);
- w2[0] = __byte_perm (w0[0], 0, selector);
- w1[3] = 0;
- w1[2] = 0;
- w1[1] = 0;
- w1[0] = 0;
- w0[3] = 0;
- w0[2] = 0;
- w0[1] = 0;
- w0[0] = 0;
- break;
-
- case 9:
- w3[1] = __byte_perm (w1[0], w0[3], selector);
- w3[0] = __byte_perm (w0[3], w0[2], selector);
- w2[3] = __byte_perm (w0[2], w0[1], selector);
- w2[2] = __byte_perm (w0[1], w0[0], selector);
- w2[1] = __byte_perm (w0[0], 0, selector);
- w2[0] = 0;
- w1[3] = 0;
- w1[2] = 0;
- w1[1] = 0;
- w1[0] = 0;
- w0[3] = 0;
- w0[2] = 0;
- w0[1] = 0;
- w0[0] = 0;
- break;
-
- case 10:
- w3[1] = __byte_perm (w0[3], w0[2], selector);
- w3[0] = __byte_perm (w0[2], w0[1], selector);
- w2[3] = __byte_perm (w0[1], w0[0], selector);
- w2[2] = __byte_perm (w0[0], 0, selector);
- w2[1] = 0;
- w2[0] = 0;
- w1[3] = 0;
- w1[2] = 0;
- w1[1] = 0;
- w1[0] = 0;
- w0[3] = 0;
- w0[2] = 0;
- w0[1] = 0;
- w0[0] = 0;
- break;
-
- case 11:
- w3[1] = __byte_perm (w0[2], w0[1], selector);
- w3[0] = __byte_perm (w0[1], w0[0], selector);
- w2[3] = __byte_perm (w0[0], 0, selector);
- w2[2] = 0;
- w2[1] = 0;
- w2[0] = 0;
- w1[3] = 0;
- w1[2] = 0;
- w1[1] = 0;
- w1[0] = 0;
- w0[3] = 0;
- w0[2] = 0;
- w0[1] = 0;
- w0[0] = 0;
- break;
-
- case 12:
- w3[1] = __byte_perm (w0[1], w0[0], selector);
- w3[0] = __byte_perm (w0[0], 0, selector);
- w2[3] = 0;
- w2[2] = 0;
- w2[1] = 0;
- w2[0] = 0;
- w1[3] = 0;
- w1[2] = 0;
- w1[1] = 0;
- w1[0] = 0;
- w0[3] = 0;
- w0[2] = 0;
- w0[1] = 0;
- w0[0] = 0;
- break;
-
- case 13:
- w3[1] = __byte_perm (w0[0], 0, selector);
- w3[0] = 0;
- w2[3] = 0;
- w2[2] = 0;
- w2[1] = 0;
- w2[0] = 0;
- w1[3] = 0;
- w1[2] = 0;
- w1[1] = 0;
- w1[0] = 0;
- w0[3] = 0;
- w0[2] = 0;
- w0[1] = 0;
- w0[0] = 0;
- break;
- }
- #endif
-}
-
-/* not needed anymore?
-
-// before: append_0x80_2_be
-static void append_0x80_2x4_be (u32 w0[4], u32 w1[4], const u32 offset)
-{
- switch (offset)
- {
- case 0:
- w0[0] |= 0x80000000;
- break;
-
- case 1:
- w0[0] |= 0x800000;
- break;
-
- case 2:
- w0[0] |= 0x8000;
- break;
-
- case 3:
- w0[0] |= 0x80;
- break;
-
- case 4:
- w0[1] |= 0x80000000;
- break;
-
- case 5:
- w0[1] |= 0x800000;
- break;
-
- case 6:
- w0[1] |= 0x8000;
- break;
-
- case 7:
- w0[1] |= 0x80;
- break;
-
- case 8:
- w0[2] |= 0x80000000;
- break;
-
- case 9:
- w0[2] |= 0x800000;
- break;
-
- case 10:
- w0[2] |= 0x8000;
- break;
-
- case 11:
- w0[2] |= 0x80;
- break;
-
- case 12:
- w0[3] |= 0x80000000;
- break;
-
- case 13:
- w0[3] |= 0x800000;
- break;
-
- case 14:
- w0[3] |= 0x8000;
- break;
-
- case 15:
- w0[3] |= 0x80;
- break;
-
- case 16:
- w1[0] |= 0x80000000;
- break;
-
- case 17:
- w1[0] |= 0x800000;
- break;
-
- case 18:
- w1[0] |= 0x8000;
- break;
-
- case 19:
- w1[0] |= 0x80;
- break;
-
- case 20:
- w1[1] |= 0x80000000;
- break;
-
- case 21:
- w1[1] |= 0x800000;
- break;
-
- case 22:
- w1[1] |= 0x8000;
- break;
-
- case 23:
- w1[1] |= 0x80;
- break;
-
- case 24:
- w1[2] |= 0x80000000;
- break;
-
- case 25:
- w1[2] |= 0x800000;
- break;
-
- case 26:
- w1[2] |= 0x8000;
- break;
-
- case 27:
- w1[2] |= 0x80;
- break;
-
- case 28:
- w1[3] |= 0x80000000;
- break;
-
- case 29:
- w1[3] |= 0x800000;
- break;
-
- case 30:
- w1[3] |= 0x8000;
- break;
-
- case 31:
- w1[3] |= 0x80;
- break;
- }
-}
-
-// before: append_0x80_8
-static void append_0x80_1x32 (u32 w[32], const u32 offset)
-{
- switch (offset)
- {
- case 0:
- w[ 0] = 0x80;
- break;
-
- case 1:
- w[ 0] = w[ 0] | 0x8000;
- break;
-
- case 2:
- w[ 0] = w[ 0] | 0x800000;
- break;
-
- case 3:
- w[ 0] = w[ 0] | 0x80000000;
- break;
-
- case 4:
- w[ 1] = 0x80;
- break;
-
- case 5:
- w[ 1] = w[ 1] | 0x8000;
- break;
-
- case 6:
- w[ 1] = w[ 1] | 0x800000;
- break;
-
- case 7:
- w[ 1] = w[ 1] | 0x80000000;
- break;
-
- case 8:
- w[ 2] = 0x80;
- break;
-
- case 9:
- w[ 2] = w[ 2] | 0x8000;
- break;
-
- case 10:
- w[ 2] = w[ 2] | 0x800000;
- break;
-
- case 11:
- w[ 2] = w[ 2] | 0x80000000;
- break;
-
- case 12:
- w[ 3] = 0x80;
- break;
-
- case 13:
- w[ 3] = w[ 3] | 0x8000;
- break;
-
- case 14:
- w[ 3] = w[ 3] | 0x800000;
- break;
-
- case 15:
- w[ 3] = w[ 3] | 0x80000000;
- break;
-
- case 16:
- w[ 4] = 0x80;
- break;
-
- case 17:
- w[ 4] = w[ 4] | 0x8000;
- break;
-
- case 18:
- w[ 4] = w[ 4] | 0x800000;
- break;
-
- case 19:
- w[ 4] = w[ 4] | 0x80000000;
- break;
-
- case 20:
- w[ 5] = 0x80;
- break;
-
- case 21:
- w[ 5] = w[ 5] | 0x8000;
- break;
-
- case 22:
- w[ 5] = w[ 5] | 0x800000;
- break;
-
- case 23:
- w[ 5] = w[ 5] | 0x80000000;
- break;
-
- case 24:
- w[ 6] = 0x80;
- break;
-
- case 25:
- w[ 6] = w[ 6] | 0x8000;
- break;
-
- case 26:
- w[ 6] = w[ 6] | 0x800000;
- break;
-
- case 27:
- w[ 6] = w[ 6] | 0x80000000;
- break;
-
- case 28:
- w[ 7] = 0x80;
- break;
-
- case 29:
- w[ 7] = w[ 7] | 0x8000;
- break;
-
- case 30:
- w[ 7] = w[ 7] | 0x800000;
- break;
-
- case 31:
- w[ 7] = w[ 7] | 0x80000000;
- break;
-
- case 32:
- w[ 8] = 0x80;
- break;
-
- case 33:
- w[ 8] = w[ 8] | 0x8000;
- break;
-
- case 34:
- w[ 8] = w[ 8] | 0x800000;
- break;
-
- case 35:
- w[ 8] = w[ 8] | 0x80000000;
- break;
-
- case 36:
- w[ 9] = 0x80;
- break;
-
- case 37:
- w[ 9] = w[ 9] | 0x8000;
- break;
-
- case 38:
- w[ 9] = w[ 9] | 0x800000;
- break;
-
- case 39:
- w[ 9] = w[ 9] | 0x80000000;
- break;
-
- case 40:
- w[10] = 0x80;
- break;
-
- case 41:
- w[10] = w[10] | 0x8000;
- break;
-
- case 42:
- w[10] = w[10] | 0x800000;
- break;
-
- case 43:
- w[10] = w[10] | 0x80000000;
- break;
-
- case 44:
- w[11] = 0x80;
- break;
-
- case 45:
- w[11] = w[11] | 0x8000;
- break;
-
- case 46:
- w[11] = w[11] | 0x800000;
- break;
-
- case 47:
- w[11] = w[11] | 0x80000000;
- break;
-
- case 48:
- w[12] = 0x80;
- break;
-
- case 49:
- w[12] = w[12] | 0x8000;
- break;
-
- case 50:
- w[12] = w[12] | 0x800000;
- break;
-
- case 51:
- w[12] = w[12] | 0x80000000;
- break;
-
- case 52:
- w[13] = 0x80;
- break;
-
- case 53:
- w[13] = w[13] | 0x8000;
- break;
-
- case 54:
- w[13] = w[13] | 0x800000;
- break;
-
- case 55:
- w[13] = w[13] | 0x80000000;
- break;
-
- case 56:
- w[14] = 0x80;
- break;
-
- case 57:
- w[14] = w[14] | 0x8000;
- break;
-
- case 58:
- w[14] = w[14] | 0x800000;
- break;
-
- case 59:
- w[14] = w[14] | 0x80000000;
- break;
-
- case 60:
- w[15] = 0x80;
- break;
-
- case 61:
- w[15] = w[15] | 0x8000;
- break;
-
- case 62:
- w[15] = w[15] | 0x800000;
- break;
-
- case 63:
- w[15] = w[15] | 0x80000000;
- break;
-
- case 64:
- w[16] = 0x80;
- break;
-
- case 65:
- w[16] = w[16] | 0x8000;
- break;
-
- case 66:
- w[16] = w[16] | 0x800000;
- break;
-
- case 67:
- w[16] = w[16] | 0x80000000;
- break;
-
- case 68:
- w[17] = 0x80;
- break;
-
- case 69:
- w[17] = w[17] | 0x8000;
- break;
-
- case 70:
- w[17] = w[17] | 0x800000;
- break;
-
- case 71:
- w[17] = w[17] | 0x80000000;
- break;
-
- case 72:
- w[18] = 0x80;
- break;
-
- case 73:
- w[18] = w[18] | 0x8000;
- break;
-
- case 74:
- w[18] = w[18] | 0x800000;
- break;
-
- case 75:
- w[18] = w[18] | 0x80000000;
- break;
-
- case 76:
- w[19] = 0x80;
- break;
-
- case 77:
- w[19] = w[19] | 0x8000;
- break;
-
- case 78:
- w[19] = w[19] | 0x800000;
- break;
-
- case 79:
- w[19] = w[19] | 0x80000000;
- break;
-
- case 80:
- w[20] = 0x80;
- break;
-
- case 81:
- w[20] = w[20] | 0x8000;
- break;
-
- case 82:
- w[20] = w[20] | 0x800000;
- break;
-
- case 83:
- w[20] = w[20] | 0x80000000;
- break;
-
- case 84:
- w[21] = 0x80;
- break;
-
- case 85:
- w[21] = w[21] | 0x8000;
- break;
+ case 0: sw[0] = w0;
+ break;
+ case 1: sw[0] = (sw[0] & 0xff000000) | (w0 >> 8);
+ sw[1] = (sw[1] & 0x00ffffff) | (w0 << 24);
+ break;
+ case 2: sw[0] = (sw[0] & 0xffff0000) | (w0 >> 16);
+ sw[1] = (sw[1] & 0x0000ffff) | (w0 << 16);
+ break;
+ case 3: sw[0] = (sw[0] & 0xffffff00) | (w0 >> 24);
+ sw[1] = (sw[1] & 0x000000ff) | (w0 << 8);
+ break;
+ case 4: sw[1] = w0;
+ break;
+ case 5: sw[1] = (sw[1] & 0xff000000) | (w0 >> 8);
+ sw[2] = (sw[2] & 0x00ffffff) | (w0 << 24);
+ break;
+ case 6: sw[1] = (sw[1] & 0xffff0000) | (w0 >> 16);
+ sw[2] = (sw[2] & 0x0000ffff) | (w0 << 16);
+ break;
+ case 7: sw[1] = (sw[1] & 0xffffff00) | (w0 >> 24);
+ sw[2] = (sw[2] & 0x000000ff) | (w0 << 8);
+ break;
+ case 8: sw[2] = w0;
+ break;
+ case 9: sw[2] = (sw[2] & 0xff000000) | (w0 >> 8);
+ sw[3] = (sw[3] & 0x00ffffff) | (w0 << 24);
+ break;
+ case 10: sw[2] = (sw[2] & 0xffff0000) | (w0 >> 16);
+ sw[3] = (sw[3] & 0x0000ffff) | (w0 << 16);
+ break;
+ case 11: sw[2] = (sw[2] & 0xffffff00) | (w0 >> 24);
+ sw[3] = (sw[3] & 0x000000ff) | (w0 << 8);
+ break;
+ case 12: sw[3] = w0;
+ break;
+ case 13: sw[3] = (sw[3] & 0xff000000) | (w0 >> 8);
+ sw[4] = (sw[4] & 0x00ffffff) | (w0 << 24);
+ break;
+ case 14: sw[3] = (sw[3] & 0xffff0000) | (w0 >> 16);
+ sw[4] = (sw[4] & 0x0000ffff) | (w0 << 16);
+ break;
+ case 15: sw[3] = (sw[3] & 0xffffff00) | (w0 >> 24);
+ sw[4] = (sw[4] & 0x000000ff) | (w0 << 8);
+ break;
+ case 16: sw[4] = w0;
+ break;
+ case 17: sw[4] = (sw[4] & 0xff000000) | (w0 >> 8);
+ sw[5] = (sw[5] & 0x00ffffff) | (w0 << 24);
+ break;
+ case 18: sw[4] = (sw[4] & 0xffff0000) | (w0 >> 16);
+ sw[5] = (sw[5] & 0x0000ffff) | (w0 << 16);
+ break;
+ case 19: sw[4] = (sw[4] & 0xffffff00) | (w0 >> 24);
+ sw[5] = (sw[5] & 0x000000ff) | (w0 << 8);
+ break;
+ case 20: sw[5] = w0;
+ break;
+ case 21: sw[5] = (sw[5] & 0xff000000) | (w0 >> 8);
+ sw[6] = (sw[6] & 0x00ffffff) | (w0 << 24);
+ break;
+ case 22: sw[5] = (sw[5] & 0xffff0000) | (w0 >> 16);
+ sw[6] = (sw[6] & 0x0000ffff) | (w0 << 16);
+ break;
+ case 23: sw[5] = (sw[5] & 0xffffff00) | (w0 >> 24);
+ sw[6] = (sw[6] & 0x000000ff) | (w0 << 8);
+ break;
+ case 24: sw[6] = w0;
+ break;
+ case 25: sw[6] = (sw[6] & 0xff000000) | (w0 >> 8);
+ sw[7] = (sw[7] & 0x00ffffff) | (w0 << 24);
+ break;
+ case 26: sw[6] = (sw[6] & 0xffff0000) | (w0 >> 16);
+ sw[7] = (sw[7] & 0x0000ffff) | (w0 << 16);
+ break;
+ case 27: sw[6] = (sw[6] & 0xffffff00) | (w0 >> 24);
+ sw[7] = (sw[7] & 0x000000ff) | (w0 << 8);
+ break;
+ case 28: sw[7] = w0;
+ break;
+ case 29: sw[7] = (sw[7] & 0xff000000) | (w0 >> 8);
+ sw[8] = (sw[8] & 0x00ffffff) | (w0 << 24);
+ break;
+ case 30: sw[7] = (sw[7] & 0xffff0000) | (w0 >> 16);
+ sw[8] = (sw[8] & 0x0000ffff) | (w0 << 16);
+ break;
+ case 31: sw[7] = (sw[7] & 0xffffff00) | (w0 >> 24);
+ sw[8] = (sw[8] & 0x000000ff) | (w0 << 8);
+ break;
+ }
+}
- case 86:
- w[21] = w[21] | 0x800000;
- break;
+/**
+ * vector functions as scalar (for outer loop usage)
+ */
- case 87:
- w[21] = w[21] | 0x80000000;
+static void append_0x80_1x4_S (u32 w0[4], const u32 offset)
+{
+ switch (offset)
+ {
+ case 0:
+ w0[0] = 0x80;
break;
- case 88:
- w[22] = 0x80;
+ case 1:
+ w0[0] = w0[0] | 0x8000;
break;
- case 89:
- w[22] = w[22] | 0x8000;
+ case 2:
+ w0[0] = w0[0] | 0x800000;
break;
- case 90:
- w[22] = w[22] | 0x800000;
+ case 3:
+ w0[0] = w0[0] | 0x80000000;
break;
- case 91:
- w[22] = w[22] | 0x80000000;
+ case 4:
+ w0[1] = 0x80;
break;
- case 92:
- w[23] = 0x80;
+ case 5:
+ w0[1] = w0[1] | 0x8000;
break;
- case 93:
- w[23] = w[23] | 0x8000;
+ case 6:
+ w0[1] = w0[1] | 0x800000;
break;
- case 94:
- w[23] = w[23] | 0x800000;
+ case 7:
+ w0[1] = w0[1] | 0x80000000;
break;
- case 95:
- w[23] = w[23] | 0x80000000;
+ case 8:
+ w0[2] = 0x80;
break;
- case 96:
- w[24] = 0x80;
+ case 9:
+ w0[2] = w0[2] | 0x8000;
break;
- case 97:
- w[24] = w[24] | 0x8000;
+ case 10:
+ w0[2] = w0[2] | 0x800000;
break;
- case 98:
- w[24] = w[24] | 0x800000;
+ case 11:
+ w0[2] = w0[2] | 0x80000000;
break;
- case 99:
- w[24] = w[24] | 0x80000000;
+ case 12:
+ w0[3] = 0x80;
break;
- case 100:
- w[25] = 0x80;
+ case 13:
+ w0[3] = w0[3] | 0x8000;
break;
- case 101:
- w[25] = w[25] | 0x8000;
+ case 14:
+ w0[3] = w0[3] | 0x800000;
break;
- case 102:
- w[25] = w[25] | 0x800000;
+ case 15:
+ w0[3] = w0[3] | 0x80000000;
break;
+ }
+}
- case 103:
- w[25] = w[25] | 0x80000000;
+static void append_0x80_2x4_S (u32 w0[4], u32 w1[4], const u32 offset)
+{
+ switch (offset)
+ {
+ case 0:
+ w0[0] = 0x80;
break;
- case 104:
- w[26] = 0x80;
+ case 1:
+ w0[0] = w0[0] | 0x8000;
break;
- case 105:
- w[26] = w[26] | 0x8000;
+ case 2:
+ w0[0] = w0[0] | 0x800000;
break;
- case 106:
- w[26] = w[26] | 0x800000;
+ case 3:
+ w0[0] = w0[0] | 0x80000000;
break;
- case 107:
- w[26] = w[26] | 0x80000000;
+ case 4:
+ w0[1] = 0x80;
break;
- case 108:
- w[27] = 0x80;
+ case 5:
+ w0[1] = w0[1] | 0x8000;
break;
- case 109:
- w[27] = w[27] | 0x8000;
+ case 6:
+ w0[1] = w0[1] | 0x800000;
break;
- case 110:
- w[27] = w[27] | 0x800000;
+ case 7:
+ w0[1] = w0[1] | 0x80000000;
break;
- case 111:
- w[27] = w[27] | 0x80000000;
+ case 8:
+ w0[2] = 0x80;
break;
- case 112:
- w[28] = 0x80;
+ case 9:
+ w0[2] = w0[2] | 0x8000;
break;
- case 113:
- w[28] = w[28] | 0x8000;
+ case 10:
+ w0[2] = w0[2] | 0x800000;
break;
- case 114:
- w[28] = w[28] | 0x800000;
+ case 11:
+ w0[2] = w0[2] | 0x80000000;
break;
- case 115:
- w[28] = w[28] | 0x80000000;
+ case 12:
+ w0[3] = 0x80;
break;
- case 116:
- w[29] = 0x80;
+ case 13:
+ w0[3] = w0[3] | 0x8000;
break;
- case 117:
- w[29] = w[29] | 0x8000;
+ case 14:
+ w0[3] = w0[3] | 0x800000;
break;
- case 118:
- w[29] = w[29] | 0x800000;
+ case 15:
+ w0[3] = w0[3] | 0x80000000;
break;
- case 119:
- w[29] = w[29] | 0x80000000;
+ case 16:
+ w1[0] = 0x80;
break;
- case 120:
- w[30] = 0x80;
+ case 17:
+ w1[0] = w1[0] | 0x8000;
break;
- case 121:
- w[30] = w[30] | 0x8000;
+ case 18:
+ w1[0] = w1[0] | 0x800000;
break;
- case 122:
- w[30] = w[30] | 0x800000;
+ case 19:
+ w1[0] = w1[0] | 0x80000000;
break;
- case 123:
- w[30] = w[30] | 0x80000000;
+ case 20:
+ w1[1] = 0x80;
break;
- case 124:
- w[31] = 0x80;
+ case 21:
+ w1[1] = w1[1] | 0x8000;
break;
- case 125:
- w[31] = w[31] | 0x8000;
+ case 22:
+ w1[1] = w1[1] | 0x800000;
break;
- case 126:
- w[31] = w[31] | 0x800000;
+ case 23:
+ w1[1] = w1[1] | 0x80000000;
break;
- case 127:
- w[31] = w[31] | 0x80000000;
+ case 24:
+ w1[2] = 0x80;
break;
- }
-}
-// before: device_memcat2L
-static void memcat_c7_d1x2_sl1x2_sr1x2 (const u32 offset, u32 dst0[2], u32 src_l0[2], u32 src_r0[2])
-{
- switch (offset)
- {
- case 1:
- dst0[0] = src_l0[0] | src_r0[0] << 8;
- dst0[1] = src_r0[0] >> 24 | src_r0[1] << 8;
+ case 25:
+ w1[2] = w1[2] | 0x8000;
break;
- case 2:
- dst0[0] = src_l0[0] | src_r0[0] << 16;
- dst0[1] = src_r0[0] >> 16 | src_r0[1] << 16;
+ case 26:
+ w1[2] = w1[2] | 0x800000;
break;
- case 3:
- dst0[0] = src_l0[0] | src_r0[0] << 24;
- dst0[1] = src_r0[0] >> 8 | src_r0[1] << 24;
+ case 27:
+ w1[2] = w1[2] | 0x80000000;
break;
- case 4:
- dst0[1] = src_r0[0];
+ case 28:
+ w1[3] = 0x80;
break;
- case 5:
- dst0[1] = src_l0[1] | src_r0[0] << 8;
+ case 29:
+ w1[3] = w1[3] | 0x8000;
break;
- case 6:
- dst0[1] = src_l0[1] | src_r0[0] << 16;
+ case 30:
+ w1[3] = w1[3] | 0x800000;
break;
- case 7:
- dst0[1] = src_l0[1] | src_r0[0] << 24;
+ case 31:
+ w1[3] = w1[3] | 0x80000000;
break;
}
}
-// before: device_memcat4L
-static void memcat_c15_d1x4_sl1x4_sr1x4 (const u32 offset, u32 dst0[4], u32 src_l0[4], u32 src_r0[4])
+static void append_0x80_3x4_S (u32 w0[4], u32 w1[4], u32 w2[4], const u32 offset)
{
switch (offset)
{
- case 1:
- dst0[0] = src_l0[0] | src_r0[0] << 8;
- dst0[1] = src_r0[0] >> 24 | src_r0[1] << 8;
- dst0[2] = src_r0[1] >> 24 | src_r0[2] << 8;
- dst0[3] = src_r0[2] >> 24 | src_r0[3] << 8;
- break;
-
- case 2:
- dst0[0] = src_l0[0] | src_r0[0] << 16;
- dst0[1] = src_r0[0] >> 16 | src_r0[1] << 16;
- dst0[2] = src_r0[1] >> 16 | src_r0[2] << 16;
- dst0[3] = src_r0[2] >> 16 | src_r0[3] << 16;
- break;
-
- case 3:
- dst0[0] = src_l0[0] | src_r0[0] << 24;
- dst0[1] = src_r0[0] >> 8 | src_r0[1] << 24;
- dst0[2] = src_r0[1] >> 8 | src_r0[2] << 24;
- dst0[3] = src_r0[2] >> 8 | src_r0[3] << 24;
- break;
-
- case 4:
- dst0[1] = src_r0[0];
- dst0[2] = src_r0[1];
- dst0[3] = src_r0[2];
- break;
-
- case 5:
- dst0[1] = src_l0[1] | src_r0[0] << 8;
- dst0[2] = src_r0[0] >> 24 | src_r0[1] << 8;
- dst0[3] = src_r0[1] >> 24 | src_r0[2] << 8;
- break;
-
- case 6:
- dst0[1] = src_l0[1] | src_r0[0] << 16;
- dst0[2] = src_r0[0] >> 16 | src_r0[1] << 16;
- dst0[3] = src_r0[1] >> 16 | src_r0[2] << 16;
- break;
-
- case 7:
- dst0[1] = src_l0[1] | src_r0[0] << 24;
- dst0[2] = src_r0[0] >> 8 | src_r0[1] << 24;
- dst0[3] = src_r0[1] >> 8 | src_r0[2] << 24;
- break;
-
- case 8:
- dst0[2] = src_r0[0];
- dst0[3] = src_r0[1];
- break;
-
- case 9:
- dst0[2] = src_l0[2] | src_r0[0] << 8;
- dst0[3] = src_r0[0] >> 24 | src_r0[1] << 8;
- break;
-
- case 10:
- dst0[2] = src_l0[2] | src_r0[0] << 16;
- dst0[3] = src_r0[0] >> 16 | src_r0[1] << 16;
- break;
-
- case 11:
- dst0[2] = src_l0[2] | src_r0[0] << 24;
- dst0[3] = src_r0[0] >> 8 | src_r0[1] << 24;
- break;
-
- case 12:
- dst0[3] = src_r0[0];
- break;
-
- case 13:
- dst0[3] = src_l0[3] | src_r0[0] << 8;
- break;
-
- case 14:
- dst0[3] = src_l0[3] | src_r0[0] << 16;
- break;
-
- case 15:
- dst0[3] = src_l0[3] | src_r0[0] << 24;
+ case 0:
+ w0[0] = 0x80;
break;
- }
-}
-// before: device_memcat8L
-static void memcat_c31_d2x4_sl2x4_sr1x4 (const u32 offset, u32 dst0[4], u32 dst1[4], u32 src_l0[4], u32 src_l1[4], u32 src_r0[4])
-{
- switch (offset)
- {
case 1:
- dst0[0] = src_l0[0] | src_r0[0] << 8;
- dst0[1] = src_r0[0] >> 24 | src_r0[1] << 8;
- dst0[2] = src_r0[1] >> 24 | src_r0[2] << 8;
- dst0[3] = src_r0[2] >> 24 | src_r0[3] << 8;
- dst1[0] = src_r0[3] >> 24;
+ w0[0] = w0[0] | 0x8000;
break;
case 2:
- dst0[0] = src_l0[0] | src_r0[0] << 16;
- dst0[1] = src_r0[0] >> 16 | src_r0[1] << 16;
- dst0[2] = src_r0[1] >> 16 | src_r0[2] << 16;
- dst0[3] = src_r0[2] >> 16 | src_r0[3] << 16;
- dst1[0] = src_r0[3] >> 16;
+ w0[0] = w0[0] | 0x800000;
break;
case 3:
- dst0[0] = src_l0[0] | src_r0[0] << 24;
- dst0[1] = src_r0[0] >> 8 | src_r0[1] << 24;
- dst0[2] = src_r0[1] >> 8 | src_r0[2] << 24;
- dst0[3] = src_r0[2] >> 8 | src_r0[3] << 24;
- dst1[0] = src_r0[3] >> 8;
+ w0[0] = w0[0] | 0x80000000;
break;
case 4:
- dst0[1] = src_r0[0];
- dst0[2] = src_r0[1];
- dst0[3] = src_r0[2];
- dst1[0] = src_r0[3];
+ w0[1] = 0x80;
break;
case 5:
- dst0[1] = src_l0[1] | src_r0[0] << 8;
- dst0[2] = src_r0[0] >> 24 | src_r0[1] << 8;
- dst0[3] = src_r0[1] >> 24 | src_r0[2] << 8;
- dst1[0] = src_r0[2] >> 24 | src_r0[3] << 8;
- dst1[1] = src_r0[3] >> 24;
+ w0[1] = w0[1] | 0x8000;
break;
case 6:
- dst0[1] = src_l0[1] | src_r0[0] << 16;
- dst0[2] = src_r0[0] >> 16 | src_r0[1] << 16;
- dst0[3] = src_r0[1] >> 16 | src_r0[2] << 16;
- dst1[0] = src_r0[2] >> 16 | src_r0[3] << 16;
- dst1[1] = src_r0[3] >> 16;
+ w0[1] = w0[1] | 0x800000;
break;
case 7:
- dst0[1] = src_l0[1] | src_r0[0] << 24;
- dst0[2] = src_r0[0] >> 8 | src_r0[1] << 24;
- dst0[3] = src_r0[1] >> 8 | src_r0[2] << 24;
- dst1[0] = src_r0[2] >> 8 | src_r0[3] << 24;
- dst1[1] = src_r0[3] >> 8;
+ w0[1] = w0[1] | 0x80000000;
break;
case 8:
- dst0[2] = src_r0[0];
- dst0[3] = src_r0[1];
- dst1[0] = src_r0[2];
- dst1[1] = src_r0[3];
+ w0[2] = 0x80;
break;
case 9:
- dst0[2] = src_l0[2] | src_r0[0] << 8;
- dst0[3] = src_r0[0] >> 24 | src_r0[1] << 8;
- dst1[0] = src_r0[1] >> 24 | src_r0[2] << 8;
- dst1[1] = src_r0[2] >> 24 | src_r0[3] << 8;
- dst1[2] = src_r0[3] >> 24;
+ w0[2] = w0[2] | 0x8000;
break;
case 10:
- dst0[2] = src_l0[2] | src_r0[0] << 16;
- dst0[3] = src_r0[0] >> 16 | src_r0[1] << 16;
- dst1[0] = src_r0[1] >> 16 | src_r0[2] << 16;
- dst1[1] = src_r0[2] >> 16 | src_r0[3] << 16;
- dst1[2] = src_r0[3] >> 16;
+ w0[2] = w0[2] | 0x800000;
break;
case 11:
- dst0[2] = src_l0[2] | src_r0[0] << 24;
- dst0[3] = src_r0[0] >> 8 | src_r0[1] << 24;
- dst1[0] = src_r0[1] >> 8 | src_r0[2] << 24;
- dst1[1] = src_r0[2] >> 8 | src_r0[3] << 24;
- dst1[2] = src_r0[3] >> 8;
+ w0[2] = w0[2] | 0x80000000;
break;
case 12:
- dst0[3] = src_r0[0];
- dst1[0] = src_r0[1];
- dst1[1] = src_r0[2];
- dst1[2] = src_r0[3];
+ w0[3] = 0x80;
break;
case 13:
- dst0[3] = src_l0[3] | src_r0[0] << 8;
- dst1[0] = src_r0[0] >> 24 | src_r0[1] << 8;
- dst1[1] = src_r0[1] >> 24 | src_r0[2] << 8;
- dst1[2] = src_r0[2] >> 24 | src_r0[3] << 8;
- dst1[3] = src_r0[3] >> 24;
+ w0[3] = w0[3] | 0x8000;
break;
case 14:
- dst0[3] = src_l0[3] | src_r0[0] << 16;
- dst1[0] = src_r0[0] >> 16 | src_r0[1] << 16;
- dst1[1] = src_r0[1] >> 16 | src_r0[2] << 16;
- dst1[2] = src_r0[2] >> 16 | src_r0[3] << 16;
- dst1[3] = src_r0[3] >> 16;
+ w0[3] = w0[3] | 0x800000;
break;
case 15:
- dst0[3] = src_l0[3] | src_r0[0] << 24;
- dst1[0] = src_r0[0] >> 8 | src_r0[1] << 24;
- dst1[1] = src_r0[1] >> 8 | src_r0[2] << 24;
- dst1[2] = src_r0[2] >> 8 | src_r0[3] << 24;
- dst1[3] = src_r0[3] >> 8;
+ w0[3] = w0[3] | 0x80000000;
break;
case 16:
- dst1[0] = src_r0[0];
- dst1[1] = src_r0[1];
- dst1[2] = src_r0[2];
- dst1[3] = src_r0[3];
+ w1[0] = 0x80;
break;
case 17:
- dst1[0] = src_l1[0] | src_r0[0] << 8;
- dst1[1] = src_r0[0] >> 24 | src_r0[1] << 8;
- dst1[2] = src_r0[1] >> 24 | src_r0[2] << 8;
- dst1[3] = src_r0[2] >> 24 | src_r0[3] << 8;
+ w1[0] = w1[0] | 0x8000;
break;
case 18:
- dst1[0] = src_l1[0] | src_r0[0] << 16;
- dst1[1] = src_r0[0] >> 16 | src_r0[1] << 16;
- dst1[2] = src_r0[1] >> 16 | src_r0[2] << 16;
- dst1[3] = src_r0[2] >> 16 | src_r0[3] << 16;
+ w1[0] = w1[0] | 0x800000;
break;
case 19:
- dst1[0] = src_l1[0] | src_r0[0] << 24;
- dst1[1] = src_r0[0] >> 8 | src_r0[1] << 24;
- dst1[2] = src_r0[1] >> 8 | src_r0[2] << 24;
- dst1[3] = src_r0[2] >> 8 | src_r0[3] << 24;
+ w1[0] = w1[0] | 0x80000000;
break;
case 20:
- dst1[1] = src_r0[0];
- dst1[2] = src_r0[1];
- dst1[3] = src_r0[2];
+ w1[1] = 0x80;
break;
case 21:
- dst1[1] = src_l1[1] | src_r0[0] << 8;
- dst1[2] = src_r0[0] >> 24 | src_r0[1] << 8;
- dst1[3] = src_r0[1] >> 24 | src_r0[2] << 8;
+ w1[1] = w1[1] | 0x8000;
break;
case 22:
- dst1[1] = src_l1[1] | src_r0[0] << 16;
- dst1[2] = src_r0[0] >> 16 | src_r0[1] << 16;
- dst1[3] = src_r0[1] >> 16 | src_r0[2] << 16;
+ w1[1] = w1[1] | 0x800000;
break;
case 23:
- dst1[1] = src_l1[1] | src_r0[0] << 24;
- dst1[2] = src_r0[0] >> 8 | src_r0[1] << 24;
- dst1[3] = src_r0[1] >> 8 | src_r0[2] << 24;
+ w1[1] = w1[1] | 0x80000000;
break;
case 24:
- dst1[2] = src_r0[0];
- dst1[3] = src_r0[1];
+ w1[2] = 0x80;
break;
case 25:
- dst1[2] = src_l1[2] | src_r0[0] << 8;
- dst1[3] = src_r0[0] >> 24 | src_r0[1] << 8;
+ w1[2] = w1[2] | 0x8000;
break;
case 26:
- dst1[2] = src_l1[2] | src_r0[0] << 16;
- dst1[3] = src_r0[0] >> 16 | src_r0[1] << 16;
+ w1[2] = w1[2] | 0x800000;
break;
case 27:
- dst1[2] = src_l1[2] | src_r0[0] << 24;
- dst1[3] = src_r0[0] >> 8 | src_r0[1] << 24;
+ w1[2] = w1[2] | 0x80000000;
break;
case 28:
- dst1[3] = src_r0[0];
+ w1[3] = 0x80;
break;
case 29:
- dst1[3] = src_l1[3] | src_r0[0] << 8;
+ w1[3] = w1[3] | 0x8000;
break;
case 30:
- dst1[3] = src_l1[3] | src_r0[0] << 16;
+ w1[3] = w1[3] | 0x800000;
break;
case 31:
- dst1[3] = src_l1[3] | src_r0[0] << 24;
+ w1[3] = w1[3] | 0x80000000;
+ break;
+
+ case 32:
+ w2[0] = 0x80;
+ break;
+
+ case 33:
+ w2[0] = w2[0] | 0x8000;
+ break;
+
+ case 34:
+ w2[0] = w2[0] | 0x800000;
+ break;
+
+ case 35:
+ w2[0] = w2[0] | 0x80000000;
+ break;
+
+ case 36:
+ w2[1] = 0x80;
+ break;
+
+ case 37:
+ w2[1] = w2[1] | 0x8000;
+ break;
+
+ case 38:
+ w2[1] = w2[1] | 0x800000;
+ break;
+
+ case 39:
+ w2[1] = w2[1] | 0x80000000;
+ break;
+
+ case 40:
+ w2[2] = 0x80;
+ break;
+
+ case 41:
+ w2[2] = w2[2] | 0x8000;
+ break;
+
+ case 42:
+ w2[2] = w2[2] | 0x800000;
+ break;
+
+ case 43:
+ w2[2] = w2[2] | 0x80000000;
+ break;
+
+ case 44:
+ w2[3] = 0x80;
+ break;
+
+ case 45:
+ w2[3] = w2[3] | 0x8000;
+ break;
+
+ case 46:
+ w2[3] = w2[3] | 0x800000;
+ break;
+
+ case 47:
+ w2[3] = w2[3] | 0x80000000;
break;
}
}
-// before: device_memcat12L
-static void memcat_c47_d3x4_sl3x4_sr1x4 (const u32 offset, u32 dst0[4], u32 dst1[4], u32 dst2[4], u32 src_l0[4], u32 src_l1[4], u32 src_l2[4], u32 src_r0[4])
+static void append_0x80_4x4_S (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], const u32 offset)
{
switch (offset)
{
+ case 0:
+ w0[0] = 0x80;
+ break;
+
case 1:
- dst0[0] = src_l0[0] | src_r0[0] << 8;
- dst0[1] = src_r0[0] >> 24 | src_r0[1] << 8;
- dst0[2] = src_r0[1] >> 24 | src_r0[2] << 8;
- dst0[3] = src_r0[2] >> 24 | src_r0[3] << 8;
- dst1[0] = src_r0[3] >> 24;
+ w0[0] = w0[0] | 0x8000;
break;
case 2:
- dst0[0] = src_l0[0] | src_r0[0] << 16;
- dst0[1] = src_r0[0] >> 16 | src_r0[1] << 16;
- dst0[2] = src_r0[1] >> 16 | src_r0[2] << 16;
- dst0[3] = src_r0[2] >> 16 | src_r0[3] << 16;
- dst1[0] = src_r0[3] >> 16;
+ w0[0] = w0[0] | 0x800000;
break;
case 3:
- dst0[0] = src_l0[0] | src_r0[0] << 24;
- dst0[1] = src_r0[0] >> 8 | src_r0[1] << 24;
- dst0[2] = src_r0[1] >> 8 | src_r0[2] << 24;
- dst0[3] = src_r0[2] >> 8 | src_r0[3] << 24;
- dst1[0] = src_r0[3] >> 8;
+ w0[0] = w0[0] | 0x80000000;
break;
case 4:
- dst0[1] = src_r0[0];
- dst0[2] = src_r0[1];
- dst0[3] = src_r0[2];
- dst1[0] = src_r0[3];
+ w0[1] = 0x80;
break;
case 5:
- dst0[1] = src_l0[1] | src_r0[0] << 8;
- dst0[2] = src_r0[0] >> 24 | src_r0[1] << 8;
- dst0[3] = src_r0[1] >> 24 | src_r0[2] << 8;
- dst1[0] = src_r0[2] >> 24 | src_r0[3] << 8;
- dst1[1] = src_r0[3] >> 24;
+ w0[1] = w0[1] | 0x8000;
break;
case 6:
- dst0[1] = src_l0[1] | src_r0[0] << 16;
- dst0[2] = src_r0[0] >> 16 | src_r0[1] << 16;
- dst0[3] = src_r0[1] >> 16 | src_r0[2] << 16;
- dst1[0] = src_r0[2] >> 16 | src_r0[3] << 16;
- dst1[1] = src_r0[3] >> 16;
+ w0[1] = w0[1] | 0x800000;
break;
case 7:
- dst0[1] = src_l0[1] | src_r0[0] << 24;
- dst0[2] = src_r0[0] >> 8 | src_r0[1] << 24;
- dst0[3] = src_r0[1] >> 8 | src_r0[2] << 24;
- dst1[0] = src_r0[2] >> 8 | src_r0[3] << 24;
- dst1[1] = src_r0[3] >> 8;
+ w0[1] = w0[1] | 0x80000000;
break;
case 8:
- dst0[2] = src_r0[0];
- dst0[3] = src_r0[1];
- dst1[0] = src_r0[2];
- dst1[1] = src_r0[3];
+ w0[2] = 0x80;
break;
case 9:
- dst0[2] = src_l0[2] | src_r0[0] << 8;
- dst0[3] = src_r0[0] >> 24 | src_r0[1] << 8;
- dst1[0] = src_r0[1] >> 24 | src_r0[2] << 8;
- dst1[1] = src_r0[2] >> 24 | src_r0[3] << 8;
- dst1[2] = src_r0[3] >> 24;
+ w0[2] = w0[2] | 0x8000;
break;
case 10:
- dst0[2] = src_l0[2] | src_r0[0] << 16;
- dst0[3] = src_r0[0] >> 16 | src_r0[1] << 16;
- dst1[0] = src_r0[1] >> 16 | src_r0[2] << 16;
- dst1[1] = src_r0[2] >> 16 | src_r0[3] << 16;
- dst1[2] = src_r0[3] >> 16;
+ w0[2] = w0[2] | 0x800000;
break;
case 11:
- dst0[2] = src_l0[2] | src_r0[0] << 24;
- dst0[3] = src_r0[0] >> 8 | src_r0[1] << 24;
- dst1[0] = src_r0[1] >> 8 | src_r0[2] << 24;
- dst1[1] = src_r0[2] >> 8 | src_r0[3] << 24;
- dst1[2] = src_r0[3] >> 8;
+ w0[2] = w0[2] | 0x80000000;
break;
case 12:
- dst0[3] = src_r0[0];
- dst1[0] = src_r0[1];
- dst1[1] = src_r0[2];
- dst1[2] = src_r0[3];
+ w0[3] = 0x80;
break;
case 13:
- dst0[3] = src_l0[3] | src_r0[0] << 8;
- dst1[0] = src_r0[0] >> 24 | src_r0[1] << 8;
- dst1[1] = src_r0[1] >> 24 | src_r0[2] << 8;
- dst1[2] = src_r0[2] >> 24 | src_r0[3] << 8;
- dst1[3] = src_r0[3] >> 24;
+ w0[3] = w0[3] | 0x8000;
break;
case 14:
- dst0[3] = src_l0[3] | src_r0[0] << 16;
- dst1[0] = src_r0[0] >> 16 | src_r0[1] << 16;
- dst1[1] = src_r0[1] >> 16 | src_r0[2] << 16;
- dst1[2] = src_r0[2] >> 16 | src_r0[3] << 16;
- dst1[3] = src_r0[3] >> 16;
+ w0[3] = w0[3] | 0x800000;
break;
case 15:
- dst0[3] = src_l0[3] | src_r0[0] << 24;
- dst1[0] = src_r0[0] >> 8 | src_r0[1] << 24;
- dst1[1] = src_r0[1] >> 8 | src_r0[2] << 24;
- dst1[2] = src_r0[2] >> 8 | src_r0[3] << 24;
- dst1[3] = src_r0[3] >> 8;
+ w0[3] = w0[3] | 0x80000000;
break;
case 16:
- dst1[0] = src_r0[0];
- dst1[1] = src_r0[1];
- dst1[2] = src_r0[2];
- dst1[3] = src_r0[3];
+ w1[0] = 0x80;
break;
case 17:
- dst1[0] = src_l1[0] | src_r0[0] << 8;
- dst1[1] = src_r0[0] >> 24 | src_r0[1] << 8;
- dst1[2] = src_r0[1] >> 24 | src_r0[2] << 8;
- dst1[3] = src_r0[2] >> 24 | src_r0[3] << 8;
- dst2[0] = src_r0[3] >> 24;
+ w1[0] = w1[0] | 0x8000;
break;
case 18:
- dst1[0] = src_l1[0] | src_r0[0] << 16;
- dst1[1] = src_r0[0] >> 16 | src_r0[1] << 16;
- dst1[2] = src_r0[1] >> 16 | src_r0[2] << 16;
- dst1[3] = src_r0[2] >> 16 | src_r0[3] << 16;
- dst2[0] = src_r0[3] >> 16;
+ w1[0] = w1[0] | 0x800000;
break;
case 19:
- dst1[0] = src_l1[0] | src_r0[0] << 24;
- dst1[1] = src_r0[0] >> 8 | src_r0[1] << 24;
- dst1[2] = src_r0[1] >> 8 | src_r0[2] << 24;
- dst1[3] = src_r0[2] >> 8 | src_r0[3] << 24;
- dst2[0] = src_r0[3] >> 8;
+ w1[0] = w1[0] | 0x80000000;
break;
case 20:
- dst1[1] = src_r0[0];
- dst1[2] = src_r0[1];
- dst1[3] = src_r0[2];
- dst2[0] = src_r0[3];
+ w1[1] = 0x80;
break;
case 21:
- dst1[1] = src_l1[1] | src_r0[0] << 8;
- dst1[2] = src_r0[0] >> 24 | src_r0[1] << 8;
- dst1[3] = src_r0[1] >> 24 | src_r0[2] << 8;
- dst2[0] = src_r0[2] >> 24 | src_r0[3] << 8;
- dst2[1] = src_r0[3] >> 24;
+ w1[1] = w1[1] | 0x8000;
break;
case 22:
- dst1[1] = src_l1[1] | src_r0[0] << 16;
- dst1[2] = src_r0[0] >> 16 | src_r0[1] << 16;
- dst1[3] = src_r0[1] >> 16 | src_r0[2] << 16;
- dst2[0] = src_r0[2] >> 16 | src_r0[3] << 16;
- dst2[1] = src_r0[3] >> 16;
+ w1[1] = w1[1] | 0x800000;
break;
case 23:
- dst1[1] = src_l1[1] | src_r0[0] << 24;
- dst1[2] = src_r0[0] >> 8 | src_r0[1] << 24;
- dst1[3] = src_r0[1] >> 8 | src_r0[2] << 24;
- dst2[0] = src_r0[2] >> 8 | src_r0[3] << 24;
- dst2[1] = src_r0[3] >> 8;
+ w1[1] = w1[1] | 0x80000000;
break;
case 24:
- dst1[2] = src_r0[0];
- dst1[3] = src_r0[1];
- dst2[0] = src_r0[2];
- dst2[1] = src_r0[3];
+ w1[2] = 0x80;
break;
case 25:
- dst1[2] = src_l1[2] | src_r0[0] << 8;
- dst1[3] = src_r0[0] >> 24 | src_r0[1] << 8;
- dst2[0] = src_r0[1] >> 24 | src_r0[2] << 8;
- dst2[1] = src_r0[2] >> 24 | src_r0[3] << 8;
- dst2[2] = src_r0[3] >> 24;
+ w1[2] = w1[2] | 0x8000;
break;
case 26:
- dst1[2] = src_l1[2] | src_r0[0] << 16;
- dst1[3] = src_r0[0] >> 16 | src_r0[1] << 16;
- dst2[0] = src_r0[1] >> 16 | src_r0[2] << 16;
- dst2[1] = src_r0[2] >> 16 | src_r0[3] << 16;
- dst2[2] = src_r0[3] >> 16;
+ w1[2] = w1[2] | 0x800000;
break;
case 27:
- dst1[2] = src_l1[2] | src_r0[0] << 24;
- dst1[3] = src_r0[0] >> 8 | src_r0[1] << 24;
- dst2[0] = src_r0[1] >> 8 | src_r0[2] << 24;
- dst2[1] = src_r0[2] >> 8 | src_r0[3] << 24;
- dst2[2] = src_r0[3] >> 8;
+ w1[2] = w1[2] | 0x80000000;
break;
case 28:
- dst1[3] = src_r0[0];
- dst2[0] = src_r0[1];
- dst2[1] = src_r0[2];
- dst2[2] = src_r0[3];
+ w1[3] = 0x80;
break;
case 29:
- dst1[3] = src_l1[3] | src_r0[0] << 8;
- dst2[0] = src_r0[0] >> 24 | src_r0[1] << 8;
- dst2[1] = src_r0[1] >> 24 | src_r0[2] << 8;
- dst2[2] = src_r0[2] >> 24 | src_r0[3] << 8;
- dst2[3] = src_r0[3] >> 24;
+ w1[3] = w1[3] | 0x8000;
break;
case 30:
- dst1[3] = src_l1[3] | src_r0[0] << 16;
- dst2[0] = src_r0[0] >> 16 | src_r0[1] << 16;
- dst2[1] = src_r0[1] >> 16 | src_r0[2] << 16;
- dst2[2] = src_r0[2] >> 16 | src_r0[3] << 16;
- dst2[3] = src_r0[3] >> 16;
+ w1[3] = w1[3] | 0x800000;
break;
case 31:
- dst1[3] = src_l1[3] | src_r0[0] << 24;
- dst2[0] = src_r0[0] >> 8 | src_r0[1] << 24;
- dst2[1] = src_r0[1] >> 8 | src_r0[2] << 24;
- dst2[2] = src_r0[2] >> 8 | src_r0[3] << 24;
- dst2[3] = src_r0[3] >> 8;
+ w1[3] = w1[3] | 0x80000000;
break;
case 32:
- dst2[0] = src_r0[0];
- dst2[1] = src_r0[1];
- dst2[2] = src_r0[2];
- dst2[3] = src_r0[3];
+ w2[0] = 0x80;
break;
case 33:
- dst2[0] = src_l2[0] | src_r0[0] << 8;
- dst2[1] = src_r0[0] >> 24 | src_r0[1] << 8;
- dst2[2] = src_r0[1] >> 24 | src_r0[2] << 8;
- dst2[3] = src_r0[2] >> 24 | src_r0[3] << 8;
+ w2[0] = w2[0] | 0x8000;
break;
case 34:
- dst2[0] = src_l2[0] | src_r0[0] << 16;
- dst2[1] = src_r0[0] >> 16 | src_r0[1] << 16;
- dst2[2] = src_r0[1] >> 16 | src_r0[2] << 16;
- dst2[3] = src_r0[2] >> 16 | src_r0[3] << 16;
+ w2[0] = w2[0] | 0x800000;
break;
case 35:
- dst2[0] = src_l2[0] | src_r0[0] << 24;
- dst2[1] = src_r0[0] >> 8 | src_r0[1] << 24;
- dst2[2] = src_r0[1] >> 8 | src_r0[2] << 24;
- dst2[3] = src_r0[2] >> 8 | src_r0[3] << 24;
+ w2[0] = w2[0] | 0x80000000;
break;
case 36:
- dst2[1] = src_r0[0];
- dst2[2] = src_r0[1];
- dst2[3] = src_r0[2];
+ w2[1] = 0x80;
break;
case 37:
- dst2[1] = src_l2[1] | src_r0[0] << 8;
- dst2[2] = src_r0[0] >> 24 | src_r0[1] << 8;
- dst2[3] = src_r0[1] >> 24 | src_r0[2] << 8;
+ w2[1] = w2[1] | 0x8000;
break;
case 38:
- dst2[1] = src_l2[1] | src_r0[0] << 16;
- dst2[2] = src_r0[0] >> 16 | src_r0[1] << 16;
- dst2[3] = src_r0[1] >> 16 | src_r0[2] << 16;
+ w2[1] = w2[1] | 0x800000;
break;
case 39:
- dst2[1] = src_l2[1] | src_r0[0] << 24;
- dst2[2] = src_r0[0] >> 8 | src_r0[1] << 24;
- dst2[3] = src_r0[1] >> 8 | src_r0[2] << 24;
+ w2[1] = w2[1] | 0x80000000;
break;
case 40:
- dst2[2] = src_r0[0];
- dst2[3] = src_r0[1];
+ w2[2] = 0x80;
break;
case 41:
- dst2[2] = src_l2[2] | src_r0[0] << 8;
- dst2[3] = src_r0[0] >> 24 | src_r0[1] << 8;
+ w2[2] = w2[2] | 0x8000;
break;
case 42:
- dst2[2] = src_l2[2] | src_r0[0] << 16;
- dst2[3] = src_r0[0] >> 16 | src_r0[1] << 16;
+ w2[2] = w2[2] | 0x800000;
break;
case 43:
- dst2[2] = src_l2[2] | src_r0[0] << 24;
- dst2[3] = src_r0[0] >> 8 | src_r0[1] << 24;
+ w2[2] = w2[2] | 0x80000000;
break;
case 44:
- dst2[3] = src_r0[0];
+ w2[3] = 0x80;
break;
case 45:
- dst2[3] = src_l2[3] | src_r0[0] << 8;
+ w2[3] = w2[3] | 0x8000;
break;
case 46:
- dst2[3] = src_l2[3] | src_r0[0] << 16;
+ w2[3] = w2[3] | 0x800000;
break;
case 47:
- dst2[3] = src_l2[3] | src_r0[0] << 24;
+ w2[3] = w2[3] | 0x80000000;
+ break;
+
+ case 48:
+ w3[0] = 0x80;
+ break;
+
+ case 49:
+ w3[0] = w3[0] | 0x8000;
+ break;
+
+ case 50:
+ w3[0] = w3[0] | 0x800000;
+ break;
+
+ case 51:
+ w3[0] = w3[0] | 0x80000000;
+ break;
+
+ case 52:
+ w3[1] = 0x80;
+ break;
+
+ case 53:
+ w3[1] = w3[1] | 0x8000;
+ break;
+
+ case 54:
+ w3[1] = w3[1] | 0x800000;
+ break;
+
+ case 55:
+ w3[1] = w3[1] | 0x80000000;
+ break;
+
+ case 56:
+ w3[2] = 0x80;
+ break;
+
+ case 57:
+ w3[2] = w3[2] | 0x8000;
+ break;
+
+ case 58:
+ w3[2] = w3[2] | 0x800000;
+ break;
+
+ case 59:
+ w3[2] = w3[2] | 0x80000000;
+ break;
+
+ case 60:
+ w3[3] = 0x80;
+ break;
+
+ case 61:
+ w3[3] = w3[3] | 0x8000;
+ break;
+
+ case 62:
+ w3[3] = w3[3] | 0x800000;
+ break;
+
+ case 63:
+ w3[3] = w3[3] | 0x80000000;
break;
}
}
-// before: device_memcat12L
-static void memcat_c47_d3x4_sl3x4_sr2x4 (const u32 offset, u32 dst0[4], u32 dst1[4], u32 dst2[4], u32 src_l0[4], u32 src_l1[4], u32 src_l2[4], u32 src_r0[4], u32 src_r1[4])
+static void truncate_block_S (u32 w[4], const u32 len)
+{
+ switch (len)
+ {
+ case 0: w[0] &= 0;
+ w[1] &= 0;
+ w[2] &= 0;
+ w[3] &= 0;
+ break;
+ case 1: w[0] &= 0x000000FF;
+ w[1] &= 0;
+ w[2] &= 0;
+ w[3] &= 0;
+ break;
+ case 2: w[0] &= 0x0000FFFF;
+ w[1] &= 0;
+ w[2] &= 0;
+ w[3] &= 0;
+ break;
+ case 3: w[0] &= 0x00FFFFFF;
+ w[1] &= 0;
+ w[2] &= 0;
+ w[3] &= 0;
+ break;
+ case 4: w[1] &= 0;
+ w[2] &= 0;
+ w[3] &= 0;
+ break;
+ case 5: w[1] &= 0x000000FF;
+ w[2] &= 0;
+ w[3] &= 0;
+ break;
+ case 6: w[1] &= 0x0000FFFF;
+ w[2] &= 0;
+ w[3] &= 0;
+ break;
+ case 7: w[1] &= 0x00FFFFFF;
+ w[2] &= 0;
+ w[3] &= 0;
+ break;
+ case 8: w[2] &= 0;
+ w[3] &= 0;
+ break;
+ case 9: w[2] &= 0x000000FF;
+ w[3] &= 0;
+ break;
+ case 10: w[2] &= 0x0000FFFF;
+ w[3] &= 0;
+ break;
+ case 11: w[2] &= 0x00FFFFFF;
+ w[3] &= 0;
+ break;
+ case 12: w[3] &= 0;
+ break;
+ case 13: w[3] &= 0x000000FF;
+ break;
+ case 14: w[3] &= 0x0000FFFF;
+ break;
+ case 15: w[3] &= 0x00FFFFFF;
+ break;
+ }
+}
+
+static void make_unicode_S (const u32 in[4], u32 out1[4], u32 out2[4])
{
- switch (offset)
- {
- case 0:
- dst0[0] = src_r0[0];
- dst0[1] = src_r0[1];
- dst0[2] = src_r0[2];
- dst0[3] = src_r0[3];
- dst1[0] = src_r1[0];
- dst1[1] = src_r1[1];
- dst1[2] = src_r1[2];
- dst1[3] = src_r1[3];
- break;
+ #ifdef IS_NV
+ out2[3] = __byte_perm_S (in[3], 0, 0x7372);
+ out2[2] = __byte_perm_S (in[3], 0, 0x7170);
+ out2[1] = __byte_perm_S (in[2], 0, 0x7372);
+ out2[0] = __byte_perm_S (in[2], 0, 0x7170);
+ out1[3] = __byte_perm_S (in[1], 0, 0x7372);
+ out1[2] = __byte_perm_S (in[1], 0, 0x7170);
+ out1[1] = __byte_perm_S (in[0], 0, 0x7372);
+ out1[0] = __byte_perm_S (in[0], 0, 0x7170);
+ #endif
- case 1:
- dst0[0] = src_l0[0] | src_r0[0] << 8;
- dst0[1] = src_r0[0] >> 24 | src_r0[1] << 8;
- dst0[2] = src_r0[1] >> 24 | src_r0[2] << 8;
- dst0[3] = src_r0[2] >> 24 | src_r0[3] << 8;
- dst1[0] = src_r0[3] >> 24 | src_r1[0] << 8;
- dst1[1] = src_r1[0] >> 24 | src_r1[1] << 8;
- dst1[2] = src_r1[1] >> 24 | src_r1[2] << 8;
- dst1[3] = src_r1[2] >> 24 | src_r1[3] << 8;
- dst2[0] = src_r1[3] >> 24;
- break;
+ #if defined IS_AMD || defined IS_GENERIC
+ out2[3] = ((in[3] >> 8) & 0x00FF0000) | ((in[3] >> 16) & 0x000000FF);
+ out2[2] = ((in[3] << 8) & 0x00FF0000) | ((in[3] >> 0) & 0x000000FF);
+ out2[1] = ((in[2] >> 8) & 0x00FF0000) | ((in[2] >> 16) & 0x000000FF);
+ out2[0] = ((in[2] << 8) & 0x00FF0000) | ((in[2] >> 0) & 0x000000FF);
+ out1[3] = ((in[1] >> 8) & 0x00FF0000) | ((in[1] >> 16) & 0x000000FF);
+ out1[2] = ((in[1] << 8) & 0x00FF0000) | ((in[1] >> 0) & 0x000000FF);
+ out1[1] = ((in[0] >> 8) & 0x00FF0000) | ((in[0] >> 16) & 0x000000FF);
+ out1[0] = ((in[0] << 8) & 0x00FF0000) | ((in[0] >> 0) & 0x000000FF);
+ #endif
+}
- case 2:
- dst0[0] = src_l0[0] | src_r0[0] << 16;
- dst0[1] = src_r0[0] >> 16 | src_r0[1] << 16;
- dst0[2] = src_r0[1] >> 16 | src_r0[2] << 16;
- dst0[3] = src_r0[2] >> 16 | src_r0[3] << 16;
- dst1[0] = src_r0[3] >> 16 | src_r1[0] << 16;
- dst1[1] = src_r1[0] >> 16 | src_r1[1] << 16;
- dst1[2] = src_r1[1] >> 16 | src_r1[2] << 16;
- dst1[3] = src_r1[2] >> 16 | src_r1[3] << 16;
- dst2[0] = src_r1[3] >> 16;
- break;
+static void undo_unicode_S (const u32 in1[4], const u32 in2[4], u32 out[4])
+{
+ #ifdef IS_NV
+ out[0] = __byte_perm_S (in1[0], in1[1], 0x6420);
+ out[1] = __byte_perm_S (in1[2], in1[3], 0x6420);
+ out[2] = __byte_perm_S (in2[0], in2[1], 0x6420);
+ out[3] = __byte_perm_S (in2[2], in2[3], 0x6420);
+ #endif
- case 3:
- dst0[0] = src_l0[0] | src_r0[0] << 24;
- dst0[1] = src_r0[0] >> 8 | src_r0[1] << 24;
- dst0[2] = src_r0[1] >> 8 | src_r0[2] << 24;
- dst0[3] = src_r0[2] >> 8 | src_r0[3] << 24;
- dst1[0] = src_r0[3] >> 8 | src_r1[0] << 24;
- dst1[1] = src_r1[0] >> 8 | src_r1[1] << 24;
- dst1[2] = src_r1[1] >> 8 | src_r1[2] << 24;
- dst1[3] = src_r1[2] >> 8 | src_r1[3] << 24;
- dst2[0] = src_r1[3] >> 8;
- break;
+ #if defined IS_AMD || defined IS_GENERIC
+ out[0] = ((in1[0] & 0x000000ff) >> 0) | ((in1[0] & 0x00ff0000) >> 8)
+ | ((in1[1] & 0x000000ff) << 16) | ((in1[1] & 0x00ff0000) << 8);
+ out[1] = ((in1[2] & 0x000000ff) >> 0) | ((in1[2] & 0x00ff0000) >> 8)
+ | ((in1[3] & 0x000000ff) << 16) | ((in1[3] & 0x00ff0000) << 8);
+ out[2] = ((in2[0] & 0x000000ff) >> 0) | ((in2[0] & 0x00ff0000) >> 8)
+ | ((in2[1] & 0x000000ff) << 16) | ((in2[1] & 0x00ff0000) << 8);
+ out[3] = ((in2[2] & 0x000000ff) >> 0) | ((in2[2] & 0x00ff0000) >> 8)
+ | ((in2[3] & 0x000000ff) << 16) | ((in2[3] & 0x00ff0000) << 8);
+ #endif
+}
- case 4:
- dst0[1] = src_r0[0];
- dst0[2] = src_r0[1];
- dst0[3] = src_r0[2];
- dst1[0] = src_r0[3];
- dst1[1] = src_r1[0];
- dst1[2] = src_r1[1];
- dst1[3] = src_r1[2];
- dst2[0] = src_r1[3];
- break;
+static void switch_buffer_by_offset_le_S (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], const u32 offset)
+{
+ #if defined IS_AMD || defined IS_GENERIC
+ const int offset_mod_4 = offset & 3;
- case 5:
- dst0[1] = src_l0[1] | src_r0[0] << 8;
- dst0[2] = src_r0[0] >> 24 | src_r0[1] << 8;
- dst0[3] = src_r0[1] >> 24 | src_r0[2] << 8;
- dst1[0] = src_r0[2] >> 24 | src_r0[3] << 8;
- dst1[1] = src_r0[3] >> 24 | src_r1[0] << 8;
- dst1[2] = src_r1[0] >> 24 | src_r1[1] << 8;
- dst1[3] = src_r1[1] >> 24 | src_r1[2] << 8;
- dst2[0] = src_r1[2] >> 24 | src_r1[3] << 8;
- dst2[1] = src_r1[3] >> 24;
- break;
+ const int offset_minus_4 = 4 - offset;
- case 6:
- dst0[1] = src_l0[1] | src_r0[0] << 16;
- dst0[2] = src_r0[0] >> 16 | src_r0[1] << 16;
- dst0[3] = src_r0[1] >> 16 | src_r0[2] << 16;
- dst1[0] = src_r0[2] >> 16 | src_r0[3] << 16;
- dst1[1] = src_r0[3] >> 16 | src_r1[0] << 16;
- dst1[2] = src_r1[0] >> 16 | src_r1[1] << 16;
- dst1[3] = src_r1[1] >> 16 | src_r1[2] << 16;
- dst2[0] = src_r1[2] >> 16 | src_r1[3] << 16;
- dst2[1] = src_r1[3] >> 16;
- break;
+ switch (offset / 4)
+ {
+ case 0:
+ w3[2] = amd_bytealign_S ( 0, w3[1], offset_minus_4);
+ w3[1] = amd_bytealign_S (w3[1], w3[0], offset_minus_4);
+ w3[0] = amd_bytealign_S (w3[0], w2[3], offset_minus_4);
+ w2[3] = amd_bytealign_S (w2[3], w2[2], offset_minus_4);
+ w2[2] = amd_bytealign_S (w2[2], w2[1], offset_minus_4);
+ w2[1] = amd_bytealign_S (w2[1], w2[0], offset_minus_4);
+ w2[0] = amd_bytealign_S (w2[0], w1[3], offset_minus_4);
+ w1[3] = amd_bytealign_S (w1[3], w1[2], offset_minus_4);
+ w1[2] = amd_bytealign_S (w1[2], w1[1], offset_minus_4);
+ w1[1] = amd_bytealign_S (w1[1], w1[0], offset_minus_4);
+ w1[0] = amd_bytealign_S (w1[0], w0[3], offset_minus_4);
+ w0[3] = amd_bytealign_S (w0[3], w0[2], offset_minus_4);
+ w0[2] = amd_bytealign_S (w0[2], w0[1], offset_minus_4);
+ w0[1] = amd_bytealign_S (w0[1], w0[0], offset_minus_4);
+ w0[0] = amd_bytealign_S (w0[0], 0, offset_minus_4);
- case 7:
- dst0[1] = src_l0[1] | src_r0[0] << 24;
- dst0[2] = src_r0[0] >> 8 | src_r0[1] << 24;
- dst0[3] = src_r0[1] >> 8 | src_r0[2] << 24;
- dst1[0] = src_r0[2] >> 8 | src_r0[3] << 24;
- dst1[1] = src_r0[3] >> 8 | src_r1[0] << 24;
- dst1[2] = src_r1[0] >> 8 | src_r1[1] << 24;
- dst1[3] = src_r1[1] >> 8 | src_r1[2] << 24;
- dst2[0] = src_r1[2] >> 8 | src_r1[3] << 24;
- dst2[1] = src_r1[3] >> 8;
- break;
+ if (offset_mod_4 == 0)
+ {
+ w0[0] = w0[1];
+ w0[1] = w0[2];
+ w0[2] = w0[3];
+ w0[3] = w1[0];
+ w1[0] = w1[1];
+ w1[1] = w1[2];
+ w1[2] = w1[3];
+ w1[3] = w2[0];
+ w2[0] = w2[1];
+ w2[1] = w2[2];
+ w2[2] = w2[3];
+ w2[3] = w3[0];
+ w3[0] = w3[1];
+ w3[1] = w3[2];
+ w3[2] = 0;
+ }
- case 8:
- dst0[2] = src_r0[0];
- dst0[3] = src_r0[1];
- dst1[0] = src_r0[2];
- dst1[1] = src_r0[3];
- dst1[2] = src_r1[0];
- dst1[3] = src_r1[1];
- dst2[0] = src_r1[2];
- dst2[1] = src_r1[3];
break;
- case 9:
- dst0[2] = src_l0[2] | src_r0[0] << 8;
- dst0[3] = src_r0[0] >> 24 | src_r0[1] << 8;
- dst1[0] = src_r0[1] >> 24 | src_r0[2] << 8;
- dst1[1] = src_r0[2] >> 24 | src_r0[3] << 8;
- dst1[2] = src_r0[3] >> 24 | src_r1[0] << 8;
- dst1[3] = src_r1[0] >> 24 | src_r1[1] << 8;
- dst2[0] = src_r1[1] >> 24 | src_r1[2] << 8;
- dst2[1] = src_r1[2] >> 24 | src_r1[3] << 8;
- dst2[2] = src_r1[3] >> 24;
- break;
+ case 1:
+ w3[2] = amd_bytealign_S ( 0, w3[0], offset_minus_4);
+ w3[1] = amd_bytealign_S (w3[0], w2[3], offset_minus_4);
+ w3[0] = amd_bytealign_S (w2[3], w2[2], offset_minus_4);
+ w2[3] = amd_bytealign_S (w2[2], w2[1], offset_minus_4);
+ w2[2] = amd_bytealign_S (w2[1], w2[0], offset_minus_4);
+ w2[1] = amd_bytealign_S (w2[0], w1[3], offset_minus_4);
+ w2[0] = amd_bytealign_S (w1[3], w1[2], offset_minus_4);
+ w1[3] = amd_bytealign_S (w1[2], w1[1], offset_minus_4);
+ w1[2] = amd_bytealign_S (w1[1], w1[0], offset_minus_4);
+ w1[1] = amd_bytealign_S (w1[0], w0[3], offset_minus_4);
+ w1[0] = amd_bytealign_S (w0[3], w0[2], offset_minus_4);
+ w0[3] = amd_bytealign_S (w0[2], w0[1], offset_minus_4);
+ w0[2] = amd_bytealign_S (w0[1], w0[0], offset_minus_4);
+ w0[1] = amd_bytealign_S (w0[0], 0, offset_minus_4);
+ w0[0] = 0;
- case 10:
- dst0[2] = src_l0[2] | src_r0[0] << 16;
- dst0[3] = src_r0[0] >> 16 | src_r0[1] << 16;
- dst1[0] = src_r0[1] >> 16 | src_r0[2] << 16;
- dst1[1] = src_r0[2] >> 16 | src_r0[3] << 16;
- dst1[2] = src_r0[3] >> 16 | src_r1[0] << 16;
- dst1[3] = src_r1[0] >> 16 | src_r1[1] << 16;
- dst2[0] = src_r1[1] >> 16 | src_r1[2] << 16;
- dst2[1] = src_r1[2] >> 16 | src_r1[3] << 16;
- dst2[2] = src_r1[3] >> 16;
- break;
+ if (offset_mod_4 == 0)
+ {
+ w0[1] = w0[2];
+ w0[2] = w0[3];
+ w0[3] = w1[0];
+ w1[0] = w1[1];
+ w1[1] = w1[2];
+ w1[2] = w1[3];
+ w1[3] = w2[0];
+ w2[0] = w2[1];
+ w2[1] = w2[2];
+ w2[2] = w2[3];
+ w2[3] = w3[0];
+ w3[0] = w3[1];
+ w3[1] = w3[2];
+ w3[2] = 0;
+ }
- case 11:
- dst0[2] = src_l0[2] | src_r0[0] << 24;
- dst0[3] = src_r0[0] >> 8 | src_r0[1] << 24;
- dst1[0] = src_r0[1] >> 8 | src_r0[2] << 24;
- dst1[1] = src_r0[2] >> 8 | src_r0[3] << 24;
- dst1[2] = src_r0[3] >> 8 | src_r1[0] << 24;
- dst1[3] = src_r1[0] >> 8 | src_r1[1] << 24;
- dst2[0] = src_r1[1] >> 8 | src_r1[2] << 24;
- dst2[1] = src_r1[2] >> 8 | src_r1[3] << 24;
- dst2[2] = src_r1[3] >> 8;
break;
- case 12:
- dst0[3] = src_r0[0];
- dst1[0] = src_r0[1];
- dst1[1] = src_r0[2];
- dst1[2] = src_r0[3];
- dst1[3] = src_r1[0];
- dst2[0] = src_r1[1];
- dst2[1] = src_r1[2];
- dst2[2] = src_r1[3];
- break;
+ case 2:
+ w3[2] = amd_bytealign_S ( 0, w2[3], offset_minus_4);
+ w3[1] = amd_bytealign_S (w2[3], w2[2], offset_minus_4);
+ w3[0] = amd_bytealign_S (w2[2], w2[1], offset_minus_4);
+ w2[3] = amd_bytealign_S (w2[1], w2[0], offset_minus_4);
+ w2[2] = amd_bytealign_S (w2[0], w1[3], offset_minus_4);
+ w2[1] = amd_bytealign_S (w1[3], w1[2], offset_minus_4);
+ w2[0] = amd_bytealign_S (w1[2], w1[1], offset_minus_4);
+ w1[3] = amd_bytealign_S (w1[1], w1[0], offset_minus_4);
+ w1[2] = amd_bytealign_S (w1[0], w0[3], offset_minus_4);
+ w1[1] = amd_bytealign_S (w0[3], w0[2], offset_minus_4);
+ w1[0] = amd_bytealign_S (w0[2], w0[1], offset_minus_4);
+ w0[3] = amd_bytealign_S (w0[1], w0[0], offset_minus_4);
+ w0[2] = amd_bytealign_S (w0[0], 0, offset_minus_4);
+ w0[1] = 0;
+ w0[0] = 0;
- case 13:
- dst0[3] = src_l0[3] | src_r0[0] << 8;
- dst1[0] = src_r0[0] >> 24 | src_r0[1] << 8;
- dst1[1] = src_r0[1] >> 24 | src_r0[2] << 8;
- dst1[2] = src_r0[2] >> 24 | src_r0[3] << 8;
- dst1[3] = src_r0[3] >> 24 | src_r1[0] << 8;
- dst2[0] = src_r1[0] >> 24 | src_r1[1] << 8;
- dst2[1] = src_r1[1] >> 24 | src_r1[2] << 8;
- dst2[2] = src_r1[2] >> 24 | src_r1[3] << 8;
- dst2[3] = src_r1[3] >> 24;
- break;
+ if (offset_mod_4 == 0)
+ {
+ w0[2] = w0[3];
+ w0[3] = w1[0];
+ w1[0] = w1[1];
+ w1[1] = w1[2];
+ w1[2] = w1[3];
+ w1[3] = w2[0];
+ w2[0] = w2[1];
+ w2[1] = w2[2];
+ w2[2] = w2[3];
+ w2[3] = w3[0];
+ w3[0] = w3[1];
+ w3[1] = w3[2];
+ w3[2] = 0;
+ }
- case 14:
- dst0[3] = src_l0[3] | src_r0[0] << 16;
- dst1[0] = src_r0[0] >> 16 | src_r0[1] << 16;
- dst1[1] = src_r0[1] >> 16 | src_r0[2] << 16;
- dst1[2] = src_r0[2] >> 16 | src_r0[3] << 16;
- dst1[3] = src_r0[3] >> 16 | src_r1[0] << 16;
- dst2[0] = src_r1[0] >> 16 | src_r1[1] << 16;
- dst2[1] = src_r1[1] >> 16 | src_r1[2] << 16;
- dst2[2] = src_r1[2] >> 16 | src_r1[3] << 16;
- dst2[3] = src_r1[3] >> 16;
break;
- case 15:
- dst0[3] = src_l0[3] | src_r0[0] << 24;
- dst1[0] = src_r0[0] >> 8 | src_r0[1] << 24;
- dst1[1] = src_r0[1] >> 8 | src_r0[2] << 24;
- dst1[2] = src_r0[2] >> 8 | src_r0[3] << 24;
- dst1[3] = src_r0[3] >> 8 | src_r1[0] << 24;
- dst2[0] = src_r1[0] >> 8 | src_r1[1] << 24;
- dst2[1] = src_r1[1] >> 8 | src_r1[2] << 24;
- dst2[2] = src_r1[2] >> 8 | src_r1[3] << 24;
- dst2[3] = src_r1[3] >> 8;
- break;
+ case 3:
+ w3[2] = amd_bytealign_S ( 0, w2[2], offset_minus_4);
+ w3[1] = amd_bytealign_S (w2[2], w2[1], offset_minus_4);
+ w3[0] = amd_bytealign_S (w2[1], w2[0], offset_minus_4);
+ w2[3] = amd_bytealign_S (w2[0], w1[3], offset_minus_4);
+ w2[2] = amd_bytealign_S (w1[3], w1[2], offset_minus_4);
+ w2[1] = amd_bytealign_S (w1[2], w1[1], offset_minus_4);
+ w2[0] = amd_bytealign_S (w1[1], w1[0], offset_minus_4);
+ w1[3] = amd_bytealign_S (w1[0], w0[3], offset_minus_4);
+ w1[2] = amd_bytealign_S (w0[3], w0[2], offset_minus_4);
+ w1[1] = amd_bytealign_S (w0[2], w0[1], offset_minus_4);
+ w1[0] = amd_bytealign_S (w0[1], w0[0], offset_minus_4);
+ w0[3] = amd_bytealign_S (w0[0], 0, offset_minus_4);
+ w0[2] = 0;
+ w0[1] = 0;
+ w0[0] = 0;
- case 16:
- dst1[0] = src_r0[0];
- dst1[1] = src_r0[1];
- dst1[2] = src_r0[2];
- dst1[3] = src_r0[3];
- dst2[0] = src_r1[0];
- dst2[1] = src_r1[1];
- dst2[2] = src_r1[2];
- dst2[3] = src_r1[3];
- break;
+ if (offset_mod_4 == 0)
+ {
+ w0[3] = w1[0];
+ w1[0] = w1[1];
+ w1[1] = w1[2];
+ w1[2] = w1[3];
+ w1[3] = w2[0];
+ w2[0] = w2[1];
+ w2[1] = w2[2];
+ w2[2] = w2[3];
+ w2[3] = w3[0];
+ w3[0] = w3[1];
+ w3[1] = w3[2];
+ w3[2] = 0;
+ }
- case 17:
- dst1[0] = src_l1[0] | src_r0[0] << 8;
- dst1[1] = src_r0[0] >> 24 | src_r0[1] << 8;
- dst1[2] = src_r0[1] >> 24 | src_r0[2] << 8;
- dst1[3] = src_r0[2] >> 24 | src_r0[3] << 8;
- dst2[0] = src_r0[3] >> 24 | src_r1[0] << 8;
- dst2[1] = src_r1[0] >> 24 | src_r1[1] << 8;
- dst2[2] = src_r1[1] >> 24 | src_r1[2] << 8;
- dst2[3] = src_r1[2] >> 24 | src_r1[3] << 8;
break;
- case 18:
- dst1[0] = src_l1[0] | src_r0[0] << 16;
- dst1[1] = src_r0[0] >> 16 | src_r0[1] << 16;
- dst1[2] = src_r0[1] >> 16 | src_r0[2] << 16;
- dst1[3] = src_r0[2] >> 16 | src_r0[3] << 16;
- dst2[0] = src_r0[3] >> 16 | src_r1[0] << 16;
- dst2[1] = src_r1[0] >> 16 | src_r1[1] << 16;
- dst2[2] = src_r1[1] >> 16 | src_r1[2] << 16;
- dst2[3] = src_r1[2] >> 16 | src_r1[3] << 16;
- break;
+ case 4:
+ w3[2] = amd_bytealign_S ( 0, w2[1], offset_minus_4);
+ w3[1] = amd_bytealign_S (w2[1], w2[0], offset_minus_4);
+ w3[0] = amd_bytealign_S (w2[0], w1[3], offset_minus_4);
+ w2[3] = amd_bytealign_S (w1[3], w1[2], offset_minus_4);
+ w2[2] = amd_bytealign_S (w1[2], w1[1], offset_minus_4);
+ w2[1] = amd_bytealign_S (w1[1], w1[0], offset_minus_4);
+ w2[0] = amd_bytealign_S (w1[0], w0[3], offset_minus_4);
+ w1[3] = amd_bytealign_S (w0[3], w0[2], offset_minus_4);
+ w1[2] = amd_bytealign_S (w0[2], w0[1], offset_minus_4);
+ w1[1] = amd_bytealign_S (w0[1], w0[0], offset_minus_4);
+ w1[0] = amd_bytealign_S (w0[0], 0, offset_minus_4);
+ w0[3] = 0;
+ w0[2] = 0;
+ w0[1] = 0;
+ w0[0] = 0;
- case 19:
- dst1[0] = src_l1[0] | src_r0[0] << 24;
- dst1[1] = src_r0[0] >> 8 | src_r0[1] << 24;
- dst1[2] = src_r0[1] >> 8 | src_r0[2] << 24;
- dst1[3] = src_r0[2] >> 8 | src_r0[3] << 24;
- dst2[0] = src_r0[3] >> 8 | src_r1[0] << 24;
- dst2[1] = src_r1[0] >> 8 | src_r1[1] << 24;
- dst2[2] = src_r1[1] >> 8 | src_r1[2] << 24;
- dst2[3] = src_r1[2] >> 8 | src_r1[3] << 24;
- break;
+ if (offset_mod_4 == 0)
+ {
+ w1[0] = w1[1];
+ w1[1] = w1[2];
+ w1[2] = w1[3];
+ w1[3] = w2[0];
+ w2[0] = w2[1];
+ w2[1] = w2[2];
+ w2[2] = w2[3];
+ w2[3] = w3[0];
+ w3[0] = w3[1];
+ w3[1] = w3[2];
+ w3[2] = 0;
+ }
- case 20:
- dst1[1] = src_r1[0];
- dst1[2] = src_r0[1];
- dst1[3] = src_r0[2];
- dst2[0] = src_r0[3];
- dst2[1] = src_r1[0];
- dst2[2] = src_r1[1];
- dst2[3] = src_r1[2];
break;
- case 21:
- dst1[1] = src_l1[1] | src_r0[0] << 8;
- dst1[2] = src_r0[0] >> 24 | src_r0[1] << 8;
- dst1[3] = src_r0[1] >> 24 | src_r0[2] << 8;
- dst2[0] = src_r0[2] >> 24 | src_r0[3] << 8;
- dst2[1] = src_r0[3] >> 24 | src_r1[0] << 8;
- dst2[2] = src_r1[0] >> 24 | src_r1[1] << 8;
- dst2[3] = src_r1[1] >> 24 | src_r1[2] << 8;
- break;
+ case 5:
+ w3[2] = amd_bytealign_S ( 0, w2[0], offset_minus_4);
+ w3[1] = amd_bytealign_S (w2[0], w1[3], offset_minus_4);
+ w3[0] = amd_bytealign_S (w1[3], w1[2], offset_minus_4);
+ w2[3] = amd_bytealign_S (w1[2], w1[1], offset_minus_4);
+ w2[2] = amd_bytealign_S (w1[1], w1[0], offset_minus_4);
+ w2[1] = amd_bytealign_S (w1[0], w0[3], offset_minus_4);
+ w2[0] = amd_bytealign_S (w0[3], w0[2], offset_minus_4);
+ w1[3] = amd_bytealign_S (w0[2], w0[1], offset_minus_4);
+ w1[2] = amd_bytealign_S (w0[1], w0[0], offset_minus_4);
+ w1[1] = amd_bytealign_S (w0[0], 0, offset_minus_4);
+ w1[0] = 0;
+ w0[3] = 0;
+ w0[2] = 0;
+ w0[1] = 0;
+ w0[0] = 0;
- case 22:
- dst1[1] = src_l1[1] | src_r0[0] << 16;
- dst1[2] = src_r0[0] >> 16 | src_r0[1] << 16;
- dst1[3] = src_r0[1] >> 16 | src_r0[2] << 16;
- dst2[0] = src_r0[2] >> 16 | src_r0[3] << 16;
- dst2[1] = src_r0[3] >> 16 | src_r1[0] << 16;
- dst2[2] = src_r1[0] >> 16 | src_r1[1] << 16;
- dst2[3] = src_r1[1] >> 16 | src_r1[2] << 16;
- break;
+ if (offset_mod_4 == 0)
+ {
+ w1[1] = w1[2];
+ w1[2] = w1[3];
+ w1[3] = w2[0];
+ w2[0] = w2[1];
+ w2[1] = w2[2];
+ w2[2] = w2[3];
+ w2[3] = w3[0];
+ w3[0] = w3[1];
+ w3[1] = w3[2];
+ w3[2] = 0;
+ }
- case 23:
- dst1[1] = src_l1[1] | src_r0[0] << 24;
- dst1[2] = src_r0[0] >> 8 | src_r0[1] << 24;
- dst1[3] = src_r0[1] >> 8 | src_r0[2] << 24;
- dst2[0] = src_r0[2] >> 8 | src_r0[3] << 24;
- dst2[1] = src_r0[3] >> 8 | src_r1[0] << 24;
- dst2[2] = src_r1[0] >> 8 | src_r1[1] << 24;
- dst2[3] = src_r1[1] >> 8 | src_r1[2] << 24;
break;
- case 24:
- dst1[2] = src_r1[0];
- dst1[3] = src_r0[1];
- dst2[0] = src_r0[2];
- dst2[1] = src_r0[3];
- dst2[2] = src_r1[0];
- dst2[3] = src_r1[1];
- break;
+ case 6:
+ w3[2] = amd_bytealign_S ( 0, w1[3], offset_minus_4);
+ w3[1] = amd_bytealign_S (w1[3], w1[2], offset_minus_4);
+ w3[0] = amd_bytealign_S (w1[2], w1[1], offset_minus_4);
+ w2[3] = amd_bytealign_S (w1[1], w1[0], offset_minus_4);
+ w2[2] = amd_bytealign_S (w1[0], w0[3], offset_minus_4);
+ w2[1] = amd_bytealign_S (w0[3], w0[2], offset_minus_4);
+ w2[0] = amd_bytealign_S (w0[2], w0[1], offset_minus_4);
+ w1[3] = amd_bytealign_S (w0[1], w0[0], offset_minus_4);
+ w1[2] = amd_bytealign_S (w0[0], 0, offset_minus_4);
+ w1[1] = 0;
+ w1[0] = 0;
+ w0[3] = 0;
+ w0[2] = 0;
+ w0[1] = 0;
+ w0[0] = 0;
- case 25:
- dst1[2] = src_l1[2] | src_r0[0] << 8;
- dst1[3] = src_r0[0] >> 24 | src_r0[1] << 8;
- dst2[0] = src_r0[1] >> 24 | src_r0[2] << 8;
- dst2[1] = src_r0[2] >> 24 | src_r0[3] << 8;
- dst2[2] = src_r0[3] >> 24 | src_r1[0] << 8;
- dst2[3] = src_r1[0] >> 24 | src_r1[1] << 8;
- break;
+ if (offset_mod_4 == 0)
+ {
+ w1[2] = w1[3];
+ w1[3] = w2[0];
+ w2[0] = w2[1];
+ w2[1] = w2[2];
+ w2[2] = w2[3];
+ w2[3] = w3[0];
+ w3[0] = w3[1];
+ w3[1] = w3[2];
+ w3[2] = 0;
+ }
- case 26:
- dst1[2] = src_l1[2] | src_r0[0] << 16;
- dst1[3] = src_r0[0] >> 16 | src_r0[1] << 16;
- dst2[0] = src_r0[1] >> 16 | src_r0[2] << 16;
- dst2[1] = src_r0[2] >> 16 | src_r0[3] << 16;
- dst2[2] = src_r0[3] >> 16 | src_r1[0] << 16;
- dst2[3] = src_r1[0] >> 16 | src_r1[1] << 16;
break;
- case 27:
- dst1[2] = src_l1[2] | src_r0[0] << 24;
- dst1[3] = src_r0[0] >> 8 | src_r0[1] << 24;
- dst2[0] = src_r0[1] >> 8 | src_r0[2] << 24;
- dst2[1] = src_r0[2] >> 8 | src_r0[3] << 24;
- dst2[2] = src_r0[3] >> 8 | src_r1[0] << 24;
- dst2[3] = src_r1[0] >> 8 | src_r1[1] << 24;
- break;
+ case 7:
+ w3[2] = amd_bytealign_S ( 0, w1[2], offset_minus_4);
+ w3[1] = amd_bytealign_S (w1[2], w1[1], offset_minus_4);
+ w3[0] = amd_bytealign_S (w1[1], w1[0], offset_minus_4);
+ w2[3] = amd_bytealign_S (w1[0], w0[3], offset_minus_4);
+ w2[2] = amd_bytealign_S (w0[3], w0[2], offset_minus_4);
+ w2[1] = amd_bytealign_S (w0[2], w0[1], offset_minus_4);
+ w2[0] = amd_bytealign_S (w0[1], w0[0], offset_minus_4);
+ w1[3] = amd_bytealign_S (w0[0], 0, offset_minus_4);
+ w1[2] = 0;
+ w1[1] = 0;
+ w1[0] = 0;
+ w0[3] = 0;
+ w0[2] = 0;
+ w0[1] = 0;
+ w0[0] = 0;
- case 28:
- dst1[3] = src_r1[0];
- dst2[0] = src_r0[1];
- dst2[1] = src_r0[2];
- dst2[2] = src_r0[3];
- dst2[3] = src_r1[0];
- break;
+ if (offset_mod_4 == 0)
+ {
+ w1[3] = w2[0];
+ w2[0] = w2[1];
+ w2[1] = w2[2];
+ w2[2] = w2[3];
+ w2[3] = w3[0];
+ w3[0] = w3[1];
+ w3[1] = w3[2];
+ w3[2] = 0;
+ }
- case 29:
- dst1[3] = src_l1[3] | src_r0[0] << 8;
- dst2[0] = src_r0[0] >> 24 | src_r0[1] << 8;
- dst2[1] = src_r0[1] >> 24 | src_r0[2] << 8;
- dst2[2] = src_r0[2] >> 24 | src_r0[3] << 8;
- dst2[3] = src_r0[3] >> 24 | src_r1[0] << 8;
break;
- case 30:
- dst1[3] = src_l1[3] | src_r0[0] << 16;
- dst2[0] = src_r0[0] >> 16 | src_r0[1] << 16;
- dst2[1] = src_r0[1] >> 16 | src_r0[2] << 16;
- dst2[2] = src_r0[2] >> 16 | src_r0[3] << 16;
- dst2[3] = src_r0[3] >> 16 | src_r1[0] << 16;
- break;
+ case 8:
+ w3[2] = amd_bytealign_S ( 0, w1[1], offset_minus_4);
+ w3[1] = amd_bytealign_S (w1[1], w1[0], offset_minus_4);
+ w3[0] = amd_bytealign_S (w1[0], w0[3], offset_minus_4);
+ w2[3] = amd_bytealign_S (w0[3], w0[2], offset_minus_4);
+ w2[2] = amd_bytealign_S (w0[2], w0[1], offset_minus_4);
+ w2[1] = amd_bytealign_S (w0[1], w0[0], offset_minus_4);
+ w2[0] = amd_bytealign_S (w0[0], 0, offset_minus_4);
+ w1[3] = 0;
+ w1[2] = 0;
+ w1[1] = 0;
+ w1[0] = 0;
+ w0[3] = 0;
+ w0[2] = 0;
+ w0[1] = 0;
+ w0[0] = 0;
- case 31:
- dst1[3] = src_l1[3] | src_r0[0] << 24;
- dst2[0] = src_r0[0] >> 8 | src_r0[1] << 24;
- dst2[1] = src_r0[1] >> 8 | src_r0[2] << 24;
- dst2[2] = src_r0[2] >> 8 | src_r0[3] << 24;
- dst2[3] = src_r0[3] >> 8 | src_r1[0] << 24;
- break;
+ if (offset_mod_4 == 0)
+ {
+ w2[0] = w2[1];
+ w2[1] = w2[2];
+ w2[2] = w2[3];
+ w2[3] = w3[0];
+ w3[0] = w3[1];
+ w3[1] = w3[2];
+ w3[2] = 0;
+ }
- case 32:
- dst2[0] = src_r0[0];
- dst2[1] = src_r0[1];
- dst2[2] = src_r0[2];
- dst2[3] = src_r0[3];
break;
- case 33:
- dst2[0] = src_l2[0] | src_r0[0] << 8;
- dst2[1] = src_r0[0] >> 24 | src_r0[1] << 8;
- dst2[2] = src_r0[1] >> 24 | src_r0[2] << 8;
- dst2[3] = src_r0[2] >> 24 | src_r0[3] << 8;
- break;
+ case 9:
+ w3[2] = amd_bytealign_S ( 0, w1[0], offset_minus_4);
+ w3[1] = amd_bytealign_S (w1[0], w0[3], offset_minus_4);
+ w3[0] = amd_bytealign_S (w0[3], w0[2], offset_minus_4);
+ w2[3] = amd_bytealign_S (w0[2], w0[1], offset_minus_4);
+ w2[2] = amd_bytealign_S (w0[1], w0[0], offset_minus_4);
+ w2[1] = amd_bytealign_S (w0[0], 0, offset_minus_4);
+ w2[0] = 0;
+ w1[3] = 0;
+ w1[2] = 0;
+ w1[1] = 0;
+ w1[0] = 0;
+ w0[3] = 0;
+ w0[2] = 0;
+ w0[1] = 0;
+ w0[0] = 0;
- case 34:
- dst2[0] = src_l2[0] | src_r0[0] << 16;
- dst2[1] = src_r0[0] >> 16 | src_r0[1] << 16;
- dst2[2] = src_r0[1] >> 16 | src_r0[2] << 16;
- dst2[3] = src_r0[2] >> 16 | src_r0[3] << 16;
- break;
+ if (offset_mod_4 == 0)
+ {
+ w2[1] = w2[2];
+ w2[2] = w2[3];
+ w2[3] = w3[0];
+ w3[0] = w3[1];
+ w3[1] = w3[2];
+ w3[2] = 0;
+ }
- case 35:
- dst2[0] = src_l2[0] | src_r0[0] << 24;
- dst2[1] = src_r0[0] >> 8 | src_r0[1] << 24;
- dst2[2] = src_r0[1] >> 8 | src_r0[2] << 24;
- dst2[3] = src_r0[2] >> 8 | src_r0[3] << 24;
break;
- case 36:
- dst2[1] = src_r0[0];
- dst2[2] = src_r0[1];
- dst2[3] = src_r0[2];
- break;
+ case 10:
+ w3[2] = amd_bytealign_S ( 0, w0[3], offset_minus_4);
+ w3[1] = amd_bytealign_S (w0[3], w0[2], offset_minus_4);
+ w3[0] = amd_bytealign_S (w0[2], w0[1], offset_minus_4);
+ w2[3] = amd_bytealign_S (w0[1], w0[0], offset_minus_4);
+ w2[2] = amd_bytealign_S (w0[0], 0, offset_minus_4);
+ w2[1] = 0;
+ w2[0] = 0;
+ w1[3] = 0;
+ w1[2] = 0;
+ w1[1] = 0;
+ w1[0] = 0;
+ w0[3] = 0;
+ w0[2] = 0;
+ w0[1] = 0;
+ w0[0] = 0;
- case 37:
- dst2[1] = src_l2[1] | src_r0[0] << 8;
- dst2[2] = src_r0[0] >> 24 | src_r0[1] << 8;
- dst2[3] = src_r0[1] >> 24 | src_r0[2] << 8;
- break;
+ if (offset_mod_4 == 0)
+ {
+ w2[2] = w2[3];
+ w2[3] = w3[0];
+ w3[0] = w3[1];
+ w3[1] = w3[2];
+ w3[2] = 0;
+ }
- case 38:
- dst2[1] = src_l2[1] | src_r0[0] << 16;
- dst2[2] = src_r0[0] >> 16 | src_r0[1] << 16;
- dst2[3] = src_r0[1] >> 16 | src_r0[2] << 16;
break;
- case 39:
- dst2[1] = src_l2[1] | src_r0[0] << 24;
- dst2[2] = src_r0[0] >> 8 | src_r0[1] << 24;
- dst2[3] = src_r0[1] >> 8 | src_r0[2] << 24;
- break;
+ case 11:
+ w3[2] = amd_bytealign_S ( 0, w0[2], offset_minus_4);
+ w3[1] = amd_bytealign_S (w0[2], w0[1], offset_minus_4);
+ w3[0] = amd_bytealign_S (w0[1], w0[0], offset_minus_4);
+ w2[3] = amd_bytealign_S (w0[0], 0, offset_minus_4);
+ w2[2] = 0;
+ w2[1] = 0;
+ w2[0] = 0;
+ w1[3] = 0;
+ w1[2] = 0;
+ w1[1] = 0;
+ w1[0] = 0;
+ w0[3] = 0;
+ w0[2] = 0;
+ w0[1] = 0;
+ w0[0] = 0;
- case 40:
- dst2[2] = src_r0[0];
- dst2[3] = src_r0[1];
- break;
+ if (offset_mod_4 == 0)
+ {
+ w2[3] = w3[0];
+ w3[0] = w3[1];
+ w3[1] = w3[2];
+ w3[2] = 0;
+ }
- case 41:
- dst2[2] = src_l2[2] | src_r0[0] << 8;
- dst2[3] = src_r0[0] >> 24 | src_r0[1] << 8;
break;
- case 42:
- dst2[2] = src_l2[2] | src_r0[0] << 16;
- dst2[3] = src_r0[0] >> 16 | src_r0[1] << 16;
- break;
+ case 12:
+ w3[2] = amd_bytealign_S ( 0, w0[1], offset_minus_4);
+ w3[1] = amd_bytealign_S (w0[1], w0[0], offset_minus_4);
+ w3[0] = amd_bytealign_S (w0[0], 0, offset_minus_4);
+ w2[3] = 0;
+ w2[2] = 0;
+ w2[1] = 0;
+ w2[0] = 0;
+ w1[3] = 0;
+ w1[2] = 0;
+ w1[1] = 0;
+ w1[0] = 0;
+ w0[3] = 0;
+ w0[2] = 0;
+ w0[1] = 0;
+ w0[0] = 0;
- case 43:
- dst2[2] = src_l2[2] | src_r0[0] << 24;
- dst2[3] = src_r0[0] >> 8 | src_r0[1] << 24;
- break;
+ if (offset_mod_4 == 0)
+ {
+ w3[0] = w3[1];
+ w3[1] = w3[2];
+ w3[2] = 0;
+ }
- case 44:
- dst2[3] = src_r0[0];
break;
- case 45:
- dst2[3] = src_l2[3] | src_r0[0] << 8;
- break;
+ case 13:
+ w3[2] = amd_bytealign_S ( 0, w0[0], offset_minus_4);
+ w3[1] = amd_bytealign_S (w0[0], 0, offset_minus_4);
+ w3[0] = 0;
+ w2[3] = 0;
+ w2[2] = 0;
+ w2[1] = 0;
+ w2[0] = 0;
+ w1[3] = 0;
+ w1[2] = 0;
+ w1[1] = 0;
+ w1[0] = 0;
+ w0[3] = 0;
+ w0[2] = 0;
+ w0[1] = 0;
+ w0[0] = 0;
- case 46:
- dst2[3] = src_l2[3] | src_r0[0] << 16;
- break;
+ if (offset_mod_4 == 0)
+ {
+ w3[1] = w3[2];
+ w3[2] = 0;
+ }
- case 47:
- dst2[3] = src_l2[3] | src_r0[0] << 24;
break;
}
-}
+ #endif
-// before: memcat16_9
-static void memcat_c15_w4x4_a3x4 (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], const u32 append0[4], const u32 append1[4], const u32 append2[4], const u32 offset)
-{
- switch (offset)
+ #ifdef IS_NV
+ const int offset_minus_4 = 4 - (offset % 4);
+
+ const int selector = (0x76543210 >> (offset_minus_4 * 4)) & 0xffff;
+
+ switch (offset / 4)
{
case 0:
- w0[0] = append0[0];
- w0[1] = append0[1];
- w0[2] = append0[2];
- w0[3] = append0[3];
- w1[0] = append1[0];
- w1[1] = append1[1];
- w1[2] = append1[2];
- w1[3] = append1[3];
- w2[0] = append2[0];
+ w3[1] = __byte_perm_S (w3[0], w3[1], selector);
+ w3[0] = __byte_perm_S (w2[3], w3[0], selector);
+ w2[3] = __byte_perm_S (w2[2], w2[3], selector);
+ w2[2] = __byte_perm_S (w2[1], w2[2], selector);
+ w2[1] = __byte_perm_S (w2[0], w2[1], selector);
+ w2[0] = __byte_perm_S (w1[3], w2[0], selector);
+ w1[3] = __byte_perm_S (w1[2], w1[3], selector);
+ w1[2] = __byte_perm_S (w1[1], w1[2], selector);
+ w1[1] = __byte_perm_S (w1[0], w1[1], selector);
+ w1[0] = __byte_perm_S (w0[3], w1[0], selector);
+ w0[3] = __byte_perm_S (w0[2], w0[3], selector);
+ w0[2] = __byte_perm_S (w0[1], w0[2], selector);
+ w0[1] = __byte_perm_S (w0[0], w0[1], selector);
+ w0[0] = __byte_perm_S ( 0, w0[0], selector);
+
break;
case 1:
- w0[0] = w0[0] | append0[0] << 8;
- w0[1] = append0[0] >> 24 | append0[1] << 8;
- w0[2] = append0[1] >> 24 | append0[2] << 8;
- w0[3] = append0[2] >> 24 | append0[3] << 8;
- w1[0] = append0[3] >> 24 | append1[0] << 8;
- w1[1] = append1[0] >> 24 | append1[1] << 8;
- w1[2] = append1[1] >> 24 | append1[2] << 8;
- w1[3] = append1[2] >> 24 | append1[3] << 8;
- w2[0] = append1[3] >> 24 | append2[0] << 8;
- w2[1] = append2[0] >> 24;
+ w3[1] = __byte_perm_S (w2[3], w3[0], selector);
+ w3[0] = __byte_perm_S (w2[2], w2[3], selector);
+ w2[3] = __byte_perm_S (w2[1], w2[2], selector);
+ w2[2] = __byte_perm_S (w2[0], w2[1], selector);
+ w2[1] = __byte_perm_S (w1[3], w2[0], selector);
+ w2[0] = __byte_perm_S (w1[2], w1[3], selector);
+ w1[3] = __byte_perm_S (w1[1], w1[2], selector);
+ w1[2] = __byte_perm_S (w1[0], w1[1], selector);
+ w1[1] = __byte_perm_S (w0[3], w1[0], selector);
+ w1[0] = __byte_perm_S (w0[2], w0[3], selector);
+ w0[3] = __byte_perm_S (w0[1], w0[2], selector);
+ w0[2] = __byte_perm_S (w0[0], w0[1], selector);
+ w0[1] = __byte_perm_S ( 0, w0[0], selector);
+ w0[0] = 0;
+
break;
case 2:
- w0[0] = w0[0] | append0[0] << 16;
- w0[1] = append0[0] >> 16 | append0[1] << 16;
- w0[2] = append0[1] >> 16 | append0[2] << 16;
- w0[3] = append0[2] >> 16 | append0[3] << 16;
- w1[0] = append0[3] >> 16 | append1[0] << 16;
- w1[1] = append1[0] >> 16 | append1[1] << 16;
- w1[2] = append1[1] >> 16 | append1[2] << 16;
- w1[3] = append1[2] >> 16 | append1[3] << 16;
- w2[0] = append1[3] >> 16 | append2[0] << 16;
- w2[1] = append2[0] >> 16;
+ w3[1] = __byte_perm_S (w2[2], w2[3], selector);
+ w3[0] = __byte_perm_S (w2[1], w2[2], selector);
+ w2[3] = __byte_perm_S (w2[0], w2[1], selector);
+ w2[2] = __byte_perm_S (w1[3], w2[0], selector);
+ w2[1] = __byte_perm_S (w1[2], w1[3], selector);
+ w2[0] = __byte_perm_S (w1[1], w1[2], selector);
+ w1[3] = __byte_perm_S (w1[0], w1[1], selector);
+ w1[2] = __byte_perm_S (w0[3], w1[0], selector);
+ w1[1] = __byte_perm_S (w0[2], w0[3], selector);
+ w1[0] = __byte_perm_S (w0[1], w0[2], selector);
+ w0[3] = __byte_perm_S (w0[0], w0[1], selector);
+ w0[2] = __byte_perm_S ( 0, w0[0], selector);
+ w0[1] = 0;
+ w0[0] = 0;
+
break;
case 3:
- w0[0] = w0[0] | append0[0] << 24;
- w0[1] = append0[0] >> 8 | append0[1] << 24;
- w0[2] = append0[1] >> 8 | append0[2] << 24;
- w0[3] = append0[2] >> 8 | append0[3] << 24;
- w1[0] = append0[3] >> 8 | append1[0] << 24;
- w1[1] = append1[0] >> 8 | append1[1] << 24;
- w1[2] = append1[1] >> 8 | append1[2] << 24;
- w1[3] = append1[2] >> 8 | append1[3] << 24;
- w2[0] = append1[3] >> 8 | append2[0] << 24;
- w2[1] = append2[0] >> 8;
+ w3[1] = __byte_perm_S (w2[1], w2[2], selector);
+ w3[0] = __byte_perm_S (w2[0], w2[1], selector);
+ w2[3] = __byte_perm_S (w1[3], w2[0], selector);
+ w2[2] = __byte_perm_S (w1[2], w1[3], selector);
+ w2[1] = __byte_perm_S (w1[1], w1[2], selector);
+ w2[0] = __byte_perm_S (w1[0], w1[1], selector);
+ w1[3] = __byte_perm_S (w0[3], w1[0], selector);
+ w1[2] = __byte_perm_S (w0[2], w0[3], selector);
+ w1[1] = __byte_perm_S (w0[1], w0[2], selector);
+ w1[0] = __byte_perm_S (w0[0], w0[1], selector);
+ w0[3] = __byte_perm_S ( 0, w0[0], selector);
+ w0[2] = 0;
+ w0[1] = 0;
+ w0[0] = 0;
+
break;
case 4:
- w0[1] = append0[0];
- w0[2] = append0[1];
- w0[3] = append0[2];
- w1[0] = append0[3];
- w1[1] = append1[0];
- w1[2] = append1[1];
- w1[3] = append1[2];
- w2[0] = append1[3];
- w2[1] = append2[0];
+ w3[1] = __byte_perm_S (w2[0], w2[1], selector);
+ w3[0] = __byte_perm_S (w1[3], w2[0], selector);
+ w2[3] = __byte_perm_S (w1[2], w1[3], selector);
+ w2[2] = __byte_perm_S (w1[1], w1[2], selector);
+ w2[1] = __byte_perm_S (w1[0], w1[1], selector);
+ w2[0] = __byte_perm_S (w0[3], w1[0], selector);
+ w1[3] = __byte_perm_S (w0[2], w0[3], selector);
+ w1[2] = __byte_perm_S (w0[1], w0[2], selector);
+ w1[1] = __byte_perm_S (w0[0], w0[1], selector);
+ w1[0] = __byte_perm_S ( 0, w0[0], selector);
+ w0[3] = 0;
+ w0[2] = 0;
+ w0[1] = 0;
+ w0[0] = 0;
+
break;
case 5:
- w0[1] = w0[1] | append0[0] << 8;
- w0[2] = append0[0] >> 24 | append0[1] << 8;
- w0[3] = append0[1] >> 24 | append0[2] << 8;
- w1[0] = append0[2] >> 24 | append0[3] << 8;
- w1[1] = append0[3] >> 24 | append1[0] << 8;
- w1[2] = append1[0] >> 24 | append1[1] << 8;
- w1[3] = append1[1] >> 24 | append1[2] << 8;
- w2[0] = append1[2] >> 24 | append1[3] << 8;
- w2[1] = append1[3] >> 24 | append2[0] << 8;
- w2[2] = append2[0] >> 24;
+ w3[1] = __byte_perm_S (w1[3], w2[0], selector);
+ w3[0] = __byte_perm_S (w1[2], w1[3], selector);
+ w2[3] = __byte_perm_S (w1[1], w1[2], selector);
+ w2[2] = __byte_perm_S (w1[0], w1[1], selector);
+ w2[1] = __byte_perm_S (w0[3], w1[0], selector);
+ w2[0] = __byte_perm_S (w0[2], w0[3], selector);
+ w1[3] = __byte_perm_S (w0[1], w0[2], selector);
+ w1[2] = __byte_perm_S (w0[0], w0[1], selector);
+ w1[1] = __byte_perm_S ( 0, w0[0], selector);
+ w1[0] = 0;
+ w0[3] = 0;
+ w0[2] = 0;
+ w0[1] = 0;
+ w0[0] = 0;
+
break;
case 6:
- w0[1] = w0[1] | append0[0] << 16;
- w0[2] = append0[0] >> 16 | append0[1] << 16;
- w0[3] = append0[1] >> 16 | append0[2] << 16;
- w1[0] = append0[2] >> 16 | append0[3] << 16;
- w1[1] = append0[3] >> 16 | append1[0] << 16;
- w1[2] = append1[0] >> 16 | append1[1] << 16;
- w1[3] = append1[1] >> 16 | append1[2] << 16;
- w2[0] = append1[2] >> 16 | append1[3] << 16;
- w2[1] = append1[3] >> 16 | append2[0] << 16;
- w2[2] = append2[0] >> 16;
+ w3[1] = __byte_perm_S (w1[2], w1[3], selector);
+ w3[0] = __byte_perm_S (w1[1], w1[2], selector);
+ w2[3] = __byte_perm_S (w1[0], w1[1], selector);
+ w2[2] = __byte_perm_S (w0[3], w1[0], selector);
+ w2[1] = __byte_perm_S (w0[2], w0[3], selector);
+ w2[0] = __byte_perm_S (w0[1], w0[2], selector);
+ w1[3] = __byte_perm_S (w0[0], w0[1], selector);
+ w1[2] = __byte_perm_S ( 0, w0[0], selector);
+ w1[1] = 0;
+ w1[0] = 0;
+ w0[3] = 0;
+ w0[2] = 0;
+ w0[1] = 0;
+ w0[0] = 0;
+
break;
case 7:
- w0[1] = w0[1] | append0[0] << 24;
- w0[2] = append0[0] >> 8 | append0[1] << 24;
- w0[3] = append0[1] >> 8 | append0[2] << 24;
- w1[0] = append0[2] >> 8 | append0[3] << 24;
- w1[1] = append0[3] >> 8 | append1[0] << 24;
- w1[2] = append1[0] >> 8 | append1[1] << 24;
- w1[3] = append1[1] >> 8 | append1[2] << 24;
- w2[0] = append1[2] >> 8 | append1[3] << 24;
- w2[1] = append1[3] >> 8 | append2[0] << 24;
- w2[2] = append2[0] >> 8;
+ w3[1] = __byte_perm_S (w1[1], w1[2], selector);
+ w3[0] = __byte_perm_S (w1[0], w1[1], selector);
+ w2[3] = __byte_perm_S (w0[3], w1[0], selector);
+ w2[2] = __byte_perm_S (w0[2], w0[3], selector);
+ w2[1] = __byte_perm_S (w0[1], w0[2], selector);
+ w2[0] = __byte_perm_S (w0[0], w0[1], selector);
+ w1[3] = __byte_perm_S ( 0, w0[0], selector);
+ w1[2] = 0;
+ w1[1] = 0;
+ w1[0] = 0;
+ w0[3] = 0;
+ w0[2] = 0;
+ w0[1] = 0;
+ w0[0] = 0;
+
break;
case 8:
- w0[2] = append0[0];
- w0[3] = append0[1];
- w1[0] = append0[2];
- w1[1] = append0[3];
- w1[2] = append1[0];
- w1[3] = append1[1];
- w2[0] = append1[2];
- w2[1] = append1[3];
- w2[2] = append2[0];
+ w3[1] = __byte_perm_S (w1[0], w1[1], selector);
+ w3[0] = __byte_perm_S (w0[3], w1[0], selector);
+ w2[3] = __byte_perm_S (w0[2], w0[3], selector);
+ w2[2] = __byte_perm_S (w0[1], w0[2], selector);
+ w2[1] = __byte_perm_S (w0[0], w0[1], selector);
+ w2[0] = __byte_perm_S ( 0, w0[0], selector);
+ w1[3] = 0;
+ w1[2] = 0;
+ w1[1] = 0;
+ w1[0] = 0;
+ w0[3] = 0;
+ w0[2] = 0;
+ w0[1] = 0;
+ w0[0] = 0;
+
break;
case 9:
- w0[2] = w0[2] | append0[0] << 8;
- w0[3] = append0[0] >> 24 | append0[1] << 8;
- w1[0] = append0[1] >> 24 | append0[2] << 8;
- w1[1] = append0[2] >> 24 | append0[3] << 8;
- w1[2] = append0[3] >> 24 | append1[0] << 8;
- w1[3] = append1[0] >> 24 | append1[1] << 8;
- w2[0] = append1[1] >> 24 | append1[2] << 8;
- w2[1] = append1[2] >> 24 | append1[3] << 8;
- w2[2] = append1[3] >> 24 | append2[0] << 8;
- w2[3] = append2[0] >> 24;
+ w3[1] = __byte_perm_S (w0[3], w1[0], selector);
+ w3[0] = __byte_perm_S (w0[2], w0[3], selector);
+ w2[3] = __byte_perm_S (w0[1], w0[2], selector);
+ w2[2] = __byte_perm_S (w0[0], w0[1], selector);
+ w2[1] = __byte_perm_S ( 0, w0[0], selector);
+ w2[0] = 0;
+ w1[3] = 0;
+ w1[2] = 0;
+ w1[1] = 0;
+ w1[0] = 0;
+ w0[3] = 0;
+ w0[2] = 0;
+ w0[1] = 0;
+ w0[0] = 0;
+
break;
case 10:
- w0[2] = w0[2] | append0[0] << 16;
- w0[3] = append0[0] >> 16 | append0[1] << 16;
- w1[0] = append0[1] >> 16 | append0[2] << 16;
- w1[1] = append0[2] >> 16 | append0[3] << 16;
- w1[2] = append0[3] >> 16 | append1[0] << 16;
- w1[3] = append1[0] >> 16 | append1[1] << 16;
- w2[0] = append1[1] >> 16 | append1[2] << 16;
- w2[1] = append1[2] >> 16 | append1[3] << 16;
- w2[2] = append1[3] >> 16 | append2[0] << 16;
- w2[3] = append2[0] >> 16;
+ w3[1] = __byte_perm_S (w0[2], w0[3], selector);
+ w3[0] = __byte_perm_S (w0[1], w0[2], selector);
+ w2[3] = __byte_perm_S (w0[0], w0[1], selector);
+ w2[2] = __byte_perm_S ( 0, w0[0], selector);
+ w2[1] = 0;
+ w2[0] = 0;
+ w1[3] = 0;
+ w1[2] = 0;
+ w1[1] = 0;
+ w1[0] = 0;
+ w0[3] = 0;
+ w0[2] = 0;
+ w0[1] = 0;
+ w0[0] = 0;
+
break;
case 11:
- w0[2] = w0[2] | append0[0] << 24;
- w0[3] = append0[0] >> 8 | append0[1] << 24;
- w1[0] = append0[1] >> 8 | append0[2] << 24;
- w1[1] = append0[2] >> 8 | append0[3] << 24;
- w1[2] = append0[3] >> 8 | append1[0] << 24;
- w1[3] = append1[0] >> 8 | append1[1] << 24;
- w2[0] = append1[1] >> 8 | append1[2] << 24;
- w2[1] = append1[2] >> 8 | append1[3] << 24;
- w2[2] = append1[3] >> 8 | append2[0] << 24;
- w2[3] = append2[0] >> 8;
+ w3[1] = __byte_perm_S (w0[1], w0[2], selector);
+ w3[0] = __byte_perm_S (w0[0], w0[1], selector);
+ w2[3] = __byte_perm_S ( 0, w0[0], selector);
+ w2[2] = 0;
+ w2[1] = 0;
+ w2[0] = 0;
+ w1[3] = 0;
+ w1[2] = 0;
+ w1[1] = 0;
+ w1[0] = 0;
+ w0[3] = 0;
+ w0[2] = 0;
+ w0[1] = 0;
+ w0[0] = 0;
+
break;
case 12:
- w0[3] = append0[0];
- w1[0] = append0[1];
- w1[1] = append0[2];
- w1[2] = append0[3];
- w1[3] = append1[0];
- w2[0] = append1[1];
- w2[1] = append1[2];
- w2[2] = append1[3];
- w2[3] = append2[0];
- break;
+ w3[1] = __byte_perm_S (w0[0], w0[1], selector);
+ w3[0] = __byte_perm_S ( 0, w0[0], selector);
+ w2[3] = 0;
+ w2[2] = 0;
+ w2[1] = 0;
+ w2[0] = 0;
+ w1[3] = 0;
+ w1[2] = 0;
+ w1[1] = 0;
+ w1[0] = 0;
+ w0[3] = 0;
+ w0[2] = 0;
+ w0[1] = 0;
+ w0[0] = 0;
- case 13:
- w0[3] = w0[3] | append0[0] << 8;
- w1[0] = append0[0] >> 24 | append0[1] << 8;
- w1[1] = append0[1] >> 24 | append0[2] << 8;
- w1[2] = append0[2] >> 24 | append0[3] << 8;
- w1[3] = append0[3] >> 24 | append1[0] << 8;
- w2[0] = append1[0] >> 24 | append1[1] << 8;
- w2[1] = append1[1] >> 24 | append1[2] << 8;
- w2[2] = append1[2] >> 24 | append1[3] << 8;
- w2[3] = append1[3] >> 24 | append2[0] << 8;
- w3[0] = append2[0] >> 24;
break;
- case 14:
- w0[3] = w0[3] | append0[0] << 16;
- w1[0] = append0[0] >> 16 | append0[1] << 16;
- w1[1] = append0[1] >> 16 | append0[2] << 16;
- w1[2] = append0[2] >> 16 | append0[3] << 16;
- w1[3] = append0[3] >> 16 | append1[0] << 16;
- w2[0] = append1[0] >> 16 | append1[1] << 16;
- w2[1] = append1[1] >> 16 | append1[2] << 16;
- w2[2] = append1[2] >> 16 | append1[3] << 16;
- w2[3] = append1[3] >> 16 | append2[0] << 16;
- w3[0] = append2[0] >> 16;
- break;
+ case 13:
+ w3[1] = __byte_perm_S ( 0, w0[0], selector);
+ w3[0] = 0;
+ w2[3] = 0;
+ w2[2] = 0;
+ w2[1] = 0;
+ w2[0] = 0;
+ w1[3] = 0;
+ w1[2] = 0;
+ w1[1] = 0;
+ w1[0] = 0;
+ w0[3] = 0;
+ w0[2] = 0;
+ w0[1] = 0;
+ w0[0] = 0;
- case 15:
- w0[3] = w0[3] | append0[0] << 24;
- w1[0] = append0[0] >> 8 | append0[1] << 24;
- w1[1] = append0[1] >> 8 | append0[2] << 24;
- w1[2] = append0[2] >> 8 | append0[3] << 24;
- w1[3] = append0[3] >> 8 | append1[0] << 24;
- w2[0] = append1[0] >> 8 | append1[1] << 24;
- w2[1] = append1[1] >> 8 | append1[2] << 24;
- w2[2] = append1[2] >> 8 | append1[3] << 24;
- w2[3] = append1[3] >> 8 | append2[0] << 24;
- w3[0] = append2[0] >> 8;
break;
}
+ #endif
}
-// before: memcat32_8
-static void memcat_c32_w4x4_a2x4 (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], const u32 append0[4], const u32 append1[4], const u32 offset)
+static void switch_buffer_by_offset_be_S (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], const u32 offset)
{
- switch (offset)
+ #if defined IS_AMD || defined IS_GENERIC
+ switch (offset / 4)
{
case 0:
- w0[0] = append0[0];
- w0[1] = append0[1];
- w0[2] = append0[2];
- w0[3] = append0[3];
- w1[0] = append1[0];
- w1[1] = append1[1];
- w1[2] = append1[2];
- w1[3] = append1[3];
+ w3[2] = amd_bytealign_S (w3[1], 0, offset);
+ w3[1] = amd_bytealign_S (w3[0], w3[1], offset);
+ w3[0] = amd_bytealign_S (w2[3], w3[0], offset);
+ w2[3] = amd_bytealign_S (w2[2], w2[3], offset);
+ w2[2] = amd_bytealign_S (w2[1], w2[2], offset);
+ w2[1] = amd_bytealign_S (w2[0], w2[1], offset);
+ w2[0] = amd_bytealign_S (w1[3], w2[0], offset);
+ w1[3] = amd_bytealign_S (w1[2], w1[3], offset);
+ w1[2] = amd_bytealign_S (w1[1], w1[2], offset);
+ w1[1] = amd_bytealign_S (w1[0], w1[1], offset);
+ w1[0] = amd_bytealign_S (w0[3], w1[0], offset);
+ w0[3] = amd_bytealign_S (w0[2], w0[3], offset);
+ w0[2] = amd_bytealign_S (w0[1], w0[2], offset);
+ w0[1] = amd_bytealign_S (w0[0], w0[1], offset);
+ w0[0] = amd_bytealign_S ( 0, w0[0], offset);
break;
case 1:
- w0[0] = w0[0] | append0[0] << 8;
- w0[1] = append0[0] >> 24 | append0[1] << 8;
- w0[2] = append0[1] >> 24 | append0[2] << 8;
- w0[3] = append0[2] >> 24 | append0[3] << 8;
- w1[0] = append0[3] >> 24 | append1[0] << 8;
- w1[1] = append1[0] >> 24 | append1[1] << 8;
- w1[2] = append1[1] >> 24 | append1[2] << 8;
- w1[3] = append1[2] >> 24 | append1[3] << 8;
- w2[0] = append1[3] >> 24;
+ w3[2] = amd_bytealign_S (w3[0], 0, offset);
+ w3[1] = amd_bytealign_S (w2[3], w3[0], offset);
+ w3[0] = amd_bytealign_S (w2[2], w2[3], offset);
+ w2[3] = amd_bytealign_S (w2[1], w2[2], offset);
+ w2[2] = amd_bytealign_S (w2[0], w2[1], offset);
+ w2[1] = amd_bytealign_S (w1[3], w2[0], offset);
+ w2[0] = amd_bytealign_S (w1[2], w1[3], offset);
+ w1[3] = amd_bytealign_S (w1[1], w1[2], offset);
+ w1[2] = amd_bytealign_S (w1[0], w1[1], offset);
+ w1[1] = amd_bytealign_S (w0[3], w1[0], offset);
+ w1[0] = amd_bytealign_S (w0[2], w0[3], offset);
+ w0[3] = amd_bytealign_S (w0[1], w0[2], offset);
+ w0[2] = amd_bytealign_S (w0[0], w0[1], offset);
+ w0[1] = amd_bytealign_S ( 0, w0[0], offset);
+ w0[0] = 0;
break;
case 2:
- w0[0] = w0[0] | append0[0] << 16;
- w0[1] = append0[0] >> 16 | append0[1] << 16;
- w0[2] = append0[1] >> 16 | append0[2] << 16;
- w0[3] = append0[2] >> 16 | append0[3] << 16;
- w1[0] = append0[3] >> 16 | append1[0] << 16;
- w1[1] = append1[0] >> 16 | append1[1] << 16;
- w1[2] = append1[1] >> 16 | append1[2] << 16;
- w1[3] = append1[2] >> 16 | append1[3] << 16;
- w2[0] = append1[3] >> 16;
+ w3[2] = amd_bytealign_S (w2[3], 0, offset);
+ w3[1] = amd_bytealign_S (w2[2], w2[3], offset);
+ w3[0] = amd_bytealign_S (w2[1], w2[2], offset);
+ w2[3] = amd_bytealign_S (w2[0], w2[1], offset);
+ w2[2] = amd_bytealign_S (w1[3], w2[0], offset);
+ w2[1] = amd_bytealign_S (w1[2], w1[3], offset);
+ w2[0] = amd_bytealign_S (w1[1], w1[2], offset);
+ w1[3] = amd_bytealign_S (w1[0], w1[1], offset);
+ w1[2] = amd_bytealign_S (w0[3], w1[0], offset);
+ w1[1] = amd_bytealign_S (w0[2], w0[3], offset);
+ w1[0] = amd_bytealign_S (w0[1], w0[2], offset);
+ w0[3] = amd_bytealign_S (w0[0], w0[1], offset);
+ w0[2] = amd_bytealign_S ( 0, w0[0], offset);
+ w0[1] = 0;
+ w0[0] = 0;
break;
case 3:
- w0[0] = w0[0] | append0[0] << 24;
- w0[1] = append0[0] >> 8 | append0[1] << 24;
- w0[2] = append0[1] >> 8 | append0[2] << 24;
- w0[3] = append0[2] >> 8 | append0[3] << 24;
- w1[0] = append0[3] >> 8 | append1[0] << 24;
- w1[1] = append1[0] >> 8 | append1[1] << 24;
- w1[2] = append1[1] >> 8 | append1[2] << 24;
- w1[3] = append1[2] >> 8 | append1[3] << 24;
- w2[0] = append1[3] >> 8;
+ w3[2] = amd_bytealign_S (w2[2], 0, offset);
+ w3[1] = amd_bytealign_S (w2[1], w2[2], offset);
+ w3[0] = amd_bytealign_S (w2[0], w2[1], offset);
+ w2[3] = amd_bytealign_S (w1[3], w2[0], offset);
+ w2[2] = amd_bytealign_S (w1[2], w1[3], offset);
+ w2[1] = amd_bytealign_S (w1[1], w1[2], offset);
+ w2[0] = amd_bytealign_S (w1[0], w1[1], offset);
+ w1[3] = amd_bytealign_S (w0[3], w1[0], offset);
+ w1[2] = amd_bytealign_S (w0[2], w0[3], offset);
+ w1[1] = amd_bytealign_S (w0[1], w0[2], offset);
+ w1[0] = amd_bytealign_S (w0[0], w0[1], offset);
+ w0[3] = amd_bytealign_S ( 0, w0[0], offset);
+ w0[2] = 0;
+ w0[1] = 0;
+ w0[0] = 0;
break;
case 4:
- w0[1] = append0[0];
- w0[2] = append0[1];
- w0[3] = append0[2];
- w1[0] = append0[3];
- w1[1] = append1[0];
- w1[2] = append1[1];
- w1[3] = append1[2];
- w2[0] = append1[3];
+ w3[2] = amd_bytealign_S (w2[1], 0, offset);
+ w3[1] = amd_bytealign_S (w2[0], w2[1], offset);
+ w3[0] = amd_bytealign_S (w1[3], w2[0], offset);
+ w2[3] = amd_bytealign_S (w1[2], w1[3], offset);
+ w2[2] = amd_bytealign_S (w1[1], w1[2], offset);
+ w2[1] = amd_bytealign_S (w1[0], w1[1], offset);
+ w2[0] = amd_bytealign_S (w0[3], w1[0], offset);
+ w1[3] = amd_bytealign_S (w0[2], w0[3], offset);
+ w1[2] = amd_bytealign_S (w0[1], w0[2], offset);
+ w1[1] = amd_bytealign_S (w0[0], w0[1], offset);
+ w1[0] = amd_bytealign_S ( 0, w0[0], offset);
+ w0[3] = 0;
+ w0[2] = 0;
+ w0[1] = 0;
+ w0[0] = 0;
break;
case 5:
- w0[1] = w0[1] | append0[0] << 8;
- w0[2] = append0[0] >> 24 | append0[1] << 8;
- w0[3] = append0[1] >> 24 | append0[2] << 8;
- w1[0] = append0[2] >> 24 | append0[3] << 8;
- w1[1] = append0[3] >> 24 | append1[0] << 8;
- w1[2] = append1[0] >> 24 | append1[1] << 8;
- w1[3] = append1[1] >> 24 | append1[2] << 8;
- w2[0] = append1[2] >> 24 | append1[3] << 8;
- w2[1] = append1[3] >> 24;
+ w3[2] = amd_bytealign_S (w2[0], 0, offset);
+ w3[1] = amd_bytealign_S (w1[3], w2[0], offset);
+ w3[0] = amd_bytealign_S (w1[2], w1[3], offset);
+ w2[3] = amd_bytealign_S (w1[1], w1[2], offset);
+ w2[2] = amd_bytealign_S (w1[0], w1[1], offset);
+ w2[1] = amd_bytealign_S (w0[3], w1[0], offset);
+ w2[0] = amd_bytealign_S (w0[2], w0[3], offset);
+ w1[3] = amd_bytealign_S (w0[1], w0[2], offset);
+ w1[2] = amd_bytealign_S (w0[0], w0[1], offset);
+ w1[1] = amd_bytealign_S ( 0, w0[0], offset);
+ w1[0] = 0;
+ w0[3] = 0;
+ w0[2] = 0;
+ w0[1] = 0;
+ w0[0] = 0;
break;
case 6:
- w0[1] = w0[1] | append0[0] << 16;
- w0[2] = append0[0] >> 16 | append0[1] << 16;
- w0[3] = append0[1] >> 16 | append0[2] << 16;
- w1[0] = append0[2] >> 16 | append0[3] << 16;
- w1[1] = append0[3] >> 16 | append1[0] << 16;
- w1[2] = append1[0] >> 16 | append1[1] << 16;
- w1[3] = append1[1] >> 16 | append1[2] << 16;
- w2[0] = append1[2] >> 16 | append1[3] << 16;
- w2[1] = append1[3] >> 16;
+ w3[2] = amd_bytealign_S (w1[3], 0, offset);
+ w3[1] = amd_bytealign_S (w1[2], w1[3], offset);
+ w3[0] = amd_bytealign_S (w1[1], w1[2], offset);
+ w2[3] = amd_bytealign_S (w1[0], w1[1], offset);
+ w2[2] = amd_bytealign_S (w0[3], w1[0], offset);
+ w2[1] = amd_bytealign_S (w0[2], w0[3], offset);
+ w2[0] = amd_bytealign_S (w0[1], w0[2], offset);
+ w1[3] = amd_bytealign_S (w0[0], w0[1], offset);
+ w1[2] = amd_bytealign_S ( 0, w0[0], offset);
+ w1[1] = 0;
+ w1[0] = 0;
+ w0[3] = 0;
+ w0[2] = 0;
+ w0[1] = 0;
+ w0[0] = 0;
break;
case 7:
- w0[1] = w0[1] | append0[0] << 24;
- w0[2] = append0[0] >> 8 | append0[1] << 24;
- w0[3] = append0[1] >> 8 | append0[2] << 24;
- w1[0] = append0[2] >> 8 | append0[3] << 24;
- w1[1] = append0[3] >> 8 | append1[0] << 24;
- w1[2] = append1[0] >> 8 | append1[1] << 24;
- w1[3] = append1[1] >> 8 | append1[2] << 24;
- w2[0] = append1[2] >> 8 | append1[3] << 24;
- w2[1] = append1[3] >> 8;
+ w3[2] = amd_bytealign_S (w1[2], 0, offset);
+ w3[1] = amd_bytealign_S (w1[1], w1[2], offset);
+ w3[0] = amd_bytealign_S (w1[0], w1[1], offset);
+ w2[3] = amd_bytealign_S (w0[3], w1[0], offset);
+ w2[2] = amd_bytealign_S (w0[2], w0[3], offset);
+ w2[1] = amd_bytealign_S (w0[1], w0[2], offset);
+ w2[0] = amd_bytealign_S (w0[0], w0[1], offset);
+ w1[3] = amd_bytealign_S ( 0, w0[0], offset);
+ w1[2] = 0;
+ w1[1] = 0;
+ w1[0] = 0;
+ w0[3] = 0;
+ w0[2] = 0;
+ w0[1] = 0;
+ w0[0] = 0;
break;
case 8:
- w0[2] = append0[0];
- w0[3] = append0[1];
- w1[0] = append0[2];
- w1[1] = append0[3];
- w1[2] = append1[0];
- w1[3] = append1[1];
- w2[0] = append1[2];
- w2[1] = append1[3];
+ w3[2] = amd_bytealign_S (w1[1], 0, offset);
+ w3[1] = amd_bytealign_S (w1[0], w1[1], offset);
+ w3[0] = amd_bytealign_S (w0[3], w1[0], offset);
+ w2[3] = amd_bytealign_S (w0[2], w0[3], offset);
+ w2[2] = amd_bytealign_S (w0[1], w0[2], offset);
+ w2[1] = amd_bytealign_S (w0[0], w0[1], offset);
+ w2[0] = amd_bytealign_S ( 0, w0[0], offset);
+ w1[3] = 0;
+ w1[2] = 0;
+ w1[1] = 0;
+ w1[0] = 0;
+ w0[3] = 0;
+ w0[2] = 0;
+ w0[1] = 0;
+ w0[0] = 0;
break;
case 9:
- w0[2] = w0[2] | append0[0] << 8;
- w0[3] = append0[0] >> 24 | append0[1] << 8;
- w1[0] = append0[1] >> 24 | append0[2] << 8;
- w1[1] = append0[2] >> 24 | append0[3] << 8;
- w1[2] = append0[3] >> 24 | append1[0] << 8;
- w1[3] = append1[0] >> 24 | append1[1] << 8;
- w2[0] = append1[1] >> 24 | append1[2] << 8;
- w2[1] = append1[2] >> 24 | append1[3] << 8;
- w2[2] = append1[3] >> 24;
+ w3[2] = amd_bytealign_S (w1[0], 0, offset);
+ w3[1] = amd_bytealign_S (w0[3], w1[0], offset);
+ w3[0] = amd_bytealign_S (w0[2], w0[3], offset);
+ w2[3] = amd_bytealign_S (w0[1], w0[2], offset);
+ w2[2] = amd_bytealign_S (w0[0], w0[1], offset);
+ w2[1] = amd_bytealign_S ( 0, w0[0], offset);
+ w2[0] = 0;
+ w1[3] = 0;
+ w1[2] = 0;
+ w1[1] = 0;
+ w1[0] = 0;
+ w0[3] = 0;
+ w0[2] = 0;
+ w0[1] = 0;
+ w0[0] = 0;
break;
case 10:
- w0[2] = w0[2] | append0[0] << 16;
- w0[3] = append0[0] >> 16 | append0[1] << 16;
- w1[0] = append0[1] >> 16 | append0[2] << 16;
- w1[1] = append0[2] >> 16 | append0[3] << 16;
- w1[2] = append0[3] >> 16 | append1[0] << 16;
- w1[3] = append1[0] >> 16 | append1[1] << 16;
- w2[0] = append1[1] >> 16 | append1[2] << 16;
- w2[1] = append1[2] >> 16 | append1[3] << 16;
- w2[2] = append1[3] >> 16;
+ w3[2] = amd_bytealign_S (w0[3], 0, offset);
+ w3[1] = amd_bytealign_S (w0[2], w0[3], offset);
+ w3[0] = amd_bytealign_S (w0[1], w0[2], offset);
+ w2[3] = amd_bytealign_S (w0[0], w0[1], offset);
+ w2[2] = amd_bytealign_S ( 0, w0[0], offset);
+ w2[1] = 0;
+ w2[0] = 0;
+ w1[3] = 0;
+ w1[2] = 0;
+ w1[1] = 0;
+ w1[0] = 0;
+ w0[3] = 0;
+ w0[2] = 0;
+ w0[1] = 0;
+ w0[0] = 0;
break;
case 11:
- w0[2] = w0[2] | append0[0] << 24;
- w0[3] = append0[0] >> 8 | append0[1] << 24;
- w1[0] = append0[1] >> 8 | append0[2] << 24;
- w1[1] = append0[2] >> 8 | append0[3] << 24;
- w1[2] = append0[3] >> 8 | append1[0] << 24;
- w1[3] = append1[0] >> 8 | append1[1] << 24;
- w2[0] = append1[1] >> 8 | append1[2] << 24;
- w2[1] = append1[2] >> 8 | append1[3] << 24;
- w2[2] = append1[3] >> 8;
+ w3[2] = amd_bytealign_S (w0[2], 0, offset);
+ w3[1] = amd_bytealign_S (w0[1], w0[2], offset);
+ w3[0] = amd_bytealign_S (w0[0], w0[1], offset);
+ w2[3] = amd_bytealign_S ( 0, w0[0], offset);
+ w2[2] = 0;
+ w2[1] = 0;
+ w2[0] = 0;
+ w1[3] = 0;
+ w1[2] = 0;
+ w1[1] = 0;
+ w1[0] = 0;
+ w0[3] = 0;
+ w0[2] = 0;
+ w0[1] = 0;
+ w0[0] = 0;
break;
case 12:
- w0[3] = append0[0];
- w1[0] = append0[1];
- w1[1] = append0[2];
- w1[2] = append0[3];
- w1[3] = append1[0];
- w2[0] = append1[1];
- w2[1] = append1[2];
- w2[2] = append1[3];
+ w3[2] = amd_bytealign_S (w0[1], 0, offset);
+ w3[1] = amd_bytealign_S (w0[0], w0[1], offset);
+ w3[0] = amd_bytealign_S ( 0, w0[0], offset);
+ w2[3] = 0;
+ w2[2] = 0;
+ w2[1] = 0;
+ w2[0] = 0;
+ w1[3] = 0;
+ w1[2] = 0;
+ w1[1] = 0;
+ w1[0] = 0;
+ w0[3] = 0;
+ w0[2] = 0;
+ w0[1] = 0;
+ w0[0] = 0;
break;
case 13:
- w0[3] = w0[3] | append0[0] << 8;
- w1[0] = append0[0] >> 24 | append0[1] << 8;
- w1[1] = append0[1] >> 24 | append0[2] << 8;
- w1[2] = append0[2] >> 24 | append0[3] << 8;
- w1[3] = append0[3] >> 24 | append1[0] << 8;
- w2[0] = append1[0] >> 24 | append1[1] << 8;
- w2[1] = append1[1] >> 24 | append1[2] << 8;
- w2[2] = append1[2] >> 24 | append1[3] << 8;
- w2[3] = append1[3] >> 24;
- break;
-
- case 14:
- w0[3] = w0[3] | append0[0] << 16;
- w1[0] = append0[0] >> 16 | append0[1] << 16;
- w1[1] = append0[1] >> 16 | append0[2] << 16;
- w1[2] = append0[2] >> 16 | append0[3] << 16;
- w1[3] = append0[3] >> 16 | append1[0] << 16;
- w2[0] = append1[0] >> 16 | append1[1] << 16;
- w2[1] = append1[1] >> 16 | append1[2] << 16;
- w2[2] = append1[2] >> 16 | append1[3] << 16;
- w2[3] = append1[3] >> 16;
- break;
-
- case 15:
- w0[3] = w0[3] | append0[0] << 24;
- w1[0] = append0[0] >> 8 | append0[1] << 24;
- w1[1] = append0[1] >> 8 | append0[2] << 24;
- w1[2] = append0[2] >> 8 | append0[3] << 24;
- w1[3] = append0[3] >> 8 | append1[0] << 24;
- w2[0] = append1[0] >> 8 | append1[1] << 24;
- w2[1] = append1[1] >> 8 | append1[2] << 24;
- w2[2] = append1[2] >> 8 | append1[3] << 24;
- w2[3] = append1[3] >> 8;
- break;
-
- case 16:
- w1[0] = append0[0];
- w1[1] = append0[1];
- w1[2] = append0[2];
- w1[3] = append0[3];
- w2[0] = append1[0];
- w2[1] = append1[1];
- w2[2] = append1[2];
- w2[3] = append1[3];
- break;
-
- case 17:
- w1[0] = w1[0] | append0[0] << 8;
- w1[1] = append0[0] >> 24 | append0[1] << 8;
- w1[2] = append0[1] >> 24 | append0[2] << 8;
- w1[3] = append0[2] >> 24 | append0[3] << 8;
- w2[0] = append0[3] >> 24 | append1[0] << 8;
- w2[1] = append1[0] >> 24 | append1[1] << 8;
- w2[2] = append1[1] >> 24 | append1[2] << 8;
- w2[3] = append1[2] >> 24 | append1[3] << 8;
- w3[0] = append1[3] >> 24;
- break;
-
- case 18:
- w1[0] = w1[0] | append0[0] << 16;
- w1[1] = append0[0] >> 16 | append0[1] << 16;
- w1[2] = append0[1] >> 16 | append0[2] << 16;
- w1[3] = append0[2] >> 16 | append0[3] << 16;
- w2[0] = append0[3] >> 16 | append1[0] << 16;
- w2[1] = append1[0] >> 16 | append1[1] << 16;
- w2[2] = append1[1] >> 16 | append1[2] << 16;
- w2[3] = append1[2] >> 16 | append1[3] << 16;
- w3[0] = append1[3] >> 16;
- break;
-
- case 19:
- w1[0] = w1[0] | append0[0] << 24;
- w1[1] = append0[0] >> 8 | append0[1] << 24;
- w1[2] = append0[1] >> 8 | append0[2] << 24;
- w1[3] = append0[2] >> 8 | append0[3] << 24;
- w2[0] = append0[3] >> 8 | append1[0] << 24;
- w2[1] = append1[0] >> 8 | append1[1] << 24;
- w2[2] = append1[1] >> 8 | append1[2] << 24;
- w2[3] = append1[2] >> 8 | append1[3] << 24;
- w3[0] = append1[3] >> 8;
- break;
-
- case 20:
- w1[1] = append0[0];
- w1[2] = append0[1];
- w1[3] = append0[2];
- w2[0] = append0[3];
- w2[1] = append1[0];
- w2[2] = append1[1];
- w2[3] = append1[2];
- w3[0] = append1[3];
- break;
-
- case 21:
- w1[1] = w1[1] | append0[0] << 8;
- w1[2] = append0[0] >> 24 | append0[1] << 8;
- w1[3] = append0[1] >> 24 | append0[2] << 8;
- w2[0] = append0[2] >> 24 | append0[3] << 8;
- w2[1] = append0[3] >> 24 | append1[0] << 8;
- w2[2] = append1[0] >> 24 | append1[1] << 8;
- w2[3] = append1[1] >> 24 | append1[2] << 8;
- w3[0] = append1[2] >> 24 | append1[3] << 8;
- w3[1] = append1[3] >> 24;
- break;
-
- case 22:
- w1[1] = w1[1] | append0[0] << 16;
- w1[2] = append0[0] >> 16 | append0[1] << 16;
- w1[3] = append0[1] >> 16 | append0[2] << 16;
- w2[0] = append0[2] >> 16 | append0[3] << 16;
- w2[1] = append0[3] >> 16 | append1[0] << 16;
- w2[2] = append1[0] >> 16 | append1[1] << 16;
- w2[3] = append1[1] >> 16 | append1[2] << 16;
- w3[0] = append1[2] >> 16 | append1[3] << 16;
- w3[1] = append1[3] >> 16;
- break;
-
- case 23:
- w1[1] = w1[1] | append0[0] << 24;
- w1[2] = append0[0] >> 8 | append0[1] << 24;
- w1[3] = append0[1] >> 8 | append0[2] << 24;
- w2[0] = append0[2] >> 8 | append0[3] << 24;
- w2[1] = append0[3] >> 8 | append1[0] << 24;
- w2[2] = append1[0] >> 8 | append1[1] << 24;
- w2[3] = append1[1] >> 8 | append1[2] << 24;
- w3[0] = append1[2] >> 8 | append1[3] << 24;
- w3[1] = append1[3] >> 8;
- break;
-
- case 24:
- w1[2] = append0[0];
- w1[3] = append0[1];
- w2[0] = append0[2];
- w2[1] = append0[3];
- w2[2] = append1[0];
- w2[3] = append1[1];
- w3[0] = append1[2];
- w3[1] = append1[3];
- break;
-
- case 25:
- w1[2] = w1[2] | append0[0] << 8;
- w1[3] = append0[0] >> 24 | append0[1] << 8;
- w2[0] = append0[1] >> 24 | append0[2] << 8;
- w2[1] = append0[2] >> 24 | append0[3] << 8;
- w2[2] = append0[3] >> 24 | append1[0] << 8;
- w2[3] = append1[0] >> 24 | append1[1] << 8;
- w3[0] = append1[1] >> 24 | append1[2] << 8;
- w3[1] = append1[2] >> 24 | append1[3] << 8;
- break;
-
- case 26:
- w1[2] = w1[2] | append0[0] << 16;
- w1[3] = append0[0] >> 16 | append0[1] << 16;
- w2[0] = append0[1] >> 16 | append0[2] << 16;
- w2[1] = append0[2] >> 16 | append0[3] << 16;
- w2[2] = append0[3] >> 16 | append1[0] << 16;
- w2[3] = append1[0] >> 16 | append1[1] << 16;
- w3[0] = append1[1] >> 16 | append1[2] << 16;
- w3[1] = append1[2] >> 16 | append1[3] << 16;
- break;
-
- case 27:
- w1[2] = w1[2] | append0[0] << 24;
- w1[3] = append0[0] >> 8 | append0[1] << 24;
- w2[0] = append0[1] >> 8 | append0[2] << 24;
- w2[1] = append0[2] >> 8 | append0[3] << 24;
- w2[2] = append0[3] >> 8 | append1[0] << 24;
- w2[3] = append1[0] >> 8 | append1[1] << 24;
- w3[0] = append1[1] >> 8 | append1[2] << 24;
- w3[1] = append1[2] >> 8 | append1[3] << 24;
- break;
-
- case 28:
- w1[3] = append0[0];
- w2[0] = append0[1];
- w2[1] = append0[2];
- w2[2] = append0[3];
- w2[3] = append1[0];
- w3[0] = append1[1];
- w3[1] = append1[2];
- break;
-
- case 29:
- w1[3] = w1[3] | append0[0] << 8;
- w2[0] = append0[0] >> 24 | append0[1] << 8;
- w2[1] = append0[1] >> 24 | append0[2] << 8;
- w2[2] = append0[2] >> 24 | append0[3] << 8;
- w2[3] = append0[3] >> 24 | append1[0] << 8;
- w3[0] = append1[0] >> 24 | append1[1] << 8;
- w3[1] = append1[1] >> 24 | append1[2] << 8;
- break;
-
- case 30:
- w1[3] = w1[3] | append0[0] << 16;
- w2[0] = append0[0] >> 16 | append0[1] << 16;
- w2[1] = append0[1] >> 16 | append0[2] << 16;
- w2[2] = append0[2] >> 16 | append0[3] << 16;
- w2[3] = append0[3] >> 16 | append1[0] << 16;
- w3[0] = append1[0] >> 16 | append1[1] << 16;
- w3[1] = append1[1] >> 16 | append1[2] << 16;
- break;
-
- case 31:
- w1[3] = w1[3] | append0[0] << 24;
- w2[0] = append0[0] >> 8 | append0[1] << 24;
- w2[1] = append0[1] >> 8 | append0[2] << 24;
- w2[2] = append0[2] >> 8 | append0[3] << 24;
- w2[3] = append0[3] >> 8 | append1[0] << 24;
- w3[0] = append1[0] >> 8 | append1[1] << 24;
- w3[1] = append1[1] >> 8 | append1[2] << 24;
- break;
-
- case 32:
- w2[0] = append0[0];
- w2[1] = append0[1];
- w2[2] = append0[2];
- w2[3] = append0[3];
- w3[0] = append1[0];
- w3[1] = append1[1];
+ w3[2] = amd_bytealign_S (w0[0], 0, offset);
+ w3[1] = amd_bytealign_S ( 0, w0[0], offset);
+ w3[0] = 0;
+ w2[3] = 0;
+ w2[2] = 0;
+ w2[1] = 0;
+ w2[0] = 0;
+ w1[3] = 0;
+ w1[2] = 0;
+ w1[1] = 0;
+ w1[0] = 0;
+ w0[3] = 0;
+ w0[2] = 0;
+ w0[1] = 0;
+ w0[0] = 0;
break;
}
-}
+ #endif
-// before: memcat32_9
-static void memcat_c32_w4x4_a3x4 (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], const u32 append0[4], const u32 append1[4], const u32 append2[4], const u32 offset)
-{
- switch (offset)
+ #ifdef IS_NV
+ const int selector = (0x76543210 >> ((offset & 3) * 4)) & 0xffff;
+
+ switch (offset / 4)
{
case 0:
- w0[0] = append0[0];
- w0[1] = append0[1];
- w0[2] = append0[2];
- w0[3] = append0[3];
- w1[0] = append1[0];
- w1[1] = append1[1];
- w1[2] = append1[2];
- w1[3] = append1[3];
- w2[0] = append2[0];
+ w3[1] = __byte_perm_S (w3[1], w3[0], selector);
+ w3[0] = __byte_perm_S (w3[0], w2[3], selector);
+ w2[3] = __byte_perm_S (w2[3], w2[2], selector);
+ w2[2] = __byte_perm_S (w2[2], w2[1], selector);
+ w2[1] = __byte_perm_S (w2[1], w2[0], selector);
+ w2[0] = __byte_perm_S (w2[0], w1[3], selector);
+ w1[3] = __byte_perm_S (w1[3], w1[2], selector);
+ w1[2] = __byte_perm_S (w1[2], w1[1], selector);
+ w1[1] = __byte_perm_S (w1[1], w1[0], selector);
+ w1[0] = __byte_perm_S (w1[0], w0[3], selector);
+ w0[3] = __byte_perm_S (w0[3], w0[2], selector);
+ w0[2] = __byte_perm_S (w0[2], w0[1], selector);
+ w0[1] = __byte_perm_S (w0[1], w0[0], selector);
+ w0[0] = __byte_perm_S (w0[0], 0, selector);
break;
case 1:
- w0[0] = w0[0] | append0[0] << 8;
- w0[1] = append0[0] >> 24 | append0[1] << 8;
- w0[2] = append0[1] >> 24 | append0[2] << 8;
- w0[3] = append0[2] >> 24 | append0[3] << 8;
- w1[0] = append0[3] >> 24 | append1[0] << 8;
- w1[1] = append1[0] >> 24 | append1[1] << 8;
- w1[2] = append1[1] >> 24 | append1[2] << 8;
- w1[3] = append1[2] >> 24 | append1[3] << 8;
- w2[0] = append1[3] >> 24 | append2[0] << 8;
- w2[1] = append2[0] >> 24;
+ w3[1] = __byte_perm_S (w3[0], w2[3], selector);
+ w3[0] = __byte_perm_S (w2[3], w2[2], selector);
+ w2[3] = __byte_perm_S (w2[2], w2[1], selector);
+ w2[2] = __byte_perm_S (w2[1], w2[0], selector);
+ w2[1] = __byte_perm_S (w2[0], w1[3], selector);
+ w2[0] = __byte_perm_S (w1[3], w1[2], selector);
+ w1[3] = __byte_perm_S (w1[2], w1[1], selector);
+ w1[2] = __byte_perm_S (w1[1], w1[0], selector);
+ w1[1] = __byte_perm_S (w1[0], w0[3], selector);
+ w1[0] = __byte_perm_S (w0[3], w0[2], selector);
+ w0[3] = __byte_perm_S (w0[2], w0[1], selector);
+ w0[2] = __byte_perm_S (w0[1], w0[0], selector);
+ w0[1] = __byte_perm_S (w0[0], 0, selector);
+ w0[0] = 0;
break;
case 2:
- w0[0] = w0[0] | append0[0] << 16;
- w0[1] = append0[0] >> 16 | append0[1] << 16;
- w0[2] = append0[1] >> 16 | append0[2] << 16;
- w0[3] = append0[2] >> 16 | append0[3] << 16;
- w1[0] = append0[3] >> 16 | append1[0] << 16;
- w1[1] = append1[0] >> 16 | append1[1] << 16;
- w1[2] = append1[1] >> 16 | append1[2] << 16;
- w1[3] = append1[2] >> 16 | append1[3] << 16;
- w2[0] = append1[3] >> 16 | append2[0] << 16;
- w2[1] = append2[0] >> 16;
+ w3[1] = __byte_perm_S (w2[3], w2[2], selector);
+ w3[0] = __byte_perm_S (w2[2], w2[1], selector);
+ w2[3] = __byte_perm_S (w2[1], w2[0], selector);
+ w2[2] = __byte_perm_S (w2[0], w1[3], selector);
+ w2[1] = __byte_perm_S (w1[3], w1[2], selector);
+ w2[0] = __byte_perm_S (w1[2], w1[1], selector);
+ w1[3] = __byte_perm_S (w1[1], w1[0], selector);
+ w1[2] = __byte_perm_S (w1[0], w0[3], selector);
+ w1[1] = __byte_perm_S (w0[3], w0[2], selector);
+ w1[0] = __byte_perm_S (w0[2], w0[1], selector);
+ w0[3] = __byte_perm_S (w0[1], w0[0], selector);
+ w0[2] = __byte_perm_S (w0[0], 0, selector);
+ w0[1] = 0;
+ w0[0] = 0;
break;
case 3:
- w0[0] = w0[0] | append0[0] << 24;
- w0[1] = append0[0] >> 8 | append0[1] << 24;
- w0[2] = append0[1] >> 8 | append0[2] << 24;
- w0[3] = append0[2] >> 8 | append0[3] << 24;
- w1[0] = append0[3] >> 8 | append1[0] << 24;
- w1[1] = append1[0] >> 8 | append1[1] << 24;
- w1[2] = append1[1] >> 8 | append1[2] << 24;
- w1[3] = append1[2] >> 8 | append1[3] << 24;
- w2[0] = append1[3] >> 8 | append2[0] << 24;
- w2[1] = append2[0] >> 8;
+ w3[1] = __byte_perm_S (w2[2], w2[1], selector);
+ w3[0] = __byte_perm_S (w2[1], w2[0], selector);
+ w2[3] = __byte_perm_S (w2[0], w1[3], selector);
+ w2[2] = __byte_perm_S (w1[3], w1[2], selector);
+ w2[1] = __byte_perm_S (w1[2], w1[1], selector);
+ w2[0] = __byte_perm_S (w1[1], w1[0], selector);
+ w1[3] = __byte_perm_S (w1[0], w0[3], selector);
+ w1[2] = __byte_perm_S (w0[3], w0[2], selector);
+ w1[1] = __byte_perm_S (w0[2], w0[1], selector);
+ w1[0] = __byte_perm_S (w0[1], w0[0], selector);
+ w0[3] = __byte_perm_S (w0[0], 0, selector);
+ w0[2] = 0;
+ w0[1] = 0;
+ w0[0] = 0;
break;
case 4:
- w0[1] = append0[0];
- w0[2] = append0[1];
- w0[3] = append0[2];
- w1[0] = append0[3];
- w1[1] = append1[0];
- w1[2] = append1[1];
- w1[3] = append1[2];
- w2[0] = append1[3];
- w2[1] = append2[0];
+ w3[1] = __byte_perm_S (w2[1], w2[0], selector);
+ w3[0] = __byte_perm_S (w2[0], w1[3], selector);
+ w2[3] = __byte_perm_S (w1[3], w1[2], selector);
+ w2[2] = __byte_perm_S (w1[2], w1[1], selector);
+ w2[1] = __byte_perm_S (w1[1], w1[0], selector);
+ w2[0] = __byte_perm_S (w1[0], w0[3], selector);
+ w1[3] = __byte_perm_S (w0[3], w0[2], selector);
+ w1[2] = __byte_perm_S (w0[2], w0[1], selector);
+ w1[1] = __byte_perm_S (w0[1], w0[0], selector);
+ w1[0] = __byte_perm_S (w0[0], 0, selector);
+ w0[3] = 0;
+ w0[2] = 0;
+ w0[1] = 0;
+ w0[0] = 0;
break;
case 5:
- w0[1] = w0[1] | append0[0] << 8;
- w0[2] = append0[0] >> 24 | append0[1] << 8;
- w0[3] = append0[1] >> 24 | append0[2] << 8;
- w1[0] = append0[2] >> 24 | append0[3] << 8;
- w1[1] = append0[3] >> 24 | append1[0] << 8;
- w1[2] = append1[0] >> 24 | append1[1] << 8;
- w1[3] = append1[1] >> 24 | append1[2] << 8;
- w2[0] = append1[2] >> 24 | append1[3] << 8;
- w2[1] = append1[3] >> 24 | append2[0] << 8;
- w2[2] = append2[0] >> 24;
+ w3[1] = __byte_perm_S (w2[0], w1[3], selector);
+ w3[0] = __byte_perm_S (w1[3], w1[2], selector);
+ w2[3] = __byte_perm_S (w1[2], w1[1], selector);
+ w2[2] = __byte_perm_S (w1[1], w1[0], selector);
+ w2[1] = __byte_perm_S (w1[0], w0[3], selector);
+ w2[0] = __byte_perm_S (w0[3], w0[2], selector);
+ w1[3] = __byte_perm_S (w0[2], w0[1], selector);
+ w1[2] = __byte_perm_S (w0[1], w0[0], selector);
+ w1[1] = __byte_perm_S (w0[0], 0, selector);
+ w1[0] = 0;
+ w0[3] = 0;
+ w0[2] = 0;
+ w0[1] = 0;
+ w0[0] = 0;
break;
case 6:
- w0[1] = w0[1] | append0[0] << 16;
- w0[2] = append0[0] >> 16 | append0[1] << 16;
- w0[3] = append0[1] >> 16 | append0[2] << 16;
- w1[0] = append0[2] >> 16 | append0[3] << 16;
- w1[1] = append0[3] >> 16 | append1[0] << 16;
- w1[2] = append1[0] >> 16 | append1[1] << 16;
- w1[3] = append1[1] >> 16 | append1[2] << 16;
- w2[0] = append1[2] >> 16 | append1[3] << 16;
- w2[1] = append1[3] >> 16 | append2[0] << 16;
- w2[2] = append2[0] >> 16;
+ w3[1] = __byte_perm_S (w1[3], w1[2], selector);
+ w3[0] = __byte_perm_S (w1[2], w1[1], selector);
+ w2[3] = __byte_perm_S (w1[1], w1[0], selector);
+ w2[2] = __byte_perm_S (w1[0], w0[3], selector);
+ w2[1] = __byte_perm_S (w0[3], w0[2], selector);
+ w2[0] = __byte_perm_S (w0[2], w0[1], selector);
+ w1[3] = __byte_perm_S (w0[1], w0[0], selector);
+ w1[2] = __byte_perm_S (w0[0], 0, selector);
+ w1[1] = 0;
+ w1[0] = 0;
+ w0[3] = 0;
+ w0[2] = 0;
+ w0[1] = 0;
+ w0[0] = 0;
break;
case 7:
- w0[1] = w0[1] | append0[0] << 24;
- w0[2] = append0[0] >> 8 | append0[1] << 24;
- w0[3] = append0[1] >> 8 | append0[2] << 24;
- w1[0] = append0[2] >> 8 | append0[3] << 24;
- w1[1] = append0[3] >> 8 | append1[0] << 24;
- w1[2] = append1[0] >> 8 | append1[1] << 24;
- w1[3] = append1[1] >> 8 | append1[2] << 24;
- w2[0] = append1[2] >> 8 | append1[3] << 24;
- w2[1] = append1[3] >> 8 | append2[0] << 24;
- w2[2] = append2[0] >> 8;
+ w3[1] = __byte_perm_S (w1[2], w1[1], selector);
+ w3[0] = __byte_perm_S (w1[1], w1[0], selector);
+ w2[3] = __byte_perm_S (w1[0], w0[3], selector);
+ w2[2] = __byte_perm_S (w0[3], w0[2], selector);
+ w2[1] = __byte_perm_S (w0[2], w0[1], selector);
+ w2[0] = __byte_perm_S (w0[1], w0[0], selector);
+ w1[3] = __byte_perm_S (w0[0], 0, selector);
+ w1[2] = 0;
+ w1[1] = 0;
+ w1[0] = 0;
+ w0[3] = 0;
+ w0[2] = 0;
+ w0[1] = 0;
+ w0[0] = 0;
break;
case 8:
- w0[2] = append0[0];
- w0[3] = append0[1];
- w1[0] = append0[2];
- w1[1] = append0[3];
- w1[2] = append1[0];
- w1[3] = append1[1];
- w2[0] = append1[2];
- w2[1] = append1[3];
- w2[2] = append2[0];
+ w3[1] = __byte_perm_S (w1[1], w1[0], selector);
+ w3[0] = __byte_perm_S (w1[0], w0[3], selector);
+ w2[3] = __byte_perm_S (w0[3], w0[2], selector);
+ w2[2] = __byte_perm_S (w0[2], w0[1], selector);
+ w2[1] = __byte_perm_S (w0[1], w0[0], selector);
+ w2[0] = __byte_perm_S (w0[0], 0, selector);
+ w1[3] = 0;
+ w1[2] = 0;
+ w1[1] = 0;
+ w1[0] = 0;
+ w0[3] = 0;
+ w0[2] = 0;
+ w0[1] = 0;
+ w0[0] = 0;
break;
case 9:
- w0[2] = w0[2] | append0[0] << 8;
- w0[3] = append0[0] >> 24 | append0[1] << 8;
- w1[0] = append0[1] >> 24 | append0[2] << 8;
- w1[1] = append0[2] >> 24 | append0[3] << 8;
- w1[2] = append0[3] >> 24 | append1[0] << 8;
- w1[3] = append1[0] >> 24 | append1[1] << 8;
- w2[0] = append1[1] >> 24 | append1[2] << 8;
- w2[1] = append1[2] >> 24 | append1[3] << 8;
- w2[2] = append1[3] >> 24 | append2[0] << 8;
- w2[3] = append2[0] >> 24;
+ w3[1] = __byte_perm_S (w1[0], w0[3], selector);
+ w3[0] = __byte_perm_S (w0[3], w0[2], selector);
+ w2[3] = __byte_perm_S (w0[2], w0[1], selector);
+ w2[2] = __byte_perm_S (w0[1], w0[0], selector);
+ w2[1] = __byte_perm_S (w0[0], 0, selector);
+ w2[0] = 0;
+ w1[3] = 0;
+ w1[2] = 0;
+ w1[1] = 0;
+ w1[0] = 0;
+ w0[3] = 0;
+ w0[2] = 0;
+ w0[1] = 0;
+ w0[0] = 0;
break;
case 10:
- w0[2] = w0[2] | append0[0] << 16;
- w0[3] = append0[0] >> 16 | append0[1] << 16;
- w1[0] = append0[1] >> 16 | append0[2] << 16;
- w1[1] = append0[2] >> 16 | append0[3] << 16;
- w1[2] = append0[3] >> 16 | append1[0] << 16;
- w1[3] = append1[0] >> 16 | append1[1] << 16;
- w2[0] = append1[1] >> 16 | append1[2] << 16;
- w2[1] = append1[2] >> 16 | append1[3] << 16;
- w2[2] = append1[3] >> 16 | append2[0] << 16;
- w2[3] = append2[0] >> 16;
+ w3[1] = __byte_perm_S (w0[3], w0[2], selector);
+ w3[0] = __byte_perm_S (w0[2], w0[1], selector);
+ w2[3] = __byte_perm_S (w0[1], w0[0], selector);
+ w2[2] = __byte_perm_S (w0[0], 0, selector);
+ w2[1] = 0;
+ w2[0] = 0;
+ w1[3] = 0;
+ w1[2] = 0;
+ w1[1] = 0;
+ w1[0] = 0;
+ w0[3] = 0;
+ w0[2] = 0;
+ w0[1] = 0;
+ w0[0] = 0;
break;
case 11:
- w0[2] = w0[2] | append0[0] << 24;
- w0[3] = append0[0] >> 8 | append0[1] << 24;
- w1[0] = append0[1] >> 8 | append0[2] << 24;
- w1[1] = append0[2] >> 8 | append0[3] << 24;
- w1[2] = append0[3] >> 8 | append1[0] << 24;
- w1[3] = append1[0] >> 8 | append1[1] << 24;
- w2[0] = append1[1] >> 8 | append1[2] << 24;
- w2[1] = append1[2] >> 8 | append1[3] << 24;
- w2[2] = append1[3] >> 8 | append2[0] << 24;
- w2[3] = append2[0] >> 8;
+ w3[1] = __byte_perm_S (w0[2], w0[1], selector);
+ w3[0] = __byte_perm_S (w0[1], w0[0], selector);
+ w2[3] = __byte_perm_S (w0[0], 0, selector);
+ w2[2] = 0;
+ w2[1] = 0;
+ w2[0] = 0;
+ w1[3] = 0;
+ w1[2] = 0;
+ w1[1] = 0;
+ w1[0] = 0;
+ w0[3] = 0;
+ w0[2] = 0;
+ w0[1] = 0;
+ w0[0] = 0;
break;
case 12:
- w0[3] = append0[0];
- w1[0] = append0[1];
- w1[1] = append0[2];
- w1[2] = append0[3];
- w1[3] = append1[0];
- w2[0] = append1[1];
- w2[1] = append1[2];
- w2[2] = append1[3];
- w2[3] = append2[0];
+ w3[1] = __byte_perm_S (w0[1], w0[0], selector);
+ w3[0] = __byte_perm_S (w0[0], 0, selector);
+ w2[3] = 0;
+ w2[2] = 0;
+ w2[1] = 0;
+ w2[0] = 0;
+ w1[3] = 0;
+ w1[2] = 0;
+ w1[1] = 0;
+ w1[0] = 0;
+ w0[3] = 0;
+ w0[2] = 0;
+ w0[1] = 0;
+ w0[0] = 0;
break;
case 13:
- w0[3] = w0[3] | append0[0] << 8;
- w1[0] = append0[0] >> 24 | append0[1] << 8;
- w1[1] = append0[1] >> 24 | append0[2] << 8;
- w1[2] = append0[2] >> 24 | append0[3] << 8;
- w1[3] = append0[3] >> 24 | append1[0] << 8;
- w2[0] = append1[0] >> 24 | append1[1] << 8;
- w2[1] = append1[1] >> 24 | append1[2] << 8;
- w2[2] = append1[2] >> 24 | append1[3] << 8;
- w2[3] = append1[3] >> 24 | append2[0] << 8;
- w3[0] = append2[0] >> 24;
- break;
-
- case 14:
- w0[3] = w0[3] | append0[0] << 16;
- w1[0] = append0[0] >> 16 | append0[1] << 16;
- w1[1] = append0[1] >> 16 | append0[2] << 16;
- w1[2] = append0[2] >> 16 | append0[3] << 16;
- w1[3] = append0[3] >> 16 | append1[0] << 16;
- w2[0] = append1[0] >> 16 | append1[1] << 16;
- w2[1] = append1[1] >> 16 | append1[2] << 16;
- w2[2] = append1[2] >> 16 | append1[3] << 16;
- w2[3] = append1[3] >> 16 | append2[0] << 16;
- w3[0] = append2[0] >> 16;
- break;
-
- case 15:
- w0[3] = w0[3] | append0[0] << 24;
- w1[0] = append0[0] >> 8 | append0[1] << 24;
- w1[1] = append0[1] >> 8 | append0[2] << 24;
- w1[2] = append0[2] >> 8 | append0[3] << 24;
- w1[3] = append0[3] >> 8 | append1[0] << 24;
- w2[0] = append1[0] >> 8 | append1[1] << 24;
- w2[1] = append1[1] >> 8 | append1[2] << 24;
- w2[2] = append1[2] >> 8 | append1[3] << 24;
- w2[3] = append1[3] >> 8 | append2[0] << 24;
- w3[0] = append2[0] >> 8;
- break;
-
- case 16:
- w1[0] = append0[0];
- w1[1] = append0[1];
- w1[2] = append0[2];
- w1[3] = append0[3];
- w2[0] = append1[0];
- w2[1] = append1[1];
- w2[2] = append1[2];
- w2[3] = append1[3];
- w3[0] = append2[0];
- break;
-
- case 17:
- w1[0] = w1[0] | append0[0] << 8;
- w1[1] = append0[0] >> 24 | append0[1] << 8;
- w1[2] = append0[1] >> 24 | append0[2] << 8;
- w1[3] = append0[2] >> 24 | append0[3] << 8;
- w2[0] = append0[3] >> 24 | append1[0] << 8;
- w2[1] = append1[0] >> 24 | append1[1] << 8;
- w2[2] = append1[1] >> 24 | append1[2] << 8;
- w2[3] = append1[2] >> 24 | append1[3] << 8;
- w3[0] = append1[3] >> 24 | append2[0] << 8;
- w3[1] = append2[0] >> 24;
- break;
-
- case 18:
- w1[0] = w1[0] | append0[0] << 16;
- w1[1] = append0[0] >> 16 | append0[1] << 16;
- w1[2] = append0[1] >> 16 | append0[2] << 16;
- w1[3] = append0[2] >> 16 | append0[3] << 16;
- w2[0] = append0[3] >> 16 | append1[0] << 16;
- w2[1] = append1[0] >> 16 | append1[1] << 16;
- w2[2] = append1[1] >> 16 | append1[2] << 16;
- w2[3] = append1[2] >> 16 | append1[3] << 16;
- w3[0] = append1[3] >> 16 | append2[0] << 16;
- w3[1] = append2[0] >> 16;
- break;
-
- case 19:
- w1[0] = w1[0] | append0[0] << 24;
- w1[1] = append0[0] >> 8 | append0[1] << 24;
- w1[2] = append0[1] >> 8 | append0[2] << 24;
- w1[3] = append0[2] >> 8 | append0[3] << 24;
- w2[0] = append0[3] >> 8 | append1[0] << 24;
- w2[1] = append1[0] >> 8 | append1[1] << 24;
- w2[2] = append1[1] >> 8 | append1[2] << 24;
- w2[3] = append1[2] >> 8 | append1[3] << 24;
- w3[0] = append1[3] >> 8 | append2[0] << 24;
- w3[1] = append2[0] >> 8;
- break;
-
- case 20:
- w1[1] = append0[0];
- w1[2] = append0[1];
- w1[3] = append0[2];
- w2[0] = append0[3];
- w2[1] = append1[0];
- w2[2] = append1[1];
- w2[3] = append1[2];
- w3[0] = append1[3];
- w3[1] = append2[0];
- break;
-
- case 21:
- w1[1] = w1[1] | append0[0] << 8;
- w1[2] = append0[0] >> 24 | append0[1] << 8;
- w1[3] = append0[1] >> 24 | append0[2] << 8;
- w2[0] = append0[2] >> 24 | append0[3] << 8;
- w2[1] = append0[3] >> 24 | append1[0] << 8;
- w2[2] = append1[0] >> 24 | append1[1] << 8;
- w2[3] = append1[1] >> 24 | append1[2] << 8;
- w3[0] = append1[2] >> 24 | append1[3] << 8;
- w3[1] = append1[3] >> 24 | append2[0] << 8;
- break;
-
- case 22:
- w1[1] = w1[1] | append0[0] << 16;
- w1[2] = append0[0] >> 16 | append0[1] << 16;
- w1[3] = append0[1] >> 16 | append0[2] << 16;
- w2[0] = append0[2] >> 16 | append0[3] << 16;
- w2[1] = append0[3] >> 16 | append1[0] << 16;
- w2[2] = append1[0] >> 16 | append1[1] << 16;
- w2[3] = append1[1] >> 16 | append1[2] << 16;
- w3[0] = append1[2] >> 16 | append1[3] << 16;
- w3[1] = append1[3] >> 16 | append2[0] << 16;
- break;
-
- case 23:
- w1[1] = w1[1] | append0[0] << 24;
- w1[2] = append0[0] >> 8 | append0[1] << 24;
- w1[3] = append0[1] >> 8 | append0[2] << 24;
- w2[0] = append0[2] >> 8 | append0[3] << 24;
- w2[1] = append0[3] >> 8 | append1[0] << 24;
- w2[2] = append1[0] >> 8 | append1[1] << 24;
- w2[3] = append1[1] >> 8 | append1[2] << 24;
- w3[0] = append1[2] >> 8 | append1[3] << 24;
- w3[1] = append1[3] >> 8 | append2[0] << 24;
- break;
-
- case 24:
- w1[2] = append0[0];
- w1[3] = append0[1];
- w2[0] = append0[2];
- w2[1] = append0[3];
- w2[2] = append1[0];
- w2[3] = append1[1];
- w3[0] = append1[2];
- w3[1] = append1[3];
- break;
-
- case 25:
- w1[2] = w1[2] | append0[0] << 8;
- w1[3] = append0[0] >> 24 | append0[1] << 8;
- w2[0] = append0[1] >> 24 | append0[2] << 8;
- w2[1] = append0[2] >> 24 | append0[3] << 8;
- w2[2] = append0[3] >> 24 | append1[0] << 8;
- w2[3] = append1[0] >> 24 | append1[1] << 8;
- w3[0] = append1[1] >> 24 | append1[2] << 8;
- w3[1] = append1[2] >> 24 | append1[3] << 8;
- break;
-
- case 26:
- w1[2] = w1[2] | append0[0] << 16;
- w1[3] = append0[0] >> 16 | append0[1] << 16;
- w2[0] = append0[1] >> 16 | append0[2] << 16;
- w2[1] = append0[2] >> 16 | append0[3] << 16;
- w2[2] = append0[3] >> 16 | append1[0] << 16;
- w2[3] = append1[0] >> 16 | append1[1] << 16;
- w3[0] = append1[1] >> 16 | append1[2] << 16;
- w3[1] = append1[2] >> 16 | append1[3] << 16;
- break;
-
- case 27:
- w1[2] = w1[2] | append0[0] << 24;
- w1[3] = append0[0] >> 8 | append0[1] << 24;
- w2[0] = append0[1] >> 8 | append0[2] << 24;
- w2[1] = append0[2] >> 8 | append0[3] << 24;
- w2[2] = append0[3] >> 8 | append1[0] << 24;
- w2[3] = append1[0] >> 8 | append1[1] << 24;
- w3[0] = append1[1] >> 8 | append1[2] << 24;
- w3[1] = append1[2] >> 8 | append1[3] << 24;
- break;
-
- case 28:
- w1[3] = append0[0];
- w2[0] = append0[1];
- w2[1] = append0[2];
- w2[2] = append0[3];
- w2[3] = append1[0];
- w3[0] = append1[1];
- w3[1] = append1[2];
- break;
-
- case 29:
- w1[3] = w1[3] | append0[0] << 8;
- w2[0] = append0[0] >> 24 | append0[1] << 8;
- w2[1] = append0[1] >> 24 | append0[2] << 8;
- w2[2] = append0[2] >> 24 | append0[3] << 8;
- w2[3] = append0[3] >> 24 | append1[0] << 8;
- w3[0] = append1[0] >> 24 | append1[1] << 8;
- w3[1] = append1[1] >> 24 | append1[2] << 8;
- break;
-
- case 30:
- w1[3] = w1[3] | append0[0] << 16;
- w2[0] = append0[0] >> 16 | append0[1] << 16;
- w2[1] = append0[1] >> 16 | append0[2] << 16;
- w2[2] = append0[2] >> 16 | append0[3] << 16;
- w2[3] = append0[3] >> 16 | append1[0] << 16;
- w3[0] = append1[0] >> 16 | append1[1] << 16;
- w3[1] = append1[1] >> 16 | append1[2] << 16;
- break;
-
- case 31:
- w1[3] = w1[3] | append0[0] << 24;
- w2[0] = append0[0] >> 8 | append0[1] << 24;
- w2[1] = append0[1] >> 8 | append0[2] << 24;
- w2[2] = append0[2] >> 8 | append0[3] << 24;
- w2[3] = append0[3] >> 8 | append1[0] << 24;
- w3[0] = append1[0] >> 8 | append1[1] << 24;
- w3[1] = append1[1] >> 8 | append1[2] << 24;
- break;
-
- case 32:
- w2[0] = append0[0];
- w2[1] = append0[1];
- w2[2] = append0[2];
- w2[3] = append0[3];
- w3[0] = append1[0];
- w3[1] = append1[1];
+ w3[1] = __byte_perm_S (w0[0], 0, selector);
+ w3[0] = 0;
+ w2[3] = 0;
+ w2[2] = 0;
+ w2[1] = 0;
+ w2[0] = 0;
+ w1[3] = 0;
+ w1[2] = 0;
+ w1[1] = 0;
+ w1[0] = 0;
+ w0[3] = 0;
+ w0[2] = 0;
+ w0[1] = 0;
+ w0[0] = 0;
break;
}
+ #endif
}
-
-*/
{
append_0x80_2x4 (wordl0, wordl1, pw_l_len);
- switch_buffer_by_offset (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len);
+ switch_buffer_by_offset_le (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len);
}
/**
if (combs_mode == COMBINATOR_MODE_BASE_LEFT)
{
- switch_buffer_by_offset (wordr0, wordr1, wordr2, wordr3, pw_l_len);
+ switch_buffer_by_offset_le (wordr0, wordr1, wordr2, wordr3, pw_l_len);
}
u32 w0[4];
{
append_0x80_2x4 (wordl0, wordl1, pw_l_len);
- switch_buffer_by_offset (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len);
+ switch_buffer_by_offset_le (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len);
}
/**
if (combs_mode == COMBINATOR_MODE_BASE_LEFT)
{
- switch_buffer_by_offset (wordr0, wordr1, wordr2, wordr3, pw_l_len);
+ switch_buffer_by_offset_le (wordr0, wordr1, wordr2, wordr3, pw_l_len);
}
u32 w0[4];
#define MD5_STEP_REV(f,a,b,c,d,x,t,s) \
{ \
a -= b; \
- a = rotr32 (a, s); \
+ a = rotr32_S (a, s); \
a -= f (b, c, d); \
a -= x; \
a -= t; \
#define MD5_STEP_REV1(f,a,b,c,d,x,t,s) \
{ \
a -= b; \
- a = rotr32 (a, s); \
+ a = rotr32_S (a, s); \
a -= x; \
a -= t; \
}
* reverse
*/
- u32x a_rev = digests_buf[digests_offset].digest_buf[0];
- u32x b_rev = digests_buf[digests_offset].digest_buf[1];
- u32x c_rev = digests_buf[digests_offset].digest_buf[2];
- u32x d_rev = digests_buf[digests_offset].digest_buf[3];
-
- MD5_STEP_REV (MD5_I, b_rev, c_rev, d_rev, a_rev, w[ 9], MD5C3f, MD5S33);
- MD5_STEP_REV (MD5_I, c_rev, d_rev, a_rev, b_rev, w[ 2], MD5C3e, MD5S32);
- MD5_STEP_REV (MD5_I, d_rev, a_rev, b_rev, c_rev, w[11], MD5C3d, MD5S31);
- MD5_STEP_REV (MD5_I, a_rev, b_rev, c_rev, d_rev, w[ 4], MD5C3c, MD5S30);
- MD5_STEP_REV (MD5_I, b_rev, c_rev, d_rev, a_rev, w[13], MD5C3b, MD5S33);
- MD5_STEP_REV (MD5_I, c_rev, d_rev, a_rev, b_rev, w[ 6], MD5C3a, MD5S32);
- MD5_STEP_REV (MD5_I, d_rev, a_rev, b_rev, c_rev, w[15], MD5C39, MD5S31);
- MD5_STEP_REV (MD5_I, a_rev, b_rev, c_rev, d_rev, w[ 8], MD5C38, MD5S30);
- MD5_STEP_REV (MD5_I, b_rev, c_rev, d_rev, a_rev, w[ 1], MD5C37, MD5S33);
- MD5_STEP_REV (MD5_I, c_rev, d_rev, a_rev, b_rev, w[10], MD5C36, MD5S32);
- MD5_STEP_REV (MD5_I, d_rev, a_rev, b_rev, c_rev, w[ 3], MD5C35, MD5S31);
- MD5_STEP_REV (MD5_I, a_rev, b_rev, c_rev, d_rev, w[12], MD5C34, MD5S30);
- MD5_STEP_REV (MD5_I, b_rev, c_rev, d_rev, a_rev, w[ 5], MD5C33, MD5S33);
- MD5_STEP_REV (MD5_I, c_rev, d_rev, a_rev, b_rev, w[14], MD5C32, MD5S32);
- MD5_STEP_REV (MD5_I, d_rev, a_rev, b_rev, c_rev, w[ 7], MD5C31, MD5S31);
- MD5_STEP_REV (MD5_I, a_rev, b_rev, c_rev, d_rev, 0, MD5C30, MD5S30);
-
- const u32x pre_cd = c_rev ^ d_rev;
-
- MD5_STEP_REV1(MD5_H, b_rev, c_rev, d_rev, a_rev, w[ 2], MD5C2f, MD5S23);
- MD5_STEP_REV1(MD5_H, c_rev, d_rev, a_rev, b_rev, w[15], MD5C2e, MD5S22);
+ u32 a_rev = digests_buf[digests_offset].digest_buf[0];
+ u32 b_rev = digests_buf[digests_offset].digest_buf[1];
+ u32 c_rev = digests_buf[digests_offset].digest_buf[2];
+ u32 d_rev = digests_buf[digests_offset].digest_buf[3];
+
+ MD5_STEP_REV (MD5_I_S, b_rev, c_rev, d_rev, a_rev, w[ 9], MD5C3f, MD5S33);
+ MD5_STEP_REV (MD5_I_S, c_rev, d_rev, a_rev, b_rev, w[ 2], MD5C3e, MD5S32);
+ MD5_STEP_REV (MD5_I_S, d_rev, a_rev, b_rev, c_rev, w[11], MD5C3d, MD5S31);
+ MD5_STEP_REV (MD5_I_S, a_rev, b_rev, c_rev, d_rev, w[ 4], MD5C3c, MD5S30);
+ MD5_STEP_REV (MD5_I_S, b_rev, c_rev, d_rev, a_rev, w[13], MD5C3b, MD5S33);
+ MD5_STEP_REV (MD5_I_S, c_rev, d_rev, a_rev, b_rev, w[ 6], MD5C3a, MD5S32);
+ MD5_STEP_REV (MD5_I_S, d_rev, a_rev, b_rev, c_rev, w[15], MD5C39, MD5S31);
+ MD5_STEP_REV (MD5_I_S, a_rev, b_rev, c_rev, d_rev, w[ 8], MD5C38, MD5S30);
+ MD5_STEP_REV (MD5_I_S, b_rev, c_rev, d_rev, a_rev, w[ 1], MD5C37, MD5S33);
+ MD5_STEP_REV (MD5_I_S, c_rev, d_rev, a_rev, b_rev, w[10], MD5C36, MD5S32);
+ MD5_STEP_REV (MD5_I_S, d_rev, a_rev, b_rev, c_rev, w[ 3], MD5C35, MD5S31);
+ MD5_STEP_REV (MD5_I_S, a_rev, b_rev, c_rev, d_rev, w[12], MD5C34, MD5S30);
+ MD5_STEP_REV (MD5_I_S, b_rev, c_rev, d_rev, a_rev, w[ 5], MD5C33, MD5S33);
+ MD5_STEP_REV (MD5_I_S, c_rev, d_rev, a_rev, b_rev, w[14], MD5C32, MD5S32);
+ MD5_STEP_REV (MD5_I_S, d_rev, a_rev, b_rev, c_rev, w[ 7], MD5C31, MD5S31);
+ MD5_STEP_REV (MD5_I_S, a_rev, b_rev, c_rev, d_rev, 0, MD5C30, MD5S30);
+
+ const u32 pre_cd = c_rev ^ d_rev;
+
+ MD5_STEP_REV1(MD5_H_S, b_rev, c_rev, d_rev, a_rev, w[ 2], MD5C2f, MD5S23);
+ MD5_STEP_REV1(MD5_H_S, c_rev, d_rev, a_rev, b_rev, w[15], MD5C2e, MD5S22);
/**
* loop
s3[2] = 0;
s3[3] = 0;
- switch_buffer_by_offset (s0, s1, s2, s3, out_len);
+ switch_buffer_by_offset_le (s0, s1, s2, s3, out_len);
const u32 pw_salt_len = out_len + salt_len;
s3[2] = 0;
s3[3] = 0;
- switch_buffer_by_offset (s0, s1, s2, s3, out_len);
+ switch_buffer_by_offset_le (s0, s1, s2, s3, out_len);
const u32 pw_salt_len = out_len + salt_len;
if (combs_mode == COMBINATOR_MODE_BASE_RIGHT)
{
- switch_buffer_by_offset (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len);
+ switch_buffer_by_offset_le (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len);
}
/**
if (combs_mode == COMBINATOR_MODE_BASE_LEFT)
{
- switch_buffer_by_offset (wordr0, wordr1, wordr2, wordr3, pw_l_len);
+ switch_buffer_by_offset_le (wordr0, wordr1, wordr2, wordr3, pw_l_len);
}
/**
s3[2] = 0;
s3[3] = 0;
- switch_buffer_by_offset (s0, s1, s2, s3, pw_len);
+ switch_buffer_by_offset_le (s0, s1, s2, s3, pw_len);
const u32 pw_salt_len = pw_len + salt_len;
if (combs_mode == COMBINATOR_MODE_BASE_RIGHT)
{
- switch_buffer_by_offset (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len);
+ switch_buffer_by_offset_le (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len);
}
/**
if (combs_mode == COMBINATOR_MODE_BASE_LEFT)
{
- switch_buffer_by_offset (wordr0, wordr1, wordr2, wordr3, pw_l_len);
+ switch_buffer_by_offset_le (wordr0, wordr1, wordr2, wordr3, pw_l_len);
}
/**
s3[2] = 0;
s3[3] = 0;
- switch_buffer_by_offset (s0, s1, s2, s3, pw_len);
+ switch_buffer_by_offset_le (s0, s1, s2, s3, pw_len);
const u32 pw_salt_len = pw_len + salt_len;
#define MD5_STEP_REV(f,a,b,c,d,x,t,s) \
{ \
a -= b; \
- a = rotr32 (a, s); \
+ a = rotr32_S (a, s); \
a -= f (b, c, d); \
a -= x; \
a -= t; \
#define MD5_STEP_REV1(f,a,b,c,d,x,t,s) \
{ \
a -= b; \
- a = rotr32 (a, s); \
+ a = rotr32_S (a, s); \
a -= x; \
a -= t; \
}
salt_buf3[2] = 0;
salt_buf3[3] = 0;
- switch_buffer_by_offset_S (salt_buf0, salt_buf1, salt_buf2, salt_buf3, pw_len);
+ switch_buffer_by_offset_le_S (salt_buf0, salt_buf1, salt_buf2, salt_buf3, pw_len);
w[ 0] |= salt_buf0[0];
w[ 1] |= salt_buf0[1];
* reverse
*/
- u32x a_rev = digests_buf[digests_offset].digest_buf[0];
- u32x b_rev = digests_buf[digests_offset].digest_buf[1];
- u32x c_rev = digests_buf[digests_offset].digest_buf[2];
- u32x d_rev = digests_buf[digests_offset].digest_buf[3];
-
- MD5_STEP_REV (MD5_I, b_rev, c_rev, d_rev, a_rev, w[ 9], MD5C3f, MD5S33);
- MD5_STEP_REV (MD5_I, c_rev, d_rev, a_rev, b_rev, w[ 2], MD5C3e, MD5S32);
- MD5_STEP_REV (MD5_I, d_rev, a_rev, b_rev, c_rev, w[11], MD5C3d, MD5S31);
- MD5_STEP_REV (MD5_I, a_rev, b_rev, c_rev, d_rev, w[ 4], MD5C3c, MD5S30);
- MD5_STEP_REV (MD5_I, b_rev, c_rev, d_rev, a_rev, w[13], MD5C3b, MD5S33);
- MD5_STEP_REV (MD5_I, c_rev, d_rev, a_rev, b_rev, w[ 6], MD5C3a, MD5S32);
- MD5_STEP_REV (MD5_I, d_rev, a_rev, b_rev, c_rev, w[15], MD5C39, MD5S31);
- MD5_STEP_REV (MD5_I, a_rev, b_rev, c_rev, d_rev, w[ 8], MD5C38, MD5S30);
- MD5_STEP_REV (MD5_I, b_rev, c_rev, d_rev, a_rev, w[ 1], MD5C37, MD5S33);
- MD5_STEP_REV (MD5_I, c_rev, d_rev, a_rev, b_rev, w[10], MD5C36, MD5S32);
- MD5_STEP_REV (MD5_I, d_rev, a_rev, b_rev, c_rev, w[ 3], MD5C35, MD5S31);
- MD5_STEP_REV (MD5_I, a_rev, b_rev, c_rev, d_rev, w[12], MD5C34, MD5S30);
- MD5_STEP_REV (MD5_I, b_rev, c_rev, d_rev, a_rev, w[ 5], MD5C33, MD5S33);
- MD5_STEP_REV (MD5_I, c_rev, d_rev, a_rev, b_rev, w[14], MD5C32, MD5S32);
- MD5_STEP_REV (MD5_I, d_rev, a_rev, b_rev, c_rev, w[ 7], MD5C31, MD5S31);
- MD5_STEP_REV (MD5_I, a_rev, b_rev, c_rev, d_rev, 0, MD5C30, MD5S30);
-
- const u32x pre_cd = c_rev ^ d_rev;
-
- MD5_STEP_REV1(MD5_H, b_rev, c_rev, d_rev, a_rev, w[ 2], MD5C2f, MD5S23);
- MD5_STEP_REV1(MD5_H, c_rev, d_rev, a_rev, b_rev, w[15], MD5C2e, MD5S22);
+ u32 a_rev = digests_buf[digests_offset].digest_buf[0];
+ u32 b_rev = digests_buf[digests_offset].digest_buf[1];
+ u32 c_rev = digests_buf[digests_offset].digest_buf[2];
+ u32 d_rev = digests_buf[digests_offset].digest_buf[3];
+
+ MD5_STEP_REV (MD5_I_S, b_rev, c_rev, d_rev, a_rev, w[ 9], MD5C3f, MD5S33);
+ MD5_STEP_REV (MD5_I_S, c_rev, d_rev, a_rev, b_rev, w[ 2], MD5C3e, MD5S32);
+ MD5_STEP_REV (MD5_I_S, d_rev, a_rev, b_rev, c_rev, w[11], MD5C3d, MD5S31);
+ MD5_STEP_REV (MD5_I_S, a_rev, b_rev, c_rev, d_rev, w[ 4], MD5C3c, MD5S30);
+ MD5_STEP_REV (MD5_I_S, b_rev, c_rev, d_rev, a_rev, w[13], MD5C3b, MD5S33);
+ MD5_STEP_REV (MD5_I_S, c_rev, d_rev, a_rev, b_rev, w[ 6], MD5C3a, MD5S32);
+ MD5_STEP_REV (MD5_I_S, d_rev, a_rev, b_rev, c_rev, w[15], MD5C39, MD5S31);
+ MD5_STEP_REV (MD5_I_S, a_rev, b_rev, c_rev, d_rev, w[ 8], MD5C38, MD5S30);
+ MD5_STEP_REV (MD5_I_S, b_rev, c_rev, d_rev, a_rev, w[ 1], MD5C37, MD5S33);
+ MD5_STEP_REV (MD5_I_S, c_rev, d_rev, a_rev, b_rev, w[10], MD5C36, MD5S32);
+ MD5_STEP_REV (MD5_I_S, d_rev, a_rev, b_rev, c_rev, w[ 3], MD5C35, MD5S31);
+ MD5_STEP_REV (MD5_I_S, a_rev, b_rev, c_rev, d_rev, w[12], MD5C34, MD5S30);
+ MD5_STEP_REV (MD5_I_S, b_rev, c_rev, d_rev, a_rev, w[ 5], MD5C33, MD5S33);
+ MD5_STEP_REV (MD5_I_S, c_rev, d_rev, a_rev, b_rev, w[14], MD5C32, MD5S32);
+ MD5_STEP_REV (MD5_I_S, d_rev, a_rev, b_rev, c_rev, w[ 7], MD5C31, MD5S31);
+ MD5_STEP_REV (MD5_I_S, a_rev, b_rev, c_rev, d_rev, 0, MD5C30, MD5S30);
+
+ const u32 pre_cd = c_rev ^ d_rev;
+
+ MD5_STEP_REV1(MD5_H_S, b_rev, c_rev, d_rev, a_rev, w[ 2], MD5C2f, MD5S23);
+ MD5_STEP_REV1(MD5_H_S, c_rev, d_rev, a_rev, b_rev, w[15], MD5C2e, MD5S22);
/**
* loop
w3_t[2] = w3[2];
w3_t[3] = w3[3];
- switch_buffer_by_offset (w0_t, w1_t, w2_t, w3_t, salt_len);
+ switch_buffer_by_offset_le (w0_t, w1_t, w2_t, w3_t, salt_len);
w0_t[0] |= salt_buf0[0];
w0_t[1] |= salt_buf0[1];
w3_t[2] = w3[2];
w3_t[3] = w3[3];
- switch_buffer_by_offset (w0_t, w1_t, w2_t, w3_t, salt_len);
+ switch_buffer_by_offset_le (w0_t, w1_t, w2_t, w3_t, salt_len);
w0_t[0] |= salt_buf0[0];
w0_t[1] |= salt_buf0[1];
if (combs_mode == COMBINATOR_MODE_BASE_RIGHT)
{
- switch_buffer_by_offset (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len);
+ switch_buffer_by_offset_le (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len);
}
/**
if (combs_mode == COMBINATOR_MODE_BASE_LEFT)
{
- switch_buffer_by_offset (wordr0, wordr1, wordr2, wordr3, pw_l_len);
+ switch_buffer_by_offset_le (wordr0, wordr1, wordr2, wordr3, pw_l_len);
}
u32 w0[4];
w3_t[2] = w3[2];
w3_t[3] = w3[3];
- switch_buffer_by_offset (w0_t, w1_t, w2_t, w3_t, salt_len);
+ switch_buffer_by_offset_le (w0_t, w1_t, w2_t, w3_t, salt_len);
w0_t[0] |= salt_buf0[0];
w0_t[1] |= salt_buf0[1];
if (combs_mode == COMBINATOR_MODE_BASE_RIGHT)
{
- switch_buffer_by_offset (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len);
+ switch_buffer_by_offset_le (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len);
}
/**
if (combs_mode == COMBINATOR_MODE_BASE_LEFT)
{
- switch_buffer_by_offset (wordr0, wordr1, wordr2, wordr3, pw_l_len);
+ switch_buffer_by_offset_le (wordr0, wordr1, wordr2, wordr3, pw_l_len);
}
u32 w0[4];
w3_t[2] = w3[2];
w3_t[3] = w3[3];
- switch_buffer_by_offset (w0_t, w1_t, w2_t, w3_t, salt_len);
+ switch_buffer_by_offset_le (w0_t, w1_t, w2_t, w3_t, salt_len);
w3_t[2] = pw_salt_len * 8;
const u32 pw_salt_len = pw_len + salt_len;
+ /**
+ * prepend salt
+ */
+
+ u32 w0_t[4];
+ u32 w1_t[4];
+ u32 w2_t[4];
+ u32 w3_t[4];
+
+ w0_t[0] = w0[0];
+ w0_t[1] = w0[1];
+ w0_t[2] = w0[2];
+ w0_t[3] = w0[3];
+ w1_t[0] = w1[0];
+ w1_t[1] = w1[1];
+ w1_t[2] = w1[2];
+ w1_t[3] = w1[3];
+ w2_t[0] = w2[0];
+ w2_t[1] = w2[1];
+ w2_t[2] = w2[2];
+ w2_t[3] = w2[3];
+ w3_t[0] = w3[0];
+ w3_t[1] = w3[1];
+ w3_t[2] = w3[2];
+ w3_t[3] = w3[3];
+
+ switch_buffer_by_offset_le_S (w0_t, w1_t, w2_t, w3_t, salt_len);
+
+ w0_t[0] |= salt_buf0[0];
+ w0_t[1] |= salt_buf0[1];
+ w0_t[2] |= salt_buf0[2];
+ w0_t[3] |= salt_buf0[3];
+ w1_t[0] |= salt_buf1[0];
+ w1_t[1] |= salt_buf1[1];
+ w1_t[2] |= salt_buf1[2];
+ w1_t[3] |= salt_buf1[3];
+ w2_t[0] |= salt_buf2[0];
+ w2_t[1] |= salt_buf2[1];
+ w2_t[2] |= salt_buf2[2];
+ w2_t[3] |= salt_buf2[3];
+ w3_t[0] |= salt_buf3[0];
+ w3_t[1] |= salt_buf3[1];
+ w3_t[2] |= salt_buf3[2];
+ w3_t[3] |= salt_buf3[3];
+
/**
* loop
*/
for (u32 il_pos = 0; il_pos < bfs_cnt; il_pos += VECT_SIZE)
{
- #if VECT_SIZE == 1
- const u32x w0r = (u32x) (bfs_buf[il_pos + 0].i);
- #elif VECT_SIZE == 2
- const u32x w0r = (u32x) (bfs_buf[il_pos + 0].i, bfs_buf[il_pos + 1].i);
- #elif VECT_SIZE == 4
- const u32x w0r = (u32x) (bfs_buf[il_pos + 0].i, bfs_buf[il_pos + 1].i, bfs_buf[il_pos + 2].i, bfs_buf[il_pos + 3].i);
- #elif VECT_SIZE == 8
- const u32x w0r = (u32x) (bfs_buf[il_pos + 0].i, bfs_buf[il_pos + 1].i, bfs_buf[il_pos + 2].i, bfs_buf[il_pos + 3].i, bfs_buf[il_pos + 4].i, bfs_buf[il_pos + 5].i, bfs_buf[il_pos + 6].i, bfs_buf[il_pos + 7].i);
- #endif
+ const u32x w0r = w0r_create_bft (bfs_buf, il_pos);
const u32x w0lr = w0l | w0r;
- /**
- * prepend salt
- */
+ u32x wx[16];
+
+ wx[ 0] = w0_t[0];
+ wx[ 1] = w0_t[1];
+ wx[ 2] = w0_t[2];
+ wx[ 3] = w0_t[3];
+ wx[ 4] = w1_t[0];
+ wx[ 5] = w1_t[1];
+ wx[ 6] = w1_t[2];
+ wx[ 7] = w1_t[3];
+ wx[ 8] = w2_t[0];
+ wx[ 9] = w2_t[1];
+ wx[10] = w2_t[2];
+ wx[11] = w2_t[3];
+ wx[12] = w3_t[0];
+ wx[13] = w3_t[1];
+ wx[14] = w3_t[2];
+ wx[15] = w3_t[3];
+
+ overwrite_at_le (wx, w0lr, salt_len);
u32x w0_t[4];
u32x w1_t[4];
u32x w2_t[4];
u32x w3_t[4];
- w0_t[0] = w0lr;
- w0_t[1] = w0[1];
- w0_t[2] = w0[2];
- w0_t[3] = w0[3];
- w1_t[0] = w1[0];
- w1_t[1] = w1[1];
- w1_t[2] = w1[2];
- w1_t[3] = w1[3];
- w2_t[0] = w2[0];
- w2_t[1] = w2[1];
- w2_t[2] = w2[2];
- w2_t[3] = w2[3];
- w3_t[0] = w3[0];
- w3_t[1] = w3[1];
- w3_t[2] = w3[2];
- w3_t[3] = w3[3];
-
- switch_buffer_by_offset (w0_t, w1_t, w2_t, w3_t, salt_len);
-
+ w0_t[0] = wx[ 0];
+ w0_t[1] = wx[ 1];
+ w0_t[2] = wx[ 2];
+ w0_t[3] = wx[ 3];
+ w1_t[0] = wx[ 4];
+ w1_t[1] = wx[ 5];
+ w1_t[2] = wx[ 6];
+ w1_t[3] = wx[ 7];
+ w2_t[0] = wx[ 8];
+ w2_t[1] = wx[ 9];
+ w2_t[2] = wx[10];
+ w2_t[3] = wx[11];
+ w3_t[0] = wx[12];
+ w3_t[1] = wx[13];
w3_t[2] = pw_salt_len * 8;
-
- w0_t[0] |= salt_buf0[0];
- w0_t[1] |= salt_buf0[1];
- w0_t[2] |= salt_buf0[2];
- w0_t[3] |= salt_buf0[3];
- w1_t[0] |= salt_buf1[0];
- w1_t[1] |= salt_buf1[1];
- w1_t[2] |= salt_buf1[2];
- w1_t[3] |= salt_buf1[3];
- w2_t[0] |= salt_buf2[0];
- w2_t[1] |= salt_buf2[1];
- w2_t[2] |= salt_buf2[2];
- w2_t[3] |= salt_buf2[3];
- w3_t[0] |= salt_buf3[0];
- w3_t[1] |= salt_buf3[1];
- w3_t[2] |= salt_buf3[2];
- w3_t[3] |= salt_buf3[3];
+ w3_t[3] = 0;
/**
* md5
const u32 pw_salt_len = pw_len + salt_len;
+ /**
+ * prepend salt
+ */
+
+ u32 w0_t[4];
+ u32 w1_t[4];
+ u32 w2_t[4];
+ u32 w3_t[4];
+
+ w0_t[0] = w0[0];
+ w0_t[1] = w0[1];
+ w0_t[2] = w0[2];
+ w0_t[3] = w0[3];
+ w1_t[0] = w1[0];
+ w1_t[1] = w1[1];
+ w1_t[2] = w1[2];
+ w1_t[3] = w1[3];
+ w2_t[0] = w2[0];
+ w2_t[1] = w2[1];
+ w2_t[2] = w2[2];
+ w2_t[3] = w2[3];
+ w3_t[0] = w3[0];
+ w3_t[1] = w3[1];
+ w3_t[2] = w3[2];
+ w3_t[3] = w3[3];
+
+ switch_buffer_by_offset_le_S (w0_t, w1_t, w2_t, w3_t, salt_len);
+
+ w0_t[0] |= salt_buf0[0];
+ w0_t[1] |= salt_buf0[1];
+ w0_t[2] |= salt_buf0[2];
+ w0_t[3] |= salt_buf0[3];
+ w1_t[0] |= salt_buf1[0];
+ w1_t[1] |= salt_buf1[1];
+ w1_t[2] |= salt_buf1[2];
+ w1_t[3] |= salt_buf1[3];
+ w2_t[0] |= salt_buf2[0];
+ w2_t[1] |= salt_buf2[1];
+ w2_t[2] |= salt_buf2[2];
+ w2_t[3] |= salt_buf2[3];
+ w3_t[0] |= salt_buf3[0];
+ w3_t[1] |= salt_buf3[1];
+ w3_t[2] |= salt_buf3[2];
+ w3_t[3] |= salt_buf3[3];
+
/**
* loop
*/
for (u32 il_pos = 0; il_pos < bfs_cnt; il_pos += VECT_SIZE)
{
- #if VECT_SIZE == 1
- const u32x w0r = (u32x) (bfs_buf[il_pos + 0].i);
- #elif VECT_SIZE == 2
- const u32x w0r = (u32x) (bfs_buf[il_pos + 0].i, bfs_buf[il_pos + 1].i);
- #elif VECT_SIZE == 4
- const u32x w0r = (u32x) (bfs_buf[il_pos + 0].i, bfs_buf[il_pos + 1].i, bfs_buf[il_pos + 2].i, bfs_buf[il_pos + 3].i);
- #elif VECT_SIZE == 8
- const u32x w0r = (u32x) (bfs_buf[il_pos + 0].i, bfs_buf[il_pos + 1].i, bfs_buf[il_pos + 2].i, bfs_buf[il_pos + 3].i, bfs_buf[il_pos + 4].i, bfs_buf[il_pos + 5].i, bfs_buf[il_pos + 6].i, bfs_buf[il_pos + 7].i);
- #endif
+ const u32x w0r = w0r_create_bft (bfs_buf, il_pos);
const u32x w0lr = w0l | w0r;
- /**
- * prepend salt
- */
+ u32x wx[16];
+
+ wx[ 0] = w0_t[0];
+ wx[ 1] = w0_t[1];
+ wx[ 2] = w0_t[2];
+ wx[ 3] = w0_t[3];
+ wx[ 4] = w1_t[0];
+ wx[ 5] = w1_t[1];
+ wx[ 6] = w1_t[2];
+ wx[ 7] = w1_t[3];
+ wx[ 8] = w2_t[0];
+ wx[ 9] = w2_t[1];
+ wx[10] = w2_t[2];
+ wx[11] = w2_t[3];
+ wx[12] = w3_t[0];
+ wx[13] = w3_t[1];
+ wx[14] = w3_t[2];
+ wx[15] = w3_t[3];
+
+ overwrite_at_le (wx, w0lr, salt_len);
u32x w0_t[4];
u32x w1_t[4];
u32x w2_t[4];
u32x w3_t[4];
- w0_t[0] = w0lr;
- w0_t[1] = w0[1];
- w0_t[2] = w0[2];
- w0_t[3] = w0[3];
- w1_t[0] = w1[0];
- w1_t[1] = w1[1];
- w1_t[2] = w1[2];
- w1_t[3] = w1[3];
- w2_t[0] = w2[0];
- w2_t[1] = w2[1];
- w2_t[2] = w2[2];
- w2_t[3] = w2[3];
- w3_t[0] = w3[0];
- w3_t[1] = w3[1];
- w3_t[2] = w3[2];
- w3_t[3] = w3[3];
-
- switch_buffer_by_offset (w0_t, w1_t, w2_t, w3_t, salt_len);
-
+ w0_t[0] = wx[ 0];
+ w0_t[1] = wx[ 1];
+ w0_t[2] = wx[ 2];
+ w0_t[3] = wx[ 3];
+ w1_t[0] = wx[ 4];
+ w1_t[1] = wx[ 5];
+ w1_t[2] = wx[ 6];
+ w1_t[3] = wx[ 7];
+ w2_t[0] = wx[ 8];
+ w2_t[1] = wx[ 9];
+ w2_t[2] = wx[10];
+ w2_t[3] = wx[11];
+ w3_t[0] = wx[12];
+ w3_t[1] = wx[13];
w3_t[2] = pw_salt_len * 8;
-
- w0_t[0] |= salt_buf0[0];
- w0_t[1] |= salt_buf0[1];
- w0_t[2] |= salt_buf0[2];
- w0_t[3] |= salt_buf0[3];
- w1_t[0] |= salt_buf1[0];
- w1_t[1] |= salt_buf1[1];
- w1_t[2] |= salt_buf1[2];
- w1_t[3] |= salt_buf1[3];
- w2_t[0] |= salt_buf2[0];
- w2_t[1] |= salt_buf2[1];
- w2_t[2] |= salt_buf2[2];
- w2_t[3] |= salt_buf2[3];
- w3_t[0] |= salt_buf3[0];
- w3_t[1] |= salt_buf3[1];
- w3_t[2] |= salt_buf3[2];
- w3_t[3] |= salt_buf3[3];
+ w3_t[3] = 0;
/**
* md5
s3[2] = 0;
s3[3] = 0;
- switch_buffer_by_offset (s0, s1, s2, s3, (out_len * 2));
+ switch_buffer_by_offset_le (s0, s1, s2, s3, (out_len * 2));
const u32 out_salt_len = (out_len * 2) + salt_len;
s3[2] = 0;
s3[3] = 0;
- switch_buffer_by_offset (s0, s1, s2, s3, (out_len * 2));
+ switch_buffer_by_offset_le (s0, s1, s2, s3, (out_len * 2));
const u32 out_salt_len = (out_len * 2) + salt_len;
if (combs_mode == COMBINATOR_MODE_BASE_RIGHT)
{
- switch_buffer_by_offset (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len);
+ switch_buffer_by_offset_le (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len);
}
/**
if (combs_mode == COMBINATOR_MODE_BASE_LEFT)
{
- switch_buffer_by_offset (wordr0, wordr1, wordr2, wordr3, pw_l_len);
+ switch_buffer_by_offset_le (wordr0, wordr1, wordr2, wordr3, pw_l_len);
}
/**
s3[2] = 0;
s3[3] = 0;
- switch_buffer_by_offset (s0, s1, s2, s3, (pw_len * 2));
+ switch_buffer_by_offset_le (s0, s1, s2, s3, (pw_len * 2));
const u32 pw_salt_len = (pw_len * 2) + salt_len;
if (combs_mode == COMBINATOR_MODE_BASE_RIGHT)
{
- switch_buffer_by_offset (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len);
+ switch_buffer_by_offset_le (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len);
}
/**
if (combs_mode == COMBINATOR_MODE_BASE_LEFT)
{
- switch_buffer_by_offset (wordr0, wordr1, wordr2, wordr3, pw_l_len);
+ switch_buffer_by_offset_le (wordr0, wordr1, wordr2, wordr3, pw_l_len);
}
/**
s3[2] = 0;
s3[3] = 0;
- switch_buffer_by_offset (s0, s1, s2, s3, (pw_len * 2));
+ switch_buffer_by_offset_le (s0, s1, s2, s3, (pw_len * 2));
const u32 pw_salt_len = (pw_len * 2) + salt_len;
#define MD5_STEP_REV(f,a,b,c,d,x,t,s) \
{ \
a -= b; \
- a = rotr32 (a, s); \
+ a = rotr32_S (a, s); \
a -= f (b, c, d); \
a -= x; \
a -= t; \
#define MD5_STEP_REV1(f,a,b,c,d,x,t,s) \
{ \
a -= b; \
- a = rotr32 (a, s); \
+ a = rotr32_S (a, s); \
a -= x; \
a -= t; \
}
salt_buf3[2] = 0;
salt_buf3[3] = 0;
- switch_buffer_by_offset_S (salt_buf0, salt_buf1, salt_buf2, salt_buf3, pw_len);
+ switch_buffer_by_offset_le_S (salt_buf0, salt_buf1, salt_buf2, salt_buf3, pw_len);
w[ 0] |= salt_buf0[0];
w[ 1] |= salt_buf0[1];
* reverse
*/
- u32x a_rev = digests_buf[digests_offset].digest_buf[0];
- u32x b_rev = digests_buf[digests_offset].digest_buf[1];
- u32x c_rev = digests_buf[digests_offset].digest_buf[2];
- u32x d_rev = digests_buf[digests_offset].digest_buf[3];
-
- MD5_STEP_REV (MD5_I, b_rev, c_rev, d_rev, a_rev, w[ 9], MD5C3f, MD5S33);
- MD5_STEP_REV (MD5_I, c_rev, d_rev, a_rev, b_rev, w[ 2], MD5C3e, MD5S32);
- MD5_STEP_REV (MD5_I, d_rev, a_rev, b_rev, c_rev, w[11], MD5C3d, MD5S31);
- MD5_STEP_REV (MD5_I, a_rev, b_rev, c_rev, d_rev, w[ 4], MD5C3c, MD5S30);
- MD5_STEP_REV (MD5_I, b_rev, c_rev, d_rev, a_rev, w[13], MD5C3b, MD5S33);
- MD5_STEP_REV (MD5_I, c_rev, d_rev, a_rev, b_rev, w[ 6], MD5C3a, MD5S32);
- MD5_STEP_REV (MD5_I, d_rev, a_rev, b_rev, c_rev, w[15], MD5C39, MD5S31);
- MD5_STEP_REV (MD5_I, a_rev, b_rev, c_rev, d_rev, w[ 8], MD5C38, MD5S30);
- MD5_STEP_REV (MD5_I, b_rev, c_rev, d_rev, a_rev, w[ 1], MD5C37, MD5S33);
- MD5_STEP_REV (MD5_I, c_rev, d_rev, a_rev, b_rev, w[10], MD5C36, MD5S32);
- MD5_STEP_REV (MD5_I, d_rev, a_rev, b_rev, c_rev, w[ 3], MD5C35, MD5S31);
- MD5_STEP_REV (MD5_I, a_rev, b_rev, c_rev, d_rev, w[12], MD5C34, MD5S30);
- MD5_STEP_REV (MD5_I, b_rev, c_rev, d_rev, a_rev, w[ 5], MD5C33, MD5S33);
- MD5_STEP_REV (MD5_I, c_rev, d_rev, a_rev, b_rev, w[14], MD5C32, MD5S32);
- MD5_STEP_REV (MD5_I, d_rev, a_rev, b_rev, c_rev, w[ 7], MD5C31, MD5S31);
- MD5_STEP_REV (MD5_I, a_rev, b_rev, c_rev, d_rev, 0, MD5C30, MD5S30);
-
- const u32x pre_cd = c_rev ^ d_rev;
-
- MD5_STEP_REV1(MD5_H, b_rev, c_rev, d_rev, a_rev, w[ 2], MD5C2f, MD5S23);
- MD5_STEP_REV1(MD5_H, c_rev, d_rev, a_rev, b_rev, w[15], MD5C2e, MD5S22);
+ u32 a_rev = digests_buf[digests_offset].digest_buf[0];
+ u32 b_rev = digests_buf[digests_offset].digest_buf[1];
+ u32 c_rev = digests_buf[digests_offset].digest_buf[2];
+ u32 d_rev = digests_buf[digests_offset].digest_buf[3];
+
+ MD5_STEP_REV (MD5_I_S, b_rev, c_rev, d_rev, a_rev, w[ 9], MD5C3f, MD5S33);
+ MD5_STEP_REV (MD5_I_S, c_rev, d_rev, a_rev, b_rev, w[ 2], MD5C3e, MD5S32);
+ MD5_STEP_REV (MD5_I_S, d_rev, a_rev, b_rev, c_rev, w[11], MD5C3d, MD5S31);
+ MD5_STEP_REV (MD5_I_S, a_rev, b_rev, c_rev, d_rev, w[ 4], MD5C3c, MD5S30);
+ MD5_STEP_REV (MD5_I_S, b_rev, c_rev, d_rev, a_rev, w[13], MD5C3b, MD5S33);
+ MD5_STEP_REV (MD5_I_S, c_rev, d_rev, a_rev, b_rev, w[ 6], MD5C3a, MD5S32);
+ MD5_STEP_REV (MD5_I_S, d_rev, a_rev, b_rev, c_rev, w[15], MD5C39, MD5S31);
+ MD5_STEP_REV (MD5_I_S, a_rev, b_rev, c_rev, d_rev, w[ 8], MD5C38, MD5S30);
+ MD5_STEP_REV (MD5_I_S, b_rev, c_rev, d_rev, a_rev, w[ 1], MD5C37, MD5S33);
+ MD5_STEP_REV (MD5_I_S, c_rev, d_rev, a_rev, b_rev, w[10], MD5C36, MD5S32);
+ MD5_STEP_REV (MD5_I_S, d_rev, a_rev, b_rev, c_rev, w[ 3], MD5C35, MD5S31);
+ MD5_STEP_REV (MD5_I_S, a_rev, b_rev, c_rev, d_rev, w[12], MD5C34, MD5S30);
+ MD5_STEP_REV (MD5_I_S, b_rev, c_rev, d_rev, a_rev, w[ 5], MD5C33, MD5S33);
+ MD5_STEP_REV (MD5_I_S, c_rev, d_rev, a_rev, b_rev, w[14], MD5C32, MD5S32);
+ MD5_STEP_REV (MD5_I_S, d_rev, a_rev, b_rev, c_rev, w[ 7], MD5C31, MD5S31);
+ MD5_STEP_REV (MD5_I_S, a_rev, b_rev, c_rev, d_rev, 0, MD5C30, MD5S30);
+
+ const u32 pre_cd = c_rev ^ d_rev;
+
+ MD5_STEP_REV1(MD5_H_S, b_rev, c_rev, d_rev, a_rev, w[ 2], MD5C2f, MD5S23);
+ MD5_STEP_REV1(MD5_H_S, c_rev, d_rev, a_rev, b_rev, w[15], MD5C2e, MD5S22);
/**
* loop
make_unicode (w0, w0_t, w1_t);
make_unicode (w1, w2_t, w3_t);
- switch_buffer_by_offset (w0_t, w1_t, w2_t, w3_t, salt_len);
+ switch_buffer_by_offset_le (w0_t, w1_t, w2_t, w3_t, salt_len);
w0_t[0] |= salt_buf0[0];
w0_t[1] |= salt_buf0[1];
* md5
*/
+ u32 tmp2;
+
u32 a = MD5M_A;
u32 b = MD5M_B;
u32 c = MD5M_C;
MD5_STEP (MD5_Go, c, d, a, b, w1_t[3], MD5C1e, MD5S12);
MD5_STEP (MD5_Go, b, c, d, a, w3_t[0], MD5C1f, MD5S13);
- MD5_STEP (MD5_H , a, b, c, d, w1_t[1], MD5C20, MD5S20);
- MD5_STEP (MD5_H , d, a, b, c, w2_t[0], MD5C21, MD5S21);
- MD5_STEP (MD5_H , c, d, a, b, w2_t[3], MD5C22, MD5S22);
- MD5_STEP (MD5_H , b, c, d, a, w3_t[2], MD5C23, MD5S23);
- MD5_STEP (MD5_H , a, b, c, d, w0_t[1], MD5C24, MD5S20);
- MD5_STEP (MD5_H , d, a, b, c, w1_t[0], MD5C25, MD5S21);
- MD5_STEP (MD5_H , c, d, a, b, w1_t[3], MD5C26, MD5S22);
- MD5_STEP (MD5_H , b, c, d, a, w2_t[2], MD5C27, MD5S23);
- MD5_STEP (MD5_H , a, b, c, d, w3_t[1], MD5C28, MD5S20);
- MD5_STEP (MD5_H , d, a, b, c, w0_t[0], MD5C29, MD5S21);
- MD5_STEP (MD5_H , c, d, a, b, w0_t[3], MD5C2a, MD5S22);
- MD5_STEP (MD5_H , b, c, d, a, w1_t[2], MD5C2b, MD5S23);
- MD5_STEP (MD5_H , a, b, c, d, w2_t[1], MD5C2c, MD5S20);
- MD5_STEP (MD5_H , d, a, b, c, w3_t[0], MD5C2d, MD5S21);
- MD5_STEP (MD5_H , c, d, a, b, w3_t[3], MD5C2e, MD5S22);
- MD5_STEP (MD5_H , b, c, d, a, w0_t[2], MD5C2f, MD5S23);
+ MD5_STEP (MD5_H1, a, b, c, d, w1_t[1], MD5C20, MD5S20);
+ MD5_STEP (MD5_H2, d, a, b, c, w2_t[0], MD5C21, MD5S21);
+ MD5_STEP (MD5_H1, c, d, a, b, w2_t[3], MD5C22, MD5S22);
+ MD5_STEP (MD5_H2, b, c, d, a, w3_t[2], MD5C23, MD5S23);
+ MD5_STEP (MD5_H1, a, b, c, d, w0_t[1], MD5C24, MD5S20);
+ MD5_STEP (MD5_H2, d, a, b, c, w1_t[0], MD5C25, MD5S21);
+ MD5_STEP (MD5_H1, c, d, a, b, w1_t[3], MD5C26, MD5S22);
+ MD5_STEP (MD5_H2, b, c, d, a, w2_t[2], MD5C27, MD5S23);
+ MD5_STEP (MD5_H1, a, b, c, d, w3_t[1], MD5C28, MD5S20);
+ MD5_STEP (MD5_H2, d, a, b, c, w0_t[0], MD5C29, MD5S21);
+ MD5_STEP (MD5_H1, c, d, a, b, w0_t[3], MD5C2a, MD5S22);
+ MD5_STEP (MD5_H2, b, c, d, a, w1_t[2], MD5C2b, MD5S23);
+ MD5_STEP (MD5_H1, a, b, c, d, w2_t[1], MD5C2c, MD5S20);
+ MD5_STEP (MD5_H2, d, a, b, c, w3_t[0], MD5C2d, MD5S21);
+ MD5_STEP (MD5_H1, c, d, a, b, w3_t[3], MD5C2e, MD5S22);
+ MD5_STEP (MD5_H2, b, c, d, a, w0_t[2], MD5C2f, MD5S23);
MD5_STEP (MD5_I , a, b, c, d, w0_t[0], MD5C30, MD5S30);
MD5_STEP (MD5_I , d, a, b, c, w1_t[3], MD5C31, MD5S31);
make_unicode (w0, w0_t, w1_t);
make_unicode (w1, w2_t, w3_t);
- switch_buffer_by_offset (w0_t, w1_t, w2_t, w3_t, salt_len);
+ switch_buffer_by_offset_le (w0_t, w1_t, w2_t, w3_t, salt_len);
w0_t[0] |= salt_buf0[0];
w0_t[1] |= salt_buf0[1];
* md5
*/
+ u32 tmp2;
+
u32 a = MD5M_A;
u32 b = MD5M_B;
u32 c = MD5M_C;
MD5_STEP (MD5_Go, c, d, a, b, w1_t[3], MD5C1e, MD5S12);
MD5_STEP (MD5_Go, b, c, d, a, w3_t[0], MD5C1f, MD5S13);
- MD5_STEP (MD5_H , a, b, c, d, w1_t[1], MD5C20, MD5S20);
- MD5_STEP (MD5_H , d, a, b, c, w2_t[0], MD5C21, MD5S21);
- MD5_STEP (MD5_H , c, d, a, b, w2_t[3], MD5C22, MD5S22);
- MD5_STEP (MD5_H , b, c, d, a, w3_t[2], MD5C23, MD5S23);
- MD5_STEP (MD5_H , a, b, c, d, w0_t[1], MD5C24, MD5S20);
- MD5_STEP (MD5_H , d, a, b, c, w1_t[0], MD5C25, MD5S21);
- MD5_STEP (MD5_H , c, d, a, b, w1_t[3], MD5C26, MD5S22);
- MD5_STEP (MD5_H , b, c, d, a, w2_t[2], MD5C27, MD5S23);
- MD5_STEP (MD5_H , a, b, c, d, w3_t[1], MD5C28, MD5S20);
- MD5_STEP (MD5_H , d, a, b, c, w0_t[0], MD5C29, MD5S21);
- MD5_STEP (MD5_H , c, d, a, b, w0_t[3], MD5C2a, MD5S22);
- MD5_STEP (MD5_H , b, c, d, a, w1_t[2], MD5C2b, MD5S23);
- MD5_STEP (MD5_H , a, b, c, d, w2_t[1], MD5C2c, MD5S20);
- MD5_STEP (MD5_H , d, a, b, c, w3_t[0], MD5C2d, MD5S21);
- MD5_STEP (MD5_H , c, d, a, b, w3_t[3], MD5C2e, MD5S22);
- MD5_STEP (MD5_H , b, c, d, a, w0_t[2], MD5C2f, MD5S23);
+ MD5_STEP (MD5_H1, a, b, c, d, w1_t[1], MD5C20, MD5S20);
+ MD5_STEP (MD5_H2, d, a, b, c, w2_t[0], MD5C21, MD5S21);
+ MD5_STEP (MD5_H1, c, d, a, b, w2_t[3], MD5C22, MD5S22);
+ MD5_STEP (MD5_H2, b, c, d, a, w3_t[2], MD5C23, MD5S23);
+ MD5_STEP (MD5_H1, a, b, c, d, w0_t[1], MD5C24, MD5S20);
+ MD5_STEP (MD5_H2, d, a, b, c, w1_t[0], MD5C25, MD5S21);
+ MD5_STEP (MD5_H1, c, d, a, b, w1_t[3], MD5C26, MD5S22);
+ MD5_STEP (MD5_H2, b, c, d, a, w2_t[2], MD5C27, MD5S23);
+ MD5_STEP (MD5_H1, a, b, c, d, w3_t[1], MD5C28, MD5S20);
+ MD5_STEP (MD5_H2, d, a, b, c, w0_t[0], MD5C29, MD5S21);
+ MD5_STEP (MD5_H1, c, d, a, b, w0_t[3], MD5C2a, MD5S22);
+ MD5_STEP (MD5_H2, b, c, d, a, w1_t[2], MD5C2b, MD5S23);
+ MD5_STEP (MD5_H1, a, b, c, d, w2_t[1], MD5C2c, MD5S20);
+ MD5_STEP (MD5_H2, d, a, b, c, w3_t[0], MD5C2d, MD5S21);
+ MD5_STEP (MD5_H1, c, d, a, b, w3_t[3], MD5C2e, MD5S22);
+ MD5_STEP (MD5_H2, b, c, d, a, w0_t[2], MD5C2f, MD5S23);
MD5_STEP (MD5_I , a, b, c, d, w0_t[0], MD5C30, MD5S30);
MD5_STEP (MD5_I , d, a, b, c, w1_t[3], MD5C31, MD5S31);
if (combs_mode == COMBINATOR_MODE_BASE_RIGHT)
{
- switch_buffer_by_offset (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len);
+ switch_buffer_by_offset_le (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len);
}
/**
if (combs_mode == COMBINATOR_MODE_BASE_LEFT)
{
- switch_buffer_by_offset (wordr0, wordr1, wordr2, wordr3, pw_l_len);
+ switch_buffer_by_offset_le (wordr0, wordr1, wordr2, wordr3, pw_l_len);
}
/**
make_unicode (w0, w0_t, w1_t);
make_unicode (w1, w2_t, w3_t);
- switch_buffer_by_offset (w0_t, w1_t, w2_t, w3_t, salt_len);
+ switch_buffer_by_offset_le (w0_t, w1_t, w2_t, w3_t, salt_len);
w0_t[0] |= salt_buf0[0];
w0_t[1] |= salt_buf0[1];
* md5
*/
+ u32 tmp2;
+
u32 a = MD5M_A;
u32 b = MD5M_B;
u32 c = MD5M_C;
MD5_STEP (MD5_Go, c, d, a, b, w1_t[3], MD5C1e, MD5S12);
MD5_STEP (MD5_Go, b, c, d, a, w3_t[0], MD5C1f, MD5S13);
- MD5_STEP (MD5_H , a, b, c, d, w1_t[1], MD5C20, MD5S20);
- MD5_STEP (MD5_H , d, a, b, c, w2_t[0], MD5C21, MD5S21);
- MD5_STEP (MD5_H , c, d, a, b, w2_t[3], MD5C22, MD5S22);
- MD5_STEP (MD5_H , b, c, d, a, w3_t[2], MD5C23, MD5S23);
- MD5_STEP (MD5_H , a, b, c, d, w0_t[1], MD5C24, MD5S20);
- MD5_STEP (MD5_H , d, a, b, c, w1_t[0], MD5C25, MD5S21);
- MD5_STEP (MD5_H , c, d, a, b, w1_t[3], MD5C26, MD5S22);
- MD5_STEP (MD5_H , b, c, d, a, w2_t[2], MD5C27, MD5S23);
- MD5_STEP (MD5_H , a, b, c, d, w3_t[1], MD5C28, MD5S20);
- MD5_STEP (MD5_H , d, a, b, c, w0_t[0], MD5C29, MD5S21);
- MD5_STEP (MD5_H , c, d, a, b, w0_t[3], MD5C2a, MD5S22);
- MD5_STEP (MD5_H , b, c, d, a, w1_t[2], MD5C2b, MD5S23);
- MD5_STEP (MD5_H , a, b, c, d, w2_t[1], MD5C2c, MD5S20);
- MD5_STEP (MD5_H , d, a, b, c, w3_t[0], MD5C2d, MD5S21);
- MD5_STEP (MD5_H , c, d, a, b, w3_t[3], MD5C2e, MD5S22);
- MD5_STEP (MD5_H , b, c, d, a, w0_t[2], MD5C2f, MD5S23);
+ MD5_STEP (MD5_H1, a, b, c, d, w1_t[1], MD5C20, MD5S20);
+ MD5_STEP (MD5_H2, d, a, b, c, w2_t[0], MD5C21, MD5S21);
+ MD5_STEP (MD5_H1, c, d, a, b, w2_t[3], MD5C22, MD5S22);
+ MD5_STEP (MD5_H2, b, c, d, a, w3_t[2], MD5C23, MD5S23);
+ MD5_STEP (MD5_H1, a, b, c, d, w0_t[1], MD5C24, MD5S20);
+ MD5_STEP (MD5_H2, d, a, b, c, w1_t[0], MD5C25, MD5S21);
+ MD5_STEP (MD5_H1, c, d, a, b, w1_t[3], MD5C26, MD5S22);
+ MD5_STEP (MD5_H2, b, c, d, a, w2_t[2], MD5C27, MD5S23);
+ MD5_STEP (MD5_H1, a, b, c, d, w3_t[1], MD5C28, MD5S20);
+ MD5_STEP (MD5_H2, d, a, b, c, w0_t[0], MD5C29, MD5S21);
+ MD5_STEP (MD5_H1, c, d, a, b, w0_t[3], MD5C2a, MD5S22);
+ MD5_STEP (MD5_H2, b, c, d, a, w1_t[2], MD5C2b, MD5S23);
+ MD5_STEP (MD5_H1, a, b, c, d, w2_t[1], MD5C2c, MD5S20);
+ MD5_STEP (MD5_H2, d, a, b, c, w3_t[0], MD5C2d, MD5S21);
+ MD5_STEP (MD5_H1, c, d, a, b, w3_t[3], MD5C2e, MD5S22);
+ MD5_STEP (MD5_H2, b, c, d, a, w0_t[2], MD5C2f, MD5S23);
MD5_STEP (MD5_I , a, b, c, d, w0_t[0], MD5C30, MD5S30);
MD5_STEP (MD5_I , d, a, b, c, w1_t[3], MD5C31, MD5S31);
if (combs_mode == COMBINATOR_MODE_BASE_RIGHT)
{
- switch_buffer_by_offset (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len);
+ switch_buffer_by_offset_le (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len);
}
/**
if (combs_mode == COMBINATOR_MODE_BASE_LEFT)
{
- switch_buffer_by_offset (wordr0, wordr1, wordr2, wordr3, pw_l_len);
+ switch_buffer_by_offset_le (wordr0, wordr1, wordr2, wordr3, pw_l_len);
}
/**
make_unicode (w0, w0_t, w1_t);
make_unicode (w1, w2_t, w3_t);
- switch_buffer_by_offset (w0_t, w1_t, w2_t, w3_t, salt_len);
+ switch_buffer_by_offset_le (w0_t, w1_t, w2_t, w3_t, salt_len);
w0_t[0] |= salt_buf0[0];
w0_t[1] |= salt_buf0[1];
* md5
*/
+ u32 tmp2;
+
u32 a = MD5M_A;
u32 b = MD5M_B;
u32 c = MD5M_C;
MD5_STEP (MD5_Go, c, d, a, b, w1_t[3], MD5C1e, MD5S12);
MD5_STEP (MD5_Go, b, c, d, a, w3_t[0], MD5C1f, MD5S13);
- MD5_STEP (MD5_H , a, b, c, d, w1_t[1], MD5C20, MD5S20);
- MD5_STEP (MD5_H , d, a, b, c, w2_t[0], MD5C21, MD5S21);
- MD5_STEP (MD5_H , c, d, a, b, w2_t[3], MD5C22, MD5S22);
- MD5_STEP (MD5_H , b, c, d, a, w3_t[2], MD5C23, MD5S23);
- MD5_STEP (MD5_H , a, b, c, d, w0_t[1], MD5C24, MD5S20);
- MD5_STEP (MD5_H , d, a, b, c, w1_t[0], MD5C25, MD5S21);
- MD5_STEP (MD5_H , c, d, a, b, w1_t[3], MD5C26, MD5S22);
- MD5_STEP (MD5_H , b, c, d, a, w2_t[2], MD5C27, MD5S23);
- MD5_STEP (MD5_H , a, b, c, d, w3_t[1], MD5C28, MD5S20);
- MD5_STEP (MD5_H , d, a, b, c, w0_t[0], MD5C29, MD5S21);
- MD5_STEP (MD5_H , c, d, a, b, w0_t[3], MD5C2a, MD5S22);
- MD5_STEP (MD5_H , b, c, d, a, w1_t[2], MD5C2b, MD5S23);
- MD5_STEP (MD5_H , a, b, c, d, w2_t[1], MD5C2c, MD5S20);
- MD5_STEP (MD5_H , d, a, b, c, w3_t[0], MD5C2d, MD5S21);
- MD5_STEP (MD5_H , c, d, a, b, w3_t[3], MD5C2e, MD5S22);
- MD5_STEP (MD5_H , b, c, d, a, w0_t[2], MD5C2f, MD5S23);
+ MD5_STEP (MD5_H1, a, b, c, d, w1_t[1], MD5C20, MD5S20);
+ MD5_STEP (MD5_H2, d, a, b, c, w2_t[0], MD5C21, MD5S21);
+ MD5_STEP (MD5_H1, c, d, a, b, w2_t[3], MD5C22, MD5S22);
+ MD5_STEP (MD5_H2, b, c, d, a, w3_t[2], MD5C23, MD5S23);
+ MD5_STEP (MD5_H1, a, b, c, d, w0_t[1], MD5C24, MD5S20);
+ MD5_STEP (MD5_H2, d, a, b, c, w1_t[0], MD5C25, MD5S21);
+ MD5_STEP (MD5_H1, c, d, a, b, w1_t[3], MD5C26, MD5S22);
+ MD5_STEP (MD5_H2, b, c, d, a, w2_t[2], MD5C27, MD5S23);
+ MD5_STEP (MD5_H1, a, b, c, d, w3_t[1], MD5C28, MD5S20);
+ MD5_STEP (MD5_H2, d, a, b, c, w0_t[0], MD5C29, MD5S21);
+ MD5_STEP (MD5_H1, c, d, a, b, w0_t[3], MD5C2a, MD5S22);
+ MD5_STEP (MD5_H2, b, c, d, a, w1_t[2], MD5C2b, MD5S23);
+ MD5_STEP (MD5_H1, a, b, c, d, w2_t[1], MD5C2c, MD5S20);
+ MD5_STEP (MD5_H2, d, a, b, c, w3_t[0], MD5C2d, MD5S21);
+ MD5_STEP (MD5_H1, c, d, a, b, w3_t[3], MD5C2e, MD5S22);
+ MD5_STEP (MD5_H2, b, c, d, a, w0_t[2], MD5C2f, MD5S23);
MD5_STEP (MD5_I , a, b, c, d, w0_t[0], MD5C30, MD5S30);
MD5_STEP (MD5_I , d, a, b, c, w1_t[3], MD5C31, MD5S31);
#define _MD5_
+#define NEW_SIMD_CODE
+
#include "include/constants.h"
#include "include/kernel_vendor.h"
#include "include/kernel_functions.c"
#include "OpenCL/types_ocl.c"
#include "OpenCL/common.c"
-
-#define COMPARE_S "OpenCL/check_single_comp4.c"
-#define COMPARE_M "OpenCL/check_multi_comp4.c"
+#include "OpenCL/simd.c"
static void m00040m (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], const u32 pw_len, __global pw_t *pws, __global kernel_rule_t *rules_buf, __global comb_t *combs_buf, __global bf_t *bfs_buf, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 bfs_cnt, const u32 digests_cnt, const u32 digests_offset)
{
const u32 pw_salt_len = pw_len + salt_len;
+ /**
+ * prepend salt
+ */
+
+ u32 w0_t[4];
+ u32 w1_t[4];
+ u32 w2_t[4];
+ u32 w3_t[4];
+
+ w0_t[0] = w0[0];
+ w0_t[1] = w0[1];
+ w0_t[2] = w0[2];
+ w0_t[3] = w0[3];
+ w1_t[0] = w1[0];
+ w1_t[1] = w1[1];
+ w1_t[2] = w1[2];
+ w1_t[3] = w1[3];
+ w2_t[0] = w2[0];
+ w2_t[1] = w2[1];
+ w2_t[2] = w2[2];
+ w2_t[3] = w2[3];
+ w3_t[0] = w3[0];
+ w3_t[1] = w3[1];
+ w3_t[2] = w3[2];
+ w3_t[3] = w3[3];
+
+ switch_buffer_by_offset_le_S (w0_t, w1_t, w2_t, w3_t, salt_len);
+
+ w0_t[0] |= salt_buf0[0];
+ w0_t[1] |= salt_buf0[1];
+ w0_t[2] |= salt_buf0[2];
+ w0_t[3] |= salt_buf0[3];
+ w1_t[0] |= salt_buf1[0];
+ w1_t[1] |= salt_buf1[1];
+ w1_t[2] |= salt_buf1[2];
+ w1_t[3] |= salt_buf1[3];
+ w2_t[0] |= salt_buf2[0];
+ w2_t[1] |= salt_buf2[1];
+ w2_t[2] |= salt_buf2[2];
+ w2_t[3] |= salt_buf2[3];
+ w3_t[0] |= salt_buf3[0];
+ w3_t[1] |= salt_buf3[1];
+ w3_t[2] |= salt_buf3[2];
+ w3_t[3] |= salt_buf3[3];
+
/**
* loop
*/
u32 w0l = w0[0];
- for (u32 il_pos = 0; il_pos < bfs_cnt; il_pos++)
+ for (u32 il_pos = 0; il_pos < bfs_cnt; il_pos += VECT_SIZE)
{
- const u32 w0r = bfs_buf[il_pos].i;
-
- w0[0] = w0l | w0r;
-
- /**
- * prepend salt
- */
-
- u32 w0_t[4];
- u32 w1_t[4];
- u32 w2_t[4];
- u32 w3_t[4];
-
- w0_t[0] = w0[0];
- w0_t[1] = w0[1];
- w0_t[2] = w0[2];
- w0_t[3] = w0[3];
- w1_t[0] = w1[0];
- w1_t[1] = w1[1];
- w1_t[2] = w1[2];
- w1_t[3] = w1[3];
- w2_t[0] = w2[0];
- w2_t[1] = w2[1];
- w2_t[2] = w2[2];
- w2_t[3] = w2[3];
- w3_t[0] = w3[0];
- w3_t[1] = w3[1];
- w3_t[2] = w3[2];
- w3_t[3] = w3[3];
-
- switch_buffer_by_offset (w0_t, w1_t, w2_t, w3_t, salt_len);
-
+ const u32x w0r = w0r_create_bft (bfs_buf, il_pos);
+
+ const u32x w0lr = w0l | w0r;
+
+ u32x wx[16];
+
+ wx[ 0] = w0_t[0];
+ wx[ 1] = w0_t[1];
+ wx[ 2] = w0_t[2];
+ wx[ 3] = w0_t[3];
+ wx[ 4] = w1_t[0];
+ wx[ 5] = w1_t[1];
+ wx[ 6] = w1_t[2];
+ wx[ 7] = w1_t[3];
+ wx[ 8] = w2_t[0];
+ wx[ 9] = w2_t[1];
+ wx[10] = w2_t[2];
+ wx[11] = w2_t[3];
+ wx[12] = w3_t[0];
+ wx[13] = w3_t[1];
+ wx[14] = w3_t[2];
+ wx[15] = w3_t[3];
+
+ overwrite_at_le (wx, w0lr, salt_len);
+
+ u32x w0_t[4];
+ u32x w1_t[4];
+ u32x w2_t[4];
+ u32x w3_t[4];
+
+ w0_t[0] = wx[ 0];
+ w0_t[1] = wx[ 1];
+ w0_t[2] = wx[ 2];
+ w0_t[3] = wx[ 3];
+ w1_t[0] = wx[ 4];
+ w1_t[1] = wx[ 5];
+ w1_t[2] = wx[ 6];
+ w1_t[3] = wx[ 7];
+ w2_t[0] = wx[ 8];
+ w2_t[1] = wx[ 9];
+ w2_t[2] = wx[10];
+ w2_t[3] = wx[11];
+ w3_t[0] = wx[12];
+ w3_t[1] = wx[13];
w3_t[2] = pw_salt_len * 8;
-
- w0_t[0] |= salt_buf0[0];
- w0_t[1] |= salt_buf0[1];
- w0_t[2] |= salt_buf0[2];
- w0_t[3] |= salt_buf0[3];
- w1_t[0] |= salt_buf1[0];
- w1_t[1] |= salt_buf1[1];
- w1_t[2] |= salt_buf1[2];
- w1_t[3] |= salt_buf1[3];
- w2_t[0] |= salt_buf2[0];
- w2_t[1] |= salt_buf2[1];
- w2_t[2] |= salt_buf2[2];
- w2_t[3] |= salt_buf2[3];
- w3_t[0] |= salt_buf3[0];
- w3_t[1] |= salt_buf3[1];
- w3_t[2] |= salt_buf3[2];
- w3_t[3] |= salt_buf3[3];
+ w3_t[3] = 0;
/**
* md5
*/
- u32 a = MD5M_A;
- u32 b = MD5M_B;
- u32 c = MD5M_C;
- u32 d = MD5M_D;
+ u32x tmp2;
+
+ u32x a = MD5M_A;
+ u32x b = MD5M_B;
+ u32x c = MD5M_C;
+ u32x d = MD5M_D;
MD5_STEP (MD5_Fo, a, b, c, d, w0_t[0], MD5C00, MD5S00);
MD5_STEP (MD5_Fo, d, a, b, c, w0_t[1], MD5C01, MD5S01);
MD5_STEP (MD5_Go, c, d, a, b, w1_t[3], MD5C1e, MD5S12);
MD5_STEP (MD5_Go, b, c, d, a, w3_t[0], MD5C1f, MD5S13);
- MD5_STEP (MD5_H , a, b, c, d, w1_t[1], MD5C20, MD5S20);
- MD5_STEP (MD5_H , d, a, b, c, w2_t[0], MD5C21, MD5S21);
- MD5_STEP (MD5_H , c, d, a, b, w2_t[3], MD5C22, MD5S22);
- MD5_STEP (MD5_H , b, c, d, a, w3_t[2], MD5C23, MD5S23);
- MD5_STEP (MD5_H , a, b, c, d, w0_t[1], MD5C24, MD5S20);
- MD5_STEP (MD5_H , d, a, b, c, w1_t[0], MD5C25, MD5S21);
- MD5_STEP (MD5_H , c, d, a, b, w1_t[3], MD5C26, MD5S22);
- MD5_STEP (MD5_H , b, c, d, a, w2_t[2], MD5C27, MD5S23);
- MD5_STEP (MD5_H , a, b, c, d, w3_t[1], MD5C28, MD5S20);
- MD5_STEP (MD5_H , d, a, b, c, w0_t[0], MD5C29, MD5S21);
- MD5_STEP (MD5_H , c, d, a, b, w0_t[3], MD5C2a, MD5S22);
- MD5_STEP (MD5_H , b, c, d, a, w1_t[2], MD5C2b, MD5S23);
- MD5_STEP (MD5_H , a, b, c, d, w2_t[1], MD5C2c, MD5S20);
- MD5_STEP (MD5_H , d, a, b, c, w3_t[0], MD5C2d, MD5S21);
- MD5_STEP (MD5_H , c, d, a, b, w3_t[3], MD5C2e, MD5S22);
- MD5_STEP (MD5_H , b, c, d, a, w0_t[2], MD5C2f, MD5S23);
+ MD5_STEP (MD5_H1, a, b, c, d, w1_t[1], MD5C20, MD5S20);
+ MD5_STEP (MD5_H2, d, a, b, c, w2_t[0], MD5C21, MD5S21);
+ MD5_STEP (MD5_H1, c, d, a, b, w2_t[3], MD5C22, MD5S22);
+ MD5_STEP (MD5_H2, b, c, d, a, w3_t[2], MD5C23, MD5S23);
+ MD5_STEP (MD5_H1, a, b, c, d, w0_t[1], MD5C24, MD5S20);
+ MD5_STEP (MD5_H2, d, a, b, c, w1_t[0], MD5C25, MD5S21);
+ MD5_STEP (MD5_H1, c, d, a, b, w1_t[3], MD5C26, MD5S22);
+ MD5_STEP (MD5_H2, b, c, d, a, w2_t[2], MD5C27, MD5S23);
+ MD5_STEP (MD5_H1, a, b, c, d, w3_t[1], MD5C28, MD5S20);
+ MD5_STEP (MD5_H2, d, a, b, c, w0_t[0], MD5C29, MD5S21);
+ MD5_STEP (MD5_H1, c, d, a, b, w0_t[3], MD5C2a, MD5S22);
+ MD5_STEP (MD5_H2, b, c, d, a, w1_t[2], MD5C2b, MD5S23);
+ MD5_STEP (MD5_H1, a, b, c, d, w2_t[1], MD5C2c, MD5S20);
+ MD5_STEP (MD5_H2, d, a, b, c, w3_t[0], MD5C2d, MD5S21);
+ MD5_STEP (MD5_H1, c, d, a, b, w3_t[3], MD5C2e, MD5S22);
+ MD5_STEP (MD5_H2, b, c, d, a, w0_t[2], MD5C2f, MD5S23);
MD5_STEP (MD5_I , a, b, c, d, w0_t[0], MD5C30, MD5S30);
MD5_STEP (MD5_I , d, a, b, c, w1_t[3], MD5C31, MD5S31);
MD5_STEP (MD5_I , c, d, a, b, w0_t[2], MD5C3e, MD5S32);
MD5_STEP (MD5_I , b, c, d, a, w2_t[1], MD5C3f, MD5S33);
-
- const u32 r0 = a;
- const u32 r1 = d;
- const u32 r2 = c;
- const u32 r3 = b;
-
- #include COMPARE_M
+ COMPARE_M_SIMD (a, d, c, b);
}
}
const u32 pw_salt_len = pw_len + salt_len;
+ /**
+ * prepend salt
+ */
+
+ u32 w0_t[4];
+ u32 w1_t[4];
+ u32 w2_t[4];
+ u32 w3_t[4];
+
+ w0_t[0] = w0[0];
+ w0_t[1] = w0[1];
+ w0_t[2] = w0[2];
+ w0_t[3] = w0[3];
+ w1_t[0] = w1[0];
+ w1_t[1] = w1[1];
+ w1_t[2] = w1[2];
+ w1_t[3] = w1[3];
+ w2_t[0] = w2[0];
+ w2_t[1] = w2[1];
+ w2_t[2] = w2[2];
+ w2_t[3] = w2[3];
+ w3_t[0] = w3[0];
+ w3_t[1] = w3[1];
+ w3_t[2] = w3[2];
+ w3_t[3] = w3[3];
+
+ switch_buffer_by_offset_le_S (w0_t, w1_t, w2_t, w3_t, salt_len);
+
+ w0_t[0] |= salt_buf0[0];
+ w0_t[1] |= salt_buf0[1];
+ w0_t[2] |= salt_buf0[2];
+ w0_t[3] |= salt_buf0[3];
+ w1_t[0] |= salt_buf1[0];
+ w1_t[1] |= salt_buf1[1];
+ w1_t[2] |= salt_buf1[2];
+ w1_t[3] |= salt_buf1[3];
+ w2_t[0] |= salt_buf2[0];
+ w2_t[1] |= salt_buf2[1];
+ w2_t[2] |= salt_buf2[2];
+ w2_t[3] |= salt_buf2[3];
+ w3_t[0] |= salt_buf3[0];
+ w3_t[1] |= salt_buf3[1];
+ w3_t[2] |= salt_buf3[2];
+ w3_t[3] |= salt_buf3[3];
+
/**
* loop
*/
u32 w0l = w0[0];
- for (u32 il_pos = 0; il_pos < bfs_cnt; il_pos++)
+ for (u32 il_pos = 0; il_pos < bfs_cnt; il_pos += VECT_SIZE)
{
- const u32 w0r = bfs_buf[il_pos].i;
-
- w0[0] = w0l | w0r;
-
- /**
- * prepend salt
- */
-
- u32 w0_t[4];
- u32 w1_t[4];
- u32 w2_t[4];
- u32 w3_t[4];
-
- w0_t[0] = w0[0];
- w0_t[1] = w0[1];
- w0_t[2] = w0[2];
- w0_t[3] = w0[3];
- w1_t[0] = w1[0];
- w1_t[1] = w1[1];
- w1_t[2] = w1[2];
- w1_t[3] = w1[3];
- w2_t[0] = w2[0];
- w2_t[1] = w2[1];
- w2_t[2] = w2[2];
- w2_t[3] = w2[3];
- w3_t[0] = w3[0];
- w3_t[1] = w3[1];
- w3_t[2] = w3[2];
- w3_t[3] = w3[3];
-
- switch_buffer_by_offset (w0_t, w1_t, w2_t, w3_t, salt_len);
-
+ const u32x w0r = w0r_create_bft (bfs_buf, il_pos);
+
+ const u32x w0lr = w0l | w0r;
+
+ u32x wx[16];
+
+ wx[ 0] = w0_t[0];
+ wx[ 1] = w0_t[1];
+ wx[ 2] = w0_t[2];
+ wx[ 3] = w0_t[3];
+ wx[ 4] = w1_t[0];
+ wx[ 5] = w1_t[1];
+ wx[ 6] = w1_t[2];
+ wx[ 7] = w1_t[3];
+ wx[ 8] = w2_t[0];
+ wx[ 9] = w2_t[1];
+ wx[10] = w2_t[2];
+ wx[11] = w2_t[3];
+ wx[12] = w3_t[0];
+ wx[13] = w3_t[1];
+ wx[14] = w3_t[2];
+ wx[15] = w3_t[3];
+
+ overwrite_at_le (wx, w0lr, salt_len);
+
+ u32x w0_t[4];
+ u32x w1_t[4];
+ u32x w2_t[4];
+ u32x w3_t[4];
+
+ w0_t[0] = wx[ 0];
+ w0_t[1] = wx[ 1];
+ w0_t[2] = wx[ 2];
+ w0_t[3] = wx[ 3];
+ w1_t[0] = wx[ 4];
+ w1_t[1] = wx[ 5];
+ w1_t[2] = wx[ 6];
+ w1_t[3] = wx[ 7];
+ w2_t[0] = wx[ 8];
+ w2_t[1] = wx[ 9];
+ w2_t[2] = wx[10];
+ w2_t[3] = wx[11];
+ w3_t[0] = wx[12];
+ w3_t[1] = wx[13];
w3_t[2] = pw_salt_len * 8;
-
- w0_t[0] |= salt_buf0[0];
- w0_t[1] |= salt_buf0[1];
- w0_t[2] |= salt_buf0[2];
- w0_t[3] |= salt_buf0[3];
- w1_t[0] |= salt_buf1[0];
- w1_t[1] |= salt_buf1[1];
- w1_t[2] |= salt_buf1[2];
- w1_t[3] |= salt_buf1[3];
- w2_t[0] |= salt_buf2[0];
- w2_t[1] |= salt_buf2[1];
- w2_t[2] |= salt_buf2[2];
- w2_t[3] |= salt_buf2[3];
- w3_t[0] |= salt_buf3[0];
- w3_t[1] |= salt_buf3[1];
- w3_t[2] |= salt_buf3[2];
- w3_t[3] |= salt_buf3[3];
+ w3_t[3] = 0;
/**
* md5
*/
- u32 a = MD5M_A;
- u32 b = MD5M_B;
- u32 c = MD5M_C;
- u32 d = MD5M_D;
+ u32x tmp2;
+
+ u32x a = MD5M_A;
+ u32x b = MD5M_B;
+ u32x c = MD5M_C;
+ u32x d = MD5M_D;
MD5_STEP (MD5_Fo, a, b, c, d, w0_t[0], MD5C00, MD5S00);
MD5_STEP (MD5_Fo, d, a, b, c, w0_t[1], MD5C01, MD5S01);
MD5_STEP (MD5_Go, c, d, a, b, w1_t[3], MD5C1e, MD5S12);
MD5_STEP (MD5_Go, b, c, d, a, w3_t[0], MD5C1f, MD5S13);
- MD5_STEP (MD5_H , a, b, c, d, w1_t[1], MD5C20, MD5S20);
- MD5_STEP (MD5_H , d, a, b, c, w2_t[0], MD5C21, MD5S21);
- MD5_STEP (MD5_H , c, d, a, b, w2_t[3], MD5C22, MD5S22);
- MD5_STEP (MD5_H , b, c, d, a, w3_t[2], MD5C23, MD5S23);
- MD5_STEP (MD5_H , a, b, c, d, w0_t[1], MD5C24, MD5S20);
- MD5_STEP (MD5_H , d, a, b, c, w1_t[0], MD5C25, MD5S21);
- MD5_STEP (MD5_H , c, d, a, b, w1_t[3], MD5C26, MD5S22);
- MD5_STEP (MD5_H , b, c, d, a, w2_t[2], MD5C27, MD5S23);
- MD5_STEP (MD5_H , a, b, c, d, w3_t[1], MD5C28, MD5S20);
- MD5_STEP (MD5_H , d, a, b, c, w0_t[0], MD5C29, MD5S21);
- MD5_STEP (MD5_H , c, d, a, b, w0_t[3], MD5C2a, MD5S22);
- MD5_STEP (MD5_H , b, c, d, a, w1_t[2], MD5C2b, MD5S23);
- MD5_STEP (MD5_H , a, b, c, d, w2_t[1], MD5C2c, MD5S20);
- MD5_STEP (MD5_H , d, a, b, c, w3_t[0], MD5C2d, MD5S21);
- MD5_STEP (MD5_H , c, d, a, b, w3_t[3], MD5C2e, MD5S22);
- MD5_STEP (MD5_H , b, c, d, a, w0_t[2], MD5C2f, MD5S23);
+ MD5_STEP (MD5_H1, a, b, c, d, w1_t[1], MD5C20, MD5S20);
+ MD5_STEP (MD5_H2, d, a, b, c, w2_t[0], MD5C21, MD5S21);
+ MD5_STEP (MD5_H1, c, d, a, b, w2_t[3], MD5C22, MD5S22);
+ MD5_STEP (MD5_H2, b, c, d, a, w3_t[2], MD5C23, MD5S23);
+ MD5_STEP (MD5_H1, a, b, c, d, w0_t[1], MD5C24, MD5S20);
+ MD5_STEP (MD5_H2, d, a, b, c, w1_t[0], MD5C25, MD5S21);
+ MD5_STEP (MD5_H1, c, d, a, b, w1_t[3], MD5C26, MD5S22);
+ MD5_STEP (MD5_H2, b, c, d, a, w2_t[2], MD5C27, MD5S23);
+ MD5_STEP (MD5_H1, a, b, c, d, w3_t[1], MD5C28, MD5S20);
+ MD5_STEP (MD5_H2, d, a, b, c, w0_t[0], MD5C29, MD5S21);
+ MD5_STEP (MD5_H1, c, d, a, b, w0_t[3], MD5C2a, MD5S22);
+ MD5_STEP (MD5_H2, b, c, d, a, w1_t[2], MD5C2b, MD5S23);
+ MD5_STEP (MD5_H1, a, b, c, d, w2_t[1], MD5C2c, MD5S20);
+ MD5_STEP (MD5_H2, d, a, b, c, w3_t[0], MD5C2d, MD5S21);
+ MD5_STEP (MD5_H1, c, d, a, b, w3_t[3], MD5C2e, MD5S22);
+ MD5_STEP (MD5_H2, b, c, d, a, w0_t[2], MD5C2f, MD5S23);
MD5_STEP (MD5_I , a, b, c, d, w0_t[0], MD5C30, MD5S30);
MD5_STEP (MD5_I , d, a, b, c, w1_t[3], MD5C31, MD5S31);
MD5_STEP (MD5_I , b, c, d, a, w3_t[1], MD5C3b, MD5S33);
MD5_STEP (MD5_I , a, b, c, d, w1_t[0], MD5C3c, MD5S30);
- bool q_cond = allx (search[0] != a);
-
- if (q_cond) continue;
+ if (MATCHES_NONE_VS (a, search[0])) continue;
MD5_STEP (MD5_I , d, a, b, c, w2_t[3], MD5C3d, MD5S31);
MD5_STEP (MD5_I , c, d, a, b, w0_t[2], MD5C3e, MD5S32);
MD5_STEP (MD5_I , b, c, d, a, w2_t[1], MD5C3f, MD5S33);
-
- const u32 r0 = a;
- const u32 r1 = d;
- const u32 r2 = c;
- const u32 r3 = b;
-
- #include COMPARE_S
+ COMPARE_S_SIMD (a, d, c, b);
}
}
if (combs_mode == COMBINATOR_MODE_BASE_RIGHT)
{
- switch_buffer_by_offset (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len);
+ switch_buffer_by_offset_le (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len);
}
/**
if (combs_mode == COMBINATOR_MODE_BASE_LEFT)
{
- switch_buffer_by_offset (wordr0, wordr1, wordr2, wordr3, pw_l_len);
+ switch_buffer_by_offset_le (wordr0, wordr1, wordr2, wordr3, pw_l_len);
}
u32 w0[4];
if (combs_mode == COMBINATOR_MODE_BASE_RIGHT)
{
- switch_buffer_by_offset (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len);
+ switch_buffer_by_offset_le (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len);
}
/**
if (combs_mode == COMBINATOR_MODE_BASE_LEFT)
{
- switch_buffer_by_offset (wordr0, wordr1, wordr2, wordr3, pw_l_len);
+ switch_buffer_by_offset_le (wordr0, wordr1, wordr2, wordr3, pw_l_len);
}
u32 w0[4];
#define _MD5_
+#define NEW_SIMD_CODE
+
#include "include/constants.h"
#include "include/kernel_vendor.h"
#include "include/kernel_functions.c"
#include "OpenCL/types_ocl.c"
#include "OpenCL/common.c"
+#include "OpenCL/simd.c"
-#define COMPARE_S "OpenCL/check_single_comp4.c"
-#define COMPARE_M "OpenCL/check_multi_comp4.c"
-
-static void md5_transform (const u32 w0[4], const u32 w1[4], const u32 w2[4], const u32 w3[4], u32 digest[4])
+static void md5_transform (const u32x w0[4], const u32x w1[4], const u32x w2[4], const u32x w3[4], u32x digest[4])
{
- u32 a = digest[0];
- u32 b = digest[1];
- u32 c = digest[2];
- u32 d = digest[3];
-
- u32 w0_t = w0[0];
- u32 w1_t = w0[1];
- u32 w2_t = w0[2];
- u32 w3_t = w0[3];
- u32 w4_t = w1[0];
- u32 w5_t = w1[1];
- u32 w6_t = w1[2];
- u32 w7_t = w1[3];
- u32 w8_t = w2[0];
- u32 w9_t = w2[1];
- u32 wa_t = w2[2];
- u32 wb_t = w2[3];
- u32 wc_t = w3[0];
- u32 wd_t = w3[1];
- u32 we_t = w3[2];
- u32 wf_t = w3[3];
-
- u32 tmp2;
+ u32x a = digest[0];
+ u32x b = digest[1];
+ u32x c = digest[2];
+ u32x d = digest[3];
+
+ u32x w0_t = w0[0];
+ u32x w1_t = w0[1];
+ u32x w2_t = w0[2];
+ u32x w3_t = w0[3];
+ u32x w4_t = w1[0];
+ u32x w5_t = w1[1];
+ u32x w6_t = w1[2];
+ u32x w7_t = w1[3];
+ u32x w8_t = w2[0];
+ u32x w9_t = w2[1];
+ u32x wa_t = w2[2];
+ u32x wb_t = w2[3];
+ u32x wc_t = w3[0];
+ u32x wd_t = w3[1];
+ u32x we_t = w3[2];
+ u32x wf_t = w3[3];
+
+ u32x tmp2;
MD5_STEP (MD5_Fo, a, b, c, d, w0_t, MD5C00, MD5S00);
MD5_STEP (MD5_Fo, d, a, b, c, w1_t, MD5C01, MD5S01);
digest[3] += d;
}
-static void hmac_md5_pad (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], u32 ipad[4], u32 opad[4])
+static void hmac_md5_pad (u32x w0[4], u32x w1[4], u32x w2[4], u32x w3[4], u32x ipad[4], u32x opad[4])
{
w0[0] = w0[0] ^ 0x36363636;
w0[1] = w0[1] ^ 0x36363636;
md5_transform (w0, w1, w2, w3, opad);
}
-static void hmac_md5_run (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], u32 ipad[4], u32 opad[4], u32 digest[4])
+static void hmac_md5_run (u32x w0[4], u32x w1[4], u32x w2[4], u32x w3[4], u32x ipad[4], u32x opad[4], u32x digest[4])
{
digest[0] = ipad[0];
digest[1] = ipad[1];
u32 w0l = w0[0];
- for (u32 il_pos = 0; il_pos < bfs_cnt; il_pos++)
+ for (u32 il_pos = 0; il_pos < bfs_cnt; il_pos += VECT_SIZE)
{
- const u32 w0r = bfs_buf[il_pos].i;
+ const u32x w0r = w0r_create_bft (bfs_buf, il_pos);
- w0[0] = w0l | w0r;
+ const u32x w0lr = w0l | w0r;
/**
* pads
*/
- u32 w0_t[4];
+ u32x w0_t[4];
- w0_t[0] = w0[0];
+ w0_t[0] = w0lr;
w0_t[1] = w0[1];
w0_t[2] = w0[2];
w0_t[3] = w0[3];
- u32 w1_t[4];
+ u32x w1_t[4];
w1_t[0] = w1[0];
w1_t[1] = w1[1];
w1_t[2] = w1[2];
w1_t[3] = w1[3];
- u32 w2_t[4];
+ u32x w2_t[4];
w2_t[0] = w2[0];
w2_t[1] = w2[1];
w2_t[2] = w2[2];
w2_t[3] = w2[3];
- u32 w3_t[4];
+ u32x w3_t[4];
w3_t[0] = w3[0];
w3_t[1] = w3[1];
w3_t[2] = 0;
w3_t[3] = 0;
- u32 ipad[4];
- u32 opad[4];
+ u32x ipad[4];
+ u32x opad[4];
hmac_md5_pad (w0_t, w1_t, w2_t, w3_t, ipad, opad);
w3_t[2] = (64 + salt_len) * 8;
w3_t[3] = 0;
- u32 digest[4];
+ u32x digest[4];
hmac_md5_run (w0_t, w1_t, w2_t, w3_t, ipad, opad, digest);
- const u32 r0 = digest[0];
- const u32 r1 = digest[3];
- const u32 r2 = digest[2];
- const u32 r3 = digest[1];
-
- #include COMPARE_M
+ COMPARE_M_SIMD (digest[0], digest[3], digest[2], digest[1]);
}
}
u32 w0l = w0[0];
- for (u32 il_pos = 0; il_pos < bfs_cnt; il_pos++)
+ for (u32 il_pos = 0; il_pos < bfs_cnt; il_pos += VECT_SIZE)
{
- const u32 w0r = bfs_buf[il_pos].i;
+ const u32x w0r = w0r_create_bft (bfs_buf, il_pos);
- w0[0] = w0l | w0r;
+ const u32x w0lr = w0l | w0r;
/**
* pads
*/
- u32 w0_t[4];
+ u32x w0_t[4];
- w0_t[0] = w0[0];
+ w0_t[0] = w0lr;
w0_t[1] = w0[1];
w0_t[2] = w0[2];
w0_t[3] = w0[3];
- u32 w1_t[4];
+ u32x w1_t[4];
w1_t[0] = w1[0];
w1_t[1] = w1[1];
w1_t[2] = w1[2];
w1_t[3] = w1[3];
- u32 w2_t[4];
+ u32x w2_t[4];
w2_t[0] = w2[0];
w2_t[1] = w2[1];
w2_t[2] = w2[2];
w2_t[3] = w2[3];
- u32 w3_t[4];
+ u32x w3_t[4];
w3_t[0] = w3[0];
w3_t[1] = w3[1];
w3_t[2] = 0;
w3_t[3] = 0;
- u32 ipad[4];
- u32 opad[4];
+ u32x ipad[4];
+ u32x opad[4];
hmac_md5_pad (w0_t, w1_t, w2_t, w3_t, ipad, opad);
w3_t[2] = (64 + salt_len) * 8;
w3_t[3] = 0;
- u32 digest[4];
+ u32x digest[4];
hmac_md5_run (w0_t, w1_t, w2_t, w3_t, ipad, opad, digest);
- const u32 r0 = digest[0];
- const u32 r1 = digest[3];
- const u32 r2 = digest[2];
- const u32 r3 = digest[1];
-
- #include COMPARE_S
+ COMPARE_S_SIMD (digest[0], digest[3], digest[2], digest[1]);
}
}
if (combs_mode == COMBINATOR_MODE_BASE_RIGHT)
{
- switch_buffer_by_offset (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len);
+ switch_buffer_by_offset_le (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len);
}
/**
if (combs_mode == COMBINATOR_MODE_BASE_LEFT)
{
- switch_buffer_by_offset (wordr0, wordr1, wordr2, wordr3, pw_l_len);
+ switch_buffer_by_offset_le (wordr0, wordr1, wordr2, wordr3, pw_l_len);
}
u32 w0[4];
if (combs_mode == COMBINATOR_MODE_BASE_RIGHT)
{
- switch_buffer_by_offset (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len);
+ switch_buffer_by_offset_le (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len);
}
/**
if (combs_mode == COMBINATOR_MODE_BASE_LEFT)
{
- switch_buffer_by_offset (wordr0, wordr1, wordr2, wordr3, pw_l_len);
+ switch_buffer_by_offset_le (wordr0, wordr1, wordr2, wordr3, pw_l_len);
}
u32 w0[4];
#define _MD5_
+#define NEW_SIMD_CODE
+
#include "include/constants.h"
#include "include/kernel_vendor.h"
#include "include/kernel_functions.c"
#include "OpenCL/types_ocl.c"
#include "OpenCL/common.c"
+#include "OpenCL/simd.c"
-#define COMPARE_S "OpenCL/check_single_comp4.c"
-#define COMPARE_M "OpenCL/check_multi_comp4.c"
-
-static void md5_transform (const u32 w0[4], const u32 w1[4], const u32 w2[4], const u32 w3[4], u32 digest[4])
+static void md5_transform (const u32x w0[4], const u32x w1[4], const u32x w2[4], const u32x w3[4], u32x digest[4])
{
- u32 a = digest[0];
- u32 b = digest[1];
- u32 c = digest[2];
- u32 d = digest[3];
-
- u32 w0_t = w0[0];
- u32 w1_t = w0[1];
- u32 w2_t = w0[2];
- u32 w3_t = w0[3];
- u32 w4_t = w1[0];
- u32 w5_t = w1[1];
- u32 w6_t = w1[2];
- u32 w7_t = w1[3];
- u32 w8_t = w2[0];
- u32 w9_t = w2[1];
- u32 wa_t = w2[2];
- u32 wb_t = w2[3];
- u32 wc_t = w3[0];
- u32 wd_t = w3[1];
- u32 we_t = w3[2];
- u32 wf_t = w3[3];
-
- u32 tmp2;
+ u32x a = digest[0];
+ u32x b = digest[1];
+ u32x c = digest[2];
+ u32x d = digest[3];
+
+ u32x w0_t = w0[0];
+ u32x w1_t = w0[1];
+ u32x w2_t = w0[2];
+ u32x w3_t = w0[3];
+ u32x w4_t = w1[0];
+ u32x w5_t = w1[1];
+ u32x w6_t = w1[2];
+ u32x w7_t = w1[3];
+ u32x w8_t = w2[0];
+ u32x w9_t = w2[1];
+ u32x wa_t = w2[2];
+ u32x wb_t = w2[3];
+ u32x wc_t = w3[0];
+ u32x wd_t = w3[1];
+ u32x we_t = w3[2];
+ u32x wf_t = w3[3];
+
+ u32x tmp2;
MD5_STEP (MD5_Fo, a, b, c, d, w0_t, MD5C00, MD5S00);
MD5_STEP (MD5_Fo, d, a, b, c, w1_t, MD5C01, MD5S01);
digest[3] += d;
}
-static void hmac_md5_pad (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], u32 ipad[4], u32 opad[4])
+static void hmac_md5_pad (u32x w0[4], u32x w1[4], u32x w2[4], u32x w3[4], u32x ipad[4], u32x opad[4])
{
w0[0] = w0[0] ^ 0x36363636;
w0[1] = w0[1] ^ 0x36363636;
md5_transform (w0, w1, w2, w3, opad);
}
-static void hmac_md5_run (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], u32 ipad[4], u32 opad[4], u32 digest[4])
+static void hmac_md5_run (u32x w0[4], u32x w1[4], u32x w2[4], u32x w3[4], u32x ipad[4], u32x opad[4], u32x digest[4])
{
digest[0] = ipad[0];
digest[1] = ipad[1];
* pads
*/
- u32 w0_t[4];
+ u32x w0_t[4];
w0_t[0] = salt_buf0[0];
w0_t[1] = salt_buf0[1];
w0_t[2] = salt_buf0[2];
w0_t[3] = salt_buf0[3];
- u32 w1_t[4];
+ u32x w1_t[4];
w1_t[0] = salt_buf1[0];
w1_t[1] = salt_buf1[1];
w1_t[2] = salt_buf1[2];
w1_t[3] = salt_buf1[3];
- u32 w2_t[4];
+ u32x w2_t[4];
w2_t[0] = 0;
w2_t[1] = 0;
w2_t[2] = 0;
w2_t[3] = 0;
- u32 w3_t[4];
+ u32x w3_t[4];
w3_t[0] = 0;
w3_t[1] = 0;
w3_t[2] = 0;
w3_t[3] = 0;
- u32 ipad[4];
- u32 opad[4];
+ u32x ipad[4];
+ u32x opad[4];
hmac_md5_pad (w0_t, w1_t, w2_t, w3_t, ipad, opad);
u32 w0l = w0[0];
- for (u32 il_pos = 0; il_pos < bfs_cnt; il_pos++)
+ for (u32 il_pos = 0; il_pos < bfs_cnt; il_pos += VECT_SIZE)
{
- const u32 w0r = bfs_buf[il_pos].i;
+ const u32x w0r = w0r_create_bft (bfs_buf, il_pos);
- w0[0] = w0l | w0r;
+ const u32x w0rl = w0r | w0l;
- append_0x80_4x4 (w0, w1, w2, w3, pw_len);
-
- w0_t[0] = w0[0];
+ w0_t[0] = w0rl;
w0_t[1] = w0[1];
w0_t[2] = w0[2];
w0_t[3] = w0[3];
w3_t[2] = (64 + pw_len) * 8;
w3_t[3] = 0;
- u32 digest[4];
+ append_0x80_4x4 (w0_t, w1_t, w2_t, w3_t, pw_len);
- hmac_md5_run (w0_t, w1_t, w2_t, w3_t, ipad, opad, digest);
+ u32x digest[4];
- const u32 r0 = digest[0];
- const u32 r1 = digest[3];
- const u32 r2 = digest[2];
- const u32 r3 = digest[1];
+ hmac_md5_run (w0_t, w1_t, w2_t, w3_t, ipad, opad, digest);
- #include COMPARE_M
+ COMPARE_M_SIMD (digest[0], digest[3], digest[2], digest[1]);
}
}
* pads
*/
- u32 w0_t[4];
+ u32x w0_t[4];
w0_t[0] = salt_buf0[0];
w0_t[1] = salt_buf0[1];
w0_t[2] = salt_buf0[2];
w0_t[3] = salt_buf0[3];
- u32 w1_t[4];
+ u32x w1_t[4];
w1_t[0] = salt_buf1[0];
w1_t[1] = salt_buf1[1];
w1_t[2] = salt_buf1[2];
w1_t[3] = salt_buf1[3];
- u32 w2_t[4];
+ u32x w2_t[4];
w2_t[0] = 0;
w2_t[1] = 0;
w2_t[2] = 0;
w2_t[3] = 0;
- u32 w3_t[4];
+ u32x w3_t[4];
w3_t[0] = 0;
w3_t[1] = 0;
w3_t[2] = 0;
w3_t[3] = 0;
- u32 ipad[4];
- u32 opad[4];
+ u32x ipad[4];
+ u32x opad[4];
hmac_md5_pad (w0_t, w1_t, w2_t, w3_t, ipad, opad);
u32 w0l = w0[0];
- for (u32 il_pos = 0; il_pos < bfs_cnt; il_pos++)
+ for (u32 il_pos = 0; il_pos < bfs_cnt; il_pos += VECT_SIZE)
{
- const u32 w0r = bfs_buf[il_pos].i;
-
- w0[0] = w0l | w0r;
+ const u32x w0r = w0r_create_bft (bfs_buf, il_pos);
- append_0x80_4x4 (w0, w1, w2, w3, pw_len);
+ const u32x w0rl = w0r | w0l;
- w0_t[0] = w0[0];
+ w0_t[0] = w0rl;
w0_t[1] = w0[1];
w0_t[2] = w0[2];
w0_t[3] = w0[3];
w3_t[2] = (64 + pw_len) * 8;
w3_t[3] = 0;
- u32 digest[4];
+ append_0x80_4x4 (w0_t, w1_t, w2_t, w3_t, pw_len);
- hmac_md5_run (w0_t, w1_t, w2_t, w3_t, ipad, opad, digest);
+ u32x digest[4];
- const u32 r0 = digest[0];
- const u32 r1 = digest[3];
- const u32 r2 = digest[2];
- const u32 r3 = digest[1];
+ hmac_md5_run (w0_t, w1_t, w2_t, w3_t, ipad, opad, digest);
- #include COMPARE_S
+ COMPARE_S_SIMD (digest[0], digest[3], digest[2], digest[1]);
}
}
{
append_0x80_2x4 (wordl0, wordl1, pw_l_len);
- switch_buffer_by_offset (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len);
+ switch_buffer_by_offset_le (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len);
}
/**
{
append_0x80_2x4 (wordr0, wordr1, pw_r_len);
- switch_buffer_by_offset (wordr0, wordr1, wordr2, wordr3, pw_l_len);
+ switch_buffer_by_offset_le (wordr0, wordr1, wordr2, wordr3, pw_l_len);
}
u32 w0[4];
{
append_0x80_2x4 (wordl0, wordl1, pw_l_len);
- switch_buffer_by_offset (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len);
+ switch_buffer_by_offset_le (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len);
}
/**
{
append_0x80_2x4 (wordr0, wordr1, pw_r_len);
- switch_buffer_by_offset (wordr0, wordr1, wordr2, wordr3, pw_l_len);
+ switch_buffer_by_offset_le (wordr0, wordr1, wordr2, wordr3, pw_l_len);
}
u32 w0[4];
#define _SHA1_
+#define NEW_SIMD_CODE
+
#include "include/constants.h"
#include "include/kernel_vendor.h"
#include "include/kernel_functions.c"
#include "OpenCL/types_ocl.c"
#include "OpenCL/common.c"
+#include "OpenCL/simd.c"
-#define COMPARE_S "OpenCL/check_single_comp4.c"
-#define COMPARE_M "OpenCL/check_multi_comp4.c"
-
-static void m00100m (u32 w[16], const u32 pw_len, __global pw_t *pws, __global kernel_rule_t *rules_buf, __global comb_t *combs_buf, __constant u32 * words_buf_r, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 bfs_cnt, const u32 digests_cnt, const u32 digests_offset)
+static void m00100m (u32 w[16], const u32 pw_len, __global pw_t *pws, __global kernel_rule_t *rules_buf, __global comb_t *combs_buf, __constant u32x * words_buf_r, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 bfs_cnt, const u32 digests_cnt, const u32 digests_offset)
{
/**
* modifier
* base
*/
- const u32 c_16s = rotl32 ((w[13] ^ w[ 8] ^ w[ 2] ), 1u);
- const u32 c_17s = rotl32 ((w[14] ^ w[ 9] ^ w[ 3] ^ w[ 1]), 1u);
- const u32 c_18s = rotl32 ((w[15] ^ w[10] ^ w[ 4] ^ w[ 2]), 1u);
- const u32 c_19s = rotl32 ((c_16s ^ w[11] ^ w[ 5] ^ w[ 3]), 1u);
- const u32 c_20s = rotl32 ((c_17s ^ w[12] ^ w[ 6] ^ w[ 4]), 1u);
- const u32 c_21s = rotl32 ((c_18s ^ w[13] ^ w[ 7] ^ w[ 5]), 1u);
- const u32 c_22s = rotl32 ((c_19s ^ w[14] ^ w[ 8] ^ w[ 6]), 1u);
- const u32 c_23s = rotl32 ((c_20s ^ w[15] ^ w[ 9] ^ w[ 7]), 1u);
- const u32 c_24s = rotl32 ((c_21s ^ c_16s ^ w[10] ^ w[ 8]), 1u);
- const u32 c_25s = rotl32 ((c_22s ^ c_17s ^ w[11] ^ w[ 9]), 1u);
- const u32 c_26s = rotl32 ((c_23s ^ c_18s ^ w[12] ^ w[10]), 1u);
- const u32 c_27s = rotl32 ((c_24s ^ c_19s ^ w[13] ^ w[11]), 1u);
- const u32 c_28s = rotl32 ((c_25s ^ c_20s ^ w[14] ^ w[12]), 1u);
- const u32 c_29s = rotl32 ((c_26s ^ c_21s ^ w[15] ^ w[13]), 1u);
- const u32 c_30s = rotl32 ((c_27s ^ c_22s ^ c_16s ^ w[14]), 1u);
- const u32 c_31s = rotl32 ((c_28s ^ c_23s ^ c_17s ^ w[15]), 1u);
- const u32 c_32s = rotl32 ((c_29s ^ c_24s ^ c_18s ^ c_16s), 1u);
- const u32 c_33s = rotl32 ((c_30s ^ c_25s ^ c_19s ^ c_17s), 1u);
- const u32 c_34s = rotl32 ((c_31s ^ c_26s ^ c_20s ^ c_18s), 1u);
- const u32 c_35s = rotl32 ((c_32s ^ c_27s ^ c_21s ^ c_19s), 1u);
- const u32 c_36s = rotl32 ((c_33s ^ c_28s ^ c_22s ^ c_20s), 1u);
- const u32 c_37s = rotl32 ((c_34s ^ c_29s ^ c_23s ^ c_21s), 1u);
- const u32 c_38s = rotl32 ((c_35s ^ c_30s ^ c_24s ^ c_22s), 1u);
- const u32 c_39s = rotl32 ((c_36s ^ c_31s ^ c_25s ^ c_23s), 1u);
- const u32 c_40s = rotl32 ((c_37s ^ c_32s ^ c_26s ^ c_24s), 1u);
- const u32 c_41s = rotl32 ((c_38s ^ c_33s ^ c_27s ^ c_25s), 1u);
- const u32 c_42s = rotl32 ((c_39s ^ c_34s ^ c_28s ^ c_26s), 1u);
- const u32 c_43s = rotl32 ((c_40s ^ c_35s ^ c_29s ^ c_27s), 1u);
- const u32 c_44s = rotl32 ((c_41s ^ c_36s ^ c_30s ^ c_28s), 1u);
- const u32 c_45s = rotl32 ((c_42s ^ c_37s ^ c_31s ^ c_29s), 1u);
- const u32 c_46s = rotl32 ((c_43s ^ c_38s ^ c_32s ^ c_30s), 1u);
- const u32 c_47s = rotl32 ((c_44s ^ c_39s ^ c_33s ^ c_31s), 1u);
- const u32 c_48s = rotl32 ((c_45s ^ c_40s ^ c_34s ^ c_32s), 1u);
- const u32 c_49s = rotl32 ((c_46s ^ c_41s ^ c_35s ^ c_33s), 1u);
- const u32 c_50s = rotl32 ((c_47s ^ c_42s ^ c_36s ^ c_34s), 1u);
- const u32 c_51s = rotl32 ((c_48s ^ c_43s ^ c_37s ^ c_35s), 1u);
- const u32 c_52s = rotl32 ((c_49s ^ c_44s ^ c_38s ^ c_36s), 1u);
- const u32 c_53s = rotl32 ((c_50s ^ c_45s ^ c_39s ^ c_37s), 1u);
- const u32 c_54s = rotl32 ((c_51s ^ c_46s ^ c_40s ^ c_38s), 1u);
- const u32 c_55s = rotl32 ((c_52s ^ c_47s ^ c_41s ^ c_39s), 1u);
- const u32 c_56s = rotl32 ((c_53s ^ c_48s ^ c_42s ^ c_40s), 1u);
- const u32 c_57s = rotl32 ((c_54s ^ c_49s ^ c_43s ^ c_41s), 1u);
- const u32 c_58s = rotl32 ((c_55s ^ c_50s ^ c_44s ^ c_42s), 1u);
- const u32 c_59s = rotl32 ((c_56s ^ c_51s ^ c_45s ^ c_43s), 1u);
- const u32 c_60s = rotl32 ((c_57s ^ c_52s ^ c_46s ^ c_44s), 1u);
- const u32 c_61s = rotl32 ((c_58s ^ c_53s ^ c_47s ^ c_45s), 1u);
- const u32 c_62s = rotl32 ((c_59s ^ c_54s ^ c_48s ^ c_46s), 1u);
- const u32 c_63s = rotl32 ((c_60s ^ c_55s ^ c_49s ^ c_47s), 1u);
- const u32 c_64s = rotl32 ((c_61s ^ c_56s ^ c_50s ^ c_48s), 1u);
- const u32 c_65s = rotl32 ((c_62s ^ c_57s ^ c_51s ^ c_49s), 1u);
- const u32 c_66s = rotl32 ((c_63s ^ c_58s ^ c_52s ^ c_50s), 1u);
- const u32 c_67s = rotl32 ((c_64s ^ c_59s ^ c_53s ^ c_51s), 1u);
- const u32 c_68s = rotl32 ((c_65s ^ c_60s ^ c_54s ^ c_52s), 1u);
- const u32 c_69s = rotl32 ((c_66s ^ c_61s ^ c_55s ^ c_53s), 1u);
- const u32 c_70s = rotl32 ((c_67s ^ c_62s ^ c_56s ^ c_54s), 1u);
- const u32 c_71s = rotl32 ((c_68s ^ c_63s ^ c_57s ^ c_55s), 1u);
- const u32 c_72s = rotl32 ((c_69s ^ c_64s ^ c_58s ^ c_56s), 1u);
- const u32 c_73s = rotl32 ((c_70s ^ c_65s ^ c_59s ^ c_57s), 1u);
- const u32 c_74s = rotl32 ((c_71s ^ c_66s ^ c_60s ^ c_58s), 1u);
- const u32 c_75s = rotl32 ((c_72s ^ c_67s ^ c_61s ^ c_59s), 1u);
+ const u32 c_16s = rotl32_S ((w[13] ^ w[ 8] ^ w[ 2] ), 1u);
+ const u32 c_17s = rotl32_S ((w[14] ^ w[ 9] ^ w[ 3] ^ w[ 1]), 1u);
+ const u32 c_18s = rotl32_S ((w[15] ^ w[10] ^ w[ 4] ^ w[ 2]), 1u);
+ const u32 c_19s = rotl32_S ((c_16s ^ w[11] ^ w[ 5] ^ w[ 3]), 1u);
+ const u32 c_20s = rotl32_S ((c_17s ^ w[12] ^ w[ 6] ^ w[ 4]), 1u);
+ const u32 c_21s = rotl32_S ((c_18s ^ w[13] ^ w[ 7] ^ w[ 5]), 1u);
+ const u32 c_22s = rotl32_S ((c_19s ^ w[14] ^ w[ 8] ^ w[ 6]), 1u);
+ const u32 c_23s = rotl32_S ((c_20s ^ w[15] ^ w[ 9] ^ w[ 7]), 1u);
+ const u32 c_24s = rotl32_S ((c_21s ^ c_16s ^ w[10] ^ w[ 8]), 1u);
+ const u32 c_25s = rotl32_S ((c_22s ^ c_17s ^ w[11] ^ w[ 9]), 1u);
+ const u32 c_26s = rotl32_S ((c_23s ^ c_18s ^ w[12] ^ w[10]), 1u);
+ const u32 c_27s = rotl32_S ((c_24s ^ c_19s ^ w[13] ^ w[11]), 1u);
+ const u32 c_28s = rotl32_S ((c_25s ^ c_20s ^ w[14] ^ w[12]), 1u);
+ const u32 c_29s = rotl32_S ((c_26s ^ c_21s ^ w[15] ^ w[13]), 1u);
+ const u32 c_30s = rotl32_S ((c_27s ^ c_22s ^ c_16s ^ w[14]), 1u);
+ const u32 c_31s = rotl32_S ((c_28s ^ c_23s ^ c_17s ^ w[15]), 1u);
+ const u32 c_32s = rotl32_S ((c_29s ^ c_24s ^ c_18s ^ c_16s), 1u);
+ const u32 c_33s = rotl32_S ((c_30s ^ c_25s ^ c_19s ^ c_17s), 1u);
+ const u32 c_34s = rotl32_S ((c_31s ^ c_26s ^ c_20s ^ c_18s), 1u);
+ const u32 c_35s = rotl32_S ((c_32s ^ c_27s ^ c_21s ^ c_19s), 1u);
+ const u32 c_36s = rotl32_S ((c_33s ^ c_28s ^ c_22s ^ c_20s), 1u);
+ const u32 c_37s = rotl32_S ((c_34s ^ c_29s ^ c_23s ^ c_21s), 1u);
+ const u32 c_38s = rotl32_S ((c_35s ^ c_30s ^ c_24s ^ c_22s), 1u);
+ const u32 c_39s = rotl32_S ((c_36s ^ c_31s ^ c_25s ^ c_23s), 1u);
+ const u32 c_40s = rotl32_S ((c_37s ^ c_32s ^ c_26s ^ c_24s), 1u);
+ const u32 c_41s = rotl32_S ((c_38s ^ c_33s ^ c_27s ^ c_25s), 1u);
+ const u32 c_42s = rotl32_S ((c_39s ^ c_34s ^ c_28s ^ c_26s), 1u);
+ const u32 c_43s = rotl32_S ((c_40s ^ c_35s ^ c_29s ^ c_27s), 1u);
+ const u32 c_44s = rotl32_S ((c_41s ^ c_36s ^ c_30s ^ c_28s), 1u);
+ const u32 c_45s = rotl32_S ((c_42s ^ c_37s ^ c_31s ^ c_29s), 1u);
+ const u32 c_46s = rotl32_S ((c_43s ^ c_38s ^ c_32s ^ c_30s), 1u);
+ const u32 c_47s = rotl32_S ((c_44s ^ c_39s ^ c_33s ^ c_31s), 1u);
+ const u32 c_48s = rotl32_S ((c_45s ^ c_40s ^ c_34s ^ c_32s), 1u);
+ const u32 c_49s = rotl32_S ((c_46s ^ c_41s ^ c_35s ^ c_33s), 1u);
+ const u32 c_50s = rotl32_S ((c_47s ^ c_42s ^ c_36s ^ c_34s), 1u);
+ const u32 c_51s = rotl32_S ((c_48s ^ c_43s ^ c_37s ^ c_35s), 1u);
+ const u32 c_52s = rotl32_S ((c_49s ^ c_44s ^ c_38s ^ c_36s), 1u);
+ const u32 c_53s = rotl32_S ((c_50s ^ c_45s ^ c_39s ^ c_37s), 1u);
+ const u32 c_54s = rotl32_S ((c_51s ^ c_46s ^ c_40s ^ c_38s), 1u);
+ const u32 c_55s = rotl32_S ((c_52s ^ c_47s ^ c_41s ^ c_39s), 1u);
+ const u32 c_56s = rotl32_S ((c_53s ^ c_48s ^ c_42s ^ c_40s), 1u);
+ const u32 c_57s = rotl32_S ((c_54s ^ c_49s ^ c_43s ^ c_41s), 1u);
+ const u32 c_58s = rotl32_S ((c_55s ^ c_50s ^ c_44s ^ c_42s), 1u);
+ const u32 c_59s = rotl32_S ((c_56s ^ c_51s ^ c_45s ^ c_43s), 1u);
+ const u32 c_60s = rotl32_S ((c_57s ^ c_52s ^ c_46s ^ c_44s), 1u);
+ const u32 c_61s = rotl32_S ((c_58s ^ c_53s ^ c_47s ^ c_45s), 1u);
+ const u32 c_62s = rotl32_S ((c_59s ^ c_54s ^ c_48s ^ c_46s), 1u);
+ const u32 c_63s = rotl32_S ((c_60s ^ c_55s ^ c_49s ^ c_47s), 1u);
+ const u32 c_64s = rotl32_S ((c_61s ^ c_56s ^ c_50s ^ c_48s), 1u);
+ const u32 c_65s = rotl32_S ((c_62s ^ c_57s ^ c_51s ^ c_49s), 1u);
+ const u32 c_66s = rotl32_S ((c_63s ^ c_58s ^ c_52s ^ c_50s), 1u);
+ const u32 c_67s = rotl32_S ((c_64s ^ c_59s ^ c_53s ^ c_51s), 1u);
+ const u32 c_68s = rotl32_S ((c_65s ^ c_60s ^ c_54s ^ c_52s), 1u);
+ const u32 c_69s = rotl32_S ((c_66s ^ c_61s ^ c_55s ^ c_53s), 1u);
+ const u32 c_70s = rotl32_S ((c_67s ^ c_62s ^ c_56s ^ c_54s), 1u);
+ const u32 c_71s = rotl32_S ((c_68s ^ c_63s ^ c_57s ^ c_55s), 1u);
+ const u32 c_72s = rotl32_S ((c_69s ^ c_64s ^ c_58s ^ c_56s), 1u);
+ const u32 c_73s = rotl32_S ((c_70s ^ c_65s ^ c_59s ^ c_57s), 1u);
+ const u32 c_74s = rotl32_S ((c_71s ^ c_66s ^ c_60s ^ c_58s), 1u);
+ const u32 c_75s = rotl32_S ((c_72s ^ c_67s ^ c_61s ^ c_59s), 1u);
const u32 c_17sK = c_17s + SHA1C00;
const u32 c_18sK = c_18s + SHA1C00;
u32 w0l = w[0];
- for (u32 il_pos = 0; il_pos < bfs_cnt; il_pos++)
+ for (u32 il_pos = 0; il_pos < bfs_cnt; il_pos += VECT_SIZE)
{
- const u32 w0r = words_buf_r[il_pos];
-
- const u32 w0 = w0l | w0r;
-
- const u32 w0s01 = rotl32 (w0, 1u);
- const u32 w0s02 = rotl32 (w0, 2u);
- const u32 w0s03 = rotl32 (w0, 3u);
- const u32 w0s04 = rotl32 (w0, 4u);
- const u32 w0s05 = rotl32 (w0, 5u);
- const u32 w0s06 = rotl32 (w0, 6u);
- const u32 w0s07 = rotl32 (w0, 7u);
- const u32 w0s08 = rotl32 (w0, 8u);
- const u32 w0s09 = rotl32 (w0, 9u);
- const u32 w0s10 = rotl32 (w0, 10u);
- const u32 w0s11 = rotl32 (w0, 11u);
- const u32 w0s12 = rotl32 (w0, 12u);
- const u32 w0s13 = rotl32 (w0, 13u);
- const u32 w0s14 = rotl32 (w0, 14u);
- const u32 w0s15 = rotl32 (w0, 15u);
- const u32 w0s16 = rotl32 (w0, 16u);
- const u32 w0s17 = rotl32 (w0, 17u);
- const u32 w0s18 = rotl32 (w0, 18u);
- const u32 w0s19 = rotl32 (w0, 19u);
- const u32 w0s20 = rotl32 (w0, 20u);
-
- const u32 w0s04___w0s06 = w0s04 ^ w0s06;
- const u32 w0s04___w0s08 = w0s04 ^ w0s08;
- const u32 w0s08___w0s12 = w0s08 ^ w0s12;
- const u32 w0s04___w0s06___w0s07 = w0s04___w0s06 ^ w0s07;
-
- u32 a = SHA1M_A;
- u32 b = SHA1M_B;
- u32 c = SHA1M_C;
- u32 d = SHA1M_D;
- u32 e = SHA1M_E;
+ const u32x w0r = words_buf_r[il_pos / VECT_SIZE];
+
+ const u32x w0 = w0l | w0r;
+
+ const u32x w0s01 = rotl32 (w0, 1u);
+ const u32x w0s02 = rotl32 (w0, 2u);
+ const u32x w0s03 = rotl32 (w0, 3u);
+ const u32x w0s04 = rotl32 (w0, 4u);
+ const u32x w0s05 = rotl32 (w0, 5u);
+ const u32x w0s06 = rotl32 (w0, 6u);
+ const u32x w0s07 = rotl32 (w0, 7u);
+ const u32x w0s08 = rotl32 (w0, 8u);
+ const u32x w0s09 = rotl32 (w0, 9u);
+ const u32x w0s10 = rotl32 (w0, 10u);
+ const u32x w0s11 = rotl32 (w0, 11u);
+ const u32x w0s12 = rotl32 (w0, 12u);
+ const u32x w0s13 = rotl32 (w0, 13u);
+ const u32x w0s14 = rotl32 (w0, 14u);
+ const u32x w0s15 = rotl32 (w0, 15u);
+ const u32x w0s16 = rotl32 (w0, 16u);
+ const u32x w0s17 = rotl32 (w0, 17u);
+ const u32x w0s18 = rotl32 (w0, 18u);
+ const u32x w0s19 = rotl32 (w0, 19u);
+ const u32x w0s20 = rotl32 (w0, 20u);
+
+ const u32x w0s04___w0s06 = w0s04 ^ w0s06;
+ const u32x w0s04___w0s08 = w0s04 ^ w0s08;
+ const u32x w0s08___w0s12 = w0s08 ^ w0s12;
+ const u32x w0s04___w0s06___w0s07 = w0s04___w0s06 ^ w0s07;
+
+ u32x a = SHA1M_A;
+ u32x b = SHA1M_B;
+ u32x c = SHA1M_C;
+ u32x d = SHA1M_D;
+ u32x e = SHA1M_E;
#undef K
#define K SHA1C00
SHA1_STEP (SHA1_F1 , b, c, d, e, a, (c_74s ^ w0s08 ^ w0s16));
SHA1_STEP (SHA1_F1 , a, b, c, d, e, (c_75s ^ w0s06 ^ w0s12 ^ w0s14));
- const u32 c_76s = rotl32 ((c_73s ^ c_68s ^ c_62s ^ c_60s), 1u);
- const u32 c_77s = rotl32 ((c_74s ^ c_69s ^ c_63s ^ c_61s), 1u);
- const u32 c_78s = rotl32 ((c_75s ^ c_70s ^ c_64s ^ c_62s), 1u);
- const u32 c_79s = rotl32 ((c_76s ^ c_71s ^ c_65s ^ c_63s), 1u);
+ const u32x c_76s = rotl32 ((c_73s ^ c_68s ^ c_62s ^ c_60s), 1u);
+ const u32x c_77s = rotl32 ((c_74s ^ c_69s ^ c_63s ^ c_61s), 1u);
+ const u32x c_78s = rotl32 ((c_75s ^ c_70s ^ c_64s ^ c_62s), 1u);
+ const u32x c_79s = rotl32 ((c_76s ^ c_71s ^ c_65s ^ c_63s), 1u);
- const u32 w0s21 = rotl32 (w0, 21u);
- const u32 w0s22 = rotl32 (w0, 22U);
+ const u32x w0s21 = rotl32 (w0, 21u);
+ const u32x w0s22 = rotl32 (w0, 22U);
SHA1_STEP (SHA1_F1 , e, a, b, c, d, (c_76s ^ w0s07 ^ w0s08___w0s12 ^ w0s16 ^ w0s21));
SHA1_STEP (SHA1_F1 , d, e, a, b, c, (c_77s));
SHA1_STEP (SHA1_F1 , c, d, e, a, b, (c_78s ^ w0s07 ^ w0s08 ^ w0s15 ^ w0s18 ^ w0s20));
SHA1_STEP (SHA1_F1 , b, c, d, e, a, (c_79s ^ w0s08 ^ w0s22));
-
- const u32 r0 = d;
- const u32 r1 = e;
- const u32 r2 = c;
- const u32 r3 = b;
-
- #include COMPARE_M
+ COMPARE_M_SIMD (d, e, c, b);
}
}
-static void m00100s (u32 w[16], const u32 pw_len, __global pw_t *pws, __global kernel_rule_t *rules_buf, __global comb_t *combs_buf, __constant u32 * words_buf_r, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 bfs_cnt, const u32 digests_cnt, const u32 digests_offset)
+static void m00100s (u32 w[16], const u32 pw_len, __global pw_t *pws, __global kernel_rule_t *rules_buf, __global comb_t *combs_buf, __constant u32x * words_buf_r, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 bfs_cnt, const u32 digests_cnt, const u32 digests_offset)
{
/**
* modifier
* base
*/
- const u32 c_16s = rotl32 ((w[13] ^ w[ 8] ^ w[ 2] ), 1u);
- const u32 c_17s = rotl32 ((w[14] ^ w[ 9] ^ w[ 3] ^ w[ 1]), 1u);
- const u32 c_18s = rotl32 ((w[15] ^ w[10] ^ w[ 4] ^ w[ 2]), 1u);
- const u32 c_19s = rotl32 ((c_16s ^ w[11] ^ w[ 5] ^ w[ 3]), 1u);
- const u32 c_20s = rotl32 ((c_17s ^ w[12] ^ w[ 6] ^ w[ 4]), 1u);
- const u32 c_21s = rotl32 ((c_18s ^ w[13] ^ w[ 7] ^ w[ 5]), 1u);
- const u32 c_22s = rotl32 ((c_19s ^ w[14] ^ w[ 8] ^ w[ 6]), 1u);
- const u32 c_23s = rotl32 ((c_20s ^ w[15] ^ w[ 9] ^ w[ 7]), 1u);
- const u32 c_24s = rotl32 ((c_21s ^ c_16s ^ w[10] ^ w[ 8]), 1u);
- const u32 c_25s = rotl32 ((c_22s ^ c_17s ^ w[11] ^ w[ 9]), 1u);
- const u32 c_26s = rotl32 ((c_23s ^ c_18s ^ w[12] ^ w[10]), 1u);
- const u32 c_27s = rotl32 ((c_24s ^ c_19s ^ w[13] ^ w[11]), 1u);
- const u32 c_28s = rotl32 ((c_25s ^ c_20s ^ w[14] ^ w[12]), 1u);
- const u32 c_29s = rotl32 ((c_26s ^ c_21s ^ w[15] ^ w[13]), 1u);
- const u32 c_30s = rotl32 ((c_27s ^ c_22s ^ c_16s ^ w[14]), 1u);
- const u32 c_31s = rotl32 ((c_28s ^ c_23s ^ c_17s ^ w[15]), 1u);
- const u32 c_32s = rotl32 ((c_29s ^ c_24s ^ c_18s ^ c_16s), 1u);
- const u32 c_33s = rotl32 ((c_30s ^ c_25s ^ c_19s ^ c_17s), 1u);
- const u32 c_34s = rotl32 ((c_31s ^ c_26s ^ c_20s ^ c_18s), 1u);
- const u32 c_35s = rotl32 ((c_32s ^ c_27s ^ c_21s ^ c_19s), 1u);
- const u32 c_36s = rotl32 ((c_33s ^ c_28s ^ c_22s ^ c_20s), 1u);
- const u32 c_37s = rotl32 ((c_34s ^ c_29s ^ c_23s ^ c_21s), 1u);
- const u32 c_38s = rotl32 ((c_35s ^ c_30s ^ c_24s ^ c_22s), 1u);
- const u32 c_39s = rotl32 ((c_36s ^ c_31s ^ c_25s ^ c_23s), 1u);
- const u32 c_40s = rotl32 ((c_37s ^ c_32s ^ c_26s ^ c_24s), 1u);
- const u32 c_41s = rotl32 ((c_38s ^ c_33s ^ c_27s ^ c_25s), 1u);
- const u32 c_42s = rotl32 ((c_39s ^ c_34s ^ c_28s ^ c_26s), 1u);
- const u32 c_43s = rotl32 ((c_40s ^ c_35s ^ c_29s ^ c_27s), 1u);
- const u32 c_44s = rotl32 ((c_41s ^ c_36s ^ c_30s ^ c_28s), 1u);
- const u32 c_45s = rotl32 ((c_42s ^ c_37s ^ c_31s ^ c_29s), 1u);
- const u32 c_46s = rotl32 ((c_43s ^ c_38s ^ c_32s ^ c_30s), 1u);
- const u32 c_47s = rotl32 ((c_44s ^ c_39s ^ c_33s ^ c_31s), 1u);
- const u32 c_48s = rotl32 ((c_45s ^ c_40s ^ c_34s ^ c_32s), 1u);
- const u32 c_49s = rotl32 ((c_46s ^ c_41s ^ c_35s ^ c_33s), 1u);
- const u32 c_50s = rotl32 ((c_47s ^ c_42s ^ c_36s ^ c_34s), 1u);
- const u32 c_51s = rotl32 ((c_48s ^ c_43s ^ c_37s ^ c_35s), 1u);
- const u32 c_52s = rotl32 ((c_49s ^ c_44s ^ c_38s ^ c_36s), 1u);
- const u32 c_53s = rotl32 ((c_50s ^ c_45s ^ c_39s ^ c_37s), 1u);
- const u32 c_54s = rotl32 ((c_51s ^ c_46s ^ c_40s ^ c_38s), 1u);
- const u32 c_55s = rotl32 ((c_52s ^ c_47s ^ c_41s ^ c_39s), 1u);
- const u32 c_56s = rotl32 ((c_53s ^ c_48s ^ c_42s ^ c_40s), 1u);
- const u32 c_57s = rotl32 ((c_54s ^ c_49s ^ c_43s ^ c_41s), 1u);
- const u32 c_58s = rotl32 ((c_55s ^ c_50s ^ c_44s ^ c_42s), 1u);
- const u32 c_59s = rotl32 ((c_56s ^ c_51s ^ c_45s ^ c_43s), 1u);
- const u32 c_60s = rotl32 ((c_57s ^ c_52s ^ c_46s ^ c_44s), 1u);
- const u32 c_61s = rotl32 ((c_58s ^ c_53s ^ c_47s ^ c_45s), 1u);
- const u32 c_62s = rotl32 ((c_59s ^ c_54s ^ c_48s ^ c_46s), 1u);
- const u32 c_63s = rotl32 ((c_60s ^ c_55s ^ c_49s ^ c_47s), 1u);
- const u32 c_64s = rotl32 ((c_61s ^ c_56s ^ c_50s ^ c_48s), 1u);
- const u32 c_65s = rotl32 ((c_62s ^ c_57s ^ c_51s ^ c_49s), 1u);
- const u32 c_66s = rotl32 ((c_63s ^ c_58s ^ c_52s ^ c_50s), 1u);
- const u32 c_67s = rotl32 ((c_64s ^ c_59s ^ c_53s ^ c_51s), 1u);
- const u32 c_68s = rotl32 ((c_65s ^ c_60s ^ c_54s ^ c_52s), 1u);
- const u32 c_69s = rotl32 ((c_66s ^ c_61s ^ c_55s ^ c_53s), 1u);
- const u32 c_70s = rotl32 ((c_67s ^ c_62s ^ c_56s ^ c_54s), 1u);
- const u32 c_71s = rotl32 ((c_68s ^ c_63s ^ c_57s ^ c_55s), 1u);
- const u32 c_72s = rotl32 ((c_69s ^ c_64s ^ c_58s ^ c_56s), 1u);
- const u32 c_73s = rotl32 ((c_70s ^ c_65s ^ c_59s ^ c_57s), 1u);
- const u32 c_74s = rotl32 ((c_71s ^ c_66s ^ c_60s ^ c_58s), 1u);
- const u32 c_75s = rotl32 ((c_72s ^ c_67s ^ c_61s ^ c_59s), 1u);
+ const u32 c_16s = rotl32_S ((w[13] ^ w[ 8] ^ w[ 2] ), 1u);
+ const u32 c_17s = rotl32_S ((w[14] ^ w[ 9] ^ w[ 3] ^ w[ 1]), 1u);
+ const u32 c_18s = rotl32_S ((w[15] ^ w[10] ^ w[ 4] ^ w[ 2]), 1u);
+ const u32 c_19s = rotl32_S ((c_16s ^ w[11] ^ w[ 5] ^ w[ 3]), 1u);
+ const u32 c_20s = rotl32_S ((c_17s ^ w[12] ^ w[ 6] ^ w[ 4]), 1u);
+ const u32 c_21s = rotl32_S ((c_18s ^ w[13] ^ w[ 7] ^ w[ 5]), 1u);
+ const u32 c_22s = rotl32_S ((c_19s ^ w[14] ^ w[ 8] ^ w[ 6]), 1u);
+ const u32 c_23s = rotl32_S ((c_20s ^ w[15] ^ w[ 9] ^ w[ 7]), 1u);
+ const u32 c_24s = rotl32_S ((c_21s ^ c_16s ^ w[10] ^ w[ 8]), 1u);
+ const u32 c_25s = rotl32_S ((c_22s ^ c_17s ^ w[11] ^ w[ 9]), 1u);
+ const u32 c_26s = rotl32_S ((c_23s ^ c_18s ^ w[12] ^ w[10]), 1u);
+ const u32 c_27s = rotl32_S ((c_24s ^ c_19s ^ w[13] ^ w[11]), 1u);
+ const u32 c_28s = rotl32_S ((c_25s ^ c_20s ^ w[14] ^ w[12]), 1u);
+ const u32 c_29s = rotl32_S ((c_26s ^ c_21s ^ w[15] ^ w[13]), 1u);
+ const u32 c_30s = rotl32_S ((c_27s ^ c_22s ^ c_16s ^ w[14]), 1u);
+ const u32 c_31s = rotl32_S ((c_28s ^ c_23s ^ c_17s ^ w[15]), 1u);
+ const u32 c_32s = rotl32_S ((c_29s ^ c_24s ^ c_18s ^ c_16s), 1u);
+ const u32 c_33s = rotl32_S ((c_30s ^ c_25s ^ c_19s ^ c_17s), 1u);
+ const u32 c_34s = rotl32_S ((c_31s ^ c_26s ^ c_20s ^ c_18s), 1u);
+ const u32 c_35s = rotl32_S ((c_32s ^ c_27s ^ c_21s ^ c_19s), 1u);
+ const u32 c_36s = rotl32_S ((c_33s ^ c_28s ^ c_22s ^ c_20s), 1u);
+ const u32 c_37s = rotl32_S ((c_34s ^ c_29s ^ c_23s ^ c_21s), 1u);
+ const u32 c_38s = rotl32_S ((c_35s ^ c_30s ^ c_24s ^ c_22s), 1u);
+ const u32 c_39s = rotl32_S ((c_36s ^ c_31s ^ c_25s ^ c_23s), 1u);
+ const u32 c_40s = rotl32_S ((c_37s ^ c_32s ^ c_26s ^ c_24s), 1u);
+ const u32 c_41s = rotl32_S ((c_38s ^ c_33s ^ c_27s ^ c_25s), 1u);
+ const u32 c_42s = rotl32_S ((c_39s ^ c_34s ^ c_28s ^ c_26s), 1u);
+ const u32 c_43s = rotl32_S ((c_40s ^ c_35s ^ c_29s ^ c_27s), 1u);
+ const u32 c_44s = rotl32_S ((c_41s ^ c_36s ^ c_30s ^ c_28s), 1u);
+ const u32 c_45s = rotl32_S ((c_42s ^ c_37s ^ c_31s ^ c_29s), 1u);
+ const u32 c_46s = rotl32_S ((c_43s ^ c_38s ^ c_32s ^ c_30s), 1u);
+ const u32 c_47s = rotl32_S ((c_44s ^ c_39s ^ c_33s ^ c_31s), 1u);
+ const u32 c_48s = rotl32_S ((c_45s ^ c_40s ^ c_34s ^ c_32s), 1u);
+ const u32 c_49s = rotl32_S ((c_46s ^ c_41s ^ c_35s ^ c_33s), 1u);
+ const u32 c_50s = rotl32_S ((c_47s ^ c_42s ^ c_36s ^ c_34s), 1u);
+ const u32 c_51s = rotl32_S ((c_48s ^ c_43s ^ c_37s ^ c_35s), 1u);
+ const u32 c_52s = rotl32_S ((c_49s ^ c_44s ^ c_38s ^ c_36s), 1u);
+ const u32 c_53s = rotl32_S ((c_50s ^ c_45s ^ c_39s ^ c_37s), 1u);
+ const u32 c_54s = rotl32_S ((c_51s ^ c_46s ^ c_40s ^ c_38s), 1u);
+ const u32 c_55s = rotl32_S ((c_52s ^ c_47s ^ c_41s ^ c_39s), 1u);
+ const u32 c_56s = rotl32_S ((c_53s ^ c_48s ^ c_42s ^ c_40s), 1u);
+ const u32 c_57s = rotl32_S ((c_54s ^ c_49s ^ c_43s ^ c_41s), 1u);
+ const u32 c_58s = rotl32_S ((c_55s ^ c_50s ^ c_44s ^ c_42s), 1u);
+ const u32 c_59s = rotl32_S ((c_56s ^ c_51s ^ c_45s ^ c_43s), 1u);
+ const u32 c_60s = rotl32_S ((c_57s ^ c_52s ^ c_46s ^ c_44s), 1u);
+ const u32 c_61s = rotl32_S ((c_58s ^ c_53s ^ c_47s ^ c_45s), 1u);
+ const u32 c_62s = rotl32_S ((c_59s ^ c_54s ^ c_48s ^ c_46s), 1u);
+ const u32 c_63s = rotl32_S ((c_60s ^ c_55s ^ c_49s ^ c_47s), 1u);
+ const u32 c_64s = rotl32_S ((c_61s ^ c_56s ^ c_50s ^ c_48s), 1u);
+ const u32 c_65s = rotl32_S ((c_62s ^ c_57s ^ c_51s ^ c_49s), 1u);
+ const u32 c_66s = rotl32_S ((c_63s ^ c_58s ^ c_52s ^ c_50s), 1u);
+ const u32 c_67s = rotl32_S ((c_64s ^ c_59s ^ c_53s ^ c_51s), 1u);
+ const u32 c_68s = rotl32_S ((c_65s ^ c_60s ^ c_54s ^ c_52s), 1u);
+ const u32 c_69s = rotl32_S ((c_66s ^ c_61s ^ c_55s ^ c_53s), 1u);
+ const u32 c_70s = rotl32_S ((c_67s ^ c_62s ^ c_56s ^ c_54s), 1u);
+ const u32 c_71s = rotl32_S ((c_68s ^ c_63s ^ c_57s ^ c_55s), 1u);
+ const u32 c_72s = rotl32_S ((c_69s ^ c_64s ^ c_58s ^ c_56s), 1u);
+ const u32 c_73s = rotl32_S ((c_70s ^ c_65s ^ c_59s ^ c_57s), 1u);
+ const u32 c_74s = rotl32_S ((c_71s ^ c_66s ^ c_60s ^ c_58s), 1u);
+ const u32 c_75s = rotl32_S ((c_72s ^ c_67s ^ c_61s ^ c_59s), 1u);
const u32 c_17sK = c_17s + SHA1C00;
const u32 c_18sK = c_18s + SHA1C00;
* reverse
*/
- const u32 e_rev = rotl32 (search[1], 2u) - SHA1C03;
+ const u32 e_rev = rotl32_S (search[1], 2u) - SHA1C03;
/**
* loop
u32 w0l = w[0];
- for (u32 il_pos = 0; il_pos < bfs_cnt; il_pos++)
+ for (u32 il_pos = 0; il_pos < bfs_cnt; il_pos += VECT_SIZE)
{
- const u32 w0r = words_buf_r[il_pos];
-
- const u32 w0 = w0l | w0r;
-
- const u32 w0s01 = rotl32 (w0, 1u);
- const u32 w0s02 = rotl32 (w0, 2u);
- const u32 w0s03 = rotl32 (w0, 3u);
- const u32 w0s04 = rotl32 (w0, 4u);
- const u32 w0s05 = rotl32 (w0, 5u);
- const u32 w0s06 = rotl32 (w0, 6u);
- const u32 w0s07 = rotl32 (w0, 7u);
- const u32 w0s08 = rotl32 (w0, 8u);
- const u32 w0s09 = rotl32 (w0, 9u);
- const u32 w0s10 = rotl32 (w0, 10u);
- const u32 w0s11 = rotl32 (w0, 11u);
- const u32 w0s12 = rotl32 (w0, 12u);
- const u32 w0s13 = rotl32 (w0, 13u);
- const u32 w0s14 = rotl32 (w0, 14u);
- const u32 w0s15 = rotl32 (w0, 15u);
- const u32 w0s16 = rotl32 (w0, 16u);
- const u32 w0s17 = rotl32 (w0, 17u);
- const u32 w0s18 = rotl32 (w0, 18u);
- const u32 w0s19 = rotl32 (w0, 19u);
- const u32 w0s20 = rotl32 (w0, 20u);
-
- const u32 w0s04___w0s06 = w0s04 ^ w0s06;
- const u32 w0s04___w0s08 = w0s04 ^ w0s08;
- const u32 w0s08___w0s12 = w0s08 ^ w0s12;
- const u32 w0s04___w0s06___w0s07 = w0s04___w0s06 ^ w0s07;
-
- u32 a = SHA1M_A;
- u32 b = SHA1M_B;
- u32 c = SHA1M_C;
- u32 d = SHA1M_D;
- u32 e = SHA1M_E;
+ const u32x w0r = words_buf_r[il_pos / VECT_SIZE];
+
+ const u32x w0 = w0l | w0r;
+
+ const u32x w0s01 = rotl32 (w0, 1u);
+ const u32x w0s02 = rotl32 (w0, 2u);
+ const u32x w0s03 = rotl32 (w0, 3u);
+ const u32x w0s04 = rotl32 (w0, 4u);
+ const u32x w0s05 = rotl32 (w0, 5u);
+ const u32x w0s06 = rotl32 (w0, 6u);
+ const u32x w0s07 = rotl32 (w0, 7u);
+ const u32x w0s08 = rotl32 (w0, 8u);
+ const u32x w0s09 = rotl32 (w0, 9u);
+ const u32x w0s10 = rotl32 (w0, 10u);
+ const u32x w0s11 = rotl32 (w0, 11u);
+ const u32x w0s12 = rotl32 (w0, 12u);
+ const u32x w0s13 = rotl32 (w0, 13u);
+ const u32x w0s14 = rotl32 (w0, 14u);
+ const u32x w0s15 = rotl32 (w0, 15u);
+ const u32x w0s16 = rotl32 (w0, 16u);
+ const u32x w0s17 = rotl32 (w0, 17u);
+ const u32x w0s18 = rotl32 (w0, 18u);
+ const u32x w0s19 = rotl32 (w0, 19u);
+ const u32x w0s20 = rotl32 (w0, 20u);
+
+ const u32x w0s04___w0s06 = w0s04 ^ w0s06;
+ const u32x w0s04___w0s08 = w0s04 ^ w0s08;
+ const u32x w0s08___w0s12 = w0s08 ^ w0s12;
+ const u32x w0s04___w0s06___w0s07 = w0s04___w0s06 ^ w0s07;
+
+ u32x a = SHA1M_A;
+ u32x b = SHA1M_B;
+ u32x c = SHA1M_C;
+ u32x d = SHA1M_D;
+ u32x e = SHA1M_E;
#undef K
#define K SHA1C00
SHA1_STEP_PE (SHA1_F1, a, b, c, d, e, (c_75s ^ w0s06 ^ w0s12 ^ w0s14));
- bool q_cond = allx (e_rev != e);
-
- if (q_cond) continue;
+ if (MATCHES_NONE_VS (e, e_rev)) continue;
SHA1_STEP_PB (SHA1_F1, a, b, c, d, e, 0);
- const u32 c_76s = rotl32 ((c_73s ^ c_68s ^ c_62s ^ c_60s), 1u);
- const u32 c_77s = rotl32 ((c_74s ^ c_69s ^ c_63s ^ c_61s), 1u);
- const u32 c_78s = rotl32 ((c_75s ^ c_70s ^ c_64s ^ c_62s), 1u);
- const u32 c_79s = rotl32 ((c_76s ^ c_71s ^ c_65s ^ c_63s), 1u);
+ const u32x c_76s = rotl32 ((c_73s ^ c_68s ^ c_62s ^ c_60s), 1u);
+ const u32x c_77s = rotl32 ((c_74s ^ c_69s ^ c_63s ^ c_61s), 1u);
+ const u32x c_78s = rotl32 ((c_75s ^ c_70s ^ c_64s ^ c_62s), 1u);
+ const u32x c_79s = rotl32 ((c_76s ^ c_71s ^ c_65s ^ c_63s), 1u);
- const u32 w0s21 = rotl32 (w0, 21u);
- const u32 w0s22 = rotl32 (w0, 22U);
+ const u32x w0s21 = rotl32 (w0, 21u);
+ const u32x w0s22 = rotl32 (w0, 22U);
SHA1_STEP (SHA1_F1 , e, a, b, c, d, (c_76s ^ w0s07 ^ w0s08___w0s12 ^ w0s16 ^ w0s21));
SHA1_STEP (SHA1_F1 , d, e, a, b, c, (c_77s));
SHA1_STEP (SHA1_F1 , c, d, e, a, b, (c_78s ^ w0s07 ^ w0s08 ^ w0s15 ^ w0s18 ^ w0s20));
SHA1_STEP (SHA1_F1 , b, c, d, e, a, (c_79s ^ w0s08 ^ w0s22));
-
- const u32 r0 = d;
- const u32 r1 = e;
- const u32 r2 = c;
- const u32 r3 = b;
-
- #include COMPARE_S
+ COMPARE_S_SIMD (d, e, c, b);
}
}
-__kernel void m00100_m04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, __global comb_t *combs_buf, __constant u32 * words_buf_r, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 bfs_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max)
+__kernel void m00100_m04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, __global comb_t *combs_buf, __constant u32x * words_buf_r, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 bfs_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max)
{
/**
* base
m00100m (w, pw_len, pws, rules_buf, combs_buf, words_buf_r, tmps, hooks, bitmaps_buf_s1_a, bitmaps_buf_s1_b, bitmaps_buf_s1_c, bitmaps_buf_s1_d, bitmaps_buf_s2_a, bitmaps_buf_s2_b, bitmaps_buf_s2_c, bitmaps_buf_s2_d, plains_buf, digests_buf, hashes_shown, salt_bufs, esalt_bufs, d_return_buf, d_scryptV_buf, bitmap_mask, bitmap_shift1, bitmap_shift2, salt_pos, loop_pos, loop_cnt, bfs_cnt, digests_cnt, digests_offset);
}
-__kernel void m00100_m08 (__global pw_t *pws, __global kernel_rule_t *rules_buf, __global comb_t *combs_buf, __constant u32 * words_buf_r, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 bfs_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max)
+__kernel void m00100_m08 (__global pw_t *pws, __global kernel_rule_t *rules_buf, __global comb_t *combs_buf, __constant u32x * words_buf_r, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 bfs_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max)
{
/**
* base
m00100m (w, pw_len, pws, rules_buf, combs_buf, words_buf_r, tmps, hooks, bitmaps_buf_s1_a, bitmaps_buf_s1_b, bitmaps_buf_s1_c, bitmaps_buf_s1_d, bitmaps_buf_s2_a, bitmaps_buf_s2_b, bitmaps_buf_s2_c, bitmaps_buf_s2_d, plains_buf, digests_buf, hashes_shown, salt_bufs, esalt_bufs, d_return_buf, d_scryptV_buf, bitmap_mask, bitmap_shift1, bitmap_shift2, salt_pos, loop_pos, loop_cnt, bfs_cnt, digests_cnt, digests_offset);
}
-__kernel void m00100_m16 (__global pw_t *pws, __global kernel_rule_t *rules_buf, __global comb_t *combs_buf, __constant u32 * words_buf_r, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 bfs_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max)
+__kernel void m00100_m16 (__global pw_t *pws, __global kernel_rule_t *rules_buf, __global comb_t *combs_buf, __constant u32x * words_buf_r, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 bfs_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max)
{
/**
* base
m00100m (w, pw_len, pws, rules_buf, combs_buf, words_buf_r, tmps, hooks, bitmaps_buf_s1_a, bitmaps_buf_s1_b, bitmaps_buf_s1_c, bitmaps_buf_s1_d, bitmaps_buf_s2_a, bitmaps_buf_s2_b, bitmaps_buf_s2_c, bitmaps_buf_s2_d, plains_buf, digests_buf, hashes_shown, salt_bufs, esalt_bufs, d_return_buf, d_scryptV_buf, bitmap_mask, bitmap_shift1, bitmap_shift2, salt_pos, loop_pos, loop_cnt, bfs_cnt, digests_cnt, digests_offset);
}
-__kernel void m00100_s04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, __global comb_t *combs_buf, __constant u32 * words_buf_r, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 bfs_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max)
+__kernel void m00100_s04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, __global comb_t *combs_buf, __constant u32x * words_buf_r, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 bfs_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max)
{
/**
* base
m00100s (w, pw_len, pws, rules_buf, combs_buf, words_buf_r, tmps, hooks, bitmaps_buf_s1_a, bitmaps_buf_s1_b, bitmaps_buf_s1_c, bitmaps_buf_s1_d, bitmaps_buf_s2_a, bitmaps_buf_s2_b, bitmaps_buf_s2_c, bitmaps_buf_s2_d, plains_buf, digests_buf, hashes_shown, salt_bufs, esalt_bufs, d_return_buf, d_scryptV_buf, bitmap_mask, bitmap_shift1, bitmap_shift2, salt_pos, loop_pos, loop_cnt, bfs_cnt, digests_cnt, digests_offset);
}
-__kernel void m00100_s08 (__global pw_t *pws, __global kernel_rule_t *rules_buf, __global comb_t *combs_buf, __constant u32 * words_buf_r, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 bfs_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max)
+__kernel void m00100_s08 (__global pw_t *pws, __global kernel_rule_t *rules_buf, __global comb_t *combs_buf, __constant u32x * words_buf_r, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 bfs_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max)
{
/**
* base
m00100s (w, pw_len, pws, rules_buf, combs_buf, words_buf_r, tmps, hooks, bitmaps_buf_s1_a, bitmaps_buf_s1_b, bitmaps_buf_s1_c, bitmaps_buf_s1_d, bitmaps_buf_s2_a, bitmaps_buf_s2_b, bitmaps_buf_s2_c, bitmaps_buf_s2_d, plains_buf, digests_buf, hashes_shown, salt_bufs, esalt_bufs, d_return_buf, d_scryptV_buf, bitmap_mask, bitmap_shift1, bitmap_shift2, salt_pos, loop_pos, loop_cnt, bfs_cnt, digests_cnt, digests_offset);
}
-__kernel void m00100_s16 (__global pw_t *pws, __global kernel_rule_t *rules_buf, __global comb_t *combs_buf, __constant u32 * words_buf_r, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 bfs_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max)
+__kernel void m00100_s16 (__global pw_t *pws, __global kernel_rule_t *rules_buf, __global comb_t *combs_buf, __constant u32x * words_buf_r, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 bfs_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max)
{
/**
* base
s3[2] = 0;
s3[3] = 0;
- switch_buffer_by_offset (s0, s1, s2, s3, out_len);
+ switch_buffer_by_offset_le (s0, s1, s2, s3, out_len);
const u32 pw_salt_len = out_len + salt_len;
s3[2] = 0;
s3[3] = 0;
- switch_buffer_by_offset (s0, s1, s2, s3, out_len);
+ switch_buffer_by_offset_le (s0, s1, s2, s3, out_len);
const u32 pw_salt_len = out_len + salt_len;
if (combs_mode == COMBINATOR_MODE_BASE_RIGHT)
{
- switch_buffer_by_offset (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len);
+ switch_buffer_by_offset_le (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len);
}
/**
if (combs_mode == COMBINATOR_MODE_BASE_LEFT)
{
- switch_buffer_by_offset (wordr0, wordr1, wordr2, wordr3, pw_l_len);
+ switch_buffer_by_offset_le (wordr0, wordr1, wordr2, wordr3, pw_l_len);
}
/**
s3[2] = 0;
s3[3] = 0;
- switch_buffer_by_offset (s0, s1, s2, s3, pw_len);
+ switch_buffer_by_offset_le (s0, s1, s2, s3, pw_len);
const u32 pw_salt_len = pw_len + salt_len;
if (combs_mode == COMBINATOR_MODE_BASE_RIGHT)
{
- switch_buffer_by_offset (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len);
+ switch_buffer_by_offset_le (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len);
}
/**
if (combs_mode == COMBINATOR_MODE_BASE_LEFT)
{
- switch_buffer_by_offset (wordr0, wordr1, wordr2, wordr3, pw_l_len);
+ switch_buffer_by_offset_le (wordr0, wordr1, wordr2, wordr3, pw_l_len);
}
/**
s3[2] = 0;
s3[3] = 0;
- switch_buffer_by_offset (s0, s1, s2, s3, pw_len);
+ switch_buffer_by_offset_le (s0, s1, s2, s3, pw_len);
const u32 pw_salt_len = pw_len + salt_len;
#define _SHA1_
+#define NEW_SIMD_CODE
+
#include "include/constants.h"
#include "include/kernel_vendor.h"
#include "include/kernel_functions.c"
#include "OpenCL/types_ocl.c"
#include "OpenCL/common.c"
+#include "OpenCL/simd.c"
-#define COMPARE_S "OpenCL/check_single_comp4.c"
-#define COMPARE_M "OpenCL/check_multi_comp4.c"
-
-static void m00110m (u32 w[16], const u32 pw_len, __global pw_t *pws, __global kernel_rule_t *rules_buf, __global comb_t *combs_buf, __constant u32 * words_buf_r, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 bfs_cnt, const u32 digests_cnt, const u32 digests_offset)
+static void m00110m (u32 w[16], const u32 pw_len, __global pw_t *pws, __global kernel_rule_t *rules_buf, __global comb_t *combs_buf, __constant u32x * words_buf_r, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 bfs_cnt, const u32 digests_cnt, const u32 digests_offset)
{
/**
* modifier
salt_buf3[2] = 0;
salt_buf3[3] = 0;
- switch_buffer_by_offset (salt_buf0, salt_buf1, salt_buf2, salt_buf3, pw_len);
-
- w[ 0] |= swap32 (salt_buf0[0]);
- w[ 1] |= swap32 (salt_buf0[1]);
- w[ 2] |= swap32 (salt_buf0[2]);
- w[ 3] |= swap32 (salt_buf0[3]);
- w[ 4] |= swap32 (salt_buf1[0]);
- w[ 5] |= swap32 (salt_buf1[1]);
- w[ 6] |= swap32 (salt_buf1[2]);
- w[ 7] |= swap32 (salt_buf1[3]);
- w[ 8] |= swap32 (salt_buf2[0]);
- w[ 9] |= swap32 (salt_buf2[1]);
- w[10] |= swap32 (salt_buf2[2]);
- w[11] |= swap32 (salt_buf2[3]);
- w[12] |= swap32 (salt_buf3[0]);
- w[13] |= swap32 (salt_buf3[1]);
- w[14] |= swap32 (salt_buf3[2]);
- w[15] |= swap32 (salt_buf3[3]);
+ switch_buffer_by_offset_le_S (salt_buf0, salt_buf1, salt_buf2, salt_buf3, pw_len);
+
+ w[ 0] |= swap32_S (salt_buf0[0]);
+ w[ 1] |= swap32_S (salt_buf0[1]);
+ w[ 2] |= swap32_S (salt_buf0[2]);
+ w[ 3] |= swap32_S (salt_buf0[3]);
+ w[ 4] |= swap32_S (salt_buf1[0]);
+ w[ 5] |= swap32_S (salt_buf1[1]);
+ w[ 6] |= swap32_S (salt_buf1[2]);
+ w[ 7] |= swap32_S (salt_buf1[3]);
+ w[ 8] |= swap32_S (salt_buf2[0]);
+ w[ 9] |= swap32_S (salt_buf2[1]);
+ w[10] |= swap32_S (salt_buf2[2]);
+ w[11] |= swap32_S (salt_buf2[3]);
+ w[12] |= swap32_S (salt_buf3[0]);
+ w[13] |= swap32_S (salt_buf3[1]);
+ w[14] |= swap32_S (salt_buf3[2]);
+ w[15] |= swap32_S (salt_buf3[3]);
const u32 salt_len = salt_bufs[salt_pos].salt_len;
* base
*/
- const u32 c_16s = rotl32 ((w[13] ^ w[ 8] ^ w[ 2] ), 1u);
- const u32 c_17s = rotl32 ((w[14] ^ w[ 9] ^ w[ 3] ^ w[ 1]), 1u);
- const u32 c_18s = rotl32 ((w[15] ^ w[10] ^ w[ 4] ^ w[ 2]), 1u);
- const u32 c_19s = rotl32 ((c_16s ^ w[11] ^ w[ 5] ^ w[ 3]), 1u);
- const u32 c_20s = rotl32 ((c_17s ^ w[12] ^ w[ 6] ^ w[ 4]), 1u);
- const u32 c_21s = rotl32 ((c_18s ^ w[13] ^ w[ 7] ^ w[ 5]), 1u);
- const u32 c_22s = rotl32 ((c_19s ^ w[14] ^ w[ 8] ^ w[ 6]), 1u);
- const u32 c_23s = rotl32 ((c_20s ^ w[15] ^ w[ 9] ^ w[ 7]), 1u);
- const u32 c_24s = rotl32 ((c_21s ^ c_16s ^ w[10] ^ w[ 8]), 1u);
- const u32 c_25s = rotl32 ((c_22s ^ c_17s ^ w[11] ^ w[ 9]), 1u);
- const u32 c_26s = rotl32 ((c_23s ^ c_18s ^ w[12] ^ w[10]), 1u);
- const u32 c_27s = rotl32 ((c_24s ^ c_19s ^ w[13] ^ w[11]), 1u);
- const u32 c_28s = rotl32 ((c_25s ^ c_20s ^ w[14] ^ w[12]), 1u);
- const u32 c_29s = rotl32 ((c_26s ^ c_21s ^ w[15] ^ w[13]), 1u);
- const u32 c_30s = rotl32 ((c_27s ^ c_22s ^ c_16s ^ w[14]), 1u);
- const u32 c_31s = rotl32 ((c_28s ^ c_23s ^ c_17s ^ w[15]), 1u);
- const u32 c_32s = rotl32 ((c_29s ^ c_24s ^ c_18s ^ c_16s), 1u);
- const u32 c_33s = rotl32 ((c_30s ^ c_25s ^ c_19s ^ c_17s), 1u);
- const u32 c_34s = rotl32 ((c_31s ^ c_26s ^ c_20s ^ c_18s), 1u);
- const u32 c_35s = rotl32 ((c_32s ^ c_27s ^ c_21s ^ c_19s), 1u);
- const u32 c_36s = rotl32 ((c_33s ^ c_28s ^ c_22s ^ c_20s), 1u);
- const u32 c_37s = rotl32 ((c_34s ^ c_29s ^ c_23s ^ c_21s), 1u);
- const u32 c_38s = rotl32 ((c_35s ^ c_30s ^ c_24s ^ c_22s), 1u);
- const u32 c_39s = rotl32 ((c_36s ^ c_31s ^ c_25s ^ c_23s), 1u);
- const u32 c_40s = rotl32 ((c_37s ^ c_32s ^ c_26s ^ c_24s), 1u);
- const u32 c_41s = rotl32 ((c_38s ^ c_33s ^ c_27s ^ c_25s), 1u);
- const u32 c_42s = rotl32 ((c_39s ^ c_34s ^ c_28s ^ c_26s), 1u);
- const u32 c_43s = rotl32 ((c_40s ^ c_35s ^ c_29s ^ c_27s), 1u);
- const u32 c_44s = rotl32 ((c_41s ^ c_36s ^ c_30s ^ c_28s), 1u);
- const u32 c_45s = rotl32 ((c_42s ^ c_37s ^ c_31s ^ c_29s), 1u);
- const u32 c_46s = rotl32 ((c_43s ^ c_38s ^ c_32s ^ c_30s), 1u);
- const u32 c_47s = rotl32 ((c_44s ^ c_39s ^ c_33s ^ c_31s), 1u);
- const u32 c_48s = rotl32 ((c_45s ^ c_40s ^ c_34s ^ c_32s), 1u);
- const u32 c_49s = rotl32 ((c_46s ^ c_41s ^ c_35s ^ c_33s), 1u);
- const u32 c_50s = rotl32 ((c_47s ^ c_42s ^ c_36s ^ c_34s), 1u);
- const u32 c_51s = rotl32 ((c_48s ^ c_43s ^ c_37s ^ c_35s), 1u);
- const u32 c_52s = rotl32 ((c_49s ^ c_44s ^ c_38s ^ c_36s), 1u);
- const u32 c_53s = rotl32 ((c_50s ^ c_45s ^ c_39s ^ c_37s), 1u);
- const u32 c_54s = rotl32 ((c_51s ^ c_46s ^ c_40s ^ c_38s), 1u);
- const u32 c_55s = rotl32 ((c_52s ^ c_47s ^ c_41s ^ c_39s), 1u);
- const u32 c_56s = rotl32 ((c_53s ^ c_48s ^ c_42s ^ c_40s), 1u);
- const u32 c_57s = rotl32 ((c_54s ^ c_49s ^ c_43s ^ c_41s), 1u);
- const u32 c_58s = rotl32 ((c_55s ^ c_50s ^ c_44s ^ c_42s), 1u);
- const u32 c_59s = rotl32 ((c_56s ^ c_51s ^ c_45s ^ c_43s), 1u);
- const u32 c_60s = rotl32 ((c_57s ^ c_52s ^ c_46s ^ c_44s), 1u);
- const u32 c_61s = rotl32 ((c_58s ^ c_53s ^ c_47s ^ c_45s), 1u);
- const u32 c_62s = rotl32 ((c_59s ^ c_54s ^ c_48s ^ c_46s), 1u);
- const u32 c_63s = rotl32 ((c_60s ^ c_55s ^ c_49s ^ c_47s), 1u);
- const u32 c_64s = rotl32 ((c_61s ^ c_56s ^ c_50s ^ c_48s), 1u);
- const u32 c_65s = rotl32 ((c_62s ^ c_57s ^ c_51s ^ c_49s), 1u);
- const u32 c_66s = rotl32 ((c_63s ^ c_58s ^ c_52s ^ c_50s), 1u);
- const u32 c_67s = rotl32 ((c_64s ^ c_59s ^ c_53s ^ c_51s), 1u);
- const u32 c_68s = rotl32 ((c_65s ^ c_60s ^ c_54s ^ c_52s), 1u);
- const u32 c_69s = rotl32 ((c_66s ^ c_61s ^ c_55s ^ c_53s), 1u);
- const u32 c_70s = rotl32 ((c_67s ^ c_62s ^ c_56s ^ c_54s), 1u);
- const u32 c_71s = rotl32 ((c_68s ^ c_63s ^ c_57s ^ c_55s), 1u);
- const u32 c_72s = rotl32 ((c_69s ^ c_64s ^ c_58s ^ c_56s), 1u);
- const u32 c_73s = rotl32 ((c_70s ^ c_65s ^ c_59s ^ c_57s), 1u);
- const u32 c_74s = rotl32 ((c_71s ^ c_66s ^ c_60s ^ c_58s), 1u);
- const u32 c_75s = rotl32 ((c_72s ^ c_67s ^ c_61s ^ c_59s), 1u);
- const u32 c_76s = rotl32 ((c_73s ^ c_68s ^ c_62s ^ c_60s), 1u);
- const u32 c_77s = rotl32 ((c_74s ^ c_69s ^ c_63s ^ c_61s), 1u);
- const u32 c_78s = rotl32 ((c_75s ^ c_70s ^ c_64s ^ c_62s), 1u);
- const u32 c_79s = rotl32 ((c_76s ^ c_71s ^ c_65s ^ c_63s), 1u);
+ const u32 c_16s = rotl32_S ((w[13] ^ w[ 8] ^ w[ 2] ), 1u);
+ const u32 c_17s = rotl32_S ((w[14] ^ w[ 9] ^ w[ 3] ^ w[ 1]), 1u);
+ const u32 c_18s = rotl32_S ((w[15] ^ w[10] ^ w[ 4] ^ w[ 2]), 1u);
+ const u32 c_19s = rotl32_S ((c_16s ^ w[11] ^ w[ 5] ^ w[ 3]), 1u);
+ const u32 c_20s = rotl32_S ((c_17s ^ w[12] ^ w[ 6] ^ w[ 4]), 1u);
+ const u32 c_21s = rotl32_S ((c_18s ^ w[13] ^ w[ 7] ^ w[ 5]), 1u);
+ const u32 c_22s = rotl32_S ((c_19s ^ w[14] ^ w[ 8] ^ w[ 6]), 1u);
+ const u32 c_23s = rotl32_S ((c_20s ^ w[15] ^ w[ 9] ^ w[ 7]), 1u);
+ const u32 c_24s = rotl32_S ((c_21s ^ c_16s ^ w[10] ^ w[ 8]), 1u);
+ const u32 c_25s = rotl32_S ((c_22s ^ c_17s ^ w[11] ^ w[ 9]), 1u);
+ const u32 c_26s = rotl32_S ((c_23s ^ c_18s ^ w[12] ^ w[10]), 1u);
+ const u32 c_27s = rotl32_S ((c_24s ^ c_19s ^ w[13] ^ w[11]), 1u);
+ const u32 c_28s = rotl32_S ((c_25s ^ c_20s ^ w[14] ^ w[12]), 1u);
+ const u32 c_29s = rotl32_S ((c_26s ^ c_21s ^ w[15] ^ w[13]), 1u);
+ const u32 c_30s = rotl32_S ((c_27s ^ c_22s ^ c_16s ^ w[14]), 1u);
+ const u32 c_31s = rotl32_S ((c_28s ^ c_23s ^ c_17s ^ w[15]), 1u);
+ const u32 c_32s = rotl32_S ((c_29s ^ c_24s ^ c_18s ^ c_16s), 1u);
+ const u32 c_33s = rotl32_S ((c_30s ^ c_25s ^ c_19s ^ c_17s), 1u);
+ const u32 c_34s = rotl32_S ((c_31s ^ c_26s ^ c_20s ^ c_18s), 1u);
+ const u32 c_35s = rotl32_S ((c_32s ^ c_27s ^ c_21s ^ c_19s), 1u);
+ const u32 c_36s = rotl32_S ((c_33s ^ c_28s ^ c_22s ^ c_20s), 1u);
+ const u32 c_37s = rotl32_S ((c_34s ^ c_29s ^ c_23s ^ c_21s), 1u);
+ const u32 c_38s = rotl32_S ((c_35s ^ c_30s ^ c_24s ^ c_22s), 1u);
+ const u32 c_39s = rotl32_S ((c_36s ^ c_31s ^ c_25s ^ c_23s), 1u);
+ const u32 c_40s = rotl32_S ((c_37s ^ c_32s ^ c_26s ^ c_24s), 1u);
+ const u32 c_41s = rotl32_S ((c_38s ^ c_33s ^ c_27s ^ c_25s), 1u);
+ const u32 c_42s = rotl32_S ((c_39s ^ c_34s ^ c_28s ^ c_26s), 1u);
+ const u32 c_43s = rotl32_S ((c_40s ^ c_35s ^ c_29s ^ c_27s), 1u);
+ const u32 c_44s = rotl32_S ((c_41s ^ c_36s ^ c_30s ^ c_28s), 1u);
+ const u32 c_45s = rotl32_S ((c_42s ^ c_37s ^ c_31s ^ c_29s), 1u);
+ const u32 c_46s = rotl32_S ((c_43s ^ c_38s ^ c_32s ^ c_30s), 1u);
+ const u32 c_47s = rotl32_S ((c_44s ^ c_39s ^ c_33s ^ c_31s), 1u);
+ const u32 c_48s = rotl32_S ((c_45s ^ c_40s ^ c_34s ^ c_32s), 1u);
+ const u32 c_49s = rotl32_S ((c_46s ^ c_41s ^ c_35s ^ c_33s), 1u);
+ const u32 c_50s = rotl32_S ((c_47s ^ c_42s ^ c_36s ^ c_34s), 1u);
+ const u32 c_51s = rotl32_S ((c_48s ^ c_43s ^ c_37s ^ c_35s), 1u);
+ const u32 c_52s = rotl32_S ((c_49s ^ c_44s ^ c_38s ^ c_36s), 1u);
+ const u32 c_53s = rotl32_S ((c_50s ^ c_45s ^ c_39s ^ c_37s), 1u);
+ const u32 c_54s = rotl32_S ((c_51s ^ c_46s ^ c_40s ^ c_38s), 1u);
+ const u32 c_55s = rotl32_S ((c_52s ^ c_47s ^ c_41s ^ c_39s), 1u);
+ const u32 c_56s = rotl32_S ((c_53s ^ c_48s ^ c_42s ^ c_40s), 1u);
+ const u32 c_57s = rotl32_S ((c_54s ^ c_49s ^ c_43s ^ c_41s), 1u);
+ const u32 c_58s = rotl32_S ((c_55s ^ c_50s ^ c_44s ^ c_42s), 1u);
+ const u32 c_59s = rotl32_S ((c_56s ^ c_51s ^ c_45s ^ c_43s), 1u);
+ const u32 c_60s = rotl32_S ((c_57s ^ c_52s ^ c_46s ^ c_44s), 1u);
+ const u32 c_61s = rotl32_S ((c_58s ^ c_53s ^ c_47s ^ c_45s), 1u);
+ const u32 c_62s = rotl32_S ((c_59s ^ c_54s ^ c_48s ^ c_46s), 1u);
+ const u32 c_63s = rotl32_S ((c_60s ^ c_55s ^ c_49s ^ c_47s), 1u);
+ const u32 c_64s = rotl32_S ((c_61s ^ c_56s ^ c_50s ^ c_48s), 1u);
+ const u32 c_65s = rotl32_S ((c_62s ^ c_57s ^ c_51s ^ c_49s), 1u);
+ const u32 c_66s = rotl32_S ((c_63s ^ c_58s ^ c_52s ^ c_50s), 1u);
+ const u32 c_67s = rotl32_S ((c_64s ^ c_59s ^ c_53s ^ c_51s), 1u);
+ const u32 c_68s = rotl32_S ((c_65s ^ c_60s ^ c_54s ^ c_52s), 1u);
+ const u32 c_69s = rotl32_S ((c_66s ^ c_61s ^ c_55s ^ c_53s), 1u);
+ const u32 c_70s = rotl32_S ((c_67s ^ c_62s ^ c_56s ^ c_54s), 1u);
+ const u32 c_71s = rotl32_S ((c_68s ^ c_63s ^ c_57s ^ c_55s), 1u);
+ const u32 c_72s = rotl32_S ((c_69s ^ c_64s ^ c_58s ^ c_56s), 1u);
+ const u32 c_73s = rotl32_S ((c_70s ^ c_65s ^ c_59s ^ c_57s), 1u);
+ const u32 c_74s = rotl32_S ((c_71s ^ c_66s ^ c_60s ^ c_58s), 1u);
+ const u32 c_75s = rotl32_S ((c_72s ^ c_67s ^ c_61s ^ c_59s), 1u);
+ const u32 c_76s = rotl32_S ((c_73s ^ c_68s ^ c_62s ^ c_60s), 1u);
+ const u32 c_77s = rotl32_S ((c_74s ^ c_69s ^ c_63s ^ c_61s), 1u);
+ const u32 c_78s = rotl32_S ((c_75s ^ c_70s ^ c_64s ^ c_62s), 1u);
+ const u32 c_79s = rotl32_S ((c_76s ^ c_71s ^ c_65s ^ c_63s), 1u);
const u32 c_17sK = c_17s + SHA1C00;
const u32 c_18sK = c_18s + SHA1C00;
u32 w0l = w[0];
- for (u32 il_pos = 0; il_pos < bfs_cnt; il_pos++)
+ for (u32 il_pos = 0; il_pos < bfs_cnt; il_pos += VECT_SIZE)
{
- const u32 w0r = words_buf_r[il_pos];
-
- const u32 w0 = w0l | w0r;
-
- const u32 w0s01 = rotl32 (w0, 1u);
- const u32 w0s02 = rotl32 (w0, 2u);
- const u32 w0s03 = rotl32 (w0, 3u);
- const u32 w0s04 = rotl32 (w0, 4u);
- const u32 w0s05 = rotl32 (w0, 5u);
- const u32 w0s06 = rotl32 (w0, 6u);
- const u32 w0s07 = rotl32 (w0, 7u);
- const u32 w0s08 = rotl32 (w0, 8u);
- const u32 w0s09 = rotl32 (w0, 9u);
- const u32 w0s10 = rotl32 (w0, 10u);
- const u32 w0s11 = rotl32 (w0, 11u);
- const u32 w0s12 = rotl32 (w0, 12u);
- const u32 w0s13 = rotl32 (w0, 13u);
- const u32 w0s14 = rotl32 (w0, 14u);
- const u32 w0s15 = rotl32 (w0, 15u);
- const u32 w0s16 = rotl32 (w0, 16u);
- const u32 w0s17 = rotl32 (w0, 17u);
- const u32 w0s18 = rotl32 (w0, 18u);
- const u32 w0s19 = rotl32 (w0, 19u);
- const u32 w0s20 = rotl32 (w0, 20u);
- const u32 w0s21 = rotl32 (w0, 21u);
- const u32 w0s22 = rotl32 (w0, 22U);
-
- const u32 w0s04___w0s06 = w0s04 ^ w0s06;
- const u32 w0s04___w0s08 = w0s04 ^ w0s08;
- const u32 w0s08___w0s12 = w0s08 ^ w0s12;
- const u32 w0s04___w0s06___w0s07 = w0s04___w0s06 ^ w0s07;
-
- u32 a = SHA1M_A;
- u32 b = SHA1M_B;
- u32 c = SHA1M_C;
- u32 d = SHA1M_D;
- u32 e = SHA1M_E;
+ const u32x w0r = words_buf_r[il_pos / VECT_SIZE];
+
+ const u32x w0 = w0l | w0r;
+
+ const u32x w0s01 = rotl32 (w0, 1u);
+ const u32x w0s02 = rotl32 (w0, 2u);
+ const u32x w0s03 = rotl32 (w0, 3u);
+ const u32x w0s04 = rotl32 (w0, 4u);
+ const u32x w0s05 = rotl32 (w0, 5u);
+ const u32x w0s06 = rotl32 (w0, 6u);
+ const u32x w0s07 = rotl32 (w0, 7u);
+ const u32x w0s08 = rotl32 (w0, 8u);
+ const u32x w0s09 = rotl32 (w0, 9u);
+ const u32x w0s10 = rotl32 (w0, 10u);
+ const u32x w0s11 = rotl32 (w0, 11u);
+ const u32x w0s12 = rotl32 (w0, 12u);
+ const u32x w0s13 = rotl32 (w0, 13u);
+ const u32x w0s14 = rotl32 (w0, 14u);
+ const u32x w0s15 = rotl32 (w0, 15u);
+ const u32x w0s16 = rotl32 (w0, 16u);
+ const u32x w0s17 = rotl32 (w0, 17u);
+ const u32x w0s18 = rotl32 (w0, 18u);
+ const u32x w0s19 = rotl32 (w0, 19u);
+ const u32x w0s20 = rotl32 (w0, 20u);
+ const u32x w0s21 = rotl32 (w0, 21u);
+ const u32x w0s22 = rotl32 (w0, 22U);
+
+ const u32x w0s04___w0s06 = w0s04 ^ w0s06;
+ const u32x w0s04___w0s08 = w0s04 ^ w0s08;
+ const u32x w0s08___w0s12 = w0s08 ^ w0s12;
+ const u32x w0s04___w0s06___w0s07 = w0s04___w0s06 ^ w0s07;
+
+ u32x a = SHA1M_A;
+ u32x b = SHA1M_B;
+ u32x c = SHA1M_C;
+ u32x d = SHA1M_D;
+ u32x e = SHA1M_E;
#undef K
#define K SHA1C00
SHA1_STEP (SHA1_F1 , c, d, e, a, b, (c_78s ^ w0s07 ^ w0s08 ^ w0s15 ^ w0s18 ^ w0s20));
SHA1_STEP (SHA1_F1 , b, c, d, e, a, (c_79s ^ w0s08 ^ w0s22));
-
- const u32 r0 = d;
- const u32 r1 = e;
- const u32 r2 = c;
- const u32 r3 = b;
-
- #include COMPARE_M
+ COMPARE_M_SIMD (d, e, c, b);
}
}
-static void m00110s (u32 w[16], const u32 pw_len, __global pw_t *pws, __global kernel_rule_t *rules_buf, __global comb_t *combs_buf, __constant u32 * words_buf_r, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 bfs_cnt, const u32 digests_cnt, const u32 digests_offset)
+static void m00110s (u32 w[16], const u32 pw_len, __global pw_t *pws, __global kernel_rule_t *rules_buf, __global comb_t *combs_buf, __constant u32x * words_buf_r, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 bfs_cnt, const u32 digests_cnt, const u32 digests_offset)
{
/**
* modifier
* base
*/
- const u32 c_16s = rotl32 ((w[13] ^ w[ 8] ^ w[ 2] ), 1u);
- const u32 c_17s = rotl32 ((w[14] ^ w[ 9] ^ w[ 3] ^ w[ 1]), 1u);
- const u32 c_18s = rotl32 ((w[15] ^ w[10] ^ w[ 4] ^ w[ 2]), 1u);
- const u32 c_19s = rotl32 ((c_16s ^ w[11] ^ w[ 5] ^ w[ 3]), 1u);
- const u32 c_20s = rotl32 ((c_17s ^ w[12] ^ w[ 6] ^ w[ 4]), 1u);
- const u32 c_21s = rotl32 ((c_18s ^ w[13] ^ w[ 7] ^ w[ 5]), 1u);
- const u32 c_22s = rotl32 ((c_19s ^ w[14] ^ w[ 8] ^ w[ 6]), 1u);
- const u32 c_23s = rotl32 ((c_20s ^ w[15] ^ w[ 9] ^ w[ 7]), 1u);
- const u32 c_24s = rotl32 ((c_21s ^ c_16s ^ w[10] ^ w[ 8]), 1u);
- const u32 c_25s = rotl32 ((c_22s ^ c_17s ^ w[11] ^ w[ 9]), 1u);
- const u32 c_26s = rotl32 ((c_23s ^ c_18s ^ w[12] ^ w[10]), 1u);
- const u32 c_27s = rotl32 ((c_24s ^ c_19s ^ w[13] ^ w[11]), 1u);
- const u32 c_28s = rotl32 ((c_25s ^ c_20s ^ w[14] ^ w[12]), 1u);
- const u32 c_29s = rotl32 ((c_26s ^ c_21s ^ w[15] ^ w[13]), 1u);
- const u32 c_30s = rotl32 ((c_27s ^ c_22s ^ c_16s ^ w[14]), 1u);
- const u32 c_31s = rotl32 ((c_28s ^ c_23s ^ c_17s ^ w[15]), 1u);
- const u32 c_32s = rotl32 ((c_29s ^ c_24s ^ c_18s ^ c_16s), 1u);
- const u32 c_33s = rotl32 ((c_30s ^ c_25s ^ c_19s ^ c_17s), 1u);
- const u32 c_34s = rotl32 ((c_31s ^ c_26s ^ c_20s ^ c_18s), 1u);
- const u32 c_35s = rotl32 ((c_32s ^ c_27s ^ c_21s ^ c_19s), 1u);
- const u32 c_36s = rotl32 ((c_33s ^ c_28s ^ c_22s ^ c_20s), 1u);
- const u32 c_37s = rotl32 ((c_34s ^ c_29s ^ c_23s ^ c_21s), 1u);
- const u32 c_38s = rotl32 ((c_35s ^ c_30s ^ c_24s ^ c_22s), 1u);
- const u32 c_39s = rotl32 ((c_36s ^ c_31s ^ c_25s ^ c_23s), 1u);
- const u32 c_40s = rotl32 ((c_37s ^ c_32s ^ c_26s ^ c_24s), 1u);
- const u32 c_41s = rotl32 ((c_38s ^ c_33s ^ c_27s ^ c_25s), 1u);
- const u32 c_42s = rotl32 ((c_39s ^ c_34s ^ c_28s ^ c_26s), 1u);
- const u32 c_43s = rotl32 ((c_40s ^ c_35s ^ c_29s ^ c_27s), 1u);
- const u32 c_44s = rotl32 ((c_41s ^ c_36s ^ c_30s ^ c_28s), 1u);
- const u32 c_45s = rotl32 ((c_42s ^ c_37s ^ c_31s ^ c_29s), 1u);
- const u32 c_46s = rotl32 ((c_43s ^ c_38s ^ c_32s ^ c_30s), 1u);
- const u32 c_47s = rotl32 ((c_44s ^ c_39s ^ c_33s ^ c_31s), 1u);
- const u32 c_48s = rotl32 ((c_45s ^ c_40s ^ c_34s ^ c_32s), 1u);
- const u32 c_49s = rotl32 ((c_46s ^ c_41s ^ c_35s ^ c_33s), 1u);
- const u32 c_50s = rotl32 ((c_47s ^ c_42s ^ c_36s ^ c_34s), 1u);
- const u32 c_51s = rotl32 ((c_48s ^ c_43s ^ c_37s ^ c_35s), 1u);
- const u32 c_52s = rotl32 ((c_49s ^ c_44s ^ c_38s ^ c_36s), 1u);
- const u32 c_53s = rotl32 ((c_50s ^ c_45s ^ c_39s ^ c_37s), 1u);
- const u32 c_54s = rotl32 ((c_51s ^ c_46s ^ c_40s ^ c_38s), 1u);
- const u32 c_55s = rotl32 ((c_52s ^ c_47s ^ c_41s ^ c_39s), 1u);
- const u32 c_56s = rotl32 ((c_53s ^ c_48s ^ c_42s ^ c_40s), 1u);
- const u32 c_57s = rotl32 ((c_54s ^ c_49s ^ c_43s ^ c_41s), 1u);
- const u32 c_58s = rotl32 ((c_55s ^ c_50s ^ c_44s ^ c_42s), 1u);
- const u32 c_59s = rotl32 ((c_56s ^ c_51s ^ c_45s ^ c_43s), 1u);
- const u32 c_60s = rotl32 ((c_57s ^ c_52s ^ c_46s ^ c_44s), 1u);
- const u32 c_61s = rotl32 ((c_58s ^ c_53s ^ c_47s ^ c_45s), 1u);
- const u32 c_62s = rotl32 ((c_59s ^ c_54s ^ c_48s ^ c_46s), 1u);
- const u32 c_63s = rotl32 ((c_60s ^ c_55s ^ c_49s ^ c_47s), 1u);
- const u32 c_64s = rotl32 ((c_61s ^ c_56s ^ c_50s ^ c_48s), 1u);
- const u32 c_65s = rotl32 ((c_62s ^ c_57s ^ c_51s ^ c_49s), 1u);
- const u32 c_66s = rotl32 ((c_63s ^ c_58s ^ c_52s ^ c_50s), 1u);
- const u32 c_67s = rotl32 ((c_64s ^ c_59s ^ c_53s ^ c_51s), 1u);
- const u32 c_68s = rotl32 ((c_65s ^ c_60s ^ c_54s ^ c_52s), 1u);
- const u32 c_69s = rotl32 ((c_66s ^ c_61s ^ c_55s ^ c_53s), 1u);
- const u32 c_70s = rotl32 ((c_67s ^ c_62s ^ c_56s ^ c_54s), 1u);
- const u32 c_71s = rotl32 ((c_68s ^ c_63s ^ c_57s ^ c_55s), 1u);
- const u32 c_72s = rotl32 ((c_69s ^ c_64s ^ c_58s ^ c_56s), 1u);
- const u32 c_73s = rotl32 ((c_70s ^ c_65s ^ c_59s ^ c_57s), 1u);
- const u32 c_74s = rotl32 ((c_71s ^ c_66s ^ c_60s ^ c_58s), 1u);
- const u32 c_75s = rotl32 ((c_72s ^ c_67s ^ c_61s ^ c_59s), 1u);
+ const u32 c_16s = rotl32_S ((w[13] ^ w[ 8] ^ w[ 2] ), 1u);
+ const u32 c_17s = rotl32_S ((w[14] ^ w[ 9] ^ w[ 3] ^ w[ 1]), 1u);
+ const u32 c_18s = rotl32_S ((w[15] ^ w[10] ^ w[ 4] ^ w[ 2]), 1u);
+ const u32 c_19s = rotl32_S ((c_16s ^ w[11] ^ w[ 5] ^ w[ 3]), 1u);
+ const u32 c_20s = rotl32_S ((c_17s ^ w[12] ^ w[ 6] ^ w[ 4]), 1u);
+ const u32 c_21s = rotl32_S ((c_18s ^ w[13] ^ w[ 7] ^ w[ 5]), 1u);
+ const u32 c_22s = rotl32_S ((c_19s ^ w[14] ^ w[ 8] ^ w[ 6]), 1u);
+ const u32 c_23s = rotl32_S ((c_20s ^ w[15] ^ w[ 9] ^ w[ 7]), 1u);
+ const u32 c_24s = rotl32_S ((c_21s ^ c_16s ^ w[10] ^ w[ 8]), 1u);
+ const u32 c_25s = rotl32_S ((c_22s ^ c_17s ^ w[11] ^ w[ 9]), 1u);
+ const u32 c_26s = rotl32_S ((c_23s ^ c_18s ^ w[12] ^ w[10]), 1u);
+ const u32 c_27s = rotl32_S ((c_24s ^ c_19s ^ w[13] ^ w[11]), 1u);
+ const u32 c_28s = rotl32_S ((c_25s ^ c_20s ^ w[14] ^ w[12]), 1u);
+ const u32 c_29s = rotl32_S ((c_26s ^ c_21s ^ w[15] ^ w[13]), 1u);
+ const u32 c_30s = rotl32_S ((c_27s ^ c_22s ^ c_16s ^ w[14]), 1u);
+ const u32 c_31s = rotl32_S ((c_28s ^ c_23s ^ c_17s ^ w[15]), 1u);
+ const u32 c_32s = rotl32_S ((c_29s ^ c_24s ^ c_18s ^ c_16s), 1u);
+ const u32 c_33s = rotl32_S ((c_30s ^ c_25s ^ c_19s ^ c_17s), 1u);
+ const u32 c_34s = rotl32_S ((c_31s ^ c_26s ^ c_20s ^ c_18s), 1u);
+ const u32 c_35s = rotl32_S ((c_32s ^ c_27s ^ c_21s ^ c_19s), 1u);
+ const u32 c_36s = rotl32_S ((c_33s ^ c_28s ^ c_22s ^ c_20s), 1u);
+ const u32 c_37s = rotl32_S ((c_34s ^ c_29s ^ c_23s ^ c_21s), 1u);
+ const u32 c_38s = rotl32_S ((c_35s ^ c_30s ^ c_24s ^ c_22s), 1u);
+ const u32 c_39s = rotl32_S ((c_36s ^ c_31s ^ c_25s ^ c_23s), 1u);
+ const u32 c_40s = rotl32_S ((c_37s ^ c_32s ^ c_26s ^ c_24s), 1u);
+ const u32 c_41s = rotl32_S ((c_38s ^ c_33s ^ c_27s ^ c_25s), 1u);
+ const u32 c_42s = rotl32_S ((c_39s ^ c_34s ^ c_28s ^ c_26s), 1u);
+ const u32 c_43s = rotl32_S ((c_40s ^ c_35s ^ c_29s ^ c_27s), 1u);
+ const u32 c_44s = rotl32_S ((c_41s ^ c_36s ^ c_30s ^ c_28s), 1u);
+ const u32 c_45s = rotl32_S ((c_42s ^ c_37s ^ c_31s ^ c_29s), 1u);
+ const u32 c_46s = rotl32_S ((c_43s ^ c_38s ^ c_32s ^ c_30s), 1u);
+ const u32 c_47s = rotl32_S ((c_44s ^ c_39s ^ c_33s ^ c_31s), 1u);
+ const u32 c_48s = rotl32_S ((c_45s ^ c_40s ^ c_34s ^ c_32s), 1u);
+ const u32 c_49s = rotl32_S ((c_46s ^ c_41s ^ c_35s ^ c_33s), 1u);
+ const u32 c_50s = rotl32_S ((c_47s ^ c_42s ^ c_36s ^ c_34s), 1u);
+ const u32 c_51s = rotl32_S ((c_48s ^ c_43s ^ c_37s ^ c_35s), 1u);
+ const u32 c_52s = rotl32_S ((c_49s ^ c_44s ^ c_38s ^ c_36s), 1u);
+ const u32 c_53s = rotl32_S ((c_50s ^ c_45s ^ c_39s ^ c_37s), 1u);
+ const u32 c_54s = rotl32_S ((c_51s ^ c_46s ^ c_40s ^ c_38s), 1u);
+ const u32 c_55s = rotl32_S ((c_52s ^ c_47s ^ c_41s ^ c_39s), 1u);
+ const u32 c_56s = rotl32_S ((c_53s ^ c_48s ^ c_42s ^ c_40s), 1u);
+ const u32 c_57s = rotl32_S ((c_54s ^ c_49s ^ c_43s ^ c_41s), 1u);
+ const u32 c_58s = rotl32_S ((c_55s ^ c_50s ^ c_44s ^ c_42s), 1u);
+ const u32 c_59s = rotl32_S ((c_56s ^ c_51s ^ c_45s ^ c_43s), 1u);
+ const u32 c_60s = rotl32_S ((c_57s ^ c_52s ^ c_46s ^ c_44s), 1u);
+ const u32 c_61s = rotl32_S ((c_58s ^ c_53s ^ c_47s ^ c_45s), 1u);
+ const u32 c_62s = rotl32_S ((c_59s ^ c_54s ^ c_48s ^ c_46s), 1u);
+ const u32 c_63s = rotl32_S ((c_60s ^ c_55s ^ c_49s ^ c_47s), 1u);
+ const u32 c_64s = rotl32_S ((c_61s ^ c_56s ^ c_50s ^ c_48s), 1u);
+ const u32 c_65s = rotl32_S ((c_62s ^ c_57s ^ c_51s ^ c_49s), 1u);
+ const u32 c_66s = rotl32_S ((c_63s ^ c_58s ^ c_52s ^ c_50s), 1u);
+ const u32 c_67s = rotl32_S ((c_64s ^ c_59s ^ c_53s ^ c_51s), 1u);
+ const u32 c_68s = rotl32_S ((c_65s ^ c_60s ^ c_54s ^ c_52s), 1u);
+ const u32 c_69s = rotl32_S ((c_66s ^ c_61s ^ c_55s ^ c_53s), 1u);
+ const u32 c_70s = rotl32_S ((c_67s ^ c_62s ^ c_56s ^ c_54s), 1u);
+ const u32 c_71s = rotl32_S ((c_68s ^ c_63s ^ c_57s ^ c_55s), 1u);
+ const u32 c_72s = rotl32_S ((c_69s ^ c_64s ^ c_58s ^ c_56s), 1u);
+ const u32 c_73s = rotl32_S ((c_70s ^ c_65s ^ c_59s ^ c_57s), 1u);
+ const u32 c_74s = rotl32_S ((c_71s ^ c_66s ^ c_60s ^ c_58s), 1u);
+ const u32 c_75s = rotl32_S ((c_72s ^ c_67s ^ c_61s ^ c_59s), 1u);
const u32 c_17sK = c_17s + SHA1C00;
const u32 c_18sK = c_18s + SHA1C00;
* reverse
*/
- const u32 e_rev = rotl32 (search[1], 2u) - SHA1C03;
+ const u32 e_rev = rotl32_S (search[1], 2u) - SHA1C03;
/**
* loop
u32 w0l = w[0];
- for (u32 il_pos = 0; il_pos < bfs_cnt; il_pos++)
+ for (u32 il_pos = 0; il_pos < bfs_cnt; il_pos += VECT_SIZE)
{
- const u32 w0r = words_buf_r[il_pos];
-
- const u32 w0 = w0l | w0r;
-
- const u32 w0s01 = rotl32 (w0, 1u);
- const u32 w0s02 = rotl32 (w0, 2u);
- const u32 w0s03 = rotl32 (w0, 3u);
- const u32 w0s04 = rotl32 (w0, 4u);
- const u32 w0s05 = rotl32 (w0, 5u);
- const u32 w0s06 = rotl32 (w0, 6u);
- const u32 w0s07 = rotl32 (w0, 7u);
- const u32 w0s08 = rotl32 (w0, 8u);
- const u32 w0s09 = rotl32 (w0, 9u);
- const u32 w0s10 = rotl32 (w0, 10u);
- const u32 w0s11 = rotl32 (w0, 11u);
- const u32 w0s12 = rotl32 (w0, 12u);
- const u32 w0s13 = rotl32 (w0, 13u);
- const u32 w0s14 = rotl32 (w0, 14u);
- const u32 w0s15 = rotl32 (w0, 15u);
- const u32 w0s16 = rotl32 (w0, 16u);
- const u32 w0s17 = rotl32 (w0, 17u);
- const u32 w0s18 = rotl32 (w0, 18u);
- const u32 w0s19 = rotl32 (w0, 19u);
- const u32 w0s20 = rotl32 (w0, 20u);
-
- const u32 w0s04___w0s06 = w0s04 ^ w0s06;
- const u32 w0s04___w0s08 = w0s04 ^ w0s08;
- const u32 w0s08___w0s12 = w0s08 ^ w0s12;
- const u32 w0s04___w0s06___w0s07 = w0s04___w0s06 ^ w0s07;
-
- u32 a = SHA1M_A;
- u32 b = SHA1M_B;
- u32 c = SHA1M_C;
- u32 d = SHA1M_D;
- u32 e = SHA1M_E;
+ const u32x w0r = words_buf_r[il_pos / VECT_SIZE];
+
+ const u32x w0 = w0l | w0r;
+
+ const u32x w0s01 = rotl32 (w0, 1u);
+ const u32x w0s02 = rotl32 (w0, 2u);
+ const u32x w0s03 = rotl32 (w0, 3u);
+ const u32x w0s04 = rotl32 (w0, 4u);
+ const u32x w0s05 = rotl32 (w0, 5u);
+ const u32x w0s06 = rotl32 (w0, 6u);
+ const u32x w0s07 = rotl32 (w0, 7u);
+ const u32x w0s08 = rotl32 (w0, 8u);
+ const u32x w0s09 = rotl32 (w0, 9u);
+ const u32x w0s10 = rotl32 (w0, 10u);
+ const u32x w0s11 = rotl32 (w0, 11u);
+ const u32x w0s12 = rotl32 (w0, 12u);
+ const u32x w0s13 = rotl32 (w0, 13u);
+ const u32x w0s14 = rotl32 (w0, 14u);
+ const u32x w0s15 = rotl32 (w0, 15u);
+ const u32x w0s16 = rotl32 (w0, 16u);
+ const u32x w0s17 = rotl32 (w0, 17u);
+ const u32x w0s18 = rotl32 (w0, 18u);
+ const u32x w0s19 = rotl32 (w0, 19u);
+ const u32x w0s20 = rotl32 (w0, 20u);
+
+ const u32x w0s04___w0s06 = w0s04 ^ w0s06;
+ const u32x w0s04___w0s08 = w0s04 ^ w0s08;
+ const u32x w0s08___w0s12 = w0s08 ^ w0s12;
+ const u32x w0s04___w0s06___w0s07 = w0s04___w0s06 ^ w0s07;
+
+ u32x a = SHA1M_A;
+ u32x b = SHA1M_B;
+ u32x c = SHA1M_C;
+ u32x d = SHA1M_D;
+ u32x e = SHA1M_E;
#undef K
#define K SHA1C00
SHA1_STEP_PE (SHA1_F1, a, b, c, d, e, (c_75s ^ w0s06 ^ w0s12 ^ w0s14));
- bool q_cond = allx (e_rev != e);
-
- if (q_cond) continue;
+ if (MATCHES_NONE_VS (e, e_rev)) continue;
SHA1_STEP_PB (SHA1_F1, a, b, c, d, e, 0);
- const u32 c_76s = rotl32 ((c_73s ^ c_68s ^ c_62s ^ c_60s), 1u);
- const u32 c_77s = rotl32 ((c_74s ^ c_69s ^ c_63s ^ c_61s), 1u);
- const u32 c_78s = rotl32 ((c_75s ^ c_70s ^ c_64s ^ c_62s), 1u);
- const u32 c_79s = rotl32 ((c_76s ^ c_71s ^ c_65s ^ c_63s), 1u);
+ const u32x c_76s = rotl32 ((c_73s ^ c_68s ^ c_62s ^ c_60s), 1u);
+ const u32x c_77s = rotl32 ((c_74s ^ c_69s ^ c_63s ^ c_61s), 1u);
+ const u32x c_78s = rotl32 ((c_75s ^ c_70s ^ c_64s ^ c_62s), 1u);
+ const u32x c_79s = rotl32 ((c_76s ^ c_71s ^ c_65s ^ c_63s), 1u);
- const u32 w0s21 = rotl32 (w0, 21u);
- const u32 w0s22 = rotl32 (w0, 22U);
+ const u32x w0s21 = rotl32 (w0, 21u);
+ const u32x w0s22 = rotl32 (w0, 22U);
SHA1_STEP (SHA1_F1 , e, a, b, c, d, (c_76s ^ w0s07 ^ w0s08___w0s12 ^ w0s16 ^ w0s21));
SHA1_STEP (SHA1_F1 , d, e, a, b, c, (c_77s));
SHA1_STEP (SHA1_F1 , c, d, e, a, b, (c_78s ^ w0s07 ^ w0s08 ^ w0s15 ^ w0s18 ^ w0s20));
SHA1_STEP (SHA1_F1 , b, c, d, e, a, (c_79s ^ w0s08 ^ w0s22));
-
- const u32 r0 = d;
- const u32 r1 = e;
- const u32 r2 = c;
- const u32 r3 = b;
-
- #include COMPARE_S
+ COMPARE_S_SIMD (d, e, c, b);
}
}
-__kernel void m00110_m04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, __global comb_t *combs_buf, __constant u32 * words_buf_r, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 bfs_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max)
+__kernel void m00110_m04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, __global comb_t *combs_buf, __constant u32x * words_buf_r, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 bfs_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max)
{
/**
* base
m00110m (w, pw_len, pws, rules_buf, combs_buf, words_buf_r, tmps, hooks, bitmaps_buf_s1_a, bitmaps_buf_s1_b, bitmaps_buf_s1_c, bitmaps_buf_s1_d, bitmaps_buf_s2_a, bitmaps_buf_s2_b, bitmaps_buf_s2_c, bitmaps_buf_s2_d, plains_buf, digests_buf, hashes_shown, salt_bufs, esalt_bufs, d_return_buf, d_scryptV_buf, bitmap_mask, bitmap_shift1, bitmap_shift2, salt_pos, loop_pos, loop_cnt, bfs_cnt, digests_cnt, digests_offset);
}
-__kernel void m00110_m08 (__global pw_t *pws, __global kernel_rule_t *rules_buf, __global comb_t *combs_buf, __constant u32 * words_buf_r, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 bfs_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max)
+__kernel void m00110_m08 (__global pw_t *pws, __global kernel_rule_t *rules_buf, __global comb_t *combs_buf, __constant u32x * words_buf_r, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 bfs_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max)
{
/**
* base
m00110m (w, pw_len, pws, rules_buf, combs_buf, words_buf_r, tmps, hooks, bitmaps_buf_s1_a, bitmaps_buf_s1_b, bitmaps_buf_s1_c, bitmaps_buf_s1_d, bitmaps_buf_s2_a, bitmaps_buf_s2_b, bitmaps_buf_s2_c, bitmaps_buf_s2_d, plains_buf, digests_buf, hashes_shown, salt_bufs, esalt_bufs, d_return_buf, d_scryptV_buf, bitmap_mask, bitmap_shift1, bitmap_shift2, salt_pos, loop_pos, loop_cnt, bfs_cnt, digests_cnt, digests_offset);
}
-__kernel void m00110_m16 (__global pw_t *pws, __global kernel_rule_t *rules_buf, __global comb_t *combs_buf, __constant u32 * words_buf_r, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 bfs_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max)
+__kernel void m00110_m16 (__global pw_t *pws, __global kernel_rule_t *rules_buf, __global comb_t *combs_buf, __constant u32x * words_buf_r, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 bfs_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max)
{
/**
* base
m00110m (w, pw_len, pws, rules_buf, combs_buf, words_buf_r, tmps, hooks, bitmaps_buf_s1_a, bitmaps_buf_s1_b, bitmaps_buf_s1_c, bitmaps_buf_s1_d, bitmaps_buf_s2_a, bitmaps_buf_s2_b, bitmaps_buf_s2_c, bitmaps_buf_s2_d, plains_buf, digests_buf, hashes_shown, salt_bufs, esalt_bufs, d_return_buf, d_scryptV_buf, bitmap_mask, bitmap_shift1, bitmap_shift2, salt_pos, loop_pos, loop_cnt, bfs_cnt, digests_cnt, digests_offset);
}
-__kernel void m00110_s04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, __global comb_t *combs_buf, __constant u32 * words_buf_r, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 bfs_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max)
+__kernel void m00110_s04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, __global comb_t *combs_buf, __constant u32x * words_buf_r, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 bfs_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max)
{
/**
* base
m00110s (w, pw_len, pws, rules_buf, combs_buf, words_buf_r, tmps, hooks, bitmaps_buf_s1_a, bitmaps_buf_s1_b, bitmaps_buf_s1_c, bitmaps_buf_s1_d, bitmaps_buf_s2_a, bitmaps_buf_s2_b, bitmaps_buf_s2_c, bitmaps_buf_s2_d, plains_buf, digests_buf, hashes_shown, salt_bufs, esalt_bufs, d_return_buf, d_scryptV_buf, bitmap_mask, bitmap_shift1, bitmap_shift2, salt_pos, loop_pos, loop_cnt, bfs_cnt, digests_cnt, digests_offset);
}
-__kernel void m00110_s08 (__global pw_t *pws, __global kernel_rule_t *rules_buf, __global comb_t *combs_buf, __constant u32 * words_buf_r, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 bfs_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max)
+__kernel void m00110_s08 (__global pw_t *pws, __global kernel_rule_t *rules_buf, __global comb_t *combs_buf, __constant u32x * words_buf_r, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 bfs_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max)
{
/**
* base
m00110s (w, pw_len, pws, rules_buf, combs_buf, words_buf_r, tmps, hooks, bitmaps_buf_s1_a, bitmaps_buf_s1_b, bitmaps_buf_s1_c, bitmaps_buf_s1_d, bitmaps_buf_s2_a, bitmaps_buf_s2_b, bitmaps_buf_s2_c, bitmaps_buf_s2_d, plains_buf, digests_buf, hashes_shown, salt_bufs, esalt_bufs, d_return_buf, d_scryptV_buf, bitmap_mask, bitmap_shift1, bitmap_shift2, salt_pos, loop_pos, loop_cnt, bfs_cnt, digests_cnt, digests_offset);
}
-__kernel void m00110_s16 (__global pw_t *pws, __global kernel_rule_t *rules_buf, __global comb_t *combs_buf, __constant u32 * words_buf_r, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 bfs_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max)
+__kernel void m00110_s16 (__global pw_t *pws, __global kernel_rule_t *rules_buf, __global comb_t *combs_buf, __constant u32x * words_buf_r, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 bfs_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max)
{
/**
* base
w3_t[2] = w3[2];
w3_t[3] = w3[3];
- switch_buffer_by_offset (w0_t, w1_t, w2_t, w3_t, salt_len);
+ switch_buffer_by_offset_le (w0_t, w1_t, w2_t, w3_t, salt_len);
w0_t[0] |= salt_buf0[0];
w0_t[1] |= salt_buf0[1];
w3_t[2] = w3[2];
w3_t[3] = w3[3];
- switch_buffer_by_offset (w0_t, w1_t, w2_t, w3_t, salt_len);
+ switch_buffer_by_offset_le (w0_t, w1_t, w2_t, w3_t, salt_len);
w0_t[0] |= salt_buf0[0];
w0_t[1] |= salt_buf0[1];
if (combs_mode == COMBINATOR_MODE_BASE_RIGHT)
{
- switch_buffer_by_offset (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len);
+ switch_buffer_by_offset_le (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len);
}
/**
if (combs_mode == COMBINATOR_MODE_BASE_LEFT)
{
- switch_buffer_by_offset (wordr0, wordr1, wordr2, wordr3, pw_l_len);
+ switch_buffer_by_offset_le (wordr0, wordr1, wordr2, wordr3, pw_l_len);
}
u32 w0[4];
w3_t[2] = w3[2];
w3_t[3] = w3[3];
- switch_buffer_by_offset (w0_t, w1_t, w2_t, w3_t, salt_len);
+ switch_buffer_by_offset_le (w0_t, w1_t, w2_t, w3_t, salt_len);
w0_t[0] |= salt_buf0[0];
w0_t[1] |= salt_buf0[1];
if (combs_mode == COMBINATOR_MODE_BASE_RIGHT)
{
- switch_buffer_by_offset (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len);
+ switch_buffer_by_offset_le (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len);
}
/**
if (combs_mode == COMBINATOR_MODE_BASE_LEFT)
{
- switch_buffer_by_offset (wordr0, wordr1, wordr2, wordr3, pw_l_len);
+ switch_buffer_by_offset_le (wordr0, wordr1, wordr2, wordr3, pw_l_len);
}
u32 w0[4];
w3_t[2] = w3[2];
w3_t[3] = w3[3];
- switch_buffer_by_offset (w0_t, w1_t, w2_t, w3_t, salt_len);
+ switch_buffer_by_offset_le (w0_t, w1_t, w2_t, w3_t, salt_len);
w0_t[0] |= salt_buf0[0];
w0_t[1] |= salt_buf0[1];
#define _SHA1_
+#define NEW_SIMD_CODE
+
#include "include/constants.h"
#include "include/kernel_vendor.h"
#include "include/kernel_functions.c"
#include "OpenCL/types_ocl.c"
#include "OpenCL/common.c"
-
-#define COMPARE_S "OpenCL/check_single_comp4.c"
-#define COMPARE_M "OpenCL/check_multi_comp4.c"
-
-static void overwrite_at (u32 sw[16], const u32 w0, const u32 salt_len)
-{
- switch (salt_len)
- {
- case 0: sw[0] = w0;
- break;
- case 1: sw[0] = (sw[0] & 0xff000000) | (w0 >> 8);
- sw[1] = (sw[1] & 0x00ffffff) | (w0 << 24);
- break;
- case 2: sw[0] = (sw[0] & 0xffff0000) | (w0 >> 16);
- sw[1] = (sw[1] & 0x0000ffff) | (w0 << 16);
- break;
- case 3: sw[0] = (sw[0] & 0xffffff00) | (w0 >> 24);
- sw[1] = (sw[1] & 0x000000ff) | (w0 << 8);
- break;
- case 4: sw[1] = w0;
- break;
- case 5: sw[1] = (sw[1] & 0xff000000) | (w0 >> 8);
- sw[2] = (sw[2] & 0x00ffffff) | (w0 << 24);
- break;
- case 6: sw[1] = (sw[1] & 0xffff0000) | (w0 >> 16);
- sw[2] = (sw[2] & 0x0000ffff) | (w0 << 16);
- break;
- case 7: sw[1] = (sw[1] & 0xffffff00) | (w0 >> 24);
- sw[2] = (sw[2] & 0x000000ff) | (w0 << 8);
- break;
- case 8: sw[2] = w0;
- break;
- case 9: sw[2] = (sw[2] & 0xff000000) | (w0 >> 8);
- sw[3] = (sw[3] & 0x00ffffff) | (w0 << 24);
- break;
- case 10: sw[2] = (sw[2] & 0xffff0000) | (w0 >> 16);
- sw[3] = (sw[3] & 0x0000ffff) | (w0 << 16);
- break;
- case 11: sw[2] = (sw[2] & 0xffffff00) | (w0 >> 24);
- sw[3] = (sw[3] & 0x000000ff) | (w0 << 8);
- break;
- case 12: sw[3] = w0;
- break;
- case 13: sw[3] = (sw[3] & 0xff000000) | (w0 >> 8);
- sw[4] = (sw[4] & 0x00ffffff) | (w0 << 24);
- break;
- case 14: sw[3] = (sw[3] & 0xffff0000) | (w0 >> 16);
- sw[4] = (sw[4] & 0x0000ffff) | (w0 << 16);
- break;
- case 15: sw[3] = (sw[3] & 0xffffff00) | (w0 >> 24);
- sw[4] = (sw[4] & 0x000000ff) | (w0 << 8);
- break;
- case 16: sw[4] = w0;
- break;
- case 17: sw[4] = (sw[4] & 0xff000000) | (w0 >> 8);
- sw[5] = (sw[5] & 0x00ffffff) | (w0 << 24);
- break;
- case 18: sw[4] = (sw[4] & 0xffff0000) | (w0 >> 16);
- sw[5] = (sw[5] & 0x0000ffff) | (w0 << 16);
- break;
- case 19: sw[4] = (sw[4] & 0xffffff00) | (w0 >> 24);
- sw[5] = (sw[5] & 0x000000ff) | (w0 << 8);
- break;
- case 20: sw[5] = w0;
- break;
- case 21: sw[5] = (sw[5] & 0xff000000) | (w0 >> 8);
- sw[6] = (sw[6] & 0x00ffffff) | (w0 << 24);
- break;
- case 22: sw[5] = (sw[5] & 0xffff0000) | (w0 >> 16);
- sw[6] = (sw[6] & 0x0000ffff) | (w0 << 16);
- break;
- case 23: sw[5] = (sw[5] & 0xffffff00) | (w0 >> 24);
- sw[6] = (sw[6] & 0x000000ff) | (w0 << 8);
- break;
- case 24: sw[6] = w0;
- break;
- case 25: sw[6] = (sw[6] & 0xff000000) | (w0 >> 8);
- sw[7] = (sw[7] & 0x00ffffff) | (w0 << 24);
- break;
- case 26: sw[6] = (sw[6] & 0xffff0000) | (w0 >> 16);
- sw[7] = (sw[7] & 0x0000ffff) | (w0 << 16);
- break;
- case 27: sw[6] = (sw[6] & 0xffffff00) | (w0 >> 24);
- sw[7] = (sw[7] & 0x000000ff) | (w0 << 8);
- break;
- case 28: sw[7] = w0;
- break;
- case 29: sw[7] = (sw[7] & 0xff000000) | (w0 >> 8);
- sw[8] = (sw[8] & 0x00ffffff) | (w0 << 24);
- break;
- case 30: sw[7] = (sw[7] & 0xffff0000) | (w0 >> 16);
- sw[8] = (sw[8] & 0x0000ffff) | (w0 << 16);
- break;
- case 31: sw[7] = (sw[7] & 0xffffff00) | (w0 >> 24);
- sw[8] = (sw[8] & 0x000000ff) | (w0 << 8);
- break;
- }
-}
+#include "OpenCL/simd.c"
static void m00120m (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], const u32 pw_len, __global pw_t *pws, __global kernel_rule_t *rules_buf, __global comb_t *combs_buf, __global bf_t *bfs_buf, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 bfs_cnt, const u32 digests_cnt, const u32 digests_offset)
{
u32 w2_t[4];
u32 w3_t[4];
- w0_t[0] = swap32 (w0[0]);
- w0_t[1] = swap32 (w0[1]);
- w0_t[2] = swap32 (w0[2]);
- w0_t[3] = swap32 (w0[3]);
- w1_t[0] = swap32 (w1[0]);
- w1_t[1] = swap32 (w1[1]);
- w1_t[2] = swap32 (w1[2]);
- w1_t[3] = swap32 (w1[3]);
- w2_t[0] = swap32 (w2[0]);
- w2_t[1] = swap32 (w2[1]);
- w2_t[2] = swap32 (w2[2]);
- w2_t[3] = swap32 (w2[3]);
- w3_t[0] = swap32 (w3[0]);
- w3_t[1] = swap32 (w3[1]);
- w3_t[2] = swap32 (w3[2]);
- w3_t[3] = swap32 (w3[3]);
-
- switch_buffer_by_offset (w0_t, w1_t, w2_t, w3_t, salt_len);
+ w0_t[0] = swap32_S (w0[0]);
+ w0_t[1] = swap32_S (w0[1]);
+ w0_t[2] = swap32_S (w0[2]);
+ w0_t[3] = swap32_S (w0[3]);
+ w1_t[0] = swap32_S (w1[0]);
+ w1_t[1] = swap32_S (w1[1]);
+ w1_t[2] = swap32_S (w1[2]);
+ w1_t[3] = swap32_S (w1[3]);
+ w2_t[0] = swap32_S (w2[0]);
+ w2_t[1] = swap32_S (w2[1]);
+ w2_t[2] = swap32_S (w2[2]);
+ w2_t[3] = swap32_S (w2[3]);
+ w3_t[0] = swap32_S (w3[0]);
+ w3_t[1] = swap32_S (w3[1]);
+ w3_t[2] = swap32_S (w3[2]);
+ w3_t[3] = swap32_S (w3[3]);
+
+ switch_buffer_by_offset_le_S (w0_t, w1_t, w2_t, w3_t, salt_len);
w0_t[0] |= salt_buf0[0];
w0_t[1] |= salt_buf0[1];
w3_t[2] |= salt_buf3[2];
w3_t[3] |= salt_buf3[3];
- w0_t[0] = swap32 (w0_t[0]);
- w0_t[1] = swap32 (w0_t[1]);
- w0_t[2] = swap32 (w0_t[2]);
- w0_t[3] = swap32 (w0_t[3]);
- w1_t[0] = swap32 (w1_t[0]);
- w1_t[1] = swap32 (w1_t[1]);
- w1_t[2] = swap32 (w1_t[2]);
- w1_t[3] = swap32 (w1_t[3]);
- w2_t[0] = swap32 (w2_t[0]);
- w2_t[1] = swap32 (w2_t[1]);
- w2_t[2] = swap32 (w2_t[2]);
- w2_t[3] = swap32 (w2_t[3]);
- w3_t[0] = swap32 (w3_t[0]);
- w3_t[1] = swap32 (w3_t[1]);
- w3_t[2] = swap32 (w3_t[2]);
- w3_t[3] = swap32 (w3_t[3]);
+ w0_t[0] = swap32_S (w0_t[0]);
+ w0_t[1] = swap32_S (w0_t[1]);
+ w0_t[2] = swap32_S (w0_t[2]);
+ w0_t[3] = swap32_S (w0_t[3]);
+ w1_t[0] = swap32_S (w1_t[0]);
+ w1_t[1] = swap32_S (w1_t[1]);
+ w1_t[2] = swap32_S (w1_t[2]);
+ w1_t[3] = swap32_S (w1_t[3]);
+ w2_t[0] = swap32_S (w2_t[0]);
+ w2_t[1] = swap32_S (w2_t[1]);
+ w2_t[2] = swap32_S (w2_t[2]);
+ w2_t[3] = swap32_S (w2_t[3]);
+ w3_t[0] = swap32_S (w3_t[0]);
+ w3_t[1] = swap32_S (w3_t[1]);
+ w3_t[2] = swap32_S (w3_t[2]);
+ w3_t[3] = swap32_S (w3_t[3]);
/**
* loop
u32 w0l = w0[0];
- for (u32 il_pos = 0; il_pos < bfs_cnt; il_pos++)
+ for (u32 il_pos = 0; il_pos < bfs_cnt; il_pos += VECT_SIZE)
{
- const u32 w0r = bfs_buf[il_pos].i;
+ const u32x w0r = w0r_create_bft (bfs_buf, il_pos);
- const u32 w0n = w0l | w0r;
+ const u32x w0lr = w0l | w0r;
- u32 wx[16];
+ u32x wx[16];
wx[ 0] = w0_t[0];
wx[ 1] = w0_t[1];
wx[14] = w3_t[2];
wx[15] = w3_t[3];
- overwrite_at (wx, w0n, salt_len);
+ overwrite_at_be (wx, w0lr, salt_len);
- u32 w0_t[4];
- u32 w1_t[4];
- u32 w2_t[4];
- u32 w3_t[4];
+ u32x w0_t[4];
+ u32x w1_t[4];
+ u32x w2_t[4];
+ u32x w3_t[4];
w0_t[0] = wx[ 0];
w0_t[1] = wx[ 1];
* sha1
*/
- u32 a = SHA1M_A;
- u32 b = SHA1M_B;
- u32 c = SHA1M_C;
- u32 d = SHA1M_D;
- u32 e = SHA1M_E;
+ u32x a = SHA1M_A;
+ u32x b = SHA1M_B;
+ u32x c = SHA1M_C;
+ u32x d = SHA1M_D;
+ u32x e = SHA1M_E;
#undef K
#define K SHA1C00
w3_t[2] = rotl32 ((w2_t[3] ^ w1_t[2] ^ w0_t[0] ^ w3_t[2]), 1u); SHA1_STEP (SHA1_F1, c, d, e, a, b, w3_t[2]);
w3_t[3] = rotl32 ((w3_t[0] ^ w1_t[3] ^ w0_t[1] ^ w3_t[3]), 1u); SHA1_STEP (SHA1_F1, b, c, d, e, a, w3_t[3]);
- const u32 r0 = d;
- const u32 r1 = e;
- const u32 r2 = c;
- const u32 r3 = b;
-
- #include COMPARE_M
+ COMPARE_M_SIMD (d, e, c, b);
}
}
* reverse
*/
- const u32 e_rev = rotl32 (search[1], 2u);
+ const u32 e_rev = rotl32_S (search[1], 2u);
/**
* salt
u32 w2_t[4];
u32 w3_t[4];
- w0_t[0] = swap32 (w0[0]);
- w0_t[1] = swap32 (w0[1]);
- w0_t[2] = swap32 (w0[2]);
- w0_t[3] = swap32 (w0[3]);
- w1_t[0] = swap32 (w1[0]);
- w1_t[1] = swap32 (w1[1]);
- w1_t[2] = swap32 (w1[2]);
- w1_t[3] = swap32 (w1[3]);
- w2_t[0] = swap32 (w2[0]);
- w2_t[1] = swap32 (w2[1]);
- w2_t[2] = swap32 (w2[2]);
- w2_t[3] = swap32 (w2[3]);
- w3_t[0] = swap32 (w3[0]);
- w3_t[1] = swap32 (w3[1]);
- w3_t[2] = swap32 (w3[2]);
- w3_t[3] = swap32 (w3[3]);
-
- switch_buffer_by_offset (w0_t, w1_t, w2_t, w3_t, salt_len);
+ w0_t[0] = swap32_S (w0[0]);
+ w0_t[1] = swap32_S (w0[1]);
+ w0_t[2] = swap32_S (w0[2]);
+ w0_t[3] = swap32_S (w0[3]);
+ w1_t[0] = swap32_S (w1[0]);
+ w1_t[1] = swap32_S (w1[1]);
+ w1_t[2] = swap32_S (w1[2]);
+ w1_t[3] = swap32_S (w1[3]);
+ w2_t[0] = swap32_S (w2[0]);
+ w2_t[1] = swap32_S (w2[1]);
+ w2_t[2] = swap32_S (w2[2]);
+ w2_t[3] = swap32_S (w2[3]);
+ w3_t[0] = swap32_S (w3[0]);
+ w3_t[1] = swap32_S (w3[1]);
+ w3_t[2] = swap32_S (w3[2]);
+ w3_t[3] = swap32_S (w3[3]);
+
+ switch_buffer_by_offset_le_S (w0_t, w1_t, w2_t, w3_t, salt_len);
w0_t[0] |= salt_buf0[0];
w0_t[1] |= salt_buf0[1];
w3_t[2] |= salt_buf3[2];
w3_t[3] |= salt_buf3[3];
- w0_t[0] = swap32 (w0_t[0]);
- w0_t[1] = swap32 (w0_t[1]);
- w0_t[2] = swap32 (w0_t[2]);
- w0_t[3] = swap32 (w0_t[3]);
- w1_t[0] = swap32 (w1_t[0]);
- w1_t[1] = swap32 (w1_t[1]);
- w1_t[2] = swap32 (w1_t[2]);
- w1_t[3] = swap32 (w1_t[3]);
- w2_t[0] = swap32 (w2_t[0]);
- w2_t[1] = swap32 (w2_t[1]);
- w2_t[2] = swap32 (w2_t[2]);
- w2_t[3] = swap32 (w2_t[3]);
- w3_t[0] = swap32 (w3_t[0]);
- w3_t[1] = swap32 (w3_t[1]);
- w3_t[2] = swap32 (w3_t[2]);
- w3_t[3] = swap32 (w3_t[3]);
+ w0_t[0] = swap32_S (w0_t[0]);
+ w0_t[1] = swap32_S (w0_t[1]);
+ w0_t[2] = swap32_S (w0_t[2]);
+ w0_t[3] = swap32_S (w0_t[3]);
+ w1_t[0] = swap32_S (w1_t[0]);
+ w1_t[1] = swap32_S (w1_t[1]);
+ w1_t[2] = swap32_S (w1_t[2]);
+ w1_t[3] = swap32_S (w1_t[3]);
+ w2_t[0] = swap32_S (w2_t[0]);
+ w2_t[1] = swap32_S (w2_t[1]);
+ w2_t[2] = swap32_S (w2_t[2]);
+ w2_t[3] = swap32_S (w2_t[3]);
+ w3_t[0] = swap32_S (w3_t[0]);
+ w3_t[1] = swap32_S (w3_t[1]);
+ w3_t[2] = swap32_S (w3_t[2]);
+ w3_t[3] = swap32_S (w3_t[3]);
/**
* loop
u32 w0l = w0[0];
- for (u32 il_pos = 0; il_pos < bfs_cnt; il_pos++)
+ for (u32 il_pos = 0; il_pos < bfs_cnt; il_pos += VECT_SIZE)
{
- const u32 w0r = bfs_buf[il_pos].i;
+ const u32x w0r = w0r_create_bft (bfs_buf, il_pos);
- const u32 w0n = w0l | w0r;
+ const u32x w0lr = w0l | w0r;
- u32 wx[16];
+ u32x wx[16];
wx[ 0] = w0_t[0];
wx[ 1] = w0_t[1];
wx[14] = w3_t[2];
wx[15] = w3_t[3];
- overwrite_at (wx, w0n, salt_len);
+ overwrite_at_be (wx, w0lr, salt_len);
- u32 w0_t[4];
- u32 w1_t[4];
- u32 w2_t[4];
- u32 w3_t[4];
+ u32x w0_t[4];
+ u32x w1_t[4];
+ u32x w2_t[4];
+ u32x w3_t[4];
w0_t[0] = wx[ 0];
w0_t[1] = wx[ 1];
* sha1
*/
- u32 a = SHA1M_A;
- u32 b = SHA1M_B;
- u32 c = SHA1M_C;
- u32 d = SHA1M_D;
- u32 e = SHA1M_E;
+ u32x a = SHA1M_A;
+ u32x b = SHA1M_B;
+ u32x c = SHA1M_C;
+ u32x d = SHA1M_D;
+ u32x e = SHA1M_E;
#undef K
#define K SHA1C00
w2_t[3] = rotl32 ((w2_t[0] ^ w0_t[3] ^ w3_t[1] ^ w2_t[3]), 1u); SHA1_STEP (SHA1_F1, a, b, c, d, e, w2_t[3]);
w3_t[0] = rotl32 ((w2_t[1] ^ w1_t[0] ^ w3_t[2] ^ w3_t[0]), 1u); SHA1_STEP (SHA1_F1, e, a, b, c, d, w3_t[0]);
- if (allx (e != e_rev)) continue;
+ if (MATCHES_NONE_VS (e, e_rev)) continue;
w3_t[1] = rotl32 ((w2_t[2] ^ w1_t[1] ^ w3_t[3] ^ w3_t[1]), 1u); SHA1_STEP (SHA1_F1, d, e, a, b, c, w3_t[1]);
w3_t[2] = rotl32 ((w2_t[3] ^ w1_t[2] ^ w0_t[0] ^ w3_t[2]), 1u); SHA1_STEP (SHA1_F1, c, d, e, a, b, w3_t[2]);
w3_t[3] = rotl32 ((w3_t[0] ^ w1_t[3] ^ w0_t[1] ^ w3_t[3]), 1u); SHA1_STEP (SHA1_F1, b, c, d, e, a, w3_t[3]);
-
- const u32 r0 = d;
- const u32 r1 = e;
- const u32 r2 = c;
- const u32 r3 = b;
-
- #include COMPARE_S
+ COMPARE_S_SIMD (d, e, c, b);
}
}
s3[2] = 0;
s3[3] = 0;
- switch_buffer_by_offset (s0, s1, s2, s3, (out_len * 2));
+ switch_buffer_by_offset_le (s0, s1, s2, s3, (out_len * 2));
const u32 out_salt_len = (out_len * 2) + salt_len;
s3[2] = 0;
s3[3] = 0;
- switch_buffer_by_offset (s0, s1, s2, s3, (out_len * 2));
+ switch_buffer_by_offset_le (s0, s1, s2, s3, (out_len * 2));
const u32 out_salt_len = (out_len * 2) + salt_len;
if (combs_mode == COMBINATOR_MODE_BASE_RIGHT)
{
- switch_buffer_by_offset (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len);
+ switch_buffer_by_offset_le (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len);
}
/**
if (combs_mode == COMBINATOR_MODE_BASE_LEFT)
{
- switch_buffer_by_offset (wordr0, wordr1, wordr2, wordr3, pw_l_len);
+ switch_buffer_by_offset_le (wordr0, wordr1, wordr2, wordr3, pw_l_len);
}
/**
s3[2] = 0;
s3[3] = 0;
- switch_buffer_by_offset (s0, s1, s2, s3, (pw_len * 2));
+ switch_buffer_by_offset_le (s0, s1, s2, s3, (pw_len * 2));
const u32 pw_salt_len = (pw_len * 2) + salt_len;
if (combs_mode == COMBINATOR_MODE_BASE_RIGHT)
{
- switch_buffer_by_offset (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len);
+ switch_buffer_by_offset_le (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len);
}
/**
if (combs_mode == COMBINATOR_MODE_BASE_LEFT)
{
- switch_buffer_by_offset (wordr0, wordr1, wordr2, wordr3, pw_l_len);
+ switch_buffer_by_offset_le (wordr0, wordr1, wordr2, wordr3, pw_l_len);
}
/**
s3[2] = 0;
s3[3] = 0;
- switch_buffer_by_offset (s0, s1, s2, s3, (pw_len * 2));
+ switch_buffer_by_offset_le (s0, s1, s2, s3, (pw_len * 2));
const u32 pw_salt_len = (pw_len * 2) + salt_len;
#define _SHA1_
+#define NEW_SIMD_CODE
+
#include "include/constants.h"
#include "include/kernel_vendor.h"
#include "include/kernel_functions.c"
#include "OpenCL/types_ocl.c"
#include "OpenCL/common.c"
+#include "OpenCL/simd.c"
-#define COMPARE_S "OpenCL/check_single_comp4.c"
-#define COMPARE_M "OpenCL/check_multi_comp4.c"
-
-static void m00130m (u32 w[16], const u32 pw_len, __global pw_t *pws, __global kernel_rule_t *rules_buf, __global comb_t *combs_buf, __constant u32 * words_buf_r, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 bfs_cnt, const u32 digests_cnt, const u32 digests_offset)
+static void m00130m (u32 w[16], const u32 pw_len, __global pw_t *pws, __global kernel_rule_t *rules_buf, __global comb_t *combs_buf, __constant u32x * words_buf_r, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 bfs_cnt, const u32 digests_cnt, const u32 digests_offset)
{
/**
* modifier
salt_buf3[2] = 0;
salt_buf3[3] = 0;
- switch_buffer_by_offset (salt_buf0, salt_buf1, salt_buf2, salt_buf3, pw_len);
-
- w[ 0] |= swap32 (salt_buf0[0]);
- w[ 1] |= swap32 (salt_buf0[1]);
- w[ 2] |= swap32 (salt_buf0[2]);
- w[ 3] |= swap32 (salt_buf0[3]);
- w[ 4] |= swap32 (salt_buf1[0]);
- w[ 5] |= swap32 (salt_buf1[1]);
- w[ 6] |= swap32 (salt_buf1[2]);
- w[ 7] |= swap32 (salt_buf1[3]);
- w[ 8] |= swap32 (salt_buf2[0]);
- w[ 9] |= swap32 (salt_buf2[1]);
- w[10] |= swap32 (salt_buf2[2]);
- w[11] |= swap32 (salt_buf2[3]);
- w[12] |= swap32 (salt_buf3[0]);
- w[13] |= swap32 (salt_buf3[1]);
- w[14] |= swap32 (salt_buf3[2]);
- w[15] |= swap32 (salt_buf3[3]);
+ switch_buffer_by_offset_le_S (salt_buf0, salt_buf1, salt_buf2, salt_buf3, pw_len);
+
+ w[ 0] |= swap32_S (salt_buf0[0]);
+ w[ 1] |= swap32_S (salt_buf0[1]);
+ w[ 2] |= swap32_S (salt_buf0[2]);
+ w[ 3] |= swap32_S (salt_buf0[3]);
+ w[ 4] |= swap32_S (salt_buf1[0]);
+ w[ 5] |= swap32_S (salt_buf1[1]);
+ w[ 6] |= swap32_S (salt_buf1[2]);
+ w[ 7] |= swap32_S (salt_buf1[3]);
+ w[ 8] |= swap32_S (salt_buf2[0]);
+ w[ 9] |= swap32_S (salt_buf2[1]);
+ w[10] |= swap32_S (salt_buf2[2]);
+ w[11] |= swap32_S (salt_buf2[3]);
+ w[12] |= swap32_S (salt_buf3[0]);
+ w[13] |= swap32_S (salt_buf3[1]);
+ w[14] |= swap32_S (salt_buf3[2]);
+ w[15] |= swap32_S (salt_buf3[3]);
const u32 salt_len = salt_bufs[salt_pos].salt_len;
* base
*/
- const u32 c_16s = rotl32 ((w[13] ^ w[ 8] ^ w[ 2] ), 1u);
- const u32 c_17s = rotl32 ((w[14] ^ w[ 9] ^ w[ 3] ^ w[ 1]), 1u);
- const u32 c_18s = rotl32 ((w[15] ^ w[10] ^ w[ 4] ^ w[ 2]), 1u);
- const u32 c_19s = rotl32 ((c_16s ^ w[11] ^ w[ 5] ^ w[ 3]), 1u);
- const u32 c_20s = rotl32 ((c_17s ^ w[12] ^ w[ 6] ^ w[ 4]), 1u);
- const u32 c_21s = rotl32 ((c_18s ^ w[13] ^ w[ 7] ^ w[ 5]), 1u);
- const u32 c_22s = rotl32 ((c_19s ^ w[14] ^ w[ 8] ^ w[ 6]), 1u);
- const u32 c_23s = rotl32 ((c_20s ^ w[15] ^ w[ 9] ^ w[ 7]), 1u);
- const u32 c_24s = rotl32 ((c_21s ^ c_16s ^ w[10] ^ w[ 8]), 1u);
- const u32 c_25s = rotl32 ((c_22s ^ c_17s ^ w[11] ^ w[ 9]), 1u);
- const u32 c_26s = rotl32 ((c_23s ^ c_18s ^ w[12] ^ w[10]), 1u);
- const u32 c_27s = rotl32 ((c_24s ^ c_19s ^ w[13] ^ w[11]), 1u);
- const u32 c_28s = rotl32 ((c_25s ^ c_20s ^ w[14] ^ w[12]), 1u);
- const u32 c_29s = rotl32 ((c_26s ^ c_21s ^ w[15] ^ w[13]), 1u);
- const u32 c_30s = rotl32 ((c_27s ^ c_22s ^ c_16s ^ w[14]), 1u);
- const u32 c_31s = rotl32 ((c_28s ^ c_23s ^ c_17s ^ w[15]), 1u);
- const u32 c_32s = rotl32 ((c_29s ^ c_24s ^ c_18s ^ c_16s), 1u);
- const u32 c_33s = rotl32 ((c_30s ^ c_25s ^ c_19s ^ c_17s), 1u);
- const u32 c_34s = rotl32 ((c_31s ^ c_26s ^ c_20s ^ c_18s), 1u);
- const u32 c_35s = rotl32 ((c_32s ^ c_27s ^ c_21s ^ c_19s), 1u);
- const u32 c_36s = rotl32 ((c_33s ^ c_28s ^ c_22s ^ c_20s), 1u);
- const u32 c_37s = rotl32 ((c_34s ^ c_29s ^ c_23s ^ c_21s), 1u);
- const u32 c_38s = rotl32 ((c_35s ^ c_30s ^ c_24s ^ c_22s), 1u);
- const u32 c_39s = rotl32 ((c_36s ^ c_31s ^ c_25s ^ c_23s), 1u);
- const u32 c_40s = rotl32 ((c_37s ^ c_32s ^ c_26s ^ c_24s), 1u);
- const u32 c_41s = rotl32 ((c_38s ^ c_33s ^ c_27s ^ c_25s), 1u);
- const u32 c_42s = rotl32 ((c_39s ^ c_34s ^ c_28s ^ c_26s), 1u);
- const u32 c_43s = rotl32 ((c_40s ^ c_35s ^ c_29s ^ c_27s), 1u);
- const u32 c_44s = rotl32 ((c_41s ^ c_36s ^ c_30s ^ c_28s), 1u);
- const u32 c_45s = rotl32 ((c_42s ^ c_37s ^ c_31s ^ c_29s), 1u);
- const u32 c_46s = rotl32 ((c_43s ^ c_38s ^ c_32s ^ c_30s), 1u);
- const u32 c_47s = rotl32 ((c_44s ^ c_39s ^ c_33s ^ c_31s), 1u);
- const u32 c_48s = rotl32 ((c_45s ^ c_40s ^ c_34s ^ c_32s), 1u);
- const u32 c_49s = rotl32 ((c_46s ^ c_41s ^ c_35s ^ c_33s), 1u);
- const u32 c_50s = rotl32 ((c_47s ^ c_42s ^ c_36s ^ c_34s), 1u);
- const u32 c_51s = rotl32 ((c_48s ^ c_43s ^ c_37s ^ c_35s), 1u);
- const u32 c_52s = rotl32 ((c_49s ^ c_44s ^ c_38s ^ c_36s), 1u);
- const u32 c_53s = rotl32 ((c_50s ^ c_45s ^ c_39s ^ c_37s), 1u);
- const u32 c_54s = rotl32 ((c_51s ^ c_46s ^ c_40s ^ c_38s), 1u);
- const u32 c_55s = rotl32 ((c_52s ^ c_47s ^ c_41s ^ c_39s), 1u);
- const u32 c_56s = rotl32 ((c_53s ^ c_48s ^ c_42s ^ c_40s), 1u);
- const u32 c_57s = rotl32 ((c_54s ^ c_49s ^ c_43s ^ c_41s), 1u);
- const u32 c_58s = rotl32 ((c_55s ^ c_50s ^ c_44s ^ c_42s), 1u);
- const u32 c_59s = rotl32 ((c_56s ^ c_51s ^ c_45s ^ c_43s), 1u);
- const u32 c_60s = rotl32 ((c_57s ^ c_52s ^ c_46s ^ c_44s), 1u);
- const u32 c_61s = rotl32 ((c_58s ^ c_53s ^ c_47s ^ c_45s), 1u);
- const u32 c_62s = rotl32 ((c_59s ^ c_54s ^ c_48s ^ c_46s), 1u);
- const u32 c_63s = rotl32 ((c_60s ^ c_55s ^ c_49s ^ c_47s), 1u);
- const u32 c_64s = rotl32 ((c_61s ^ c_56s ^ c_50s ^ c_48s), 1u);
- const u32 c_65s = rotl32 ((c_62s ^ c_57s ^ c_51s ^ c_49s), 1u);
- const u32 c_66s = rotl32 ((c_63s ^ c_58s ^ c_52s ^ c_50s), 1u);
- const u32 c_67s = rotl32 ((c_64s ^ c_59s ^ c_53s ^ c_51s), 1u);
- const u32 c_68s = rotl32 ((c_65s ^ c_60s ^ c_54s ^ c_52s), 1u);
- const u32 c_69s = rotl32 ((c_66s ^ c_61s ^ c_55s ^ c_53s), 1u);
- const u32 c_70s = rotl32 ((c_67s ^ c_62s ^ c_56s ^ c_54s), 1u);
- const u32 c_71s = rotl32 ((c_68s ^ c_63s ^ c_57s ^ c_55s), 1u);
- const u32 c_72s = rotl32 ((c_69s ^ c_64s ^ c_58s ^ c_56s), 1u);
- const u32 c_73s = rotl32 ((c_70s ^ c_65s ^ c_59s ^ c_57s), 1u);
- const u32 c_74s = rotl32 ((c_71s ^ c_66s ^ c_60s ^ c_58s), 1u);
- const u32 c_75s = rotl32 ((c_72s ^ c_67s ^ c_61s ^ c_59s), 1u);
- const u32 c_76s = rotl32 ((c_73s ^ c_68s ^ c_62s ^ c_60s), 1u);
- const u32 c_77s = rotl32 ((c_74s ^ c_69s ^ c_63s ^ c_61s), 1u);
- const u32 c_78s = rotl32 ((c_75s ^ c_70s ^ c_64s ^ c_62s), 1u);
- const u32 c_79s = rotl32 ((c_76s ^ c_71s ^ c_65s ^ c_63s), 1u);
+ const u32 c_16s = rotl32_S ((w[13] ^ w[ 8] ^ w[ 2] ), 1u);
+ const u32 c_17s = rotl32_S ((w[14] ^ w[ 9] ^ w[ 3] ^ w[ 1]), 1u);
+ const u32 c_18s = rotl32_S ((w[15] ^ w[10] ^ w[ 4] ^ w[ 2]), 1u);
+ const u32 c_19s = rotl32_S ((c_16s ^ w[11] ^ w[ 5] ^ w[ 3]), 1u);
+ const u32 c_20s = rotl32_S ((c_17s ^ w[12] ^ w[ 6] ^ w[ 4]), 1u);
+ const u32 c_21s = rotl32_S ((c_18s ^ w[13] ^ w[ 7] ^ w[ 5]), 1u);
+ const u32 c_22s = rotl32_S ((c_19s ^ w[14] ^ w[ 8] ^ w[ 6]), 1u);
+ const u32 c_23s = rotl32_S ((c_20s ^ w[15] ^ w[ 9] ^ w[ 7]), 1u);
+ const u32 c_24s = rotl32_S ((c_21s ^ c_16s ^ w[10] ^ w[ 8]), 1u);
+ const u32 c_25s = rotl32_S ((c_22s ^ c_17s ^ w[11] ^ w[ 9]), 1u);
+ const u32 c_26s = rotl32_S ((c_23s ^ c_18s ^ w[12] ^ w[10]), 1u);
+ const u32 c_27s = rotl32_S ((c_24s ^ c_19s ^ w[13] ^ w[11]), 1u);
+ const u32 c_28s = rotl32_S ((c_25s ^ c_20s ^ w[14] ^ w[12]), 1u);
+ const u32 c_29s = rotl32_S ((c_26s ^ c_21s ^ w[15] ^ w[13]), 1u);
+ const u32 c_30s = rotl32_S ((c_27s ^ c_22s ^ c_16s ^ w[14]), 1u);
+ const u32 c_31s = rotl32_S ((c_28s ^ c_23s ^ c_17s ^ w[15]), 1u);
+ const u32 c_32s = rotl32_S ((c_29s ^ c_24s ^ c_18s ^ c_16s), 1u);
+ const u32 c_33s = rotl32_S ((c_30s ^ c_25s ^ c_19s ^ c_17s), 1u);
+ const u32 c_34s = rotl32_S ((c_31s ^ c_26s ^ c_20s ^ c_18s), 1u);
+ const u32 c_35s = rotl32_S ((c_32s ^ c_27s ^ c_21s ^ c_19s), 1u);
+ const u32 c_36s = rotl32_S ((c_33s ^ c_28s ^ c_22s ^ c_20s), 1u);
+ const u32 c_37s = rotl32_S ((c_34s ^ c_29s ^ c_23s ^ c_21s), 1u);
+ const u32 c_38s = rotl32_S ((c_35s ^ c_30s ^ c_24s ^ c_22s), 1u);
+ const u32 c_39s = rotl32_S ((c_36s ^ c_31s ^ c_25s ^ c_23s), 1u);
+ const u32 c_40s = rotl32_S ((c_37s ^ c_32s ^ c_26s ^ c_24s), 1u);
+ const u32 c_41s = rotl32_S ((c_38s ^ c_33s ^ c_27s ^ c_25s), 1u);
+ const u32 c_42s = rotl32_S ((c_39s ^ c_34s ^ c_28s ^ c_26s), 1u);
+ const u32 c_43s = rotl32_S ((c_40s ^ c_35s ^ c_29s ^ c_27s), 1u);
+ const u32 c_44s = rotl32_S ((c_41s ^ c_36s ^ c_30s ^ c_28s), 1u);
+ const u32 c_45s = rotl32_S ((c_42s ^ c_37s ^ c_31s ^ c_29s), 1u);
+ const u32 c_46s = rotl32_S ((c_43s ^ c_38s ^ c_32s ^ c_30s), 1u);
+ const u32 c_47s = rotl32_S ((c_44s ^ c_39s ^ c_33s ^ c_31s), 1u);
+ const u32 c_48s = rotl32_S ((c_45s ^ c_40s ^ c_34s ^ c_32s), 1u);
+ const u32 c_49s = rotl32_S ((c_46s ^ c_41s ^ c_35s ^ c_33s), 1u);
+ const u32 c_50s = rotl32_S ((c_47s ^ c_42s ^ c_36s ^ c_34s), 1u);
+ const u32 c_51s = rotl32_S ((c_48s ^ c_43s ^ c_37s ^ c_35s), 1u);
+ const u32 c_52s = rotl32_S ((c_49s ^ c_44s ^ c_38s ^ c_36s), 1u);
+ const u32 c_53s = rotl32_S ((c_50s ^ c_45s ^ c_39s ^ c_37s), 1u);
+ const u32 c_54s = rotl32_S ((c_51s ^ c_46s ^ c_40s ^ c_38s), 1u);
+ const u32 c_55s = rotl32_S ((c_52s ^ c_47s ^ c_41s ^ c_39s), 1u);
+ const u32 c_56s = rotl32_S ((c_53s ^ c_48s ^ c_42s ^ c_40s), 1u);
+ const u32 c_57s = rotl32_S ((c_54s ^ c_49s ^ c_43s ^ c_41s), 1u);
+ const u32 c_58s = rotl32_S ((c_55s ^ c_50s ^ c_44s ^ c_42s), 1u);
+ const u32 c_59s = rotl32_S ((c_56s ^ c_51s ^ c_45s ^ c_43s), 1u);
+ const u32 c_60s = rotl32_S ((c_57s ^ c_52s ^ c_46s ^ c_44s), 1u);
+ const u32 c_61s = rotl32_S ((c_58s ^ c_53s ^ c_47s ^ c_45s), 1u);
+ const u32 c_62s = rotl32_S ((c_59s ^ c_54s ^ c_48s ^ c_46s), 1u);
+ const u32 c_63s = rotl32_S ((c_60s ^ c_55s ^ c_49s ^ c_47s), 1u);
+ const u32 c_64s = rotl32_S ((c_61s ^ c_56s ^ c_50s ^ c_48s), 1u);
+ const u32 c_65s = rotl32_S ((c_62s ^ c_57s ^ c_51s ^ c_49s), 1u);
+ const u32 c_66s = rotl32_S ((c_63s ^ c_58s ^ c_52s ^ c_50s), 1u);
+ const u32 c_67s = rotl32_S ((c_64s ^ c_59s ^ c_53s ^ c_51s), 1u);
+ const u32 c_68s = rotl32_S ((c_65s ^ c_60s ^ c_54s ^ c_52s), 1u);
+ const u32 c_69s = rotl32_S ((c_66s ^ c_61s ^ c_55s ^ c_53s), 1u);
+ const u32 c_70s = rotl32_S ((c_67s ^ c_62s ^ c_56s ^ c_54s), 1u);
+ const u32 c_71s = rotl32_S ((c_68s ^ c_63s ^ c_57s ^ c_55s), 1u);
+ const u32 c_72s = rotl32_S ((c_69s ^ c_64s ^ c_58s ^ c_56s), 1u);
+ const u32 c_73s = rotl32_S ((c_70s ^ c_65s ^ c_59s ^ c_57s), 1u);
+ const u32 c_74s = rotl32_S ((c_71s ^ c_66s ^ c_60s ^ c_58s), 1u);
+ const u32 c_75s = rotl32_S ((c_72s ^ c_67s ^ c_61s ^ c_59s), 1u);
+ const u32 c_76s = rotl32_S ((c_73s ^ c_68s ^ c_62s ^ c_60s), 1u);
+ const u32 c_77s = rotl32_S ((c_74s ^ c_69s ^ c_63s ^ c_61s), 1u);
+ const u32 c_78s = rotl32_S ((c_75s ^ c_70s ^ c_64s ^ c_62s), 1u);
+ const u32 c_79s = rotl32_S ((c_76s ^ c_71s ^ c_65s ^ c_63s), 1u);
const u32 c_17sK = c_17s + SHA1C00;
const u32 c_18sK = c_18s + SHA1C00;
const u32 c_65sK = c_65s + SHA1C03;
const u32 c_69sK = c_69s + SHA1C03;
- /**
+ /**
* loop
*/
u32 w0l = w[0];
- for (u32 il_pos = 0; il_pos < bfs_cnt; il_pos++)
+ for (u32 il_pos = 0; il_pos < bfs_cnt; il_pos += VECT_SIZE)
{
- const u32 w0r = words_buf_r[il_pos];
-
- const u32 w0 = w0l | w0r;
-
- const u32 w0s01 = rotl32 (w0, 1u);
- const u32 w0s02 = rotl32 (w0, 2u);
- const u32 w0s03 = rotl32 (w0, 3u);
- const u32 w0s04 = rotl32 (w0, 4u);
- const u32 w0s05 = rotl32 (w0, 5u);
- const u32 w0s06 = rotl32 (w0, 6u);
- const u32 w0s07 = rotl32 (w0, 7u);
- const u32 w0s08 = rotl32 (w0, 8u);
- const u32 w0s09 = rotl32 (w0, 9u);
- const u32 w0s10 = rotl32 (w0, 10u);
- const u32 w0s11 = rotl32 (w0, 11u);
- const u32 w0s12 = rotl32 (w0, 12u);
- const u32 w0s13 = rotl32 (w0, 13u);
- const u32 w0s14 = rotl32 (w0, 14u);
- const u32 w0s15 = rotl32 (w0, 15u);
- const u32 w0s16 = rotl32 (w0, 16u);
- const u32 w0s17 = rotl32 (w0, 17u);
- const u32 w0s18 = rotl32 (w0, 18u);
- const u32 w0s19 = rotl32 (w0, 19u);
- const u32 w0s20 = rotl32 (w0, 20u);
- const u32 w0s21 = rotl32 (w0, 21u);
- const u32 w0s22 = rotl32 (w0, 22U);
-
- const u32 w0s04___w0s06 = w0s04 ^ w0s06;
- const u32 w0s04___w0s08 = w0s04 ^ w0s08;
- const u32 w0s08___w0s12 = w0s08 ^ w0s12;
- const u32 w0s04___w0s06___w0s07 = w0s04___w0s06 ^ w0s07;
-
- u32 a = SHA1M_A;
- u32 b = SHA1M_B;
- u32 c = SHA1M_C;
- u32 d = SHA1M_D;
- u32 e = SHA1M_E;
+ const u32x w0r = words_buf_r[il_pos / VECT_SIZE];
+
+ const u32x w0 = w0l | w0r;
+
+ const u32x w0s01 = rotl32 (w0, 1u);
+ const u32x w0s02 = rotl32 (w0, 2u);
+ const u32x w0s03 = rotl32 (w0, 3u);
+ const u32x w0s04 = rotl32 (w0, 4u);
+ const u32x w0s05 = rotl32 (w0, 5u);
+ const u32x w0s06 = rotl32 (w0, 6u);
+ const u32x w0s07 = rotl32 (w0, 7u);
+ const u32x w0s08 = rotl32 (w0, 8u);
+ const u32x w0s09 = rotl32 (w0, 9u);
+ const u32x w0s10 = rotl32 (w0, 10u);
+ const u32x w0s11 = rotl32 (w0, 11u);
+ const u32x w0s12 = rotl32 (w0, 12u);
+ const u32x w0s13 = rotl32 (w0, 13u);
+ const u32x w0s14 = rotl32 (w0, 14u);
+ const u32x w0s15 = rotl32 (w0, 15u);
+ const u32x w0s16 = rotl32 (w0, 16u);
+ const u32x w0s17 = rotl32 (w0, 17u);
+ const u32x w0s18 = rotl32 (w0, 18u);
+ const u32x w0s19 = rotl32 (w0, 19u);
+ const u32x w0s20 = rotl32 (w0, 20u);
+ const u32x w0s21 = rotl32 (w0, 21u);
+ const u32x w0s22 = rotl32 (w0, 22U);
+
+ const u32x w0s04___w0s06 = w0s04 ^ w0s06;
+ const u32x w0s04___w0s08 = w0s04 ^ w0s08;
+ const u32x w0s08___w0s12 = w0s08 ^ w0s12;
+ const u32x w0s04___w0s06___w0s07 = w0s04___w0s06 ^ w0s07;
+
+ u32x a = SHA1M_A;
+ u32x b = SHA1M_B;
+ u32x c = SHA1M_C;
+ u32x d = SHA1M_D;
+ u32x e = SHA1M_E;
#undef K
#define K SHA1C00
SHA1_STEP (SHA1_F1 , c, d, e, a, b, (c_78s ^ w0s07 ^ w0s08 ^ w0s15 ^ w0s18 ^ w0s20));
SHA1_STEP (SHA1_F1 , b, c, d, e, a, (c_79s ^ w0s08 ^ w0s22));
-
- const u32 r0 = d;
- const u32 r1 = e;
- const u32 r2 = c;
- const u32 r3 = b;
-
- #include COMPARE_M
+ COMPARE_M_SIMD (d, e, c, b);
}
}
-static void m00130s (u32 w[16], const u32 pw_len, __global pw_t *pws, __global kernel_rule_t *rules_buf, __global comb_t *combs_buf, __constant u32 * words_buf_r, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 bfs_cnt, const u32 digests_cnt, const u32 digests_offset)
+static void m00130s (u32 w[16], const u32 pw_len, __global pw_t *pws, __global kernel_rule_t *rules_buf, __global comb_t *combs_buf, __constant u32x * words_buf_r, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 bfs_cnt, const u32 digests_cnt, const u32 digests_offset)
{
/**
* modifier
* base
*/
- const u32 c_16s = rotl32 ((w[13] ^ w[ 8] ^ w[ 2] ), 1u);
- const u32 c_17s = rotl32 ((w[14] ^ w[ 9] ^ w[ 3] ^ w[ 1]), 1u);
- const u32 c_18s = rotl32 ((w[15] ^ w[10] ^ w[ 4] ^ w[ 2]), 1u);
- const u32 c_19s = rotl32 ((c_16s ^ w[11] ^ w[ 5] ^ w[ 3]), 1u);
- const u32 c_20s = rotl32 ((c_17s ^ w[12] ^ w[ 6] ^ w[ 4]), 1u);
- const u32 c_21s = rotl32 ((c_18s ^ w[13] ^ w[ 7] ^ w[ 5]), 1u);
- const u32 c_22s = rotl32 ((c_19s ^ w[14] ^ w[ 8] ^ w[ 6]), 1u);
- const u32 c_23s = rotl32 ((c_20s ^ w[15] ^ w[ 9] ^ w[ 7]), 1u);
- const u32 c_24s = rotl32 ((c_21s ^ c_16s ^ w[10] ^ w[ 8]), 1u);
- const u32 c_25s = rotl32 ((c_22s ^ c_17s ^ w[11] ^ w[ 9]), 1u);
- const u32 c_26s = rotl32 ((c_23s ^ c_18s ^ w[12] ^ w[10]), 1u);
- const u32 c_27s = rotl32 ((c_24s ^ c_19s ^ w[13] ^ w[11]), 1u);
- const u32 c_28s = rotl32 ((c_25s ^ c_20s ^ w[14] ^ w[12]), 1u);
- const u32 c_29s = rotl32 ((c_26s ^ c_21s ^ w[15] ^ w[13]), 1u);
- const u32 c_30s = rotl32 ((c_27s ^ c_22s ^ c_16s ^ w[14]), 1u);
- const u32 c_31s = rotl32 ((c_28s ^ c_23s ^ c_17s ^ w[15]), 1u);
- const u32 c_32s = rotl32 ((c_29s ^ c_24s ^ c_18s ^ c_16s), 1u);
- const u32 c_33s = rotl32 ((c_30s ^ c_25s ^ c_19s ^ c_17s), 1u);
- const u32 c_34s = rotl32 ((c_31s ^ c_26s ^ c_20s ^ c_18s), 1u);
- const u32 c_35s = rotl32 ((c_32s ^ c_27s ^ c_21s ^ c_19s), 1u);
- const u32 c_36s = rotl32 ((c_33s ^ c_28s ^ c_22s ^ c_20s), 1u);
- const u32 c_37s = rotl32 ((c_34s ^ c_29s ^ c_23s ^ c_21s), 1u);
- const u32 c_38s = rotl32 ((c_35s ^ c_30s ^ c_24s ^ c_22s), 1u);
- const u32 c_39s = rotl32 ((c_36s ^ c_31s ^ c_25s ^ c_23s), 1u);
- const u32 c_40s = rotl32 ((c_37s ^ c_32s ^ c_26s ^ c_24s), 1u);
- const u32 c_41s = rotl32 ((c_38s ^ c_33s ^ c_27s ^ c_25s), 1u);
- const u32 c_42s = rotl32 ((c_39s ^ c_34s ^ c_28s ^ c_26s), 1u);
- const u32 c_43s = rotl32 ((c_40s ^ c_35s ^ c_29s ^ c_27s), 1u);
- const u32 c_44s = rotl32 ((c_41s ^ c_36s ^ c_30s ^ c_28s), 1u);
- const u32 c_45s = rotl32 ((c_42s ^ c_37s ^ c_31s ^ c_29s), 1u);
- const u32 c_46s = rotl32 ((c_43s ^ c_38s ^ c_32s ^ c_30s), 1u);
- const u32 c_47s = rotl32 ((c_44s ^ c_39s ^ c_33s ^ c_31s), 1u);
- const u32 c_48s = rotl32 ((c_45s ^ c_40s ^ c_34s ^ c_32s), 1u);
- const u32 c_49s = rotl32 ((c_46s ^ c_41s ^ c_35s ^ c_33s), 1u);
- const u32 c_50s = rotl32 ((c_47s ^ c_42s ^ c_36s ^ c_34s), 1u);
- const u32 c_51s = rotl32 ((c_48s ^ c_43s ^ c_37s ^ c_35s), 1u);
- const u32 c_52s = rotl32 ((c_49s ^ c_44s ^ c_38s ^ c_36s), 1u);
- const u32 c_53s = rotl32 ((c_50s ^ c_45s ^ c_39s ^ c_37s), 1u);
- const u32 c_54s = rotl32 ((c_51s ^ c_46s ^ c_40s ^ c_38s), 1u);
- const u32 c_55s = rotl32 ((c_52s ^ c_47s ^ c_41s ^ c_39s), 1u);
- const u32 c_56s = rotl32 ((c_53s ^ c_48s ^ c_42s ^ c_40s), 1u);
- const u32 c_57s = rotl32 ((c_54s ^ c_49s ^ c_43s ^ c_41s), 1u);
- const u32 c_58s = rotl32 ((c_55s ^ c_50s ^ c_44s ^ c_42s), 1u);
- const u32 c_59s = rotl32 ((c_56s ^ c_51s ^ c_45s ^ c_43s), 1u);
- const u32 c_60s = rotl32 ((c_57s ^ c_52s ^ c_46s ^ c_44s), 1u);
- const u32 c_61s = rotl32 ((c_58s ^ c_53s ^ c_47s ^ c_45s), 1u);
- const u32 c_62s = rotl32 ((c_59s ^ c_54s ^ c_48s ^ c_46s), 1u);
- const u32 c_63s = rotl32 ((c_60s ^ c_55s ^ c_49s ^ c_47s), 1u);
- const u32 c_64s = rotl32 ((c_61s ^ c_56s ^ c_50s ^ c_48s), 1u);
- const u32 c_65s = rotl32 ((c_62s ^ c_57s ^ c_51s ^ c_49s), 1u);
- const u32 c_66s = rotl32 ((c_63s ^ c_58s ^ c_52s ^ c_50s), 1u);
- const u32 c_67s = rotl32 ((c_64s ^ c_59s ^ c_53s ^ c_51s), 1u);
- const u32 c_68s = rotl32 ((c_65s ^ c_60s ^ c_54s ^ c_52s), 1u);
- const u32 c_69s = rotl32 ((c_66s ^ c_61s ^ c_55s ^ c_53s), 1u);
- const u32 c_70s = rotl32 ((c_67s ^ c_62s ^ c_56s ^ c_54s), 1u);
- const u32 c_71s = rotl32 ((c_68s ^ c_63s ^ c_57s ^ c_55s), 1u);
- const u32 c_72s = rotl32 ((c_69s ^ c_64s ^ c_58s ^ c_56s), 1u);
- const u32 c_73s = rotl32 ((c_70s ^ c_65s ^ c_59s ^ c_57s), 1u);
- const u32 c_74s = rotl32 ((c_71s ^ c_66s ^ c_60s ^ c_58s), 1u);
- const u32 c_75s = rotl32 ((c_72s ^ c_67s ^ c_61s ^ c_59s), 1u);
+ const u32 c_16s = rotl32_S ((w[13] ^ w[ 8] ^ w[ 2] ), 1u);
+ const u32 c_17s = rotl32_S ((w[14] ^ w[ 9] ^ w[ 3] ^ w[ 1]), 1u);
+ const u32 c_18s = rotl32_S ((w[15] ^ w[10] ^ w[ 4] ^ w[ 2]), 1u);
+ const u32 c_19s = rotl32_S ((c_16s ^ w[11] ^ w[ 5] ^ w[ 3]), 1u);
+ const u32 c_20s = rotl32_S ((c_17s ^ w[12] ^ w[ 6] ^ w[ 4]), 1u);
+ const u32 c_21s = rotl32_S ((c_18s ^ w[13] ^ w[ 7] ^ w[ 5]), 1u);
+ const u32 c_22s = rotl32_S ((c_19s ^ w[14] ^ w[ 8] ^ w[ 6]), 1u);
+ const u32 c_23s = rotl32_S ((c_20s ^ w[15] ^ w[ 9] ^ w[ 7]), 1u);
+ const u32 c_24s = rotl32_S ((c_21s ^ c_16s ^ w[10] ^ w[ 8]), 1u);
+ const u32 c_25s = rotl32_S ((c_22s ^ c_17s ^ w[11] ^ w[ 9]), 1u);
+ const u32 c_26s = rotl32_S ((c_23s ^ c_18s ^ w[12] ^ w[10]), 1u);
+ const u32 c_27s = rotl32_S ((c_24s ^ c_19s ^ w[13] ^ w[11]), 1u);
+ const u32 c_28s = rotl32_S ((c_25s ^ c_20s ^ w[14] ^ w[12]), 1u);
+ const u32 c_29s = rotl32_S ((c_26s ^ c_21s ^ w[15] ^ w[13]), 1u);
+ const u32 c_30s = rotl32_S ((c_27s ^ c_22s ^ c_16s ^ w[14]), 1u);
+ const u32 c_31s = rotl32_S ((c_28s ^ c_23s ^ c_17s ^ w[15]), 1u);
+ const u32 c_32s = rotl32_S ((c_29s ^ c_24s ^ c_18s ^ c_16s), 1u);
+ const u32 c_33s = rotl32_S ((c_30s ^ c_25s ^ c_19s ^ c_17s), 1u);
+ const u32 c_34s = rotl32_S ((c_31s ^ c_26s ^ c_20s ^ c_18s), 1u);
+ const u32 c_35s = rotl32_S ((c_32s ^ c_27s ^ c_21s ^ c_19s), 1u);
+ const u32 c_36s = rotl32_S ((c_33s ^ c_28s ^ c_22s ^ c_20s), 1u);
+ const u32 c_37s = rotl32_S ((c_34s ^ c_29s ^ c_23s ^ c_21s), 1u);
+ const u32 c_38s = rotl32_S ((c_35s ^ c_30s ^ c_24s ^ c_22s), 1u);
+ const u32 c_39s = rotl32_S ((c_36s ^ c_31s ^ c_25s ^ c_23s), 1u);
+ const u32 c_40s = rotl32_S ((c_37s ^ c_32s ^ c_26s ^ c_24s), 1u);
+ const u32 c_41s = rotl32_S ((c_38s ^ c_33s ^ c_27s ^ c_25s), 1u);
+ const u32 c_42s = rotl32_S ((c_39s ^ c_34s ^ c_28s ^ c_26s), 1u);
+ const u32 c_43s = rotl32_S ((c_40s ^ c_35s ^ c_29s ^ c_27s), 1u);
+ const u32 c_44s = rotl32_S ((c_41s ^ c_36s ^ c_30s ^ c_28s), 1u);
+ const u32 c_45s = rotl32_S ((c_42s ^ c_37s ^ c_31s ^ c_29s), 1u);
+ const u32 c_46s = rotl32_S ((c_43s ^ c_38s ^ c_32s ^ c_30s), 1u);
+ const u32 c_47s = rotl32_S ((c_44s ^ c_39s ^ c_33s ^ c_31s), 1u);
+ const u32 c_48s = rotl32_S ((c_45s ^ c_40s ^ c_34s ^ c_32s), 1u);
+ const u32 c_49s = rotl32_S ((c_46s ^ c_41s ^ c_35s ^ c_33s), 1u);
+ const u32 c_50s = rotl32_S ((c_47s ^ c_42s ^ c_36s ^ c_34s), 1u);
+ const u32 c_51s = rotl32_S ((c_48s ^ c_43s ^ c_37s ^ c_35s), 1u);
+ const u32 c_52s = rotl32_S ((c_49s ^ c_44s ^ c_38s ^ c_36s), 1u);
+ const u32 c_53s = rotl32_S ((c_50s ^ c_45s ^ c_39s ^ c_37s), 1u);
+ const u32 c_54s = rotl32_S ((c_51s ^ c_46s ^ c_40s ^ c_38s), 1u);
+ const u32 c_55s = rotl32_S ((c_52s ^ c_47s ^ c_41s ^ c_39s), 1u);
+ const u32 c_56s = rotl32_S ((c_53s ^ c_48s ^ c_42s ^ c_40s), 1u);
+ const u32 c_57s = rotl32_S ((c_54s ^ c_49s ^ c_43s ^ c_41s), 1u);
+ const u32 c_58s = rotl32_S ((c_55s ^ c_50s ^ c_44s ^ c_42s), 1u);
+ const u32 c_59s = rotl32_S ((c_56s ^ c_51s ^ c_45s ^ c_43s), 1u);
+ const u32 c_60s = rotl32_S ((c_57s ^ c_52s ^ c_46s ^ c_44s), 1u);
+ const u32 c_61s = rotl32_S ((c_58s ^ c_53s ^ c_47s ^ c_45s), 1u);
+ const u32 c_62s = rotl32_S ((c_59s ^ c_54s ^ c_48s ^ c_46s), 1u);
+ const u32 c_63s = rotl32_S ((c_60s ^ c_55s ^ c_49s ^ c_47s), 1u);
+ const u32 c_64s = rotl32_S ((c_61s ^ c_56s ^ c_50s ^ c_48s), 1u);
+ const u32 c_65s = rotl32_S ((c_62s ^ c_57s ^ c_51s ^ c_49s), 1u);
+ const u32 c_66s = rotl32_S ((c_63s ^ c_58s ^ c_52s ^ c_50s), 1u);
+ const u32 c_67s = rotl32_S ((c_64s ^ c_59s ^ c_53s ^ c_51s), 1u);
+ const u32 c_68s = rotl32_S ((c_65s ^ c_60s ^ c_54s ^ c_52s), 1u);
+ const u32 c_69s = rotl32_S ((c_66s ^ c_61s ^ c_55s ^ c_53s), 1u);
+ const u32 c_70s = rotl32_S ((c_67s ^ c_62s ^ c_56s ^ c_54s), 1u);
+ const u32 c_71s = rotl32_S ((c_68s ^ c_63s ^ c_57s ^ c_55s), 1u);
+ const u32 c_72s = rotl32_S ((c_69s ^ c_64s ^ c_58s ^ c_56s), 1u);
+ const u32 c_73s = rotl32_S ((c_70s ^ c_65s ^ c_59s ^ c_57s), 1u);
+ const u32 c_74s = rotl32_S ((c_71s ^ c_66s ^ c_60s ^ c_58s), 1u);
+ const u32 c_75s = rotl32_S ((c_72s ^ c_67s ^ c_61s ^ c_59s), 1u);
const u32 c_17sK = c_17s + SHA1C00;
const u32 c_18sK = c_18s + SHA1C00;
* reverse
*/
- const u32 e_rev = rotl32 (search[1], 2u) - SHA1C03;
+ const u32 e_rev = rotl32_S (search[1], 2u) - SHA1C03;
/**
* loop
u32 w0l = w[0];
- for (u32 il_pos = 0; il_pos < bfs_cnt; il_pos++)
+ for (u32 il_pos = 0; il_pos < bfs_cnt; il_pos += VECT_SIZE)
{
- const u32 w0r = words_buf_r[il_pos];
-
- const u32 w0 = w0l | w0r;
-
- const u32 w0s01 = rotl32 (w0, 1u);
- const u32 w0s02 = rotl32 (w0, 2u);
- const u32 w0s03 = rotl32 (w0, 3u);
- const u32 w0s04 = rotl32 (w0, 4u);
- const u32 w0s05 = rotl32 (w0, 5u);
- const u32 w0s06 = rotl32 (w0, 6u);
- const u32 w0s07 = rotl32 (w0, 7u);
- const u32 w0s08 = rotl32 (w0, 8u);
- const u32 w0s09 = rotl32 (w0, 9u);
- const u32 w0s10 = rotl32 (w0, 10u);
- const u32 w0s11 = rotl32 (w0, 11u);
- const u32 w0s12 = rotl32 (w0, 12u);
- const u32 w0s13 = rotl32 (w0, 13u);
- const u32 w0s14 = rotl32 (w0, 14u);
- const u32 w0s15 = rotl32 (w0, 15u);
- const u32 w0s16 = rotl32 (w0, 16u);
- const u32 w0s17 = rotl32 (w0, 17u);
- const u32 w0s18 = rotl32 (w0, 18u);
- const u32 w0s19 = rotl32 (w0, 19u);
- const u32 w0s20 = rotl32 (w0, 20u);
-
- const u32 w0s04___w0s06 = w0s04 ^ w0s06;
- const u32 w0s04___w0s08 = w0s04 ^ w0s08;
- const u32 w0s08___w0s12 = w0s08 ^ w0s12;
- const u32 w0s04___w0s06___w0s07 = w0s04___w0s06 ^ w0s07;
-
- u32 a = SHA1M_A;
- u32 b = SHA1M_B;
- u32 c = SHA1M_C;
- u32 d = SHA1M_D;
- u32 e = SHA1M_E;
+ const u32x w0r = words_buf_r[il_pos / VECT_SIZE];
+
+ const u32x w0 = w0l | w0r;
+
+ const u32x w0s01 = rotl32 (w0, 1u);
+ const u32x w0s02 = rotl32 (w0, 2u);
+ const u32x w0s03 = rotl32 (w0, 3u);
+ const u32x w0s04 = rotl32 (w0, 4u);
+ const u32x w0s05 = rotl32 (w0, 5u);
+ const u32x w0s06 = rotl32 (w0, 6u);
+ const u32x w0s07 = rotl32 (w0, 7u);
+ const u32x w0s08 = rotl32 (w0, 8u);
+ const u32x w0s09 = rotl32 (w0, 9u);
+ const u32x w0s10 = rotl32 (w0, 10u);
+ const u32x w0s11 = rotl32 (w0, 11u);
+ const u32x w0s12 = rotl32 (w0, 12u);
+ const u32x w0s13 = rotl32 (w0, 13u);
+ const u32x w0s14 = rotl32 (w0, 14u);
+ const u32x w0s15 = rotl32 (w0, 15u);
+ const u32x w0s16 = rotl32 (w0, 16u);
+ const u32x w0s17 = rotl32 (w0, 17u);
+ const u32x w0s18 = rotl32 (w0, 18u);
+ const u32x w0s19 = rotl32 (w0, 19u);
+ const u32x w0s20 = rotl32 (w0, 20u);
+
+ const u32x w0s04___w0s06 = w0s04 ^ w0s06;
+ const u32x w0s04___w0s08 = w0s04 ^ w0s08;
+ const u32x w0s08___w0s12 = w0s08 ^ w0s12;
+ const u32x w0s04___w0s06___w0s07 = w0s04___w0s06 ^ w0s07;
+
+ u32x a = SHA1M_A;
+ u32x b = SHA1M_B;
+ u32x c = SHA1M_C;
+ u32x d = SHA1M_D;
+ u32x e = SHA1M_E;
#undef K
#define K SHA1C00
SHA1_STEP_PE (SHA1_F1, a, b, c, d, e, (c_75s ^ w0s06 ^ w0s12 ^ w0s14));
- bool q_cond = allx (e_rev != e);
-
- if (q_cond) continue;
+ if (MATCHES_NONE_VS (e, e_rev)) continue;
SHA1_STEP_PB (SHA1_F1, a, b, c, d, e, 0);
- const u32 c_76s = rotl32 ((c_73s ^ c_68s ^ c_62s ^ c_60s), 1u);
- const u32 c_77s = rotl32 ((c_74s ^ c_69s ^ c_63s ^ c_61s), 1u);
- const u32 c_78s = rotl32 ((c_75s ^ c_70s ^ c_64s ^ c_62s), 1u);
- const u32 c_79s = rotl32 ((c_76s ^ c_71s ^ c_65s ^ c_63s), 1u);
+ const u32x c_76s = rotl32 ((c_73s ^ c_68s ^ c_62s ^ c_60s), 1u);
+ const u32x c_77s = rotl32 ((c_74s ^ c_69s ^ c_63s ^ c_61s), 1u);
+ const u32x c_78s = rotl32 ((c_75s ^ c_70s ^ c_64s ^ c_62s), 1u);
+ const u32x c_79s = rotl32 ((c_76s ^ c_71s ^ c_65s ^ c_63s), 1u);
- const u32 w0s21 = rotl32 (w0, 21u);
- const u32 w0s22 = rotl32 (w0, 22U);
+ const u32x w0s21 = rotl32 (w0, 21u);
+ const u32x w0s22 = rotl32 (w0, 22U);
SHA1_STEP (SHA1_F1 , e, a, b, c, d, (c_76s ^ w0s07 ^ w0s08___w0s12 ^ w0s16 ^ w0s21));
SHA1_STEP (SHA1_F1 , d, e, a, b, c, (c_77s));
SHA1_STEP (SHA1_F1 , c, d, e, a, b, (c_78s ^ w0s07 ^ w0s08 ^ w0s15 ^ w0s18 ^ w0s20));
SHA1_STEP (SHA1_F1 , b, c, d, e, a, (c_79s ^ w0s08 ^ w0s22));
-
- const u32 r0 = d;
- const u32 r1 = e;
- const u32 r2 = c;
- const u32 r3 = b;
-
- #include COMPARE_S
+ COMPARE_S_SIMD (d, e, c, b);
}
}
-__kernel void m00130_m04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, __global comb_t *combs_buf, __constant u32 * words_buf_r, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 bfs_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max)
+__kernel void m00130_m04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, __global comb_t *combs_buf, __constant u32x * words_buf_r, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 bfs_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max)
{
/**
* base
m00130m (w, pw_len, pws, rules_buf, combs_buf, words_buf_r, tmps, hooks, bitmaps_buf_s1_a, bitmaps_buf_s1_b, bitmaps_buf_s1_c, bitmaps_buf_s1_d, bitmaps_buf_s2_a, bitmaps_buf_s2_b, bitmaps_buf_s2_c, bitmaps_buf_s2_d, plains_buf, digests_buf, hashes_shown, salt_bufs, esalt_bufs, d_return_buf, d_scryptV_buf, bitmap_mask, bitmap_shift1, bitmap_shift2, salt_pos, loop_pos, loop_cnt, bfs_cnt, digests_cnt, digests_offset);
}
-__kernel void m00130_m08 (__global pw_t *pws, __global kernel_rule_t *rules_buf, __global comb_t *combs_buf, __constant u32 * words_buf_r, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 bfs_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max)
+__kernel void m00130_m08 (__global pw_t *pws, __global kernel_rule_t *rules_buf, __global comb_t *combs_buf, __constant u32x * words_buf_r, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 bfs_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max)
{
/**
* base
m00130m (w, pw_len, pws, rules_buf, combs_buf, words_buf_r, tmps, hooks, bitmaps_buf_s1_a, bitmaps_buf_s1_b, bitmaps_buf_s1_c, bitmaps_buf_s1_d, bitmaps_buf_s2_a, bitmaps_buf_s2_b, bitmaps_buf_s2_c, bitmaps_buf_s2_d, plains_buf, digests_buf, hashes_shown, salt_bufs, esalt_bufs, d_return_buf, d_scryptV_buf, bitmap_mask, bitmap_shift1, bitmap_shift2, salt_pos, loop_pos, loop_cnt, bfs_cnt, digests_cnt, digests_offset);
}
-__kernel void m00130_m16 (__global pw_t *pws, __global kernel_rule_t *rules_buf, __global comb_t *combs_buf, __constant u32 * words_buf_r, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 bfs_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max)
+__kernel void m00130_m16 (__global pw_t *pws, __global kernel_rule_t *rules_buf, __global comb_t *combs_buf, __constant u32x * words_buf_r, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 bfs_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max)
{
/**
* base
m00130m (w, pw_len, pws, rules_buf, combs_buf, words_buf_r, tmps, hooks, bitmaps_buf_s1_a, bitmaps_buf_s1_b, bitmaps_buf_s1_c, bitmaps_buf_s1_d, bitmaps_buf_s2_a, bitmaps_buf_s2_b, bitmaps_buf_s2_c, bitmaps_buf_s2_d, plains_buf, digests_buf, hashes_shown, salt_bufs, esalt_bufs, d_return_buf, d_scryptV_buf, bitmap_mask, bitmap_shift1, bitmap_shift2, salt_pos, loop_pos, loop_cnt, bfs_cnt, digests_cnt, digests_offset);
}
-__kernel void m00130_s04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, __global comb_t *combs_buf, __constant u32 * words_buf_r, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 bfs_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max)
+__kernel void m00130_s04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, __global comb_t *combs_buf, __constant u32x * words_buf_r, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 bfs_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max)
{
/**
* base
m00130s (w, pw_len, pws, rules_buf, combs_buf, words_buf_r, tmps, hooks, bitmaps_buf_s1_a, bitmaps_buf_s1_b, bitmaps_buf_s1_c, bitmaps_buf_s1_d, bitmaps_buf_s2_a, bitmaps_buf_s2_b, bitmaps_buf_s2_c, bitmaps_buf_s2_d, plains_buf, digests_buf, hashes_shown, salt_bufs, esalt_bufs, d_return_buf, d_scryptV_buf, bitmap_mask, bitmap_shift1, bitmap_shift2, salt_pos, loop_pos, loop_cnt, bfs_cnt, digests_cnt, digests_offset);
}
-__kernel void m00130_s08 (__global pw_t *pws, __global kernel_rule_t *rules_buf, __global comb_t *combs_buf, __constant u32 * words_buf_r, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 bfs_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max)
+__kernel void m00130_s08 (__global pw_t *pws, __global kernel_rule_t *rules_buf, __global comb_t *combs_buf, __constant u32x * words_buf_r, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 bfs_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max)
{
/**
* base
m00130s (w, pw_len, pws, rules_buf, combs_buf, words_buf_r, tmps, hooks, bitmaps_buf_s1_a, bitmaps_buf_s1_b, bitmaps_buf_s1_c, bitmaps_buf_s1_d, bitmaps_buf_s2_a, bitmaps_buf_s2_b, bitmaps_buf_s2_c, bitmaps_buf_s2_d, plains_buf, digests_buf, hashes_shown, salt_bufs, esalt_bufs, d_return_buf, d_scryptV_buf, bitmap_mask, bitmap_shift1, bitmap_shift2, salt_pos, loop_pos, loop_cnt, bfs_cnt, digests_cnt, digests_offset);
}
-__kernel void m00130_s16 (__global pw_t *pws, __global kernel_rule_t *rules_buf, __global comb_t *combs_buf, __constant u32 * words_buf_r, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 bfs_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max)
+__kernel void m00130_s16 (__global pw_t *pws, __global kernel_rule_t *rules_buf, __global comb_t *combs_buf, __constant u32x * words_buf_r, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 bfs_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max)
{
/**
* base
make_unicode (w0, w0_t, w1_t);
make_unicode (w1, w2_t, w3_t);
- switch_buffer_by_offset (w0_t, w1_t, w2_t, w3_t, salt_len);
+ switch_buffer_by_offset_le (w0_t, w1_t, w2_t, w3_t, salt_len);
w0_t[0] |= salt_buf0[0];
w0_t[1] |= salt_buf0[1];
make_unicode (w0, w0_t, w1_t);
make_unicode (w1, w2_t, w3_t);
- switch_buffer_by_offset (w0_t, w1_t, w2_t, w3_t, salt_len);
+ switch_buffer_by_offset_le (w0_t, w1_t, w2_t, w3_t, salt_len);
w0_t[0] |= salt_buf0[0];
w0_t[1] |= salt_buf0[1];
if (combs_mode == COMBINATOR_MODE_BASE_RIGHT)
{
- switch_buffer_by_offset (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len);
+ switch_buffer_by_offset_le (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len);
}
/**
if (combs_mode == COMBINATOR_MODE_BASE_LEFT)
{
- switch_buffer_by_offset (wordr0, wordr1, wordr2, wordr3, pw_l_len);
+ switch_buffer_by_offset_le (wordr0, wordr1, wordr2, wordr3, pw_l_len);
}
/**
make_unicode (w0, w0_t, w1_t);
make_unicode (w1, w2_t, w3_t);
- switch_buffer_by_offset (w0_t, w1_t, w2_t, w3_t, salt_len);
+ switch_buffer_by_offset_le (w0_t, w1_t, w2_t, w3_t, salt_len);
w0_t[0] |= salt_buf0[0];
w0_t[1] |= salt_buf0[1];
if (combs_mode == COMBINATOR_MODE_BASE_RIGHT)
{
- switch_buffer_by_offset (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len);
+ switch_buffer_by_offset_le (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len);
}
/**
if (combs_mode == COMBINATOR_MODE_BASE_LEFT)
{
- switch_buffer_by_offset (wordr0, wordr1, wordr2, wordr3, pw_l_len);
+ switch_buffer_by_offset_le (wordr0, wordr1, wordr2, wordr3, pw_l_len);
}
/**
make_unicode (w0, w0_t, w1_t);
make_unicode (w1, w2_t, w3_t);
- switch_buffer_by_offset (w0_t, w1_t, w2_t, w3_t, salt_len);
+ switch_buffer_by_offset_le (w0_t, w1_t, w2_t, w3_t, salt_len);
w0_t[0] |= salt_buf0[0];
w0_t[1] |= salt_buf0[1];
#define _SHA1_
+#define NEW_SIMD_CODE
+
#include "include/constants.h"
#include "include/kernel_vendor.h"
#include "include/kernel_functions.c"
#include "OpenCL/types_ocl.c"
#include "OpenCL/common.c"
-
-#define COMPARE_S "OpenCL/check_single_comp4.c"
-#define COMPARE_M "OpenCL/check_multi_comp4.c"
-
-static void overwrite_at (u32 sw[16], const u32 w0, const u32 salt_len)
-{
- switch (salt_len)
- {
- case 0: sw[0] = w0;
- break;
- case 1: sw[0] = (sw[0] & 0xff000000) | (w0 >> 8);
- sw[1] = (sw[1] & 0x00ffffff) | (w0 << 24);
- break;
- case 2: sw[0] = (sw[0] & 0xffff0000) | (w0 >> 16);
- sw[1] = (sw[1] & 0x0000ffff) | (w0 << 16);
- break;
- case 3: sw[0] = (sw[0] & 0xffffff00) | (w0 >> 24);
- sw[1] = (sw[1] & 0x000000ff) | (w0 << 8);
- break;
- case 4: sw[1] = w0;
- break;
- case 5: sw[1] = (sw[1] & 0xff000000) | (w0 >> 8);
- sw[2] = (sw[2] & 0x00ffffff) | (w0 << 24);
- break;
- case 6: sw[1] = (sw[1] & 0xffff0000) | (w0 >> 16);
- sw[2] = (sw[2] & 0x0000ffff) | (w0 << 16);
- break;
- case 7: sw[1] = (sw[1] & 0xffffff00) | (w0 >> 24);
- sw[2] = (sw[2] & 0x000000ff) | (w0 << 8);
- break;
- case 8: sw[2] = w0;
- break;
- case 9: sw[2] = (sw[2] & 0xff000000) | (w0 >> 8);
- sw[3] = (sw[3] & 0x00ffffff) | (w0 << 24);
- break;
- case 10: sw[2] = (sw[2] & 0xffff0000) | (w0 >> 16);
- sw[3] = (sw[3] & 0x0000ffff) | (w0 << 16);
- break;
- case 11: sw[2] = (sw[2] & 0xffffff00) | (w0 >> 24);
- sw[3] = (sw[3] & 0x000000ff) | (w0 << 8);
- break;
- case 12: sw[3] = w0;
- break;
- case 13: sw[3] = (sw[3] & 0xff000000) | (w0 >> 8);
- sw[4] = (sw[4] & 0x00ffffff) | (w0 << 24);
- break;
- case 14: sw[3] = (sw[3] & 0xffff0000) | (w0 >> 16);
- sw[4] = (sw[4] & 0x0000ffff) | (w0 << 16);
- break;
- case 15: sw[3] = (sw[3] & 0xffffff00) | (w0 >> 24);
- sw[4] = (sw[4] & 0x000000ff) | (w0 << 8);
- break;
- case 16: sw[4] = w0;
- break;
- case 17: sw[4] = (sw[4] & 0xff000000) | (w0 >> 8);
- sw[5] = (sw[5] & 0x00ffffff) | (w0 << 24);
- break;
- case 18: sw[4] = (sw[4] & 0xffff0000) | (w0 >> 16);
- sw[5] = (sw[5] & 0x0000ffff) | (w0 << 16);
- break;
- case 19: sw[4] = (sw[4] & 0xffffff00) | (w0 >> 24);
- sw[5] = (sw[5] & 0x000000ff) | (w0 << 8);
- break;
- case 20: sw[5] = w0;
- break;
- case 21: sw[5] = (sw[5] & 0xff000000) | (w0 >> 8);
- sw[6] = (sw[6] & 0x00ffffff) | (w0 << 24);
- break;
- case 22: sw[5] = (sw[5] & 0xffff0000) | (w0 >> 16);
- sw[6] = (sw[6] & 0x0000ffff) | (w0 << 16);
- break;
- case 23: sw[5] = (sw[5] & 0xffffff00) | (w0 >> 24);
- sw[6] = (sw[6] & 0x000000ff) | (w0 << 8);
- break;
- case 24: sw[6] = w0;
- break;
- case 25: sw[6] = (sw[6] & 0xff000000) | (w0 >> 8);
- sw[7] = (sw[7] & 0x00ffffff) | (w0 << 24);
- break;
- case 26: sw[6] = (sw[6] & 0xffff0000) | (w0 >> 16);
- sw[7] = (sw[7] & 0x0000ffff) | (w0 << 16);
- break;
- case 27: sw[6] = (sw[6] & 0xffffff00) | (w0 >> 24);
- sw[7] = (sw[7] & 0x000000ff) | (w0 << 8);
- break;
- case 28: sw[7] = w0;
- break;
- case 29: sw[7] = (sw[7] & 0xff000000) | (w0 >> 8);
- sw[8] = (sw[8] & 0x00ffffff) | (w0 << 24);
- break;
- case 30: sw[7] = (sw[7] & 0xffff0000) | (w0 >> 16);
- sw[8] = (sw[8] & 0x0000ffff) | (w0 << 16);
- break;
- case 31: sw[7] = (sw[7] & 0xffffff00) | (w0 >> 24);
- sw[8] = (sw[8] & 0x000000ff) | (w0 << 8);
- break;
- }
-}
+#include "OpenCL/simd.c"
static void m00140m (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], const u32 pw_len, __global pw_t *pws, __global kernel_rule_t *rules_buf, __global comb_t *combs_buf, __global bf_t *bfs_buf, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 bfs_cnt, const u32 digests_cnt, const u32 digests_offset)
{
u32 w2_t[4];
u32 w3_t[4];
- w0_t[0] = swap32 (w0[0]);
- w0_t[1] = swap32 (w0[1]);
- w0_t[2] = swap32 (w0[2]);
- w0_t[3] = swap32 (w0[3]);
- w1_t[0] = swap32 (w1[0]);
- w1_t[1] = swap32 (w1[1]);
- w1_t[2] = swap32 (w1[2]);
- w1_t[3] = swap32 (w1[3]);
- w2_t[0] = swap32 (w2[0]);
- w2_t[1] = swap32 (w2[1]);
- w2_t[2] = swap32 (w2[2]);
- w2_t[3] = swap32 (w2[3]);
- w3_t[0] = swap32 (w3[0]);
- w3_t[1] = swap32 (w3[1]);
- w3_t[2] = swap32 (w3[2]);
- w3_t[3] = swap32 (w3[3]);
-
- switch_buffer_by_offset (w0_t, w1_t, w2_t, w3_t, salt_len);
+ w0_t[0] = swap32_S (w0[0]);
+ w0_t[1] = swap32_S (w0[1]);
+ w0_t[2] = swap32_S (w0[2]);
+ w0_t[3] = swap32_S (w0[3]);
+ w1_t[0] = swap32_S (w1[0]);
+ w1_t[1] = swap32_S (w1[1]);
+ w1_t[2] = swap32_S (w1[2]);
+ w1_t[3] = swap32_S (w1[3]);
+ w2_t[0] = swap32_S (w2[0]);
+ w2_t[1] = swap32_S (w2[1]);
+ w2_t[2] = swap32_S (w2[2]);
+ w2_t[3] = swap32_S (w2[3]);
+ w3_t[0] = swap32_S (w3[0]);
+ w3_t[1] = swap32_S (w3[1]);
+ w3_t[2] = swap32_S (w3[2]);
+ w3_t[3] = swap32_S (w3[3]);
+
+ switch_buffer_by_offset_le_S (w0_t, w1_t, w2_t, w3_t, salt_len);
w0_t[0] |= salt_buf0[0];
w0_t[1] |= salt_buf0[1];
w3_t[2] |= salt_buf3[2];
w3_t[3] |= salt_buf3[3];
- w0_t[0] = swap32 (w0_t[0]);
- w0_t[1] = swap32 (w0_t[1]);
- w0_t[2] = swap32 (w0_t[2]);
- w0_t[3] = swap32 (w0_t[3]);
- w1_t[0] = swap32 (w1_t[0]);
- w1_t[1] = swap32 (w1_t[1]);
- w1_t[2] = swap32 (w1_t[2]);
- w1_t[3] = swap32 (w1_t[3]);
- w2_t[0] = swap32 (w2_t[0]);
- w2_t[1] = swap32 (w2_t[1]);
- w2_t[2] = swap32 (w2_t[2]);
- w2_t[3] = swap32 (w2_t[3]);
- w3_t[0] = swap32 (w3_t[0]);
- w3_t[1] = swap32 (w3_t[1]);
- w3_t[2] = swap32 (w3_t[2]);
- w3_t[3] = swap32 (w3_t[3]);
+ w0_t[0] = swap32_S (w0_t[0]);
+ w0_t[1] = swap32_S (w0_t[1]);
+ w0_t[2] = swap32_S (w0_t[2]);
+ w0_t[3] = swap32_S (w0_t[3]);
+ w1_t[0] = swap32_S (w1_t[0]);
+ w1_t[1] = swap32_S (w1_t[1]);
+ w1_t[2] = swap32_S (w1_t[2]);
+ w1_t[3] = swap32_S (w1_t[3]);
+ w2_t[0] = swap32_S (w2_t[0]);
+ w2_t[1] = swap32_S (w2_t[1]);
+ w2_t[2] = swap32_S (w2_t[2]);
+ w2_t[3] = swap32_S (w2_t[3]);
+ w3_t[0] = swap32_S (w3_t[0]);
+ w3_t[1] = swap32_S (w3_t[1]);
+ w3_t[2] = swap32_S (w3_t[2]);
+ w3_t[3] = swap32_S (w3_t[3]);
/**
* loop
u32 w0l = w0[0];
- for (u32 il_pos = 0; il_pos < bfs_cnt; il_pos++)
+ for (u32 il_pos = 0; il_pos < bfs_cnt; il_pos += VECT_SIZE)
{
- const u32 w0r = bfs_buf[il_pos].i;
+ const u32x w0r = w0r_create_bft (bfs_buf, il_pos);
- const u32 w0n = w0l | w0r;
+ const u32x w0lr = w0l | w0r;
- u32 wx[16];
+ u32x wx[16];
wx[ 0] = w0_t[0];
wx[ 1] = w0_t[1];
wx[14] = w3_t[2];
wx[15] = w3_t[3];
- overwrite_at (wx, w0n, salt_len);
+ overwrite_at_be (wx, w0lr, salt_len);
- u32 w0_t[4];
- u32 w1_t[4];
- u32 w2_t[4];
- u32 w3_t[4];
+ u32x w0_t[4];
+ u32x w1_t[4];
+ u32x w2_t[4];
+ u32x w3_t[4];
w0_t[0] = wx[ 0];
w0_t[1] = wx[ 1];
* sha1
*/
- u32 a = SHA1M_A;
- u32 b = SHA1M_B;
- u32 c = SHA1M_C;
- u32 d = SHA1M_D;
- u32 e = SHA1M_E;
+ u32x a = SHA1M_A;
+ u32x b = SHA1M_B;
+ u32x c = SHA1M_C;
+ u32x d = SHA1M_D;
+ u32x e = SHA1M_E;
#undef K
#define K SHA1C00
w3_t[2] = rotl32 ((w2_t[3] ^ w1_t[2] ^ w0_t[0] ^ w3_t[2]), 1u); SHA1_STEP (SHA1_F1, c, d, e, a, b, w3_t[2]);
w3_t[3] = rotl32 ((w3_t[0] ^ w1_t[3] ^ w0_t[1] ^ w3_t[3]), 1u); SHA1_STEP (SHA1_F1, b, c, d, e, a, w3_t[3]);
- const u32 r0 = d;
- const u32 r1 = e;
- const u32 r2 = c;
- const u32 r3 = b;
-
- #include COMPARE_M
+ COMPARE_M_SIMD (d, e, c, b);
}
}
* reverse
*/
- const u32 e_rev = rotl32 (search[1], 2u);
+ const u32 e_rev = rotl32_S (search[1], 2u);
/**
* salt
u32 w2_t[4];
u32 w3_t[4];
- w0_t[0] = swap32 (w0[0]);
- w0_t[1] = swap32 (w0[1]);
- w0_t[2] = swap32 (w0[2]);
- w0_t[3] = swap32 (w0[3]);
- w1_t[0] = swap32 (w1[0]);
- w1_t[1] = swap32 (w1[1]);
- w1_t[2] = swap32 (w1[2]);
- w1_t[3] = swap32 (w1[3]);
- w2_t[0] = swap32 (w2[0]);
- w2_t[1] = swap32 (w2[1]);
- w2_t[2] = swap32 (w2[2]);
- w2_t[3] = swap32 (w2[3]);
- w3_t[0] = swap32 (w3[0]);
- w3_t[1] = swap32 (w3[1]);
- w3_t[2] = swap32 (w3[2]);
- w3_t[3] = swap32 (w3[3]);
-
- switch_buffer_by_offset (w0_t, w1_t, w2_t, w3_t, salt_len);
+ w0_t[0] = swap32_S (w0[0]);
+ w0_t[1] = swap32_S (w0[1]);
+ w0_t[2] = swap32_S (w0[2]);
+ w0_t[3] = swap32_S (w0[3]);
+ w1_t[0] = swap32_S (w1[0]);
+ w1_t[1] = swap32_S (w1[1]);
+ w1_t[2] = swap32_S (w1[2]);
+ w1_t[3] = swap32_S (w1[3]);
+ w2_t[0] = swap32_S (w2[0]);
+ w2_t[1] = swap32_S (w2[1]);
+ w2_t[2] = swap32_S (w2[2]);
+ w2_t[3] = swap32_S (w2[3]);
+ w3_t[0] = swap32_S (w3[0]);
+ w3_t[1] = swap32_S (w3[1]);
+ w3_t[2] = swap32_S (w3[2]);
+ w3_t[3] = swap32_S (w3[3]);
+
+ switch_buffer_by_offset_le_S (w0_t, w1_t, w2_t, w3_t, salt_len);
w0_t[0] |= salt_buf0[0];
w0_t[1] |= salt_buf0[1];
w3_t[2] |= salt_buf3[2];
w3_t[3] |= salt_buf3[3];
- w0_t[0] = swap32 (w0_t[0]);
- w0_t[1] = swap32 (w0_t[1]);
- w0_t[2] = swap32 (w0_t[2]);
- w0_t[3] = swap32 (w0_t[3]);
- w1_t[0] = swap32 (w1_t[0]);
- w1_t[1] = swap32 (w1_t[1]);
- w1_t[2] = swap32 (w1_t[2]);
- w1_t[3] = swap32 (w1_t[3]);
- w2_t[0] = swap32 (w2_t[0]);
- w2_t[1] = swap32 (w2_t[1]);
- w2_t[2] = swap32 (w2_t[2]);
- w2_t[3] = swap32 (w2_t[3]);
- w3_t[0] = swap32 (w3_t[0]);
- w3_t[1] = swap32 (w3_t[1]);
- w3_t[2] = swap32 (w3_t[2]);
- w3_t[3] = swap32 (w3_t[3]);
+ w0_t[0] = swap32_S (w0_t[0]);
+ w0_t[1] = swap32_S (w0_t[1]);
+ w0_t[2] = swap32_S (w0_t[2]);
+ w0_t[3] = swap32_S (w0_t[3]);
+ w1_t[0] = swap32_S (w1_t[0]);
+ w1_t[1] = swap32_S (w1_t[1]);
+ w1_t[2] = swap32_S (w1_t[2]);
+ w1_t[3] = swap32_S (w1_t[3]);
+ w2_t[0] = swap32_S (w2_t[0]);
+ w2_t[1] = swap32_S (w2_t[1]);
+ w2_t[2] = swap32_S (w2_t[2]);
+ w2_t[3] = swap32_S (w2_t[3]);
+ w3_t[0] = swap32_S (w3_t[0]);
+ w3_t[1] = swap32_S (w3_t[1]);
+ w3_t[2] = swap32_S (w3_t[2]);
+ w3_t[3] = swap32_S (w3_t[3]);
/**
* loop
u32 w0l = w0[0];
- for (u32 il_pos = 0; il_pos < bfs_cnt; il_pos++)
+ for (u32 il_pos = 0; il_pos < bfs_cnt; il_pos += VECT_SIZE)
{
- const u32 w0r = bfs_buf[il_pos].i;
+ const u32x w0r = w0r_create_bft (bfs_buf, il_pos);
- const u32 w0n = w0l | w0r;
+ const u32x w0lr = w0l | w0r;
- u32 wx[16];
+ u32x wx[16];
wx[ 0] = w0_t[0];
wx[ 1] = w0_t[1];
wx[14] = w3_t[2];
wx[15] = w3_t[3];
- overwrite_at (wx, w0n, salt_len);
+ overwrite_at_be (wx, w0lr, salt_len);
- u32 w0_t[4];
- u32 w1_t[4];
- u32 w2_t[4];
- u32 w3_t[4];
+ u32x w0_t[4];
+ u32x w1_t[4];
+ u32x w2_t[4];
+ u32x w3_t[4];
w0_t[0] = wx[ 0];
w0_t[1] = wx[ 1];
* sha1
*/
- u32 a = SHA1M_A;
- u32 b = SHA1M_B;
- u32 c = SHA1M_C;
- u32 d = SHA1M_D;
- u32 e = SHA1M_E;
+ u32x a = SHA1M_A;
+ u32x b = SHA1M_B;
+ u32x c = SHA1M_C;
+ u32x d = SHA1M_D;
+ u32x e = SHA1M_E;
#undef K
#define K SHA1C00
w2_t[3] = rotl32 ((w2_t[0] ^ w0_t[3] ^ w3_t[1] ^ w2_t[3]), 1u); SHA1_STEP (SHA1_F1, a, b, c, d, e, w2_t[3]);
w3_t[0] = rotl32 ((w2_t[1] ^ w1_t[0] ^ w3_t[2] ^ w3_t[0]), 1u); SHA1_STEP (SHA1_F1, e, a, b, c, d, w3_t[0]);
- if (allx (e != e_rev)) continue;
+ if (MATCHES_NONE_VS (e, e_rev)) continue;
w3_t[1] = rotl32 ((w2_t[2] ^ w1_t[1] ^ w3_t[3] ^ w3_t[1]), 1u); SHA1_STEP (SHA1_F1, d, e, a, b, c, w3_t[1]);
w3_t[2] = rotl32 ((w2_t[3] ^ w1_t[2] ^ w0_t[0] ^ w3_t[2]), 1u); SHA1_STEP (SHA1_F1, c, d, e, a, b, w3_t[2]);
w3_t[3] = rotl32 ((w3_t[0] ^ w1_t[3] ^ w0_t[1] ^ w3_t[3]), 1u); SHA1_STEP (SHA1_F1, b, c, d, e, a, w3_t[3]);
-
- const u32 r0 = d;
- const u32 r1 = e;
- const u32 r2 = c;
- const u32 r3 = b;
-
- #include COMPARE_S
+ COMPARE_S_SIMD (d, e, c, b);
}
}
if (combs_mode == COMBINATOR_MODE_BASE_RIGHT)
{
- switch_buffer_by_offset (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len);
+ switch_buffer_by_offset_le (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len);
}
/**
if (combs_mode == COMBINATOR_MODE_BASE_LEFT)
{
- switch_buffer_by_offset (wordr0, wordr1, wordr2, wordr3, pw_l_len);
+ switch_buffer_by_offset_le (wordr0, wordr1, wordr2, wordr3, pw_l_len);
}
u32 w0[4];
if (combs_mode == COMBINATOR_MODE_BASE_RIGHT)
{
- switch_buffer_by_offset (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len);
+ switch_buffer_by_offset_le (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len);
}
/**
if (combs_mode == COMBINATOR_MODE_BASE_LEFT)
{
- switch_buffer_by_offset (wordr0, wordr1, wordr2, wordr3, pw_l_len);
+ switch_buffer_by_offset_le (wordr0, wordr1, wordr2, wordr3, pw_l_len);
}
u32 w0[4];
#define _SHA1_
+#define NEW_SIMD_CODE
+
#include "include/constants.h"
#include "include/kernel_vendor.h"
#include "include/kernel_functions.c"
#include "OpenCL/types_ocl.c"
#include "OpenCL/common.c"
+#include "OpenCL/simd.c"
-#define COMPARE_S "OpenCL/check_single_comp4.c"
-#define COMPARE_M "OpenCL/check_multi_comp4.c"
-
-static void sha1_transform (const u32 w0[4], const u32 w1[4], const u32 w2[4], const u32 w3[4], u32 digest[5])
+static void sha1_transform (const u32x w0[4], const u32x w1[4], const u32x w2[4], const u32x w3[4], u32x digest[5])
{
- u32 A = digest[0];
- u32 B = digest[1];
- u32 C = digest[2];
- u32 D = digest[3];
- u32 E = digest[4];
-
- u32 w0_t = w0[0];
- u32 w1_t = w0[1];
- u32 w2_t = w0[2];
- u32 w3_t = w0[3];
- u32 w4_t = w1[0];
- u32 w5_t = w1[1];
- u32 w6_t = w1[2];
- u32 w7_t = w1[3];
- u32 w8_t = w2[0];
- u32 w9_t = w2[1];
- u32 wa_t = w2[2];
- u32 wb_t = w2[3];
- u32 wc_t = w3[0];
- u32 wd_t = w3[1];
- u32 we_t = w3[2];
- u32 wf_t = w3[3];
+ u32x A = digest[0];
+ u32x B = digest[1];
+ u32x C = digest[2];
+ u32x D = digest[3];
+ u32x E = digest[4];
+
+ u32x w0_t = w0[0];
+ u32x w1_t = w0[1];
+ u32x w2_t = w0[2];
+ u32x w3_t = w0[3];
+ u32x w4_t = w1[0];
+ u32x w5_t = w1[1];
+ u32x w6_t = w1[2];
+ u32x w7_t = w1[3];
+ u32x w8_t = w2[0];
+ u32x w9_t = w2[1];
+ u32x wa_t = w2[2];
+ u32x wb_t = w2[3];
+ u32x wc_t = w3[0];
+ u32x wd_t = w3[1];
+ u32x we_t = w3[2];
+ u32x wf_t = w3[3];
#undef K
#define K SHA1C00
digest[4] += E;
}
-static void hmac_sha1_pad (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], u32 ipad[5], u32 opad[5])
+static void hmac_sha1_pad (u32x w0[4], u32x w1[4], u32x w2[4], u32x w3[4], u32x ipad[5], u32x opad[5])
{
w0[0] = w0[0] ^ 0x36363636;
w0[1] = w0[1] ^ 0x36363636;
sha1_transform (w0, w1, w2, w3, opad);
}
-static void hmac_sha1_run (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], u32 ipad[5], u32 opad[5], u32 digest[5])
+static void hmac_sha1_run (u32x w0[4], u32x w1[4], u32x w2[4], u32x w3[4], u32x ipad[5], u32x opad[5], u32x digest[5])
{
digest[0] = ipad[0];
digest[1] = ipad[1];
u32 w0l = w0[0];
- for (u32 il_pos = 0; il_pos < bfs_cnt; il_pos++)
+ for (u32 il_pos = 0; il_pos < bfs_cnt; il_pos += VECT_SIZE)
{
- const u32 w0r = bfs_buf[il_pos].i;
+ const u32x w0r = w0r_create_bft (bfs_buf, il_pos);
- w0[0] = w0l | w0r;
+ const u32x w0lr = w0l | w0r;
/**
* pads
*/
- u32 w0_t[4];
+ u32x w0_t[4];
- w0_t[0] = w0[0];
+ w0_t[0] = w0lr;
w0_t[1] = w0[1];
w0_t[2] = w0[2];
w0_t[3] = w0[3];
- u32 w1_t[4];
+ u32x w1_t[4];
w1_t[0] = w1[0];
w1_t[1] = w1[1];
w1_t[2] = w1[2];
w1_t[3] = w1[3];
- u32 w2_t[4];
+ u32x w2_t[4];
w2_t[0] = w2[0];
w2_t[1] = w2[1];
w2_t[2] = w2[2];
w2_t[3] = w2[3];
- u32 w3_t[4];
+ u32x w3_t[4];
w3_t[0] = w3[0];
w3_t[1] = w3[1];
w3_t[2] = 0;
w3_t[3] = 0;
- u32 ipad[5];
- u32 opad[5];
+ u32x ipad[5];
+ u32x opad[5];
hmac_sha1_pad (w0_t, w1_t, w2_t, w3_t, ipad, opad);
w3_t[2] = 0;
w3_t[3] = (64 + salt_len) * 8;
- u32 digest[5];
+ u32x digest[5];
hmac_sha1_run (w0_t, w1_t, w2_t, w3_t, ipad, opad, digest);
- const u32 r0 = digest[3];
- const u32 r1 = digest[4];
- const u32 r2 = digest[2];
- const u32 r3 = digest[1];
-
- #include COMPARE_M
+ COMPARE_M_SIMD (digest[3], digest[4], digest[2], digest[1]);
}
}
u32 w0l = w0[0];
- for (u32 il_pos = 0; il_pos < bfs_cnt; il_pos++)
+ for (u32 il_pos = 0; il_pos < bfs_cnt; il_pos += VECT_SIZE)
{
- const u32 w0r = bfs_buf[il_pos].i;
+ const u32x w0r = w0r_create_bft (bfs_buf, il_pos);
- w0[0] = w0l | w0r;
+ const u32x w0lr = w0l | w0r;
/**
* pads
*/
- u32 w0_t[4];
+ u32x w0_t[4];
- w0_t[0] = w0[0];
+ w0_t[0] = w0lr;
w0_t[1] = w0[1];
w0_t[2] = w0[2];
w0_t[3] = w0[3];
- u32 w1_t[4];
+ u32x w1_t[4];
w1_t[0] = w1[0];
w1_t[1] = w1[1];
w1_t[2] = w1[2];
w1_t[3] = w1[3];
- u32 w2_t[4];
+ u32x w2_t[4];
w2_t[0] = w2[0];
w2_t[1] = w2[1];
w2_t[2] = w2[2];
w2_t[3] = w2[3];
- u32 w3_t[4];
+ u32x w3_t[4];
w3_t[0] = w3[0];
w3_t[1] = w3[1];
w3_t[2] = 0;
w3_t[3] = 0;
- u32 ipad[5];
- u32 opad[5];
+ u32x ipad[5];
+ u32x opad[5];
hmac_sha1_pad (w0_t, w1_t, w2_t, w3_t, ipad, opad);
w3_t[2] = 0;
w3_t[3] = (64 + salt_len) * 8;
- u32 digest[5];
+ u32x digest[5];
hmac_sha1_run (w0_t, w1_t, w2_t, w3_t, ipad, opad, digest);
- const u32 r0 = digest[3];
- const u32 r1 = digest[4];
- const u32 r2 = digest[2];
- const u32 r3 = digest[1];
-
- #include COMPARE_S
+ COMPARE_S_SIMD (digest[3], digest[4], digest[2], digest[1]);
}
}
if (combs_mode == COMBINATOR_MODE_BASE_RIGHT)
{
- switch_buffer_by_offset (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len);
+ switch_buffer_by_offset_le (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len);
}
/**
if (combs_mode == COMBINATOR_MODE_BASE_LEFT)
{
- switch_buffer_by_offset (wordr0, wordr1, wordr2, wordr3, pw_l_len);
+ switch_buffer_by_offset_le (wordr0, wordr1, wordr2, wordr3, pw_l_len);
}
u32 w0[4];
if (combs_mode == COMBINATOR_MODE_BASE_RIGHT)
{
- switch_buffer_by_offset (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len);
+ switch_buffer_by_offset_le (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len);
}
/**
if (combs_mode == COMBINATOR_MODE_BASE_LEFT)
{
- switch_buffer_by_offset (wordr0, wordr1, wordr2, wordr3, pw_l_len);
+ switch_buffer_by_offset_le (wordr0, wordr1, wordr2, wordr3, pw_l_len);
}
u32 w0[4];
#define _SHA1_
+#define NEW_SIMD_CODE
+
#include "include/constants.h"
#include "include/kernel_vendor.h"
#include "include/kernel_functions.c"
#include "OpenCL/types_ocl.c"
#include "OpenCL/common.c"
+#include "OpenCL/simd.c"
-#define COMPARE_S "OpenCL/check_single_comp4.c"
-#define COMPARE_M "OpenCL/check_multi_comp4.c"
-
-static void sha1_transform (const u32 w0[4], const u32 w1[4], const u32 w2[4], const u32 w3[4], u32 digest[5])
+static void sha1_transform (const u32x w0[4], const u32x w1[4], const u32x w2[4], const u32x w3[4], u32x digest[5])
{
- u32 A = digest[0];
- u32 B = digest[1];
- u32 C = digest[2];
- u32 D = digest[3];
- u32 E = digest[4];
-
- u32 w0_t = w0[0];
- u32 w1_t = w0[1];
- u32 w2_t = w0[2];
- u32 w3_t = w0[3];
- u32 w4_t = w1[0];
- u32 w5_t = w1[1];
- u32 w6_t = w1[2];
- u32 w7_t = w1[3];
- u32 w8_t = w2[0];
- u32 w9_t = w2[1];
- u32 wa_t = w2[2];
- u32 wb_t = w2[3];
- u32 wc_t = w3[0];
- u32 wd_t = w3[1];
- u32 we_t = w3[2];
- u32 wf_t = w3[3];
+ u32x A = digest[0];
+ u32x B = digest[1];
+ u32x C = digest[2];
+ u32x D = digest[3];
+ u32x E = digest[4];
+
+ u32x w0_t = w0[0];
+ u32x w1_t = w0[1];
+ u32x w2_t = w0[2];
+ u32x w3_t = w0[3];
+ u32x w4_t = w1[0];
+ u32x w5_t = w1[1];
+ u32x w6_t = w1[2];
+ u32x w7_t = w1[3];
+ u32x w8_t = w2[0];
+ u32x w9_t = w2[1];
+ u32x wa_t = w2[2];
+ u32x wb_t = w2[3];
+ u32x wc_t = w3[0];
+ u32x wd_t = w3[1];
+ u32x we_t = w3[2];
+ u32x wf_t = w3[3];
#undef K
#define K SHA1C00
digest[4] += E;
}
-static void hmac_sha1_pad (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], u32 ipad[5], u32 opad[5])
+static void hmac_sha1_pad (u32x w0[4], u32x w1[4], u32x w2[4], u32x w3[4], u32x ipad[5], u32x opad[5])
{
w0[0] = w0[0] ^ 0x36363636;
w0[1] = w0[1] ^ 0x36363636;
sha1_transform (w0, w1, w2, w3, opad);
}
-static void hmac_sha1_run (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], u32 ipad[5], u32 opad[5], u32 digest[5])
+static void hmac_sha1_run (u32x w0[4], u32x w1[4], u32x w2[4], u32x w3[4], u32x ipad[5], u32x opad[5], u32x digest[5])
{
digest[0] = ipad[0];
digest[1] = ipad[1];
* pads
*/
- u32 w0_t[4];
+ u32x w0_t[4];
w0_t[0] = swap32 (salt_buf0[0]);
w0_t[1] = swap32 (salt_buf0[1]);
w0_t[2] = swap32 (salt_buf0[2]);
w0_t[3] = swap32 (salt_buf0[3]);
- u32 w1_t[4];
+ u32x w1_t[4];
w1_t[0] = swap32 (salt_buf1[0]);
w1_t[1] = swap32 (salt_buf1[1]);
w1_t[2] = swap32 (salt_buf1[2]);
w1_t[3] = swap32 (salt_buf1[3]);
- u32 w2_t[4];
+ u32x w2_t[4];
w2_t[0] = 0;
w2_t[1] = 0;
w2_t[2] = 0;
w2_t[3] = 0;
- u32 w3_t[4];
+ u32x w3_t[4];
w3_t[0] = 0;
w3_t[1] = 0;
w3_t[2] = 0;
w3_t[3] = 0;
- u32 ipad[5];
- u32 opad[5];
+ u32x ipad[5];
+ u32x opad[5];
hmac_sha1_pad (w0_t, w1_t, w2_t, w3_t, ipad, opad);
u32 w0l = w0[0];
- for (u32 il_pos = 0; il_pos < bfs_cnt; il_pos++)
+ for (u32 il_pos = 0; il_pos < bfs_cnt; il_pos += VECT_SIZE)
{
- const u32 w0r = bfs_buf[il_pos].i;
+ const u32x w0r = w0r_create_bft (bfs_buf, il_pos);
- w0[0] = w0l | w0r;
+ const u32x w0lr = w0l | w0r;
- w0_t[0] = w0[0];
+ w0_t[0] = w0lr;
w0_t[1] = w0[1];
w0_t[2] = w0[2];
w0_t[3] = w0[3];
w3_t[2] = 0;
w3_t[3] = (64 + pw_len) * 8;
- u32 digest[5];
+ u32x digest[5];
hmac_sha1_run (w0_t, w1_t, w2_t, w3_t, ipad, opad, digest);
- const u32 r0 = digest[3];
- const u32 r1 = digest[4];
- const u32 r2 = digest[2];
- const u32 r3 = digest[1];
-
- #include COMPARE_M
+ COMPARE_M_SIMD (digest[3], digest[4], digest[2], digest[1]);
}
}
* pads
*/
- u32 w0_t[4];
+ u32x w0_t[4];
w0_t[0] = swap32 (salt_buf0[0]);
w0_t[1] = swap32 (salt_buf0[1]);
w0_t[2] = swap32 (salt_buf0[2]);
w0_t[3] = swap32 (salt_buf0[3]);
- u32 w1_t[4];
+ u32x w1_t[4];
w1_t[0] = swap32 (salt_buf1[0]);
w1_t[1] = swap32 (salt_buf1[1]);
w1_t[2] = swap32 (salt_buf1[2]);
w1_t[3] = swap32 (salt_buf1[3]);
- u32 w2_t[4];
+ u32x w2_t[4];
w2_t[0] = 0;
w2_t[1] = 0;
w2_t[2] = 0;
w2_t[3] = 0;
- u32 w3_t[4];
+ u32x w3_t[4];
w3_t[0] = 0;
w3_t[1] = 0;
w3_t[2] = 0;
w3_t[3] = 0;
- u32 ipad[5];
- u32 opad[5];
+ u32x ipad[5];
+ u32x opad[5];
hmac_sha1_pad (w0_t, w1_t, w2_t, w3_t, ipad, opad);
u32 w0l = w0[0];
- for (u32 il_pos = 0; il_pos < bfs_cnt; il_pos++)
+ for (u32 il_pos = 0; il_pos < bfs_cnt; il_pos += VECT_SIZE)
{
- const u32 w0r = bfs_buf[il_pos].i;
+ const u32x w0r = w0r_create_bft (bfs_buf, il_pos);
- w0[0] = w0l | w0r;
+ const u32x w0lr = w0l | w0r;
- w0_t[0] = w0[0];
+ w0_t[0] = w0lr;
w0_t[1] = w0[1];
w0_t[2] = w0[2];
w0_t[3] = w0[3];
w3_t[2] = 0;
w3_t[3] = (64 + pw_len) * 8;
- u32 digest[5];
+ u32x digest[5];
hmac_sha1_run (w0_t, w1_t, w2_t, w3_t, ipad, opad, digest);
- const u32 r0 = digest[3];
- const u32 r1 = digest[4];
- const u32 r2 = digest[2];
- const u32 r3 = digest[1];
-
- #include COMPARE_S
+ COMPARE_S_SIMD (digest[3], digest[4], digest[2], digest[1]);
}
}
{
append_0x80_2x4 (wordl0, wordl1, pw_l_len);
- switch_buffer_by_offset (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len);
+ switch_buffer_by_offset_le (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len);
}
/**
{
append_0x80_2x4 (wordr0, wordr1, pw_r_len);
- switch_buffer_by_offset (wordr0, wordr1, wordr2, wordr3, pw_l_len);
+ switch_buffer_by_offset_le (wordr0, wordr1, wordr2, wordr3, pw_l_len);
}
u32 w0[4];
{
append_0x80_2x4 (wordl0, wordl1, pw_l_len);
- switch_buffer_by_offset (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len);
+ switch_buffer_by_offset_le (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len);
}
/**
{
append_0x80_2x4 (wordr0, wordr1, pw_r_len);
- switch_buffer_by_offset (wordr0, wordr1, wordr2, wordr3, pw_l_len);
+ switch_buffer_by_offset_le (wordr0, wordr1, wordr2, wordr3, pw_l_len);
}
u32 w0[4];
#define _SHA1_
+#define NEW_SIMD_CODE
+
#include "include/constants.h"
#include "include/kernel_vendor.h"
#include "include/kernel_functions.c"
#include "OpenCL/types_ocl.c"
#include "OpenCL/common.c"
+#include "OpenCL/simd.c"
-#define COMPARE_S "OpenCL/check_single_comp4.c"
-#define COMPARE_M "OpenCL/check_multi_comp4.c"
-
-static void m00190m (u32 w[16], const u32 pw_len, __global pw_t *pws, __global kernel_rule_t *rules_buf, __global comb_t *combs_buf, __constant u32 * words_buf_r, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 bfs_cnt, const u32 digests_cnt, const u32 digests_offset)
+static void m00190m (u32 w[16], const u32 pw_len, __global pw_t *pws, __global kernel_rule_t *rules_buf, __global comb_t *combs_buf, __constant u32x * words_buf_r, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 bfs_cnt, const u32 digests_cnt, const u32 digests_offset)
{
/**
* modifier
* base
*/
- const u32 c_16s = rotl32 ((w[13] ^ w[ 8] ^ w[ 2] ), 1u);
- const u32 c_17s = rotl32 ((w[14] ^ w[ 9] ^ w[ 3] ^ w[ 1]), 1u);
- const u32 c_18s = rotl32 ((w[15] ^ w[10] ^ w[ 4] ^ w[ 2]), 1u);
- const u32 c_19s = rotl32 ((c_16s ^ w[11] ^ w[ 5] ^ w[ 3]), 1u);
- const u32 c_20s = rotl32 ((c_17s ^ w[12] ^ w[ 6] ^ w[ 4]), 1u);
- const u32 c_21s = rotl32 ((c_18s ^ w[13] ^ w[ 7] ^ w[ 5]), 1u);
- const u32 c_22s = rotl32 ((c_19s ^ w[14] ^ w[ 8] ^ w[ 6]), 1u);
- const u32 c_23s = rotl32 ((c_20s ^ w[15] ^ w[ 9] ^ w[ 7]), 1u);
- const u32 c_24s = rotl32 ((c_21s ^ c_16s ^ w[10] ^ w[ 8]), 1u);
- const u32 c_25s = rotl32 ((c_22s ^ c_17s ^ w[11] ^ w[ 9]), 1u);
- const u32 c_26s = rotl32 ((c_23s ^ c_18s ^ w[12] ^ w[10]), 1u);
- const u32 c_27s = rotl32 ((c_24s ^ c_19s ^ w[13] ^ w[11]), 1u);
- const u32 c_28s = rotl32 ((c_25s ^ c_20s ^ w[14] ^ w[12]), 1u);
- const u32 c_29s = rotl32 ((c_26s ^ c_21s ^ w[15] ^ w[13]), 1u);
- const u32 c_30s = rotl32 ((c_27s ^ c_22s ^ c_16s ^ w[14]), 1u);
- const u32 c_31s = rotl32 ((c_28s ^ c_23s ^ c_17s ^ w[15]), 1u);
- const u32 c_32s = rotl32 ((c_29s ^ c_24s ^ c_18s ^ c_16s), 1u);
- const u32 c_33s = rotl32 ((c_30s ^ c_25s ^ c_19s ^ c_17s), 1u);
- const u32 c_34s = rotl32 ((c_31s ^ c_26s ^ c_20s ^ c_18s), 1u);
- const u32 c_35s = rotl32 ((c_32s ^ c_27s ^ c_21s ^ c_19s), 1u);
- const u32 c_36s = rotl32 ((c_33s ^ c_28s ^ c_22s ^ c_20s), 1u);
- const u32 c_37s = rotl32 ((c_34s ^ c_29s ^ c_23s ^ c_21s), 1u);
- const u32 c_38s = rotl32 ((c_35s ^ c_30s ^ c_24s ^ c_22s), 1u);
- const u32 c_39s = rotl32 ((c_36s ^ c_31s ^ c_25s ^ c_23s), 1u);
- const u32 c_40s = rotl32 ((c_37s ^ c_32s ^ c_26s ^ c_24s), 1u);
- const u32 c_41s = rotl32 ((c_38s ^ c_33s ^ c_27s ^ c_25s), 1u);
- const u32 c_42s = rotl32 ((c_39s ^ c_34s ^ c_28s ^ c_26s), 1u);
- const u32 c_43s = rotl32 ((c_40s ^ c_35s ^ c_29s ^ c_27s), 1u);
- const u32 c_44s = rotl32 ((c_41s ^ c_36s ^ c_30s ^ c_28s), 1u);
- const u32 c_45s = rotl32 ((c_42s ^ c_37s ^ c_31s ^ c_29s), 1u);
- const u32 c_46s = rotl32 ((c_43s ^ c_38s ^ c_32s ^ c_30s), 1u);
- const u32 c_47s = rotl32 ((c_44s ^ c_39s ^ c_33s ^ c_31s), 1u);
- const u32 c_48s = rotl32 ((c_45s ^ c_40s ^ c_34s ^ c_32s), 1u);
- const u32 c_49s = rotl32 ((c_46s ^ c_41s ^ c_35s ^ c_33s), 1u);
- const u32 c_50s = rotl32 ((c_47s ^ c_42s ^ c_36s ^ c_34s), 1u);
- const u32 c_51s = rotl32 ((c_48s ^ c_43s ^ c_37s ^ c_35s), 1u);
- const u32 c_52s = rotl32 ((c_49s ^ c_44s ^ c_38s ^ c_36s), 1u);
- const u32 c_53s = rotl32 ((c_50s ^ c_45s ^ c_39s ^ c_37s), 1u);
- const u32 c_54s = rotl32 ((c_51s ^ c_46s ^ c_40s ^ c_38s), 1u);
- const u32 c_55s = rotl32 ((c_52s ^ c_47s ^ c_41s ^ c_39s), 1u);
- const u32 c_56s = rotl32 ((c_53s ^ c_48s ^ c_42s ^ c_40s), 1u);
- const u32 c_57s = rotl32 ((c_54s ^ c_49s ^ c_43s ^ c_41s), 1u);
- const u32 c_58s = rotl32 ((c_55s ^ c_50s ^ c_44s ^ c_42s), 1u);
- const u32 c_59s = rotl32 ((c_56s ^ c_51s ^ c_45s ^ c_43s), 1u);
- const u32 c_60s = rotl32 ((c_57s ^ c_52s ^ c_46s ^ c_44s), 1u);
- const u32 c_61s = rotl32 ((c_58s ^ c_53s ^ c_47s ^ c_45s), 1u);
- const u32 c_62s = rotl32 ((c_59s ^ c_54s ^ c_48s ^ c_46s), 1u);
- const u32 c_63s = rotl32 ((c_60s ^ c_55s ^ c_49s ^ c_47s), 1u);
- const u32 c_64s = rotl32 ((c_61s ^ c_56s ^ c_50s ^ c_48s), 1u);
- const u32 c_65s = rotl32 ((c_62s ^ c_57s ^ c_51s ^ c_49s), 1u);
- const u32 c_66s = rotl32 ((c_63s ^ c_58s ^ c_52s ^ c_50s), 1u);
- const u32 c_67s = rotl32 ((c_64s ^ c_59s ^ c_53s ^ c_51s), 1u);
- const u32 c_68s = rotl32 ((c_65s ^ c_60s ^ c_54s ^ c_52s), 1u);
- const u32 c_69s = rotl32 ((c_66s ^ c_61s ^ c_55s ^ c_53s), 1u);
- const u32 c_70s = rotl32 ((c_67s ^ c_62s ^ c_56s ^ c_54s), 1u);
- const u32 c_71s = rotl32 ((c_68s ^ c_63s ^ c_57s ^ c_55s), 1u);
- const u32 c_72s = rotl32 ((c_69s ^ c_64s ^ c_58s ^ c_56s), 1u);
- const u32 c_73s = rotl32 ((c_70s ^ c_65s ^ c_59s ^ c_57s), 1u);
- const u32 c_74s = rotl32 ((c_71s ^ c_66s ^ c_60s ^ c_58s), 1u);
- const u32 c_75s = rotl32 ((c_72s ^ c_67s ^ c_61s ^ c_59s), 1u);
+ const u32 c_16s = rotl32_S ((w[13] ^ w[ 8] ^ w[ 2] ), 1u);
+ const u32 c_17s = rotl32_S ((w[14] ^ w[ 9] ^ w[ 3] ^ w[ 1]), 1u);
+ const u32 c_18s = rotl32_S ((w[15] ^ w[10] ^ w[ 4] ^ w[ 2]), 1u);
+ const u32 c_19s = rotl32_S ((c_16s ^ w[11] ^ w[ 5] ^ w[ 3]), 1u);
+ const u32 c_20s = rotl32_S ((c_17s ^ w[12] ^ w[ 6] ^ w[ 4]), 1u);
+ const u32 c_21s = rotl32_S ((c_18s ^ w[13] ^ w[ 7] ^ w[ 5]), 1u);
+ const u32 c_22s = rotl32_S ((c_19s ^ w[14] ^ w[ 8] ^ w[ 6]), 1u);
+ const u32 c_23s = rotl32_S ((c_20s ^ w[15] ^ w[ 9] ^ w[ 7]), 1u);
+ const u32 c_24s = rotl32_S ((c_21s ^ c_16s ^ w[10] ^ w[ 8]), 1u);
+ const u32 c_25s = rotl32_S ((c_22s ^ c_17s ^ w[11] ^ w[ 9]), 1u);
+ const u32 c_26s = rotl32_S ((c_23s ^ c_18s ^ w[12] ^ w[10]), 1u);
+ const u32 c_27s = rotl32_S ((c_24s ^ c_19s ^ w[13] ^ w[11]), 1u);
+ const u32 c_28s = rotl32_S ((c_25s ^ c_20s ^ w[14] ^ w[12]), 1u);
+ const u32 c_29s = rotl32_S ((c_26s ^ c_21s ^ w[15] ^ w[13]), 1u);
+ const u32 c_30s = rotl32_S ((c_27s ^ c_22s ^ c_16s ^ w[14]), 1u);
+ const u32 c_31s = rotl32_S ((c_28s ^ c_23s ^ c_17s ^ w[15]), 1u);
+ const u32 c_32s = rotl32_S ((c_29s ^ c_24s ^ c_18s ^ c_16s), 1u);
+ const u32 c_33s = rotl32_S ((c_30s ^ c_25s ^ c_19s ^ c_17s), 1u);
+ const u32 c_34s = rotl32_S ((c_31s ^ c_26s ^ c_20s ^ c_18s), 1u);
+ const u32 c_35s = rotl32_S ((c_32s ^ c_27s ^ c_21s ^ c_19s), 1u);
+ const u32 c_36s = rotl32_S ((c_33s ^ c_28s ^ c_22s ^ c_20s), 1u);
+ const u32 c_37s = rotl32_S ((c_34s ^ c_29s ^ c_23s ^ c_21s), 1u);
+ const u32 c_38s = rotl32_S ((c_35s ^ c_30s ^ c_24s ^ c_22s), 1u);
+ const u32 c_39s = rotl32_S ((c_36s ^ c_31s ^ c_25s ^ c_23s), 1u);
+ const u32 c_40s = rotl32_S ((c_37s ^ c_32s ^ c_26s ^ c_24s), 1u);
+ const u32 c_41s = rotl32_S ((c_38s ^ c_33s ^ c_27s ^ c_25s), 1u);
+ const u32 c_42s = rotl32_S ((c_39s ^ c_34s ^ c_28s ^ c_26s), 1u);
+ const u32 c_43s = rotl32_S ((c_40s ^ c_35s ^ c_29s ^ c_27s), 1u);
+ const u32 c_44s = rotl32_S ((c_41s ^ c_36s ^ c_30s ^ c_28s), 1u);
+ const u32 c_45s = rotl32_S ((c_42s ^ c_37s ^ c_31s ^ c_29s), 1u);
+ const u32 c_46s = rotl32_S ((c_43s ^ c_38s ^ c_32s ^ c_30s), 1u);
+ const u32 c_47s = rotl32_S ((c_44s ^ c_39s ^ c_33s ^ c_31s), 1u);
+ const u32 c_48s = rotl32_S ((c_45s ^ c_40s ^ c_34s ^ c_32s), 1u);
+ const u32 c_49s = rotl32_S ((c_46s ^ c_41s ^ c_35s ^ c_33s), 1u);
+ const u32 c_50s = rotl32_S ((c_47s ^ c_42s ^ c_36s ^ c_34s), 1u);
+ const u32 c_51s = rotl32_S ((c_48s ^ c_43s ^ c_37s ^ c_35s), 1u);
+ const u32 c_52s = rotl32_S ((c_49s ^ c_44s ^ c_38s ^ c_36s), 1u);
+ const u32 c_53s = rotl32_S ((c_50s ^ c_45s ^ c_39s ^ c_37s), 1u);
+ const u32 c_54s = rotl32_S ((c_51s ^ c_46s ^ c_40s ^ c_38s), 1u);
+ const u32 c_55s = rotl32_S ((c_52s ^ c_47s ^ c_41s ^ c_39s), 1u);
+ const u32 c_56s = rotl32_S ((c_53s ^ c_48s ^ c_42s ^ c_40s), 1u);
+ const u32 c_57s = rotl32_S ((c_54s ^ c_49s ^ c_43s ^ c_41s), 1u);
+ const u32 c_58s = rotl32_S ((c_55s ^ c_50s ^ c_44s ^ c_42s), 1u);
+ const u32 c_59s = rotl32_S ((c_56s ^ c_51s ^ c_45s ^ c_43s), 1u);
+ const u32 c_60s = rotl32_S ((c_57s ^ c_52s ^ c_46s ^ c_44s), 1u);
+ const u32 c_61s = rotl32_S ((c_58s ^ c_53s ^ c_47s ^ c_45s), 1u);
+ const u32 c_62s = rotl32_S ((c_59s ^ c_54s ^ c_48s ^ c_46s), 1u);
+ const u32 c_63s = rotl32_S ((c_60s ^ c_55s ^ c_49s ^ c_47s), 1u);
+ const u32 c_64s = rotl32_S ((c_61s ^ c_56s ^ c_50s ^ c_48s), 1u);
+ const u32 c_65s = rotl32_S ((c_62s ^ c_57s ^ c_51s ^ c_49s), 1u);
+ const u32 c_66s = rotl32_S ((c_63s ^ c_58s ^ c_52s ^ c_50s), 1u);
+ const u32 c_67s = rotl32_S ((c_64s ^ c_59s ^ c_53s ^ c_51s), 1u);
+ const u32 c_68s = rotl32_S ((c_65s ^ c_60s ^ c_54s ^ c_52s), 1u);
+ const u32 c_69s = rotl32_S ((c_66s ^ c_61s ^ c_55s ^ c_53s), 1u);
+ const u32 c_70s = rotl32_S ((c_67s ^ c_62s ^ c_56s ^ c_54s), 1u);
+ const u32 c_71s = rotl32_S ((c_68s ^ c_63s ^ c_57s ^ c_55s), 1u);
+ const u32 c_72s = rotl32_S ((c_69s ^ c_64s ^ c_58s ^ c_56s), 1u);
+ const u32 c_73s = rotl32_S ((c_70s ^ c_65s ^ c_59s ^ c_57s), 1u);
+ const u32 c_74s = rotl32_S ((c_71s ^ c_66s ^ c_60s ^ c_58s), 1u);
+ const u32 c_75s = rotl32_S ((c_72s ^ c_67s ^ c_61s ^ c_59s), 1u);
const u32 c_17sK = c_17s + SHA1C00;
const u32 c_18sK = c_18s + SHA1C00;
u32 w0l = w[0];
- for (u32 il_pos = 0; il_pos < bfs_cnt; il_pos++)
+ for (u32 il_pos = 0; il_pos < bfs_cnt; il_pos += VECT_SIZE)
{
- const u32 w0r = words_buf_r[il_pos];
-
- const u32 w0 = w0l | w0r;
-
- const u32 w0s01 = rotl32 (w0, 1u);
- const u32 w0s02 = rotl32 (w0, 2u);
- const u32 w0s03 = rotl32 (w0, 3u);
- const u32 w0s04 = rotl32 (w0, 4u);
- const u32 w0s05 = rotl32 (w0, 5u);
- const u32 w0s06 = rotl32 (w0, 6u);
- const u32 w0s07 = rotl32 (w0, 7u);
- const u32 w0s08 = rotl32 (w0, 8u);
- const u32 w0s09 = rotl32 (w0, 9u);
- const u32 w0s10 = rotl32 (w0, 10u);
- const u32 w0s11 = rotl32 (w0, 11u);
- const u32 w0s12 = rotl32 (w0, 12u);
- const u32 w0s13 = rotl32 (w0, 13u);
- const u32 w0s14 = rotl32 (w0, 14u);
- const u32 w0s15 = rotl32 (w0, 15u);
- const u32 w0s16 = rotl32 (w0, 16u);
- const u32 w0s17 = rotl32 (w0, 17u);
- const u32 w0s18 = rotl32 (w0, 18u);
- const u32 w0s19 = rotl32 (w0, 19u);
- const u32 w0s20 = rotl32 (w0, 20u);
-
- const u32 w0s04___w0s06 = w0s04 ^ w0s06;
- const u32 w0s04___w0s08 = w0s04 ^ w0s08;
- const u32 w0s08___w0s12 = w0s08 ^ w0s12;
- const u32 w0s04___w0s06___w0s07 = w0s04___w0s06 ^ w0s07;
-
- u32 a = SHA1M_A;
- u32 b = SHA1M_B;
- u32 c = SHA1M_C;
- u32 d = SHA1M_D;
- u32 e = SHA1M_E;
+ const u32x w0r = words_buf_r[il_pos / VECT_SIZE];
+
+ const u32x w0 = w0l | w0r;
+
+ const u32x w0s01 = rotl32 (w0, 1u);
+ const u32x w0s02 = rotl32 (w0, 2u);
+ const u32x w0s03 = rotl32 (w0, 3u);
+ const u32x w0s04 = rotl32 (w0, 4u);
+ const u32x w0s05 = rotl32 (w0, 5u);
+ const u32x w0s06 = rotl32 (w0, 6u);
+ const u32x w0s07 = rotl32 (w0, 7u);
+ const u32x w0s08 = rotl32 (w0, 8u);
+ const u32x w0s09 = rotl32 (w0, 9u);
+ const u32x w0s10 = rotl32 (w0, 10u);
+ const u32x w0s11 = rotl32 (w0, 11u);
+ const u32x w0s12 = rotl32 (w0, 12u);
+ const u32x w0s13 = rotl32 (w0, 13u);
+ const u32x w0s14 = rotl32 (w0, 14u);
+ const u32x w0s15 = rotl32 (w0, 15u);
+ const u32x w0s16 = rotl32 (w0, 16u);
+ const u32x w0s17 = rotl32 (w0, 17u);
+ const u32x w0s18 = rotl32 (w0, 18u);
+ const u32x w0s19 = rotl32 (w0, 19u);
+ const u32x w0s20 = rotl32 (w0, 20u);
+
+ const u32x w0s04___w0s06 = w0s04 ^ w0s06;
+ const u32x w0s04___w0s08 = w0s04 ^ w0s08;
+ const u32x w0s08___w0s12 = w0s08 ^ w0s12;
+ const u32x w0s04___w0s06___w0s07 = w0s04___w0s06 ^ w0s07;
+
+ u32x a = SHA1M_A;
+ u32x b = SHA1M_B;
+ u32x c = SHA1M_C;
+ u32x d = SHA1M_D;
+ u32x e = SHA1M_E;
#undef K
#define K SHA1C00
SHA1_STEP (SHA1_F1 , b, c, d, e, a, (c_74s ^ w0s08 ^ w0s16));
SHA1_STEP (SHA1_F1 , a, b, c, d, e, (c_75s ^ w0s06 ^ w0s12 ^ w0s14));
- const u32 c_76s = rotl32 ((c_73s ^ c_68s ^ c_62s ^ c_60s), 1u);
- const u32 c_77s = rotl32 ((c_74s ^ c_69s ^ c_63s ^ c_61s), 1u);
- const u32 c_78s = rotl32 ((c_75s ^ c_70s ^ c_64s ^ c_62s), 1u);
- const u32 c_79s = rotl32 ((c_76s ^ c_71s ^ c_65s ^ c_63s), 1u);
+ const u32x c_76s = rotl32 ((c_73s ^ c_68s ^ c_62s ^ c_60s), 1u);
+ const u32x c_77s = rotl32 ((c_74s ^ c_69s ^ c_63s ^ c_61s), 1u);
+ const u32x c_78s = rotl32 ((c_75s ^ c_70s ^ c_64s ^ c_62s), 1u);
+ const u32x c_79s = rotl32 ((c_76s ^ c_71s ^ c_65s ^ c_63s), 1u);
- const u32 w0s21 = rotl32 (w0, 21u);
- const u32 w0s22 = rotl32 (w0, 22U);
+ const u32x w0s21 = rotl32 (w0, 21u);
+ const u32x w0s22 = rotl32 (w0, 22U);
SHA1_STEP (SHA1_F1 , e, a, b, c, d, (c_76s ^ w0s07 ^ w0s08___w0s12 ^ w0s16 ^ w0s21));
SHA1_STEP (SHA1_F1 , d, e, a, b, c, (c_77s));
d += SHA1M_D;
c += SHA1M_C;
- {
- const u32 r0 = a;
- const u32 r1 = e;
- const u32 r2 = d;
- const u32 r3 = c;
-
- #include COMPARE_M
- }
+ COMPARE_M_SIMD (a, e, d, c);
a &= 0x00000fff;
- {
- const u32 r0 = a;
- const u32 r1 = e;
- const u32 r2 = d;
- const u32 r3 = c;
-
- #include COMPARE_M
- }
+ COMPARE_M_SIMD (a, e, d, c);
}
}
-static void m00190s (u32 w[16], const u32 pw_len, __global pw_t *pws, __global kernel_rule_t *rules_buf, __global comb_t *combs_buf, __constant u32 * words_buf_r, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 bfs_cnt, const u32 digests_cnt, const u32 digests_offset)
+static void m00190s (u32 w[16], const u32 pw_len, __global pw_t *pws, __global kernel_rule_t *rules_buf, __global comb_t *combs_buf, __constant u32x * words_buf_r, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 bfs_cnt, const u32 digests_cnt, const u32 digests_offset)
{
/**
* modifier
* base
*/
- const u32 c_16s = rotl32 ((w[13] ^ w[ 8] ^ w[ 2] ), 1u);
- const u32 c_17s = rotl32 ((w[14] ^ w[ 9] ^ w[ 3] ^ w[ 1]), 1u);
- const u32 c_18s = rotl32 ((w[15] ^ w[10] ^ w[ 4] ^ w[ 2]), 1u);
- const u32 c_19s = rotl32 ((c_16s ^ w[11] ^ w[ 5] ^ w[ 3]), 1u);
- const u32 c_20s = rotl32 ((c_17s ^ w[12] ^ w[ 6] ^ w[ 4]), 1u);
- const u32 c_21s = rotl32 ((c_18s ^ w[13] ^ w[ 7] ^ w[ 5]), 1u);
- const u32 c_22s = rotl32 ((c_19s ^ w[14] ^ w[ 8] ^ w[ 6]), 1u);
- const u32 c_23s = rotl32 ((c_20s ^ w[15] ^ w[ 9] ^ w[ 7]), 1u);
- const u32 c_24s = rotl32 ((c_21s ^ c_16s ^ w[10] ^ w[ 8]), 1u);
- const u32 c_25s = rotl32 ((c_22s ^ c_17s ^ w[11] ^ w[ 9]), 1u);
- const u32 c_26s = rotl32 ((c_23s ^ c_18s ^ w[12] ^ w[10]), 1u);
- const u32 c_27s = rotl32 ((c_24s ^ c_19s ^ w[13] ^ w[11]), 1u);
- const u32 c_28s = rotl32 ((c_25s ^ c_20s ^ w[14] ^ w[12]), 1u);
- const u32 c_29s = rotl32 ((c_26s ^ c_21s ^ w[15] ^ w[13]), 1u);
- const u32 c_30s = rotl32 ((c_27s ^ c_22s ^ c_16s ^ w[14]), 1u);
- const u32 c_31s = rotl32 ((c_28s ^ c_23s ^ c_17s ^ w[15]), 1u);
- const u32 c_32s = rotl32 ((c_29s ^ c_24s ^ c_18s ^ c_16s), 1u);
- const u32 c_33s = rotl32 ((c_30s ^ c_25s ^ c_19s ^ c_17s), 1u);
- const u32 c_34s = rotl32 ((c_31s ^ c_26s ^ c_20s ^ c_18s), 1u);
- const u32 c_35s = rotl32 ((c_32s ^ c_27s ^ c_21s ^ c_19s), 1u);
- const u32 c_36s = rotl32 ((c_33s ^ c_28s ^ c_22s ^ c_20s), 1u);
- const u32 c_37s = rotl32 ((c_34s ^ c_29s ^ c_23s ^ c_21s), 1u);
- const u32 c_38s = rotl32 ((c_35s ^ c_30s ^ c_24s ^ c_22s), 1u);
- const u32 c_39s = rotl32 ((c_36s ^ c_31s ^ c_25s ^ c_23s), 1u);
- const u32 c_40s = rotl32 ((c_37s ^ c_32s ^ c_26s ^ c_24s), 1u);
- const u32 c_41s = rotl32 ((c_38s ^ c_33s ^ c_27s ^ c_25s), 1u);
- const u32 c_42s = rotl32 ((c_39s ^ c_34s ^ c_28s ^ c_26s), 1u);
- const u32 c_43s = rotl32 ((c_40s ^ c_35s ^ c_29s ^ c_27s), 1u);
- const u32 c_44s = rotl32 ((c_41s ^ c_36s ^ c_30s ^ c_28s), 1u);
- const u32 c_45s = rotl32 ((c_42s ^ c_37s ^ c_31s ^ c_29s), 1u);
- const u32 c_46s = rotl32 ((c_43s ^ c_38s ^ c_32s ^ c_30s), 1u);
- const u32 c_47s = rotl32 ((c_44s ^ c_39s ^ c_33s ^ c_31s), 1u);
- const u32 c_48s = rotl32 ((c_45s ^ c_40s ^ c_34s ^ c_32s), 1u);
- const u32 c_49s = rotl32 ((c_46s ^ c_41s ^ c_35s ^ c_33s), 1u);
- const u32 c_50s = rotl32 ((c_47s ^ c_42s ^ c_36s ^ c_34s), 1u);
- const u32 c_51s = rotl32 ((c_48s ^ c_43s ^ c_37s ^ c_35s), 1u);
- const u32 c_52s = rotl32 ((c_49s ^ c_44s ^ c_38s ^ c_36s), 1u);
- const u32 c_53s = rotl32 ((c_50s ^ c_45s ^ c_39s ^ c_37s), 1u);
- const u32 c_54s = rotl32 ((c_51s ^ c_46s ^ c_40s ^ c_38s), 1u);
- const u32 c_55s = rotl32 ((c_52s ^ c_47s ^ c_41s ^ c_39s), 1u);
- const u32 c_56s = rotl32 ((c_53s ^ c_48s ^ c_42s ^ c_40s), 1u);
- const u32 c_57s = rotl32 ((c_54s ^ c_49s ^ c_43s ^ c_41s), 1u);
- const u32 c_58s = rotl32 ((c_55s ^ c_50s ^ c_44s ^ c_42s), 1u);
- const u32 c_59s = rotl32 ((c_56s ^ c_51s ^ c_45s ^ c_43s), 1u);
- const u32 c_60s = rotl32 ((c_57s ^ c_52s ^ c_46s ^ c_44s), 1u);
- const u32 c_61s = rotl32 ((c_58s ^ c_53s ^ c_47s ^ c_45s), 1u);
- const u32 c_62s = rotl32 ((c_59s ^ c_54s ^ c_48s ^ c_46s), 1u);
- const u32 c_63s = rotl32 ((c_60s ^ c_55s ^ c_49s ^ c_47s), 1u);
- const u32 c_64s = rotl32 ((c_61s ^ c_56s ^ c_50s ^ c_48s), 1u);
- const u32 c_65s = rotl32 ((c_62s ^ c_57s ^ c_51s ^ c_49s), 1u);
- const u32 c_66s = rotl32 ((c_63s ^ c_58s ^ c_52s ^ c_50s), 1u);
- const u32 c_67s = rotl32 ((c_64s ^ c_59s ^ c_53s ^ c_51s), 1u);
- const u32 c_68s = rotl32 ((c_65s ^ c_60s ^ c_54s ^ c_52s), 1u);
- const u32 c_69s = rotl32 ((c_66s ^ c_61s ^ c_55s ^ c_53s), 1u);
- const u32 c_70s = rotl32 ((c_67s ^ c_62s ^ c_56s ^ c_54s), 1u);
- const u32 c_71s = rotl32 ((c_68s ^ c_63s ^ c_57s ^ c_55s), 1u);
- const u32 c_72s = rotl32 ((c_69s ^ c_64s ^ c_58s ^ c_56s), 1u);
- const u32 c_73s = rotl32 ((c_70s ^ c_65s ^ c_59s ^ c_57s), 1u);
- const u32 c_74s = rotl32 ((c_71s ^ c_66s ^ c_60s ^ c_58s), 1u);
- const u32 c_75s = rotl32 ((c_72s ^ c_67s ^ c_61s ^ c_59s), 1u);
+ const u32 c_16s = rotl32_S ((w[13] ^ w[ 8] ^ w[ 2] ), 1u);
+ const u32 c_17s = rotl32_S ((w[14] ^ w[ 9] ^ w[ 3] ^ w[ 1]), 1u);
+ const u32 c_18s = rotl32_S ((w[15] ^ w[10] ^ w[ 4] ^ w[ 2]), 1u);
+ const u32 c_19s = rotl32_S ((c_16s ^ w[11] ^ w[ 5] ^ w[ 3]), 1u);
+ const u32 c_20s = rotl32_S ((c_17s ^ w[12] ^ w[ 6] ^ w[ 4]), 1u);
+ const u32 c_21s = rotl32_S ((c_18s ^ w[13] ^ w[ 7] ^ w[ 5]), 1u);
+ const u32 c_22s = rotl32_S ((c_19s ^ w[14] ^ w[ 8] ^ w[ 6]), 1u);
+ const u32 c_23s = rotl32_S ((c_20s ^ w[15] ^ w[ 9] ^ w[ 7]), 1u);
+ const u32 c_24s = rotl32_S ((c_21s ^ c_16s ^ w[10] ^ w[ 8]), 1u);
+ const u32 c_25s = rotl32_S ((c_22s ^ c_17s ^ w[11] ^ w[ 9]), 1u);
+ const u32 c_26s = rotl32_S ((c_23s ^ c_18s ^ w[12] ^ w[10]), 1u);
+ const u32 c_27s = rotl32_S ((c_24s ^ c_19s ^ w[13] ^ w[11]), 1u);
+ const u32 c_28s = rotl32_S ((c_25s ^ c_20s ^ w[14] ^ w[12]), 1u);
+ const u32 c_29s = rotl32_S ((c_26s ^ c_21s ^ w[15] ^ w[13]), 1u);
+ const u32 c_30s = rotl32_S ((c_27s ^ c_22s ^ c_16s ^ w[14]), 1u);
+ const u32 c_31s = rotl32_S ((c_28s ^ c_23s ^ c_17s ^ w[15]), 1u);
+ const u32 c_32s = rotl32_S ((c_29s ^ c_24s ^ c_18s ^ c_16s), 1u);
+ const u32 c_33s = rotl32_S ((c_30s ^ c_25s ^ c_19s ^ c_17s), 1u);
+ const u32 c_34s = rotl32_S ((c_31s ^ c_26s ^ c_20s ^ c_18s), 1u);
+ const u32 c_35s = rotl32_S ((c_32s ^ c_27s ^ c_21s ^ c_19s), 1u);
+ const u32 c_36s = rotl32_S ((c_33s ^ c_28s ^ c_22s ^ c_20s), 1u);
+ const u32 c_37s = rotl32_S ((c_34s ^ c_29s ^ c_23s ^ c_21s), 1u);
+ const u32 c_38s = rotl32_S ((c_35s ^ c_30s ^ c_24s ^ c_22s), 1u);
+ const u32 c_39s = rotl32_S ((c_36s ^ c_31s ^ c_25s ^ c_23s), 1u);
+ const u32 c_40s = rotl32_S ((c_37s ^ c_32s ^ c_26s ^ c_24s), 1u);
+ const u32 c_41s = rotl32_S ((c_38s ^ c_33s ^ c_27s ^ c_25s), 1u);
+ const u32 c_42s = rotl32_S ((c_39s ^ c_34s ^ c_28s ^ c_26s), 1u);
+ const u32 c_43s = rotl32_S ((c_40s ^ c_35s ^ c_29s ^ c_27s), 1u);
+ const u32 c_44s = rotl32_S ((c_41s ^ c_36s ^ c_30s ^ c_28s), 1u);
+ const u32 c_45s = rotl32_S ((c_42s ^ c_37s ^ c_31s ^ c_29s), 1u);
+ const u32 c_46s = rotl32_S ((c_43s ^ c_38s ^ c_32s ^ c_30s), 1u);
+ const u32 c_47s = rotl32_S ((c_44s ^ c_39s ^ c_33s ^ c_31s), 1u);
+ const u32 c_48s = rotl32_S ((c_45s ^ c_40s ^ c_34s ^ c_32s), 1u);
+ const u32 c_49s = rotl32_S ((c_46s ^ c_41s ^ c_35s ^ c_33s), 1u);
+ const u32 c_50s = rotl32_S ((c_47s ^ c_42s ^ c_36s ^ c_34s), 1u);
+ const u32 c_51s = rotl32_S ((c_48s ^ c_43s ^ c_37s ^ c_35s), 1u);
+ const u32 c_52s = rotl32_S ((c_49s ^ c_44s ^ c_38s ^ c_36s), 1u);
+ const u32 c_53s = rotl32_S ((c_50s ^ c_45s ^ c_39s ^ c_37s), 1u);
+ const u32 c_54s = rotl32_S ((c_51s ^ c_46s ^ c_40s ^ c_38s), 1u);
+ const u32 c_55s = rotl32_S ((c_52s ^ c_47s ^ c_41s ^ c_39s), 1u);
+ const u32 c_56s = rotl32_S ((c_53s ^ c_48s ^ c_42s ^ c_40s), 1u);
+ const u32 c_57s = rotl32_S ((c_54s ^ c_49s ^ c_43s ^ c_41s), 1u);
+ const u32 c_58s = rotl32_S ((c_55s ^ c_50s ^ c_44s ^ c_42s), 1u);
+ const u32 c_59s = rotl32_S ((c_56s ^ c_51s ^ c_45s ^ c_43s), 1u);
+ const u32 c_60s = rotl32_S ((c_57s ^ c_52s ^ c_46s ^ c_44s), 1u);
+ const u32 c_61s = rotl32_S ((c_58s ^ c_53s ^ c_47s ^ c_45s), 1u);
+ const u32 c_62s = rotl32_S ((c_59s ^ c_54s ^ c_48s ^ c_46s), 1u);
+ const u32 c_63s = rotl32_S ((c_60s ^ c_55s ^ c_49s ^ c_47s), 1u);
+ const u32 c_64s = rotl32_S ((c_61s ^ c_56s ^ c_50s ^ c_48s), 1u);
+ const u32 c_65s = rotl32_S ((c_62s ^ c_57s ^ c_51s ^ c_49s), 1u);
+ const u32 c_66s = rotl32_S ((c_63s ^ c_58s ^ c_52s ^ c_50s), 1u);
+ const u32 c_67s = rotl32_S ((c_64s ^ c_59s ^ c_53s ^ c_51s), 1u);
+ const u32 c_68s = rotl32_S ((c_65s ^ c_60s ^ c_54s ^ c_52s), 1u);
+ const u32 c_69s = rotl32_S ((c_66s ^ c_61s ^ c_55s ^ c_53s), 1u);
+ const u32 c_70s = rotl32_S ((c_67s ^ c_62s ^ c_56s ^ c_54s), 1u);
+ const u32 c_71s = rotl32_S ((c_68s ^ c_63s ^ c_57s ^ c_55s), 1u);
+ const u32 c_72s = rotl32_S ((c_69s ^ c_64s ^ c_58s ^ c_56s), 1u);
+ const u32 c_73s = rotl32_S ((c_70s ^ c_65s ^ c_59s ^ c_57s), 1u);
+ const u32 c_74s = rotl32_S ((c_71s ^ c_66s ^ c_60s ^ c_58s), 1u);
+ const u32 c_75s = rotl32_S ((c_72s ^ c_67s ^ c_61s ^ c_59s), 1u);
const u32 c_17sK = c_17s + SHA1C00;
const u32 c_18sK = c_18s + SHA1C00;
u32 w0l = w[0];
- for (u32 il_pos = 0; il_pos < bfs_cnt; il_pos++)
+ for (u32 il_pos = 0; il_pos < bfs_cnt; il_pos += VECT_SIZE)
{
- const u32 w0r = words_buf_r[il_pos];
-
- const u32 w0 = w0l | w0r;
-
- const u32 w0s01 = rotl32 (w0, 1u);
- const u32 w0s02 = rotl32 (w0, 2u);
- const u32 w0s03 = rotl32 (w0, 3u);
- const u32 w0s04 = rotl32 (w0, 4u);
- const u32 w0s05 = rotl32 (w0, 5u);
- const u32 w0s06 = rotl32 (w0, 6u);
- const u32 w0s07 = rotl32 (w0, 7u);
- const u32 w0s08 = rotl32 (w0, 8u);
- const u32 w0s09 = rotl32 (w0, 9u);
- const u32 w0s10 = rotl32 (w0, 10u);
- const u32 w0s11 = rotl32 (w0, 11u);
- const u32 w0s12 = rotl32 (w0, 12u);
- const u32 w0s13 = rotl32 (w0, 13u);
- const u32 w0s14 = rotl32 (w0, 14u);
- const u32 w0s15 = rotl32 (w0, 15u);
- const u32 w0s16 = rotl32 (w0, 16u);
- const u32 w0s17 = rotl32 (w0, 17u);
- const u32 w0s18 = rotl32 (w0, 18u);
- const u32 w0s19 = rotl32 (w0, 19u);
- const u32 w0s20 = rotl32 (w0, 20u);
-
- const u32 w0s04___w0s06 = w0s04 ^ w0s06;
- const u32 w0s04___w0s08 = w0s04 ^ w0s08;
- const u32 w0s08___w0s12 = w0s08 ^ w0s12;
- const u32 w0s04___w0s06___w0s07 = w0s04___w0s06 ^ w0s07;
-
- u32 a = SHA1M_A;
- u32 b = SHA1M_B;
- u32 c = SHA1M_C;
- u32 d = SHA1M_D;
- u32 e = SHA1M_E;
+ const u32x w0r = words_buf_r[il_pos / VECT_SIZE];
+
+ const u32x w0 = w0l | w0r;
+
+ const u32x w0s01 = rotl32 (w0, 1u);
+ const u32x w0s02 = rotl32 (w0, 2u);
+ const u32x w0s03 = rotl32 (w0, 3u);
+ const u32x w0s04 = rotl32 (w0, 4u);
+ const u32x w0s05 = rotl32 (w0, 5u);
+ const u32x w0s06 = rotl32 (w0, 6u);
+ const u32x w0s07 = rotl32 (w0, 7u);
+ const u32x w0s08 = rotl32 (w0, 8u);
+ const u32x w0s09 = rotl32 (w0, 9u);
+ const u32x w0s10 = rotl32 (w0, 10u);
+ const u32x w0s11 = rotl32 (w0, 11u);
+ const u32x w0s12 = rotl32 (w0, 12u);
+ const u32x w0s13 = rotl32 (w0, 13u);
+ const u32x w0s14 = rotl32 (w0, 14u);
+ const u32x w0s15 = rotl32 (w0, 15u);
+ const u32x w0s16 = rotl32 (w0, 16u);
+ const u32x w0s17 = rotl32 (w0, 17u);
+ const u32x w0s18 = rotl32 (w0, 18u);
+ const u32x w0s19 = rotl32 (w0, 19u);
+ const u32x w0s20 = rotl32 (w0, 20u);
+
+ const u32x w0s04___w0s06 = w0s04 ^ w0s06;
+ const u32x w0s04___w0s08 = w0s04 ^ w0s08;
+ const u32x w0s08___w0s12 = w0s08 ^ w0s12;
+ const u32x w0s04___w0s06___w0s07 = w0s04___w0s06 ^ w0s07;
+
+ u32x a = SHA1M_A;
+ u32x b = SHA1M_B;
+ u32x c = SHA1M_C;
+ u32x d = SHA1M_D;
+ u32x e = SHA1M_E;
#undef K
#define K SHA1C00
SHA1_STEP (SHA1_F1 , d, e, a, b, c, (c_72s ^ w0s05 ^ w0s11 ^ w0s12 ^ w0s13 ^ w0s16 ^ w0s18));
SHA1_STEP (SHA1_F1 , c, d, e, a, b, (c_73s ^ w0s20));
SHA1_STEP (SHA1_F1 , b, c, d, e, a, (c_74s ^ w0s08 ^ w0s16));
+ SHA1_STEP (SHA1_F1 , a, b, c, d, e, (c_75s ^ w0s06 ^ w0s12 ^ w0s14));
- SHA1_STEP (SHA1_F1, a, b, c, d, e, (c_75s ^ w0s06 ^ w0s12 ^ w0s14));
-
- const u32 c_76s = rotl32 ((c_73s ^ c_68s ^ c_62s ^ c_60s), 1u);
- const u32 c_77s = rotl32 ((c_74s ^ c_69s ^ c_63s ^ c_61s), 1u);
- const u32 c_78s = rotl32 ((c_75s ^ c_70s ^ c_64s ^ c_62s), 1u);
- const u32 c_79s = rotl32 ((c_76s ^ c_71s ^ c_65s ^ c_63s), 1u);
+ const u32x c_76s = rotl32 ((c_73s ^ c_68s ^ c_62s ^ c_60s), 1u);
+ const u32x c_77s = rotl32 ((c_74s ^ c_69s ^ c_63s ^ c_61s), 1u);
+ const u32x c_78s = rotl32 ((c_75s ^ c_70s ^ c_64s ^ c_62s), 1u);
+ const u32x c_79s = rotl32 ((c_76s ^ c_71s ^ c_65s ^ c_63s), 1u);
- const u32 w0s21 = rotl32 (w0, 21u);
- const u32 w0s22 = rotl32 (w0, 22U);
+ const u32x w0s21 = rotl32 (w0, 21u);
+ const u32x w0s22 = rotl32 (w0, 22U);
SHA1_STEP (SHA1_F1 , e, a, b, c, d, (c_76s ^ w0s07 ^ w0s08___w0s12 ^ w0s16 ^ w0s21));
SHA1_STEP (SHA1_F1 , d, e, a, b, c, (c_77s));
d += SHA1M_D;
c += SHA1M_C;
- {
- const u32 r0 = a;
- const u32 r1 = e;
- const u32 r2 = d;
- const u32 r3 = c;
-
- #include COMPARE_S
- }
+ COMPARE_S_SIMD (a, e, d, c);
a &= 0x00000fff;
- {
- const u32 r0 = a;
- const u32 r1 = e;
- const u32 r2 = d;
- const u32 r3 = c;
-
- #include COMPARE_S
- }
+ COMPARE_S_SIMD (a, e, d, c);
}
}
-__kernel void m00190_m04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, __global comb_t *combs_buf, __constant u32 * words_buf_r, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 bfs_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max)
+__kernel void m00190_m04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, __global comb_t *combs_buf, __constant u32x * words_buf_r, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 bfs_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max)
{
/**
* base
m00190m (w, pw_len, pws, rules_buf, combs_buf, words_buf_r, tmps, hooks, bitmaps_buf_s1_a, bitmaps_buf_s1_b, bitmaps_buf_s1_c, bitmaps_buf_s1_d, bitmaps_buf_s2_a, bitmaps_buf_s2_b, bitmaps_buf_s2_c, bitmaps_buf_s2_d, plains_buf, digests_buf, hashes_shown, salt_bufs, esalt_bufs, d_return_buf, d_scryptV_buf, bitmap_mask, bitmap_shift1, bitmap_shift2, salt_pos, loop_pos, loop_cnt, bfs_cnt, digests_cnt, digests_offset);
}
-__kernel void m00190_m08 (__global pw_t *pws, __global kernel_rule_t *rules_buf, __global comb_t *combs_buf, __constant u32 * words_buf_r, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 bfs_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max)
+__kernel void m00190_m08 (__global pw_t *pws, __global kernel_rule_t *rules_buf, __global comb_t *combs_buf, __constant u32x * words_buf_r, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 bfs_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max)
{
/**
* base
m00190m (w, pw_len, pws, rules_buf, combs_buf, words_buf_r, tmps, hooks, bitmaps_buf_s1_a, bitmaps_buf_s1_b, bitmaps_buf_s1_c, bitmaps_buf_s1_d, bitmaps_buf_s2_a, bitmaps_buf_s2_b, bitmaps_buf_s2_c, bitmaps_buf_s2_d, plains_buf, digests_buf, hashes_shown, salt_bufs, esalt_bufs, d_return_buf, d_scryptV_buf, bitmap_mask, bitmap_shift1, bitmap_shift2, salt_pos, loop_pos, loop_cnt, bfs_cnt, digests_cnt, digests_offset);
}
-__kernel void m00190_m16 (__global pw_t *pws, __global kernel_rule_t *rules_buf, __global comb_t *combs_buf, __constant u32 * words_buf_r, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 bfs_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max)
+__kernel void m00190_m16 (__global pw_t *pws, __global kernel_rule_t *rules_buf, __global comb_t *combs_buf, __constant u32x * words_buf_r, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 bfs_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max)
{
/**
* base
m00190m (w, pw_len, pws, rules_buf, combs_buf, words_buf_r, tmps, hooks, bitmaps_buf_s1_a, bitmaps_buf_s1_b, bitmaps_buf_s1_c, bitmaps_buf_s1_d, bitmaps_buf_s2_a, bitmaps_buf_s2_b, bitmaps_buf_s2_c, bitmaps_buf_s2_d, plains_buf, digests_buf, hashes_shown, salt_bufs, esalt_bufs, d_return_buf, d_scryptV_buf, bitmap_mask, bitmap_shift1, bitmap_shift2, salt_pos, loop_pos, loop_cnt, bfs_cnt, digests_cnt, digests_offset);
}
-__kernel void m00190_s04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, __global comb_t *combs_buf, __constant u32 * words_buf_r, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 bfs_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max)
+__kernel void m00190_s04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, __global comb_t *combs_buf, __constant u32x * words_buf_r, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 bfs_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max)
{
/**
* base
m00190s (w, pw_len, pws, rules_buf, combs_buf, words_buf_r, tmps, hooks, bitmaps_buf_s1_a, bitmaps_buf_s1_b, bitmaps_buf_s1_c, bitmaps_buf_s1_d, bitmaps_buf_s2_a, bitmaps_buf_s2_b, bitmaps_buf_s2_c, bitmaps_buf_s2_d, plains_buf, digests_buf, hashes_shown, salt_bufs, esalt_bufs, d_return_buf, d_scryptV_buf, bitmap_mask, bitmap_shift1, bitmap_shift2, salt_pos, loop_pos, loop_cnt, bfs_cnt, digests_cnt, digests_offset);
}
-__kernel void m00190_s08 (__global pw_t *pws, __global kernel_rule_t *rules_buf, __global comb_t *combs_buf, __constant u32 * words_buf_r, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 bfs_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max)
+__kernel void m00190_s08 (__global pw_t *pws, __global kernel_rule_t *rules_buf, __global comb_t *combs_buf, __constant u32x * words_buf_r, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 bfs_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max)
{
/**
* base
m00190s (w, pw_len, pws, rules_buf, combs_buf, words_buf_r, tmps, hooks, bitmaps_buf_s1_a, bitmaps_buf_s1_b, bitmaps_buf_s1_c, bitmaps_buf_s1_d, bitmaps_buf_s2_a, bitmaps_buf_s2_b, bitmaps_buf_s2_c, bitmaps_buf_s2_d, plains_buf, digests_buf, hashes_shown, salt_bufs, esalt_bufs, d_return_buf, d_scryptV_buf, bitmap_mask, bitmap_shift1, bitmap_shift2, salt_pos, loop_pos, loop_cnt, bfs_cnt, digests_cnt, digests_offset);
}
-__kernel void m00190_s16 (__global pw_t *pws, __global kernel_rule_t *rules_buf, __global comb_t *combs_buf, __constant u32 * words_buf_r, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 bfs_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max)
+__kernel void m00190_s16 (__global pw_t *pws, __global kernel_rule_t *rules_buf, __global comb_t *combs_buf, __constant u32x * words_buf_r, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 bfs_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max)
{
/**
* base
if (combs_mode == COMBINATOR_MODE_BASE_RIGHT)
{
- switch_buffer_by_offset (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len);
+ switch_buffer_by_offset_le (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len);
}
/**
if (combs_mode == COMBINATOR_MODE_BASE_LEFT)
{
- switch_buffer_by_offset (wordr0, wordr1, wordr2, wordr3, pw_l_len);
+ switch_buffer_by_offset_le (wordr0, wordr1, wordr2, wordr3, pw_l_len);
}
u32 w_t[16];
if (combs_mode == COMBINATOR_MODE_BASE_RIGHT)
{
- switch_buffer_by_offset (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len);
+ switch_buffer_by_offset_le (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len);
}
/**
if (combs_mode == COMBINATOR_MODE_BASE_LEFT)
{
- switch_buffer_by_offset (wordr0, wordr1, wordr2, wordr3, pw_l_len);
+ switch_buffer_by_offset_le (wordr0, wordr1, wordr2, wordr3, pw_l_len);
}
u32 w_t[16];
#define _MYSQL323_
+#define NEW_SIMD_CODE
+
#include "include/constants.h"
#include "include/kernel_vendor.h"
#include "include/kernel_functions.c"
#include "OpenCL/types_ocl.c"
#include "OpenCL/common.c"
+#include "OpenCL/simd.c"
-#define COMPARE_S "OpenCL/check_single_comp4.c"
-#define COMPARE_M "OpenCL/check_multi_comp4.c"
-
-static void m00200m (u32 w[16], const u32 pw_len, __global pw_t *pws, __global kernel_rule_t *rules_buf, __global comb_t *combs_buf, __constant u32 * words_buf_r, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 bfs_cnt, const u32 digests_cnt, const u32 digests_offset)
+static void m00200m (u32 w[16], const u32 pw_len, __global pw_t *pws, __global kernel_rule_t *rules_buf, __global comb_t *combs_buf, __constant u32x * words_buf_r, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 bfs_cnt, const u32 digests_cnt, const u32 digests_offset)
{
/**
* modifier
u32 w0l = w[0];
- for (u32 il_pos = 0; il_pos < bfs_cnt; il_pos++)
+ for (u32 il_pos = 0; il_pos < bfs_cnt; il_pos += VECT_SIZE)
{
- const u32 w0r = words_buf_r[il_pos];
+ const u32x w0r = words_buf_r[il_pos / VECT_SIZE];
- const u32 w0 = w0l | w0r;
+ const u32x w0 = w0l | w0r;
- u32 a = MYSQL323_A;
- u32 b = MYSQL323_B;
+ u32x a = MYSQL323_A;
+ u32x b = MYSQL323_B;
+ u32x c = 0;
+ u32x d = 0;
- u32 add = 7;
+ u32x add = 7;
#define ROUND(v) \
{ \
for (i = 4, j = 1; i <= (int) pw_len - 4; i += 4, j += 1)
{
- const u32 wj = w[j];
+ const u32x wj = w[j];
ROUND ((wj >> 0) & 0xff);
ROUND ((wj >> 8) & 0xff);
ROUND ((wj >> 24) & 0xff);
}
- const u32 wj = w[j];
+ const u32x wj = w[j];
const u32 left = pw_len - i;
a &= 0x7fffffff;
b &= 0x7fffffff;
- const u32 r0 = a;
- const u32 r1 = b;
- const u32 r2 = 0;
- const u32 r3 = 0;
-
- #include COMPARE_M
+ COMPARE_M_SIMD (a, b, c, d);
}
}
-static void m00200s (u32 w[16], const u32 pw_len, __global pw_t *pws, __global kernel_rule_t *rules_buf, __global comb_t *combs_buf, __constant u32 * words_buf_r, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 bfs_cnt, const u32 digests_cnt, const u32 digests_offset)
+static void m00200s (u32 w[16], const u32 pw_len, __global pw_t *pws, __global kernel_rule_t *rules_buf, __global comb_t *combs_buf, __constant u32x * words_buf_r, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 bfs_cnt, const u32 digests_cnt, const u32 digests_offset)
{
/**
* modifier
u32 w0l = w[0];
- for (u32 il_pos = 0; il_pos < bfs_cnt; il_pos++)
+ for (u32 il_pos = 0; il_pos < bfs_cnt; il_pos += VECT_SIZE)
{
- const u32 w0r = words_buf_r[il_pos];
+ const u32x w0r = words_buf_r[il_pos / VECT_SIZE];
- const u32 w0 = w0l | w0r;
+ const u32x w0 = w0l | w0r;
- u32 a = MYSQL323_A;
- u32 b = MYSQL323_B;
+ u32x a = MYSQL323_A;
+ u32x b = MYSQL323_B;
+ u32x c = 0;
+ u32x d = 0;
- u32 add = 7;
+ u32x add = 7;
#define ROUND(v) \
{ \
for (i = 4, j = 1; i <= (int) pw_len - 4; i += 4, j += 1)
{
- const u32 wj = w[j];
+ const u32x wj = w[j];
ROUND ((wj >> 0) & 0xff);
ROUND ((wj >> 8) & 0xff);
ROUND ((wj >> 24) & 0xff);
}
- const u32 wj = w[j];
+ const u32x wj = w[j];
const u32 left = pw_len - i;
a &= 0x7fffffff;
b &= 0x7fffffff;
- const u32 r0 = a;
- const u32 r1 = b;
- const u32 r2 = 0;
- const u32 r3 = 0;
-
- #include COMPARE_S
+ COMPARE_S_SIMD (a, b, c, d);
}
}
-__kernel void m00200_m04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, __global comb_t *combs_buf, __constant u32 * words_buf_r, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 bfs_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max)
+__kernel void m00200_m04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, __global comb_t *combs_buf, __constant u32x * words_buf_r, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 bfs_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max)
{
/**
* base
m00200m (w, pw_len, pws, rules_buf, combs_buf, words_buf_r, tmps, hooks, bitmaps_buf_s1_a, bitmaps_buf_s1_b, bitmaps_buf_s1_c, bitmaps_buf_s1_d, bitmaps_buf_s2_a, bitmaps_buf_s2_b, bitmaps_buf_s2_c, bitmaps_buf_s2_d, plains_buf, digests_buf, hashes_shown, salt_bufs, esalt_bufs, d_return_buf, d_scryptV_buf, bitmap_mask, bitmap_shift1, bitmap_shift2, salt_pos, loop_pos, loop_cnt, bfs_cnt, digests_cnt, digests_offset);
}
-__kernel void m00200_m08 (__global pw_t *pws, __global kernel_rule_t *rules_buf, __global comb_t *combs_buf, __constant u32 * words_buf_r, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 bfs_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max)
+__kernel void m00200_m08 (__global pw_t *pws, __global kernel_rule_t *rules_buf, __global comb_t *combs_buf, __constant u32x * words_buf_r, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 bfs_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max)
{
/**
* base
m00200m (w, pw_len, pws, rules_buf, combs_buf, words_buf_r, tmps, hooks, bitmaps_buf_s1_a, bitmaps_buf_s1_b, bitmaps_buf_s1_c, bitmaps_buf_s1_d, bitmaps_buf_s2_a, bitmaps_buf_s2_b, bitmaps_buf_s2_c, bitmaps_buf_s2_d, plains_buf, digests_buf, hashes_shown, salt_bufs, esalt_bufs, d_return_buf, d_scryptV_buf, bitmap_mask, bitmap_shift1, bitmap_shift2, salt_pos, loop_pos, loop_cnt, bfs_cnt, digests_cnt, digests_offset);
}
-__kernel void m00200_m16 (__global pw_t *pws, __global kernel_rule_t *rules_buf, __global comb_t *combs_buf, __constant u32 * words_buf_r, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 bfs_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max)
+__kernel void m00200_m16 (__global pw_t *pws, __global kernel_rule_t *rules_buf, __global comb_t *combs_buf, __constant u32x * words_buf_r, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 bfs_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max)
{
/**
* base
m00200m (w, pw_len, pws, rules_buf, combs_buf, words_buf_r, tmps, hooks, bitmaps_buf_s1_a, bitmaps_buf_s1_b, bitmaps_buf_s1_c, bitmaps_buf_s1_d, bitmaps_buf_s2_a, bitmaps_buf_s2_b, bitmaps_buf_s2_c, bitmaps_buf_s2_d, plains_buf, digests_buf, hashes_shown, salt_bufs, esalt_bufs, d_return_buf, d_scryptV_buf, bitmap_mask, bitmap_shift1, bitmap_shift2, salt_pos, loop_pos, loop_cnt, bfs_cnt, digests_cnt, digests_offset);
}
-__kernel void m00200_s04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, __global comb_t *combs_buf, __constant u32 * words_buf_r, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 bfs_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max)
+__kernel void m00200_s04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, __global comb_t *combs_buf, __constant u32x * words_buf_r, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 bfs_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max)
{
/**
* base
m00200s (w, pw_len, pws, rules_buf, combs_buf, words_buf_r, tmps, hooks, bitmaps_buf_s1_a, bitmaps_buf_s1_b, bitmaps_buf_s1_c, bitmaps_buf_s1_d, bitmaps_buf_s2_a, bitmaps_buf_s2_b, bitmaps_buf_s2_c, bitmaps_buf_s2_d, plains_buf, digests_buf, hashes_shown, salt_bufs, esalt_bufs, d_return_buf, d_scryptV_buf, bitmap_mask, bitmap_shift1, bitmap_shift2, salt_pos, loop_pos, loop_cnt, bfs_cnt, digests_cnt, digests_offset);
}
-__kernel void m00200_s08 (__global pw_t *pws, __global kernel_rule_t *rules_buf, __global comb_t *combs_buf, __constant u32 * words_buf_r, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 bfs_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max)
+__kernel void m00200_s08 (__global pw_t *pws, __global kernel_rule_t *rules_buf, __global comb_t *combs_buf, __constant u32x * words_buf_r, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 bfs_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max)
{
/**
* base
m00200s (w, pw_len, pws, rules_buf, combs_buf, words_buf_r, tmps, hooks, bitmaps_buf_s1_a, bitmaps_buf_s1_b, bitmaps_buf_s1_c, bitmaps_buf_s1_d, bitmaps_buf_s2_a, bitmaps_buf_s2_b, bitmaps_buf_s2_c, bitmaps_buf_s2_d, plains_buf, digests_buf, hashes_shown, salt_bufs, esalt_bufs, d_return_buf, d_scryptV_buf, bitmap_mask, bitmap_shift1, bitmap_shift2, salt_pos, loop_pos, loop_cnt, bfs_cnt, digests_cnt, digests_offset);
}
-__kernel void m00200_s16 (__global pw_t *pws, __global kernel_rule_t *rules_buf, __global comb_t *combs_buf, __constant u32 * words_buf_r, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 bfs_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max)
+__kernel void m00200_s16 (__global pw_t *pws, __global kernel_rule_t *rules_buf, __global comb_t *combs_buf, __constant u32x * words_buf_r, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 bfs_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max)
{
/**
* base
{
append_0x80_2x4 (wordl0, wordl1, pw_l_len);
- switch_buffer_by_offset (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len);
+ switch_buffer_by_offset_le (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len);
}
/**
{
append_0x80_2x4 (wordr0, wordr1, pw_r_len);
- switch_buffer_by_offset (wordr0, wordr1, wordr2, wordr3, pw_l_len);
+ switch_buffer_by_offset_le (wordr0, wordr1, wordr2, wordr3, pw_l_len);
}
u32 w0[4];
{
append_0x80_2x4 (wordl0, wordl1, pw_l_len);
- switch_buffer_by_offset (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len);
+ switch_buffer_by_offset_le (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len);
}
/**
{
append_0x80_2x4 (wordr0, wordr1, pw_r_len);
- switch_buffer_by_offset (wordr0, wordr1, wordr2, wordr3, pw_l_len);
+ switch_buffer_by_offset_le (wordr0, wordr1, wordr2, wordr3, pw_l_len);
}
u32 w0[4];
#define _SHA1_
+#define NEW_SIMD_CODE
+
#include "include/constants.h"
#include "include/kernel_vendor.h"
#include "include/kernel_functions.c"
#include "OpenCL/types_ocl.c"
#include "OpenCL/common.c"
+#include "OpenCL/simd.c"
-#define COMPARE_S "OpenCL/check_single_comp4.c"
-#define COMPARE_M "OpenCL/check_multi_comp4.c"
-
-static void m00300m (u32 w[16], const u32 pw_len, __global pw_t *pws, __global kernel_rule_t *rules_buf, __global comb_t *combs_buf, __constant u32 * words_buf_r, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 bfs_cnt, const u32 digests_cnt, const u32 digests_offset)
+static void m00300m (u32 w[16], const u32 pw_len, __global pw_t *pws, __global kernel_rule_t *rules_buf, __global comb_t *combs_buf, __constant u32x * words_buf_r, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 bfs_cnt, const u32 digests_cnt, const u32 digests_offset)
{
/**
* modifier
* base
*/
- const u32 c_16s = rotl32 ((w[13] ^ w[ 8] ^ w[ 2] ), 1u);
- const u32 c_17s = rotl32 ((w[14] ^ w[ 9] ^ w[ 3] ^ w[ 1]), 1u);
- const u32 c_18s = rotl32 ((w[15] ^ w[10] ^ w[ 4] ^ w[ 2]), 1u);
- const u32 c_19s = rotl32 ((c_16s ^ w[11] ^ w[ 5] ^ w[ 3]), 1u);
- const u32 c_20s = rotl32 ((c_17s ^ w[12] ^ w[ 6] ^ w[ 4]), 1u);
- const u32 c_21s = rotl32 ((c_18s ^ w[13] ^ w[ 7] ^ w[ 5]), 1u);
- const u32 c_22s = rotl32 ((c_19s ^ w[14] ^ w[ 8] ^ w[ 6]), 1u);
- const u32 c_23s = rotl32 ((c_20s ^ w[15] ^ w[ 9] ^ w[ 7]), 1u);
- const u32 c_24s = rotl32 ((c_21s ^ c_16s ^ w[10] ^ w[ 8]), 1u);
- const u32 c_25s = rotl32 ((c_22s ^ c_17s ^ w[11] ^ w[ 9]), 1u);
- const u32 c_26s = rotl32 ((c_23s ^ c_18s ^ w[12] ^ w[10]), 1u);
- const u32 c_27s = rotl32 ((c_24s ^ c_19s ^ w[13] ^ w[11]), 1u);
- const u32 c_28s = rotl32 ((c_25s ^ c_20s ^ w[14] ^ w[12]), 1u);
- const u32 c_29s = rotl32 ((c_26s ^ c_21s ^ w[15] ^ w[13]), 1u);
- const u32 c_30s = rotl32 ((c_27s ^ c_22s ^ c_16s ^ w[14]), 1u);
- const u32 c_31s = rotl32 ((c_28s ^ c_23s ^ c_17s ^ w[15]), 1u);
- const u32 c_32s = rotl32 ((c_29s ^ c_24s ^ c_18s ^ c_16s), 1u);
- const u32 c_33s = rotl32 ((c_30s ^ c_25s ^ c_19s ^ c_17s), 1u);
- const u32 c_34s = rotl32 ((c_31s ^ c_26s ^ c_20s ^ c_18s), 1u);
- const u32 c_35s = rotl32 ((c_32s ^ c_27s ^ c_21s ^ c_19s), 1u);
- const u32 c_36s = rotl32 ((c_33s ^ c_28s ^ c_22s ^ c_20s), 1u);
- const u32 c_37s = rotl32 ((c_34s ^ c_29s ^ c_23s ^ c_21s), 1u);
- const u32 c_38s = rotl32 ((c_35s ^ c_30s ^ c_24s ^ c_22s), 1u);
- const u32 c_39s = rotl32 ((c_36s ^ c_31s ^ c_25s ^ c_23s), 1u);
- const u32 c_40s = rotl32 ((c_37s ^ c_32s ^ c_26s ^ c_24s), 1u);
- const u32 c_41s = rotl32 ((c_38s ^ c_33s ^ c_27s ^ c_25s), 1u);
- const u32 c_42s = rotl32 ((c_39s ^ c_34s ^ c_28s ^ c_26s), 1u);
- const u32 c_43s = rotl32 ((c_40s ^ c_35s ^ c_29s ^ c_27s), 1u);
- const u32 c_44s = rotl32 ((c_41s ^ c_36s ^ c_30s ^ c_28s), 1u);
- const u32 c_45s = rotl32 ((c_42s ^ c_37s ^ c_31s ^ c_29s), 1u);
- const u32 c_46s = rotl32 ((c_43s ^ c_38s ^ c_32s ^ c_30s), 1u);
- const u32 c_47s = rotl32 ((c_44s ^ c_39s ^ c_33s ^ c_31s), 1u);
- const u32 c_48s = rotl32 ((c_45s ^ c_40s ^ c_34s ^ c_32s), 1u);
- const u32 c_49s = rotl32 ((c_46s ^ c_41s ^ c_35s ^ c_33s), 1u);
- const u32 c_50s = rotl32 ((c_47s ^ c_42s ^ c_36s ^ c_34s), 1u);
- const u32 c_51s = rotl32 ((c_48s ^ c_43s ^ c_37s ^ c_35s), 1u);
- const u32 c_52s = rotl32 ((c_49s ^ c_44s ^ c_38s ^ c_36s), 1u);
- const u32 c_53s = rotl32 ((c_50s ^ c_45s ^ c_39s ^ c_37s), 1u);
- const u32 c_54s = rotl32 ((c_51s ^ c_46s ^ c_40s ^ c_38s), 1u);
- const u32 c_55s = rotl32 ((c_52s ^ c_47s ^ c_41s ^ c_39s), 1u);
- const u32 c_56s = rotl32 ((c_53s ^ c_48s ^ c_42s ^ c_40s), 1u);
- const u32 c_57s = rotl32 ((c_54s ^ c_49s ^ c_43s ^ c_41s), 1u);
- const u32 c_58s = rotl32 ((c_55s ^ c_50s ^ c_44s ^ c_42s), 1u);
- const u32 c_59s = rotl32 ((c_56s ^ c_51s ^ c_45s ^ c_43s), 1u);
- const u32 c_60s = rotl32 ((c_57s ^ c_52s ^ c_46s ^ c_44s), 1u);
- const u32 c_61s = rotl32 ((c_58s ^ c_53s ^ c_47s ^ c_45s), 1u);
- const u32 c_62s = rotl32 ((c_59s ^ c_54s ^ c_48s ^ c_46s), 1u);
- const u32 c_63s = rotl32 ((c_60s ^ c_55s ^ c_49s ^ c_47s), 1u);
- const u32 c_64s = rotl32 ((c_61s ^ c_56s ^ c_50s ^ c_48s), 1u);
- const u32 c_65s = rotl32 ((c_62s ^ c_57s ^ c_51s ^ c_49s), 1u);
- const u32 c_66s = rotl32 ((c_63s ^ c_58s ^ c_52s ^ c_50s), 1u);
- const u32 c_67s = rotl32 ((c_64s ^ c_59s ^ c_53s ^ c_51s), 1u);
- const u32 c_68s = rotl32 ((c_65s ^ c_60s ^ c_54s ^ c_52s), 1u);
- const u32 c_69s = rotl32 ((c_66s ^ c_61s ^ c_55s ^ c_53s), 1u);
- const u32 c_70s = rotl32 ((c_67s ^ c_62s ^ c_56s ^ c_54s), 1u);
- const u32 c_71s = rotl32 ((c_68s ^ c_63s ^ c_57s ^ c_55s), 1u);
- const u32 c_72s = rotl32 ((c_69s ^ c_64s ^ c_58s ^ c_56s), 1u);
- const u32 c_73s = rotl32 ((c_70s ^ c_65s ^ c_59s ^ c_57s), 1u);
- const u32 c_74s = rotl32 ((c_71s ^ c_66s ^ c_60s ^ c_58s), 1u);
- const u32 c_75s = rotl32 ((c_72s ^ c_67s ^ c_61s ^ c_59s), 1u);
- const u32 c_76s = rotl32 ((c_73s ^ c_68s ^ c_62s ^ c_60s), 1u);
- const u32 c_77s = rotl32 ((c_74s ^ c_69s ^ c_63s ^ c_61s), 1u);
- const u32 c_78s = rotl32 ((c_75s ^ c_70s ^ c_64s ^ c_62s), 1u);
- const u32 c_79s = rotl32 ((c_76s ^ c_71s ^ c_65s ^ c_63s), 1u);
+ const u32 c_16s = rotl32_S ((w[13] ^ w[ 8] ^ w[ 2] ), 1u);
+ const u32 c_17s = rotl32_S ((w[14] ^ w[ 9] ^ w[ 3] ^ w[ 1]), 1u);
+ const u32 c_18s = rotl32_S ((w[15] ^ w[10] ^ w[ 4] ^ w[ 2]), 1u);
+ const u32 c_19s = rotl32_S ((c_16s ^ w[11] ^ w[ 5] ^ w[ 3]), 1u);
+ const u32 c_20s = rotl32_S ((c_17s ^ w[12] ^ w[ 6] ^ w[ 4]), 1u);
+ const u32 c_21s = rotl32_S ((c_18s ^ w[13] ^ w[ 7] ^ w[ 5]), 1u);
+ const u32 c_22s = rotl32_S ((c_19s ^ w[14] ^ w[ 8] ^ w[ 6]), 1u);
+ const u32 c_23s = rotl32_S ((c_20s ^ w[15] ^ w[ 9] ^ w[ 7]), 1u);
+ const u32 c_24s = rotl32_S ((c_21s ^ c_16s ^ w[10] ^ w[ 8]), 1u);
+ const u32 c_25s = rotl32_S ((c_22s ^ c_17s ^ w[11] ^ w[ 9]), 1u);
+ const u32 c_26s = rotl32_S ((c_23s ^ c_18s ^ w[12] ^ w[10]), 1u);
+ const u32 c_27s = rotl32_S ((c_24s ^ c_19s ^ w[13] ^ w[11]), 1u);
+ const u32 c_28s = rotl32_S ((c_25s ^ c_20s ^ w[14] ^ w[12]), 1u);
+ const u32 c_29s = rotl32_S ((c_26s ^ c_21s ^ w[15] ^ w[13]), 1u);
+ const u32 c_30s = rotl32_S ((c_27s ^ c_22s ^ c_16s ^ w[14]), 1u);
+ const u32 c_31s = rotl32_S ((c_28s ^ c_23s ^ c_17s ^ w[15]), 1u);
+ const u32 c_32s = rotl32_S ((c_29s ^ c_24s ^ c_18s ^ c_16s), 1u);
+ const u32 c_33s = rotl32_S ((c_30s ^ c_25s ^ c_19s ^ c_17s), 1u);
+ const u32 c_34s = rotl32_S ((c_31s ^ c_26s ^ c_20s ^ c_18s), 1u);
+ const u32 c_35s = rotl32_S ((c_32s ^ c_27s ^ c_21s ^ c_19s), 1u);
+ const u32 c_36s = rotl32_S ((c_33s ^ c_28s ^ c_22s ^ c_20s), 1u);
+ const u32 c_37s = rotl32_S ((c_34s ^ c_29s ^ c_23s ^ c_21s), 1u);
+ const u32 c_38s = rotl32_S ((c_35s ^ c_30s ^ c_24s ^ c_22s), 1u);
+ const u32 c_39s = rotl32_S ((c_36s ^ c_31s ^ c_25s ^ c_23s), 1u);
+ const u32 c_40s = rotl32_S ((c_37s ^ c_32s ^ c_26s ^ c_24s), 1u);
+ const u32 c_41s = rotl32_S ((c_38s ^ c_33s ^ c_27s ^ c_25s), 1u);
+ const u32 c_42s = rotl32_S ((c_39s ^ c_34s ^ c_28s ^ c_26s), 1u);
+ const u32 c_43s = rotl32_S ((c_40s ^ c_35s ^ c_29s ^ c_27s), 1u);
+ const u32 c_44s = rotl32_S ((c_41s ^ c_36s ^ c_30s ^ c_28s), 1u);
+ const u32 c_45s = rotl32_S ((c_42s ^ c_37s ^ c_31s ^ c_29s), 1u);
+ const u32 c_46s = rotl32_S ((c_43s ^ c_38s ^ c_32s ^ c_30s), 1u);
+ const u32 c_47s = rotl32_S ((c_44s ^ c_39s ^ c_33s ^ c_31s), 1u);
+ const u32 c_48s = rotl32_S ((c_45s ^ c_40s ^ c_34s ^ c_32s), 1u);
+ const u32 c_49s = rotl32_S ((c_46s ^ c_41s ^ c_35s ^ c_33s), 1u);
+ const u32 c_50s = rotl32_S ((c_47s ^ c_42s ^ c_36s ^ c_34s), 1u);
+ const u32 c_51s = rotl32_S ((c_48s ^ c_43s ^ c_37s ^ c_35s), 1u);
+ const u32 c_52s = rotl32_S ((c_49s ^ c_44s ^ c_38s ^ c_36s), 1u);
+ const u32 c_53s = rotl32_S ((c_50s ^ c_45s ^ c_39s ^ c_37s), 1u);
+ const u32 c_54s = rotl32_S ((c_51s ^ c_46s ^ c_40s ^ c_38s), 1u);
+ const u32 c_55s = rotl32_S ((c_52s ^ c_47s ^ c_41s ^ c_39s), 1u);
+ const u32 c_56s = rotl32_S ((c_53s ^ c_48s ^ c_42s ^ c_40s), 1u);
+ const u32 c_57s = rotl32_S ((c_54s ^ c_49s ^ c_43s ^ c_41s), 1u);
+ const u32 c_58s = rotl32_S ((c_55s ^ c_50s ^ c_44s ^ c_42s), 1u);
+ const u32 c_59s = rotl32_S ((c_56s ^ c_51s ^ c_45s ^ c_43s), 1u);
+ const u32 c_60s = rotl32_S ((c_57s ^ c_52s ^ c_46s ^ c_44s), 1u);
+ const u32 c_61s = rotl32_S ((c_58s ^ c_53s ^ c_47s ^ c_45s), 1u);
+ const u32 c_62s = rotl32_S ((c_59s ^ c_54s ^ c_48s ^ c_46s), 1u);
+ const u32 c_63s = rotl32_S ((c_60s ^ c_55s ^ c_49s ^ c_47s), 1u);
+ const u32 c_64s = rotl32_S ((c_61s ^ c_56s ^ c_50s ^ c_48s), 1u);
+ const u32 c_65s = rotl32_S ((c_62s ^ c_57s ^ c_51s ^ c_49s), 1u);
+ const u32 c_66s = rotl32_S ((c_63s ^ c_58s ^ c_52s ^ c_50s), 1u);
+ const u32 c_67s = rotl32_S ((c_64s ^ c_59s ^ c_53s ^ c_51s), 1u);
+ const u32 c_68s = rotl32_S ((c_65s ^ c_60s ^ c_54s ^ c_52s), 1u);
+ const u32 c_69s = rotl32_S ((c_66s ^ c_61s ^ c_55s ^ c_53s), 1u);
+ const u32 c_70s = rotl32_S ((c_67s ^ c_62s ^ c_56s ^ c_54s), 1u);
+ const u32 c_71s = rotl32_S ((c_68s ^ c_63s ^ c_57s ^ c_55s), 1u);
+ const u32 c_72s = rotl32_S ((c_69s ^ c_64s ^ c_58s ^ c_56s), 1u);
+ const u32 c_73s = rotl32_S ((c_70s ^ c_65s ^ c_59s ^ c_57s), 1u);
+ const u32 c_74s = rotl32_S ((c_71s ^ c_66s ^ c_60s ^ c_58s), 1u);
+ const u32 c_75s = rotl32_S ((c_72s ^ c_67s ^ c_61s ^ c_59s), 1u);
+ const u32 c_76s = rotl32_S ((c_73s ^ c_68s ^ c_62s ^ c_60s), 1u);
+ const u32 c_77s = rotl32_S ((c_74s ^ c_69s ^ c_63s ^ c_61s), 1u);
+ const u32 c_78s = rotl32_S ((c_75s ^ c_70s ^ c_64s ^ c_62s), 1u);
+ const u32 c_79s = rotl32_S ((c_76s ^ c_71s ^ c_65s ^ c_63s), 1u);
const u32 c_17sK = c_17s + SHA1C00;
const u32 c_18sK = c_18s + SHA1C00;
u32 w0l = w[0];
- for (u32 il_pos = 0; il_pos < bfs_cnt; il_pos++)
+ for (u32 il_pos = 0; il_pos < bfs_cnt; il_pos += VECT_SIZE)
{
- const u32 w0r = words_buf_r[il_pos];
-
- const u32 w0 = w0l | w0r;
-
- const u32 w0s01 = rotl32 (w0, 1u);
- const u32 w0s02 = rotl32 (w0, 2u);
- const u32 w0s03 = rotl32 (w0, 3u);
- const u32 w0s04 = rotl32 (w0, 4u);
- const u32 w0s05 = rotl32 (w0, 5u);
- const u32 w0s06 = rotl32 (w0, 6u);
- const u32 w0s07 = rotl32 (w0, 7u);
- const u32 w0s08 = rotl32 (w0, 8u);
- const u32 w0s09 = rotl32 (w0, 9u);
- const u32 w0s10 = rotl32 (w0, 10u);
- const u32 w0s11 = rotl32 (w0, 11u);
- const u32 w0s12 = rotl32 (w0, 12u);
- const u32 w0s13 = rotl32 (w0, 13u);
- const u32 w0s14 = rotl32 (w0, 14u);
- const u32 w0s15 = rotl32 (w0, 15u);
- const u32 w0s16 = rotl32 (w0, 16u);
- const u32 w0s17 = rotl32 (w0, 17u);
- const u32 w0s18 = rotl32 (w0, 18u);
- const u32 w0s19 = rotl32 (w0, 19u);
- const u32 w0s20 = rotl32 (w0, 20u);
- const u32 w0s21 = rotl32 (w0, 21u);
- const u32 w0s22 = rotl32 (w0, 22U);
-
- const u32 w0s04___w0s06 = w0s04 ^ w0s06;
- const u32 w0s04___w0s08 = w0s04 ^ w0s08;
- const u32 w0s08___w0s12 = w0s08 ^ w0s12;
- const u32 w0s04___w0s06___w0s07 = w0s04___w0s06 ^ w0s07;
-
- u32 a = SHA1M_A;
- u32 b = SHA1M_B;
- u32 c = SHA1M_C;
- u32 d = SHA1M_D;
- u32 e = SHA1M_E;
+ const u32x w0r = words_buf_r[il_pos / VECT_SIZE];
+
+ const u32x w0 = w0l | w0r;
+
+ const u32x w0s01 = rotl32 (w0, 1u);
+ const u32x w0s02 = rotl32 (w0, 2u);
+ const u32x w0s03 = rotl32 (w0, 3u);
+ const u32x w0s04 = rotl32 (w0, 4u);
+ const u32x w0s05 = rotl32 (w0, 5u);
+ const u32x w0s06 = rotl32 (w0, 6u);
+ const u32x w0s07 = rotl32 (w0, 7u);
+ const u32x w0s08 = rotl32 (w0, 8u);
+ const u32x w0s09 = rotl32 (w0, 9u);
+ const u32x w0s10 = rotl32 (w0, 10u);
+ const u32x w0s11 = rotl32 (w0, 11u);
+ const u32x w0s12 = rotl32 (w0, 12u);
+ const u32x w0s13 = rotl32 (w0, 13u);
+ const u32x w0s14 = rotl32 (w0, 14u);
+ const u32x w0s15 = rotl32 (w0, 15u);
+ const u32x w0s16 = rotl32 (w0, 16u);
+ const u32x w0s17 = rotl32 (w0, 17u);
+ const u32x w0s18 = rotl32 (w0, 18u);
+ const u32x w0s19 = rotl32 (w0, 19u);
+ const u32x w0s20 = rotl32 (w0, 20u);
+ const u32x w0s21 = rotl32 (w0, 21u);
+ const u32x w0s22 = rotl32 (w0, 22U);
+
+ const u32x w0s04___w0s06 = w0s04 ^ w0s06;
+ const u32x w0s04___w0s08 = w0s04 ^ w0s08;
+ const u32x w0s08___w0s12 = w0s08 ^ w0s12;
+ const u32x w0s04___w0s06___w0s07 = w0s04___w0s06 ^ w0s07;
+
+ u32x a = SHA1M_A;
+ u32x b = SHA1M_B;
+ u32x c = SHA1M_C;
+ u32x d = SHA1M_D;
+ u32x e = SHA1M_E;
#undef K
#define K SHA1C00
d += SHA1M_D;
e += SHA1M_E;
- u32 w0_t = a;
- u32 w1_t = b;
- u32 w2_t = c;
- u32 w3_t = d;
- u32 w4_t = e;
- u32 w5_t = 0x80000000;
- u32 w6_t = 0;
- u32 w7_t = 0;
- u32 w8_t = 0;
- u32 w9_t = 0;
- u32 wa_t = 0;
- u32 wb_t = 0;
- u32 wc_t = 0;
- u32 wd_t = 0;
- u32 we_t = 0;
- u32 wf_t = 20 * 8;
+ u32x w0_t = a;
+ u32x w1_t = b;
+ u32x w2_t = c;
+ u32x w3_t = d;
+ u32x w4_t = e;
+ u32x w5_t = 0x80000000;
+ u32x w6_t = 0;
+ u32x w7_t = 0;
+ u32x w8_t = 0;
+ u32x w9_t = 0;
+ u32x wa_t = 0;
+ u32x wb_t = 0;
+ u32x wc_t = 0;
+ u32x wd_t = 0;
+ u32x we_t = 0;
+ u32x wf_t = 20 * 8;
a = SHA1M_A;
b = SHA1M_B;
we_t = rotl32 ((wb_t ^ w6_t ^ w0_t ^ we_t), 1u); SHA1_STEP (SHA1_F1, c, d, e, a, b, we_t);
wf_t = rotl32 ((wc_t ^ w7_t ^ w1_t ^ wf_t), 1u); SHA1_STEP (SHA1_F1, b, c, d, e, a, wf_t);
-
- const u32 r0 = d;
- const u32 r1 = e;
- const u32 r2 = c;
- const u32 r3 = b;
-
- #include COMPARE_M
+ COMPARE_M_SIMD (d, e, c, b);
}
}
-static void m00300s (u32 w[16], const u32 pw_len, __global pw_t *pws, __global kernel_rule_t *rules_buf, __global comb_t *combs_buf, __constant u32 * words_buf_r, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 bfs_cnt, const u32 digests_cnt, const u32 digests_offset)
+static void m00300s (u32 w[16], const u32 pw_len, __global pw_t *pws, __global kernel_rule_t *rules_buf, __global comb_t *combs_buf, __constant u32x * words_buf_r, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 bfs_cnt, const u32 digests_cnt, const u32 digests_offset)
{
/**
* modifier
* base
*/
- const u32 c_16s = rotl32 ((w[13] ^ w[ 8] ^ w[ 2] ), 1u);
- const u32 c_17s = rotl32 ((w[14] ^ w[ 9] ^ w[ 3] ^ w[ 1]), 1u);
- const u32 c_18s = rotl32 ((w[15] ^ w[10] ^ w[ 4] ^ w[ 2]), 1u);
- const u32 c_19s = rotl32 ((c_16s ^ w[11] ^ w[ 5] ^ w[ 3]), 1u);
- const u32 c_20s = rotl32 ((c_17s ^ w[12] ^ w[ 6] ^ w[ 4]), 1u);
- const u32 c_21s = rotl32 ((c_18s ^ w[13] ^ w[ 7] ^ w[ 5]), 1u);
- const u32 c_22s = rotl32 ((c_19s ^ w[14] ^ w[ 8] ^ w[ 6]), 1u);
- const u32 c_23s = rotl32 ((c_20s ^ w[15] ^ w[ 9] ^ w[ 7]), 1u);
- const u32 c_24s = rotl32 ((c_21s ^ c_16s ^ w[10] ^ w[ 8]), 1u);
- const u32 c_25s = rotl32 ((c_22s ^ c_17s ^ w[11] ^ w[ 9]), 1u);
- const u32 c_26s = rotl32 ((c_23s ^ c_18s ^ w[12] ^ w[10]), 1u);
- const u32 c_27s = rotl32 ((c_24s ^ c_19s ^ w[13] ^ w[11]), 1u);
- const u32 c_28s = rotl32 ((c_25s ^ c_20s ^ w[14] ^ w[12]), 1u);
- const u32 c_29s = rotl32 ((c_26s ^ c_21s ^ w[15] ^ w[13]), 1u);
- const u32 c_30s = rotl32 ((c_27s ^ c_22s ^ c_16s ^ w[14]), 1u);
- const u32 c_31s = rotl32 ((c_28s ^ c_23s ^ c_17s ^ w[15]), 1u);
- const u32 c_32s = rotl32 ((c_29s ^ c_24s ^ c_18s ^ c_16s), 1u);
- const u32 c_33s = rotl32 ((c_30s ^ c_25s ^ c_19s ^ c_17s), 1u);
- const u32 c_34s = rotl32 ((c_31s ^ c_26s ^ c_20s ^ c_18s), 1u);
- const u32 c_35s = rotl32 ((c_32s ^ c_27s ^ c_21s ^ c_19s), 1u);
- const u32 c_36s = rotl32 ((c_33s ^ c_28s ^ c_22s ^ c_20s), 1u);
- const u32 c_37s = rotl32 ((c_34s ^ c_29s ^ c_23s ^ c_21s), 1u);
- const u32 c_38s = rotl32 ((c_35s ^ c_30s ^ c_24s ^ c_22s), 1u);
- const u32 c_39s = rotl32 ((c_36s ^ c_31s ^ c_25s ^ c_23s), 1u);
- const u32 c_40s = rotl32 ((c_37s ^ c_32s ^ c_26s ^ c_24s), 1u);
- const u32 c_41s = rotl32 ((c_38s ^ c_33s ^ c_27s ^ c_25s), 1u);
- const u32 c_42s = rotl32 ((c_39s ^ c_34s ^ c_28s ^ c_26s), 1u);
- const u32 c_43s = rotl32 ((c_40s ^ c_35s ^ c_29s ^ c_27s), 1u);
- const u32 c_44s = rotl32 ((c_41s ^ c_36s ^ c_30s ^ c_28s), 1u);
- const u32 c_45s = rotl32 ((c_42s ^ c_37s ^ c_31s ^ c_29s), 1u);
- const u32 c_46s = rotl32 ((c_43s ^ c_38s ^ c_32s ^ c_30s), 1u);
- const u32 c_47s = rotl32 ((c_44s ^ c_39s ^ c_33s ^ c_31s), 1u);
- const u32 c_48s = rotl32 ((c_45s ^ c_40s ^ c_34s ^ c_32s), 1u);
- const u32 c_49s = rotl32 ((c_46s ^ c_41s ^ c_35s ^ c_33s), 1u);
- const u32 c_50s = rotl32 ((c_47s ^ c_42s ^ c_36s ^ c_34s), 1u);
- const u32 c_51s = rotl32 ((c_48s ^ c_43s ^ c_37s ^ c_35s), 1u);
- const u32 c_52s = rotl32 ((c_49s ^ c_44s ^ c_38s ^ c_36s), 1u);
- const u32 c_53s = rotl32 ((c_50s ^ c_45s ^ c_39s ^ c_37s), 1u);
- const u32 c_54s = rotl32 ((c_51s ^ c_46s ^ c_40s ^ c_38s), 1u);
- const u32 c_55s = rotl32 ((c_52s ^ c_47s ^ c_41s ^ c_39s), 1u);
- const u32 c_56s = rotl32 ((c_53s ^ c_48s ^ c_42s ^ c_40s), 1u);
- const u32 c_57s = rotl32 ((c_54s ^ c_49s ^ c_43s ^ c_41s), 1u);
- const u32 c_58s = rotl32 ((c_55s ^ c_50s ^ c_44s ^ c_42s), 1u);
- const u32 c_59s = rotl32 ((c_56s ^ c_51s ^ c_45s ^ c_43s), 1u);
- const u32 c_60s = rotl32 ((c_57s ^ c_52s ^ c_46s ^ c_44s), 1u);
- const u32 c_61s = rotl32 ((c_58s ^ c_53s ^ c_47s ^ c_45s), 1u);
- const u32 c_62s = rotl32 ((c_59s ^ c_54s ^ c_48s ^ c_46s), 1u);
- const u32 c_63s = rotl32 ((c_60s ^ c_55s ^ c_49s ^ c_47s), 1u);
- const u32 c_64s = rotl32 ((c_61s ^ c_56s ^ c_50s ^ c_48s), 1u);
- const u32 c_65s = rotl32 ((c_62s ^ c_57s ^ c_51s ^ c_49s), 1u);
- const u32 c_66s = rotl32 ((c_63s ^ c_58s ^ c_52s ^ c_50s), 1u);
- const u32 c_67s = rotl32 ((c_64s ^ c_59s ^ c_53s ^ c_51s), 1u);
- const u32 c_68s = rotl32 ((c_65s ^ c_60s ^ c_54s ^ c_52s), 1u);
- const u32 c_69s = rotl32 ((c_66s ^ c_61s ^ c_55s ^ c_53s), 1u);
- const u32 c_70s = rotl32 ((c_67s ^ c_62s ^ c_56s ^ c_54s), 1u);
- const u32 c_71s = rotl32 ((c_68s ^ c_63s ^ c_57s ^ c_55s), 1u);
- const u32 c_72s = rotl32 ((c_69s ^ c_64s ^ c_58s ^ c_56s), 1u);
- const u32 c_73s = rotl32 ((c_70s ^ c_65s ^ c_59s ^ c_57s), 1u);
- const u32 c_74s = rotl32 ((c_71s ^ c_66s ^ c_60s ^ c_58s), 1u);
- const u32 c_75s = rotl32 ((c_72s ^ c_67s ^ c_61s ^ c_59s), 1u);
+ const u32 c_16s = rotl32_S ((w[13] ^ w[ 8] ^ w[ 2] ), 1u);
+ const u32 c_17s = rotl32_S ((w[14] ^ w[ 9] ^ w[ 3] ^ w[ 1]), 1u);
+ const u32 c_18s = rotl32_S ((w[15] ^ w[10] ^ w[ 4] ^ w[ 2]), 1u);
+ const u32 c_19s = rotl32_S ((c_16s ^ w[11] ^ w[ 5] ^ w[ 3]), 1u);
+ const u32 c_20s = rotl32_S ((c_17s ^ w[12] ^ w[ 6] ^ w[ 4]), 1u);
+ const u32 c_21s = rotl32_S ((c_18s ^ w[13] ^ w[ 7] ^ w[ 5]), 1u);
+ const u32 c_22s = rotl32_S ((c_19s ^ w[14] ^ w[ 8] ^ w[ 6]), 1u);
+ const u32 c_23s = rotl32_S ((c_20s ^ w[15] ^ w[ 9] ^ w[ 7]), 1u);
+ const u32 c_24s = rotl32_S ((c_21s ^ c_16s ^ w[10] ^ w[ 8]), 1u);
+ const u32 c_25s = rotl32_S ((c_22s ^ c_17s ^ w[11] ^ w[ 9]), 1u);
+ const u32 c_26s = rotl32_S ((c_23s ^ c_18s ^ w[12] ^ w[10]), 1u);
+ const u32 c_27s = rotl32_S ((c_24s ^ c_19s ^ w[13] ^ w[11]), 1u);
+ const u32 c_28s = rotl32_S ((c_25s ^ c_20s ^ w[14] ^ w[12]), 1u);
+ const u32 c_29s = rotl32_S ((c_26s ^ c_21s ^ w[15] ^ w[13]), 1u);
+ const u32 c_30s = rotl32_S ((c_27s ^ c_22s ^ c_16s ^ w[14]), 1u);
+ const u32 c_31s = rotl32_S ((c_28s ^ c_23s ^ c_17s ^ w[15]), 1u);
+ const u32 c_32s = rotl32_S ((c_29s ^ c_24s ^ c_18s ^ c_16s), 1u);
+ const u32 c_33s = rotl32_S ((c_30s ^ c_25s ^ c_19s ^ c_17s), 1u);
+ const u32 c_34s = rotl32_S ((c_31s ^ c_26s ^ c_20s ^ c_18s), 1u);
+ const u32 c_35s = rotl32_S ((c_32s ^ c_27s ^ c_21s ^ c_19s), 1u);
+ const u32 c_36s = rotl32_S ((c_33s ^ c_28s ^ c_22s ^ c_20s), 1u);
+ const u32 c_37s = rotl32_S ((c_34s ^ c_29s ^ c_23s ^ c_21s), 1u);
+ const u32 c_38s = rotl32_S ((c_35s ^ c_30s ^ c_24s ^ c_22s), 1u);
+ const u32 c_39s = rotl32_S ((c_36s ^ c_31s ^ c_25s ^ c_23s), 1u);
+ const u32 c_40s = rotl32_S ((c_37s ^ c_32s ^ c_26s ^ c_24s), 1u);
+ const u32 c_41s = rotl32_S ((c_38s ^ c_33s ^ c_27s ^ c_25s), 1u);
+ const u32 c_42s = rotl32_S ((c_39s ^ c_34s ^ c_28s ^ c_26s), 1u);
+ const u32 c_43s = rotl32_S ((c_40s ^ c_35s ^ c_29s ^ c_27s), 1u);
+ const u32 c_44s = rotl32_S ((c_41s ^ c_36s ^ c_30s ^ c_28s), 1u);
+ const u32 c_45s = rotl32_S ((c_42s ^ c_37s ^ c_31s ^ c_29s), 1u);
+ const u32 c_46s = rotl32_S ((c_43s ^ c_38s ^ c_32s ^ c_30s), 1u);
+ const u32 c_47s = rotl32_S ((c_44s ^ c_39s ^ c_33s ^ c_31s), 1u);
+ const u32 c_48s = rotl32_S ((c_45s ^ c_40s ^ c_34s ^ c_32s), 1u);
+ const u32 c_49s = rotl32_S ((c_46s ^ c_41s ^ c_35s ^ c_33s), 1u);
+ const u32 c_50s = rotl32_S ((c_47s ^ c_42s ^ c_36s ^ c_34s), 1u);
+ const u32 c_51s = rotl32_S ((c_48s ^ c_43s ^ c_37s ^ c_35s), 1u);
+ const u32 c_52s = rotl32_S ((c_49s ^ c_44s ^ c_38s ^ c_36s), 1u);
+ const u32 c_53s = rotl32_S ((c_50s ^ c_45s ^ c_39s ^ c_37s), 1u);
+ const u32 c_54s = rotl32_S ((c_51s ^ c_46s ^ c_40s ^ c_38s), 1u);
+ const u32 c_55s = rotl32_S ((c_52s ^ c_47s ^ c_41s ^ c_39s), 1u);
+ const u32 c_56s = rotl32_S ((c_53s ^ c_48s ^ c_42s ^ c_40s), 1u);
+ const u32 c_57s = rotl32_S ((c_54s ^ c_49s ^ c_43s ^ c_41s), 1u);
+ const u32 c_58s = rotl32_S ((c_55s ^ c_50s ^ c_44s ^ c_42s), 1u);
+ const u32 c_59s = rotl32_S ((c_56s ^ c_51s ^ c_45s ^ c_43s), 1u);
+ const u32 c_60s = rotl32_S ((c_57s ^ c_52s ^ c_46s ^ c_44s), 1u);
+ const u32 c_61s = rotl32_S ((c_58s ^ c_53s ^ c_47s ^ c_45s), 1u);
+ const u32 c_62s = rotl32_S ((c_59s ^ c_54s ^ c_48s ^ c_46s), 1u);
+ const u32 c_63s = rotl32_S ((c_60s ^ c_55s ^ c_49s ^ c_47s), 1u);
+ const u32 c_64s = rotl32_S ((c_61s ^ c_56s ^ c_50s ^ c_48s), 1u);
+ const u32 c_65s = rotl32_S ((c_62s ^ c_57s ^ c_51s ^ c_49s), 1u);
+ const u32 c_66s = rotl32_S ((c_63s ^ c_58s ^ c_52s ^ c_50s), 1u);
+ const u32 c_67s = rotl32_S ((c_64s ^ c_59s ^ c_53s ^ c_51s), 1u);
+ const u32 c_68s = rotl32_S ((c_65s ^ c_60s ^ c_54s ^ c_52s), 1u);
+ const u32 c_69s = rotl32_S ((c_66s ^ c_61s ^ c_55s ^ c_53s), 1u);
+ const u32 c_70s = rotl32_S ((c_67s ^ c_62s ^ c_56s ^ c_54s), 1u);
+ const u32 c_71s = rotl32_S ((c_68s ^ c_63s ^ c_57s ^ c_55s), 1u);
+ const u32 c_72s = rotl32_S ((c_69s ^ c_64s ^ c_58s ^ c_56s), 1u);
+ const u32 c_73s = rotl32_S ((c_70s ^ c_65s ^ c_59s ^ c_57s), 1u);
+ const u32 c_74s = rotl32_S ((c_71s ^ c_66s ^ c_60s ^ c_58s), 1u);
+ const u32 c_75s = rotl32_S ((c_72s ^ c_67s ^ c_61s ^ c_59s), 1u);
const u32 c_17sK = c_17s + SHA1C00;
const u32 c_18sK = c_18s + SHA1C00;
* reverse
*/
- const u32 e_rev = rotl32 (search[1], 2u);
+ const u32 e_rev = rotl32_S (search[1], 2u);
/**
* loop
u32 w0l = w[0];
- for (u32 il_pos = 0; il_pos < bfs_cnt; il_pos++)
+ for (u32 il_pos = 0; il_pos < bfs_cnt; il_pos += VECT_SIZE)
{
- const u32 w0r = words_buf_r[il_pos];
-
- const u32 w0 = w0l | w0r;
-
- const u32 w0s01 = rotl32 (w0, 1u);
- const u32 w0s02 = rotl32 (w0, 2u);
- const u32 w0s03 = rotl32 (w0, 3u);
- const u32 w0s04 = rotl32 (w0, 4u);
- const u32 w0s05 = rotl32 (w0, 5u);
- const u32 w0s06 = rotl32 (w0, 6u);
- const u32 w0s07 = rotl32 (w0, 7u);
- const u32 w0s08 = rotl32 (w0, 8u);
- const u32 w0s09 = rotl32 (w0, 9u);
- const u32 w0s10 = rotl32 (w0, 10u);
- const u32 w0s11 = rotl32 (w0, 11u);
- const u32 w0s12 = rotl32 (w0, 12u);
- const u32 w0s13 = rotl32 (w0, 13u);
- const u32 w0s14 = rotl32 (w0, 14u);
- const u32 w0s15 = rotl32 (w0, 15u);
- const u32 w0s16 = rotl32 (w0, 16u);
- const u32 w0s17 = rotl32 (w0, 17u);
- const u32 w0s18 = rotl32 (w0, 18u);
- const u32 w0s19 = rotl32 (w0, 19u);
- const u32 w0s20 = rotl32 (w0, 20u);
-
- const u32 w0s04___w0s06 = w0s04 ^ w0s06;
- const u32 w0s04___w0s08 = w0s04 ^ w0s08;
- const u32 w0s08___w0s12 = w0s08 ^ w0s12;
- const u32 w0s04___w0s06___w0s07 = w0s04___w0s06 ^ w0s07;
-
- u32 a = SHA1M_A;
- u32 b = SHA1M_B;
- u32 c = SHA1M_C;
- u32 d = SHA1M_D;
- u32 e = SHA1M_E;
+ const u32x w0r = words_buf_r[il_pos / VECT_SIZE];
+
+ const u32x w0 = w0l | w0r;
+
+ const u32x w0s01 = rotl32 (w0, 1u);
+ const u32x w0s02 = rotl32 (w0, 2u);
+ const u32x w0s03 = rotl32 (w0, 3u);
+ const u32x w0s04 = rotl32 (w0, 4u);
+ const u32x w0s05 = rotl32 (w0, 5u);
+ const u32x w0s06 = rotl32 (w0, 6u);
+ const u32x w0s07 = rotl32 (w0, 7u);
+ const u32x w0s08 = rotl32 (w0, 8u);
+ const u32x w0s09 = rotl32 (w0, 9u);
+ const u32x w0s10 = rotl32 (w0, 10u);
+ const u32x w0s11 = rotl32 (w0, 11u);
+ const u32x w0s12 = rotl32 (w0, 12u);
+ const u32x w0s13 = rotl32 (w0, 13u);
+ const u32x w0s14 = rotl32 (w0, 14u);
+ const u32x w0s15 = rotl32 (w0, 15u);
+ const u32x w0s16 = rotl32 (w0, 16u);
+ const u32x w0s17 = rotl32 (w0, 17u);
+ const u32x w0s18 = rotl32 (w0, 18u);
+ const u32x w0s19 = rotl32 (w0, 19u);
+ const u32x w0s20 = rotl32 (w0, 20u);
+
+ const u32x w0s04___w0s06 = w0s04 ^ w0s06;
+ const u32x w0s04___w0s08 = w0s04 ^ w0s08;
+ const u32x w0s08___w0s12 = w0s08 ^ w0s12;
+ const u32x w0s04___w0s06___w0s07 = w0s04___w0s06 ^ w0s07;
+
+ u32x a = SHA1M_A;
+ u32x b = SHA1M_B;
+ u32x c = SHA1M_C;
+ u32x d = SHA1M_D;
+ u32x e = SHA1M_E;
#undef K
#define K SHA1C00
SHA1_STEP (SHA1_F1 , b, c, d, e, a, (c_74s ^ w0s08 ^ w0s16));
SHA1_STEP (SHA1_F1 , a, b, c, d, e, (c_75s ^ w0s06 ^ w0s12 ^ w0s14));
- const u32 c_76s = rotl32 ((c_73s ^ c_68s ^ c_62s ^ c_60s), 1u);
- const u32 c_77s = rotl32 ((c_74s ^ c_69s ^ c_63s ^ c_61s), 1u);
- const u32 c_78s = rotl32 ((c_75s ^ c_70s ^ c_64s ^ c_62s), 1u);
- const u32 c_79s = rotl32 ((c_76s ^ c_71s ^ c_65s ^ c_63s), 1u);
+ const u32x c_76s = rotl32 ((c_73s ^ c_68s ^ c_62s ^ c_60s), 1u);
+ const u32x c_77s = rotl32 ((c_74s ^ c_69s ^ c_63s ^ c_61s), 1u);
+ const u32x c_78s = rotl32 ((c_75s ^ c_70s ^ c_64s ^ c_62s), 1u);
+ const u32x c_79s = rotl32 ((c_76s ^ c_71s ^ c_65s ^ c_63s), 1u);
- const u32 w0s21 = rotl32 (w0, 21u);
- const u32 w0s22 = rotl32 (w0, 22U);
+ const u32x w0s21 = rotl32 (w0, 21u);
+ const u32x w0s22 = rotl32 (w0, 22U);
SHA1_STEP (SHA1_F1 , e, a, b, c, d, (c_76s ^ w0s07 ^ w0s08___w0s12 ^ w0s16 ^ w0s21));
SHA1_STEP (SHA1_F1 , d, e, a, b, c, (c_77s));
d += SHA1M_D;
e += SHA1M_E;
- u32 w0_t = a;
- u32 w1_t = b;
- u32 w2_t = c;
- u32 w3_t = d;
- u32 w4_t = e;
- u32 w5_t = 0x80000000;
- u32 w6_t = 0;
- u32 w7_t = 0;
- u32 w8_t = 0;
- u32 w9_t = 0;
- u32 wa_t = 0;
- u32 wb_t = 0;
- u32 wc_t = 0;
- u32 wd_t = 0;
- u32 we_t = 0;
- u32 wf_t = 20 * 8;
+ u32x w0_t = a;
+ u32x w1_t = b;
+ u32x w2_t = c;
+ u32x w3_t = d;
+ u32x w4_t = e;
+ u32x w5_t = 0x80000000;
+ u32x w6_t = 0;
+ u32x w7_t = 0;
+ u32x w8_t = 0;
+ u32x w9_t = 0;
+ u32x wa_t = 0;
+ u32x wb_t = 0;
+ u32x wc_t = 0;
+ u32x wd_t = 0;
+ u32x we_t = 0;
+ u32x wf_t = 20 * 8;
a = SHA1M_A;
b = SHA1M_B;
wa_t = rotl32 ((w7_t ^ w2_t ^ wc_t ^ wa_t), 1u); SHA1_STEP (SHA1_F1, b, c, d, e, a, wa_t);
wb_t = rotl32 ((w8_t ^ w3_t ^ wd_t ^ wb_t), 1u); SHA1_STEP (SHA1_F1, a, b, c, d, e, wb_t);
- bool q_cond = allx (e_rev != e);
-
- if (q_cond) continue;
+ if (MATCHES_NONE_VS (e, e_rev)) continue;
wc_t = rotl32 ((w9_t ^ w4_t ^ we_t ^ wc_t), 1u); SHA1_STEP (SHA1_F1, e, a, b, c, d, wc_t);
wd_t = rotl32 ((wa_t ^ w5_t ^ wf_t ^ wd_t), 1u); SHA1_STEP (SHA1_F1, d, e, a, b, c, wd_t);
we_t = rotl32 ((wb_t ^ w6_t ^ w0_t ^ we_t), 1u); SHA1_STEP (SHA1_F1, c, d, e, a, b, we_t);
wf_t = rotl32 ((wc_t ^ w7_t ^ w1_t ^ wf_t), 1u); SHA1_STEP (SHA1_F1, b, c, d, e, a, wf_t);
-
- const u32 r0 = d;
- const u32 r1 = e;
- const u32 r2 = c;
- const u32 r3 = b;
-
- #include COMPARE_S
+ COMPARE_S_SIMD (d, e, c, b);
}
}
-__kernel void m00300_m04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, __global comb_t *combs_buf, __constant u32 * words_buf_r, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 bfs_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max)
+__kernel void m00300_m04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, __global comb_t *combs_buf, __constant u32x * words_buf_r, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 bfs_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max)
{
/**
* base
m00300m (w, pw_len, pws, rules_buf, combs_buf, words_buf_r, tmps, hooks, bitmaps_buf_s1_a, bitmaps_buf_s1_b, bitmaps_buf_s1_c, bitmaps_buf_s1_d, bitmaps_buf_s2_a, bitmaps_buf_s2_b, bitmaps_buf_s2_c, bitmaps_buf_s2_d, plains_buf, digests_buf, hashes_shown, salt_bufs, esalt_bufs, d_return_buf, d_scryptV_buf, bitmap_mask, bitmap_shift1, bitmap_shift2, salt_pos, loop_pos, loop_cnt, bfs_cnt, digests_cnt, digests_offset);
}
-__kernel void m00300_m08 (__global pw_t *pws, __global kernel_rule_t *rules_buf, __global comb_t *combs_buf, __constant u32 * words_buf_r, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 bfs_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max)
+__kernel void m00300_m08 (__global pw_t *pws, __global kernel_rule_t *rules_buf, __global comb_t *combs_buf, __constant u32x * words_buf_r, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 bfs_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max)
{
/**
* base
m00300m (w, pw_len, pws, rules_buf, combs_buf, words_buf_r, tmps, hooks, bitmaps_buf_s1_a, bitmaps_buf_s1_b, bitmaps_buf_s1_c, bitmaps_buf_s1_d, bitmaps_buf_s2_a, bitmaps_buf_s2_b, bitmaps_buf_s2_c, bitmaps_buf_s2_d, plains_buf, digests_buf, hashes_shown, salt_bufs, esalt_bufs, d_return_buf, d_scryptV_buf, bitmap_mask, bitmap_shift1, bitmap_shift2, salt_pos, loop_pos, loop_cnt, bfs_cnt, digests_cnt, digests_offset);
}
-__kernel void m00300_m16 (__global pw_t *pws, __global kernel_rule_t *rules_buf, __global comb_t *combs_buf, __constant u32 * words_buf_r, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 bfs_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max)
+__kernel void m00300_m16 (__global pw_t *pws, __global kernel_rule_t *rules_buf, __global comb_t *combs_buf, __constant u32x * words_buf_r, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 bfs_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max)
{
/**
* base
m00300m (w, pw_len, pws, rules_buf, combs_buf, words_buf_r, tmps, hooks, bitmaps_buf_s1_a, bitmaps_buf_s1_b, bitmaps_buf_s1_c, bitmaps_buf_s1_d, bitmaps_buf_s2_a, bitmaps_buf_s2_b, bitmaps_buf_s2_c, bitmaps_buf_s2_d, plains_buf, digests_buf, hashes_shown, salt_bufs, esalt_bufs, d_return_buf, d_scryptV_buf, bitmap_mask, bitmap_shift1, bitmap_shift2, salt_pos, loop_pos, loop_cnt, bfs_cnt, digests_cnt, digests_offset);
}
-__kernel void m00300_s04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, __global comb_t *combs_buf, __constant u32 * words_buf_r, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 bfs_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max)
+__kernel void m00300_s04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, __global comb_t *combs_buf, __constant u32x * words_buf_r, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 bfs_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max)
{
/**
* base
m00300s (w, pw_len, pws, rules_buf, combs_buf, words_buf_r, tmps, hooks, bitmaps_buf_s1_a, bitmaps_buf_s1_b, bitmaps_buf_s1_c, bitmaps_buf_s1_d, bitmaps_buf_s2_a, bitmaps_buf_s2_b, bitmaps_buf_s2_c, bitmaps_buf_s2_d, plains_buf, digests_buf, hashes_shown, salt_bufs, esalt_bufs, d_return_buf, d_scryptV_buf, bitmap_mask, bitmap_shift1, bitmap_shift2, salt_pos, loop_pos, loop_cnt, bfs_cnt, digests_cnt, digests_offset);
}
-__kernel void m00300_s08 (__global pw_t *pws, __global kernel_rule_t *rules_buf, __global comb_t *combs_buf, __constant u32 * words_buf_r, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 bfs_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max)
+__kernel void m00300_s08 (__global pw_t *pws, __global kernel_rule_t *rules_buf, __global comb_t *combs_buf, __constant u32x * words_buf_r, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 bfs_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max)
{
/**
* base
m00300s (w, pw_len, pws, rules_buf, combs_buf, words_buf_r, tmps, hooks, bitmaps_buf_s1_a, bitmaps_buf_s1_b, bitmaps_buf_s1_c, bitmaps_buf_s1_d, bitmaps_buf_s2_a, bitmaps_buf_s2_b, bitmaps_buf_s2_c, bitmaps_buf_s2_d, plains_buf, digests_buf, hashes_shown, salt_bufs, esalt_bufs, d_return_buf, d_scryptV_buf, bitmap_mask, bitmap_shift1, bitmap_shift2, salt_pos, loop_pos, loop_cnt, bfs_cnt, digests_cnt, digests_offset);
}
-__kernel void m00300_s16 (__global pw_t *pws, __global kernel_rule_t *rules_buf, __global comb_t *combs_buf, __constant u32 * words_buf_r, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 bfs_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max)
+__kernel void m00300_s16 (__global pw_t *pws, __global kernel_rule_t *rules_buf, __global comb_t *combs_buf, __constant u32x * words_buf_r, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 bfs_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max)
{
/**
* base
{
append_0x80_2x4 (wordl0, wordl1, pw_l_len);
- switch_buffer_by_offset (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len);
+ switch_buffer_by_offset_le (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len);
}
/**
if (combs_mode == COMBINATOR_MODE_BASE_LEFT)
{
- switch_buffer_by_offset (wordr0, wordr1, wordr2, wordr3, pw_l_len);
+ switch_buffer_by_offset_le (wordr0, wordr1, wordr2, wordr3, pw_l_len);
}
u32 w0[4];
{
append_0x80_2x4 (wordl0, wordl1, pw_l_len);
- switch_buffer_by_offset (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len);
+ switch_buffer_by_offset_le (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len);
}
/**
if (combs_mode == COMBINATOR_MODE_BASE_LEFT)
{
- switch_buffer_by_offset (wordr0, wordr1, wordr2, wordr3, pw_l_len);
+ switch_buffer_by_offset_le (wordr0, wordr1, wordr2, wordr3, pw_l_len);
}
u32 w0[4];
#define _MD4_
+#define NEW_SIMD_CODE
+
#include "include/constants.h"
#include "include/kernel_vendor.h"
#include "include/kernel_functions.c"
#include "OpenCL/types_ocl.c"
#include "OpenCL/common.c"
-
-#define COMPARE_S "OpenCL/check_single_comp4.c"
-#define COMPARE_M "OpenCL/check_multi_comp4.c"
+#include "OpenCL/simd.c"
#define MD4_STEP_REV(f,a,b,c,d,x,t,s) \
{ \
- a = rotr32 (a, s); \
+ a = rotr32_S (a, s); \
a -= f (b, c, d); \
a -= x; \
a -= t; \
#define MD4_STEP_REV1(f,a,b,c,d,x,t,s) \
{ \
- a = rotr32 (a, s); \
+ a = rotr32_S (a, s); \
a -= x; \
a -= t; \
}
-static void m00900m (u32 w[16], const u32 pw_len, __global pw_t *pws, __global kernel_rule_t *rules_buf, __global comb_t *combs_buf, __constant u32 * words_buf_r, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 bfs_cnt, const u32 digests_cnt, const u32 digests_offset)
+static void m00900m (u32 w[16], const u32 pw_len, __global pw_t *pws, __global kernel_rule_t *rules_buf, __global comb_t *combs_buf, __constant u32x * words_buf_r, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 bfs_cnt, const u32 digests_cnt, const u32 digests_offset)
{
/**
* modifier
u32 w0l = w[0];
- for (u32 il_pos = 0; il_pos < bfs_cnt; il_pos++)
+ for (u32 il_pos = 0; il_pos < bfs_cnt; il_pos += VECT_SIZE)
{
- const u32 w0r = words_buf_r[il_pos];
+ const u32x w0r = words_buf_r[il_pos / VECT_SIZE];
- const u32 w0 = w0l | w0r;
+ const u32x w0 = w0l | w0r;
- u32 a = MD4M_A;
- u32 b = MD4M_B;
- u32 c = MD4M_C;
- u32 d = MD4M_D;
+ u32x a = MD4M_A;
+ u32x b = MD4M_B;
+ u32x c = MD4M_C;
+ u32x d = MD4M_D;
MD4_STEP (MD4_Fo, a, b, c, d, w0, F_w0c00, MD4S00);
MD4_STEP0(MD4_Fo, d, a, b, c, F_w1c00, MD4S01);
MD4_STEP0(MD4_H , c, d, a, b, H_w7c02, MD4S22);
MD4_STEP0(MD4_H , b, c, d, a, H_wfc02, MD4S23);
- const u32 r0 = a;
- const u32 r1 = d;
- const u32 r2 = c;
- const u32 r3 = b;
-
- #include COMPARE_M
+ COMPARE_M_SIMD (a, d, c, b);
}
}
-static void m00900s (u32 w[16], const u32 pw_len, __global pw_t *pws, __global kernel_rule_t *rules_buf, __global comb_t *combs_buf, __constant u32 * words_buf_r, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 bfs_cnt, const u32 digests_cnt, const u32 digests_offset)
+static void m00900s (u32 w[16], const u32 pw_len, __global pw_t *pws, __global kernel_rule_t *rules_buf, __global comb_t *combs_buf, __constant u32x * words_buf_r, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 bfs_cnt, const u32 digests_cnt, const u32 digests_offset)
{
/**
* modifier
u32 c_rev = digests_buf[digests_offset].digest_buf[2];
u32 d_rev = digests_buf[digests_offset].digest_buf[3];
- MD4_STEP_REV (MD4_H, b_rev, c_rev, d_rev, a_rev, w[15], MD4C02, MD4S23);
- MD4_STEP_REV (MD4_H, c_rev, d_rev, a_rev, b_rev, w[ 7], MD4C02, MD4S22);
- MD4_STEP_REV (MD4_H, d_rev, a_rev, b_rev, c_rev, w[11], MD4C02, MD4S21);
- MD4_STEP_REV (MD4_H, a_rev, b_rev, c_rev, d_rev, w[ 3], MD4C02, MD4S20);
- MD4_STEP_REV (MD4_H, b_rev, c_rev, d_rev, a_rev, w[13], MD4C02, MD4S23);
- MD4_STEP_REV (MD4_H, c_rev, d_rev, a_rev, b_rev, w[ 5], MD4C02, MD4S22);
- MD4_STEP_REV (MD4_H, d_rev, a_rev, b_rev, c_rev, w[ 9], MD4C02, MD4S21);
- MD4_STEP_REV (MD4_H, a_rev, b_rev, c_rev, d_rev, w[ 1], MD4C02, MD4S20);
- MD4_STEP_REV (MD4_H, b_rev, c_rev, d_rev, a_rev, w[14], MD4C02, MD4S23);
- MD4_STEP_REV (MD4_H, c_rev, d_rev, a_rev, b_rev, w[ 6], MD4C02, MD4S22);
- MD4_STEP_REV (MD4_H, d_rev, a_rev, b_rev, c_rev, w[10], MD4C02, MD4S21);
- MD4_STEP_REV (MD4_H, a_rev, b_rev, c_rev, d_rev, w[ 2], MD4C02, MD4S20);
- MD4_STEP_REV (MD4_H, b_rev, c_rev, d_rev, a_rev, w[12], MD4C02, MD4S23);
- MD4_STEP_REV (MD4_H, c_rev, d_rev, a_rev, b_rev, w[ 4], MD4C02, MD4S22);
- MD4_STEP_REV (MD4_H, d_rev, a_rev, b_rev, c_rev, w[ 8], MD4C02, MD4S21);
- MD4_STEP_REV (MD4_H, a_rev, b_rev, c_rev, d_rev, 0, MD4C02, MD4S20);
+ MD4_STEP_REV (MD4_H_S, b_rev, c_rev, d_rev, a_rev, w[15], MD4C02, MD4S23);
+ MD4_STEP_REV (MD4_H_S, c_rev, d_rev, a_rev, b_rev, w[ 7], MD4C02, MD4S22);
+ MD4_STEP_REV (MD4_H_S, d_rev, a_rev, b_rev, c_rev, w[11], MD4C02, MD4S21);
+ MD4_STEP_REV (MD4_H_S, a_rev, b_rev, c_rev, d_rev, w[ 3], MD4C02, MD4S20);
+ MD4_STEP_REV (MD4_H_S, b_rev, c_rev, d_rev, a_rev, w[13], MD4C02, MD4S23);
+ MD4_STEP_REV (MD4_H_S, c_rev, d_rev, a_rev, b_rev, w[ 5], MD4C02, MD4S22);
+ MD4_STEP_REV (MD4_H_S, d_rev, a_rev, b_rev, c_rev, w[ 9], MD4C02, MD4S21);
+ MD4_STEP_REV (MD4_H_S, a_rev, b_rev, c_rev, d_rev, w[ 1], MD4C02, MD4S20);
+ MD4_STEP_REV (MD4_H_S, b_rev, c_rev, d_rev, a_rev, w[14], MD4C02, MD4S23);
+ MD4_STEP_REV (MD4_H_S, c_rev, d_rev, a_rev, b_rev, w[ 6], MD4C02, MD4S22);
+ MD4_STEP_REV (MD4_H_S, d_rev, a_rev, b_rev, c_rev, w[10], MD4C02, MD4S21);
+ MD4_STEP_REV (MD4_H_S, a_rev, b_rev, c_rev, d_rev, w[ 2], MD4C02, MD4S20);
+ MD4_STEP_REV (MD4_H_S, b_rev, c_rev, d_rev, a_rev, w[12], MD4C02, MD4S23);
+ MD4_STEP_REV (MD4_H_S, c_rev, d_rev, a_rev, b_rev, w[ 4], MD4C02, MD4S22);
+ MD4_STEP_REV (MD4_H_S, d_rev, a_rev, b_rev, c_rev, w[ 8], MD4C02, MD4S21);
+ MD4_STEP_REV (MD4_H_S, a_rev, b_rev, c_rev, d_rev, 0, MD4C02, MD4S20);
const u32 sav_c = c_rev;
const u32 sav_d = d_rev;
- MD4_STEP_REV1(MD4_G, b_rev, c_rev, d_rev, a_rev, w[15], MD4C01, MD4S13);
- MD4_STEP_REV1(MD4_G, c_rev, d_rev, a_rev, b_rev, w[11], MD4C01, MD4S12);
+ MD4_STEP_REV1(MD4_G_S, b_rev, c_rev, d_rev, a_rev, w[15], MD4C01, MD4S13);
+ MD4_STEP_REV1(MD4_G_S, c_rev, d_rev, a_rev, b_rev, w[11], MD4C01, MD4S12);
/**
* loop
u32 w0l = w[0];
- for (u32 il_pos = 0; il_pos < bfs_cnt; il_pos++)
+ for (u32 il_pos = 0; il_pos < bfs_cnt; il_pos += VECT_SIZE)
{
- const u32 w0r = words_buf_r[il_pos];
+ const u32x w0r = words_buf_r[il_pos / VECT_SIZE];
- const u32 w0 = w0l | w0r;
+ const u32x w0 = w0l | w0r;
- u32 pre_a = a_rev;
- u32 pre_b = b_rev;
- u32 pre_c = c_rev;
+ u32x pre_a = a_rev;
+ u32x pre_b = b_rev;
+ u32x pre_c = c_rev;
pre_a = pre_a - w0;
pre_b = pre_b - MD4_G (sav_c, sav_d, pre_a);
pre_c = pre_c - MD4_G (sav_d, pre_a, pre_b);
- u32 a = MD4M_A;
- u32 b = MD4M_B;
- u32 c = MD4M_C;
- u32 d = MD4M_D;
+ u32x a = MD4M_A;
+ u32x b = MD4M_B;
+ u32x c = MD4M_C;
+ u32x d = MD4M_D;
MD4_STEP (MD4_Fo, a, b, c, d, w0, F_w0c00, MD4S00);
MD4_STEP0(MD4_Fo, d, a, b, c, F_w1c00, MD4S01);
MD4_STEP0(MD4_Go, d, a, b, c, G_w6c01, MD4S11);
MD4_STEP0(MD4_Go, c, d, a, b, G_wac01, MD4S12);
- bool q_cond = allx (pre_c != c);
-
- if (q_cond) continue;
+ if (MATCHES_NONE_VV (c, pre_c)) continue;
MD4_STEP0(MD4_Go, b, c, d, a, G_wec01, MD4S13);
MD4_STEP0(MD4_Go, a, b, c, d, G_w3c01, MD4S10);
- bool q_cond2 = allx (pre_a != a);
-
- if (q_cond2) continue;
+ if (MATCHES_NONE_VV (a, pre_a)) continue;
MD4_STEP0(MD4_Go, d, a, b, c, G_w7c01, MD4S11);
MD4_STEP0(MD4_Go, c, d, a, b, G_wbc01, MD4S12);
MD4_STEP0(MD4_H , c, d, a, b, H_w7c02, MD4S22);
MD4_STEP0(MD4_H , b, c, d, a, H_wfc02, MD4S23);
- const u32 r0 = a;
- const u32 r1 = d;
- const u32 r2 = c;
- const u32 r3 = b;
-
- #include COMPARE_S
+ COMPARE_S_SIMD (a, d, c, b);
}
}
-__kernel void m00900_m04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, __global comb_t *combs_buf, __constant u32 * words_buf_r, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 bfs_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max)
+__kernel void m00900_m04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, __global comb_t *combs_buf, __constant u32x * words_buf_r, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 bfs_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max)
{
/**
* base
m00900m (w, pw_len, pws, rules_buf, combs_buf, words_buf_r, tmps, hooks, bitmaps_buf_s1_a, bitmaps_buf_s1_b, bitmaps_buf_s1_c, bitmaps_buf_s1_d, bitmaps_buf_s2_a, bitmaps_buf_s2_b, bitmaps_buf_s2_c, bitmaps_buf_s2_d, plains_buf, digests_buf, hashes_shown, salt_bufs, esalt_bufs, d_return_buf, d_scryptV_buf, bitmap_mask, bitmap_shift1, bitmap_shift2, salt_pos, loop_pos, loop_cnt, bfs_cnt, digests_cnt, digests_offset);
}
-__kernel void m00900_m08 (__global pw_t *pws, __global kernel_rule_t *rules_buf, __global comb_t *combs_buf, __constant u32 * words_buf_r, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 bfs_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max)
+__kernel void m00900_m08 (__global pw_t *pws, __global kernel_rule_t *rules_buf, __global comb_t *combs_buf, __constant u32x * words_buf_r, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 bfs_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max)
{
/**
* base
m00900m (w, pw_len, pws, rules_buf, combs_buf, words_buf_r, tmps, hooks, bitmaps_buf_s1_a, bitmaps_buf_s1_b, bitmaps_buf_s1_c, bitmaps_buf_s1_d, bitmaps_buf_s2_a, bitmaps_buf_s2_b, bitmaps_buf_s2_c, bitmaps_buf_s2_d, plains_buf, digests_buf, hashes_shown, salt_bufs, esalt_bufs, d_return_buf, d_scryptV_buf, bitmap_mask, bitmap_shift1, bitmap_shift2, salt_pos, loop_pos, loop_cnt, bfs_cnt, digests_cnt, digests_offset);
}
-__kernel void m00900_m16 (__global pw_t *pws, __global kernel_rule_t *rules_buf, __global comb_t *combs_buf, __constant u32 * words_buf_r, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 bfs_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max)
+__kernel void m00900_m16 (__global pw_t *pws, __global kernel_rule_t *rules_buf, __global comb_t *combs_buf, __constant u32x * words_buf_r, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 bfs_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max)
{
/**
* base
m00900m (w, pw_len, pws, rules_buf, combs_buf, words_buf_r, tmps, hooks, bitmaps_buf_s1_a, bitmaps_buf_s1_b, bitmaps_buf_s1_c, bitmaps_buf_s1_d, bitmaps_buf_s2_a, bitmaps_buf_s2_b, bitmaps_buf_s2_c, bitmaps_buf_s2_d, plains_buf, digests_buf, hashes_shown, salt_bufs, esalt_bufs, d_return_buf, d_scryptV_buf, bitmap_mask, bitmap_shift1, bitmap_shift2, salt_pos, loop_pos, loop_cnt, bfs_cnt, digests_cnt, digests_offset);
}
-__kernel void m00900_s04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, __global comb_t *combs_buf, __constant u32 * words_buf_r, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 bfs_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max)
+__kernel void m00900_s04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, __global comb_t *combs_buf, __constant u32x * words_buf_r, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 bfs_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max)
{
/**
* base
m00900s (w, pw_len, pws, rules_buf, combs_buf, words_buf_r, tmps, hooks, bitmaps_buf_s1_a, bitmaps_buf_s1_b, bitmaps_buf_s1_c, bitmaps_buf_s1_d, bitmaps_buf_s2_a, bitmaps_buf_s2_b, bitmaps_buf_s2_c, bitmaps_buf_s2_d, plains_buf, digests_buf, hashes_shown, salt_bufs, esalt_bufs, d_return_buf, d_scryptV_buf, bitmap_mask, bitmap_shift1, bitmap_shift2, salt_pos, loop_pos, loop_cnt, bfs_cnt, digests_cnt, digests_offset);
}
-__kernel void m00900_s08 (__global pw_t *pws, __global kernel_rule_t *rules_buf, __global comb_t *combs_buf, __constant u32 * words_buf_r, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 bfs_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max)
+__kernel void m00900_s08 (__global pw_t *pws, __global kernel_rule_t *rules_buf, __global comb_t *combs_buf, __constant u32x * words_buf_r, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 bfs_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max)
{
/**
* base
m00900s (w, pw_len, pws, rules_buf, combs_buf, words_buf_r, tmps, hooks, bitmaps_buf_s1_a, bitmaps_buf_s1_b, bitmaps_buf_s1_c, bitmaps_buf_s1_d, bitmaps_buf_s2_a, bitmaps_buf_s2_b, bitmaps_buf_s2_c, bitmaps_buf_s2_d, plains_buf, digests_buf, hashes_shown, salt_bufs, esalt_bufs, d_return_buf, d_scryptV_buf, bitmap_mask, bitmap_shift1, bitmap_shift2, salt_pos, loop_pos, loop_cnt, bfs_cnt, digests_cnt, digests_offset);
}
-__kernel void m00900_s16 (__global pw_t *pws, __global kernel_rule_t *rules_buf, __global comb_t *combs_buf, __constant u32 * words_buf_r, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 bfs_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max)
+__kernel void m00900_s16 (__global pw_t *pws, __global kernel_rule_t *rules_buf, __global comb_t *combs_buf, __constant u32x * words_buf_r, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 bfs_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max)
{
/**
* base
{
append_0x80_2x4 (wordl0, wordl1, pw_l_len);
- switch_buffer_by_offset (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len);
+ switch_buffer_by_offset_le (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len);
}
/**
if (combs_mode == COMBINATOR_MODE_BASE_LEFT)
{
- switch_buffer_by_offset (wordr0, wordr1, wordr2, wordr3, pw_l_len);
+ switch_buffer_by_offset_le (wordr0, wordr1, wordr2, wordr3, pw_l_len);
}
u32 w0[4];
{
append_0x80_2x4 (wordl0, wordl1, pw_l_len);
- switch_buffer_by_offset (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len);
+ switch_buffer_by_offset_le (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len);
}
/**
if (combs_mode == COMBINATOR_MODE_BASE_LEFT)
{
- switch_buffer_by_offset (wordr0, wordr1, wordr2, wordr3, pw_l_len);
+ switch_buffer_by_offset_le (wordr0, wordr1, wordr2, wordr3, pw_l_len);
}
u32 w0[4];
#define MD4_STEP_REV(f,a,b,c,d,x,t,s) \
{ \
- a = rotr32 (a, s); \
+ a = rotr32_S (a, s); \
a -= f (b, c, d); \
a -= x; \
a -= t; \
#define MD4_STEP_REV1(f,a,b,c,d,x,t,s) \
{ \
- a = rotr32 (a, s); \
+ a = rotr32_S (a, s); \
a -= x; \
a -= t; \
}
const u32x w0 = w0l | w0r;
- u32x tmp2;
-
u32x a = MD4M_A;
u32x b = MD4M_B;
u32x c = MD4M_C;
MD4_STEP0(MD4_Go, c, d, a, b, G_wbc01, MD4S12);
MD4_STEP0(MD4_Go, b, c, d, a, G_wfc01, MD4S13);
- MD4_STEP (MD4_H1, a, b, c, d, w0, H_w0c02, MD4S20);
- MD4_STEP0(MD4_H2, d, a, b, c, H_w8c02, MD4S21);
- MD4_STEP0(MD4_H1, c, d, a, b, H_w4c02, MD4S22);
- MD4_STEP0(MD4_H2, b, c, d, a, H_wcc02, MD4S23);
- MD4_STEP0(MD4_H1, a, b, c, d, H_w2c02, MD4S20);
- MD4_STEP0(MD4_H2, d, a, b, c, H_wac02, MD4S21);
- MD4_STEP0(MD4_H1, c, d, a, b, H_w6c02, MD4S22);
- MD4_STEP0(MD4_H2, b, c, d, a, H_wec02, MD4S23);
- MD4_STEP0(MD4_H1, a, b, c, d, H_w1c02, MD4S20);
- MD4_STEP0(MD4_H2, d, a, b, c, H_w9c02, MD4S21);
- MD4_STEP0(MD4_H1, c, d, a, b, H_w5c02, MD4S22);
- MD4_STEP0(MD4_H2, b, c, d, a, H_wdc02, MD4S23);
- MD4_STEP0(MD4_H1, a, b, c, d, H_w3c02, MD4S20);
- MD4_STEP0(MD4_H2, d, a, b, c, H_wbc02, MD4S21);
- MD4_STEP0(MD4_H1, c, d, a, b, H_w7c02, MD4S22);
- MD4_STEP0(MD4_H2, b, c, d, a, H_wfc02, MD4S23);
+ MD4_STEP (MD4_H , a, b, c, d, w0, H_w0c02, MD4S20);
+ MD4_STEP0(MD4_H , d, a, b, c, H_w8c02, MD4S21);
+ MD4_STEP0(MD4_H , c, d, a, b, H_w4c02, MD4S22);
+ MD4_STEP0(MD4_H , b, c, d, a, H_wcc02, MD4S23);
+ MD4_STEP0(MD4_H , a, b, c, d, H_w2c02, MD4S20);
+ MD4_STEP0(MD4_H , d, a, b, c, H_wac02, MD4S21);
+ MD4_STEP0(MD4_H , c, d, a, b, H_w6c02, MD4S22);
+ MD4_STEP0(MD4_H , b, c, d, a, H_wec02, MD4S23);
+ MD4_STEP0(MD4_H , a, b, c, d, H_w1c02, MD4S20);
+ MD4_STEP0(MD4_H , d, a, b, c, H_w9c02, MD4S21);
+ MD4_STEP0(MD4_H , c, d, a, b, H_w5c02, MD4S22);
+ MD4_STEP0(MD4_H , b, c, d, a, H_wdc02, MD4S23);
+ MD4_STEP0(MD4_H , a, b, c, d, H_w3c02, MD4S20);
+ MD4_STEP0(MD4_H , d, a, b, c, H_wbc02, MD4S21);
+ MD4_STEP0(MD4_H , c, d, a, b, H_w7c02, MD4S22);
+ MD4_STEP0(MD4_H , b, c, d, a, H_wfc02, MD4S23);
COMPARE_M_SIMD (a, d, c, b);
}
* reverse
*/
- u32x a_rev = digests_buf[digests_offset].digest_buf[0];
- u32x b_rev = digests_buf[digests_offset].digest_buf[1];
- u32x c_rev = digests_buf[digests_offset].digest_buf[2];
- u32x d_rev = digests_buf[digests_offset].digest_buf[3];
-
- MD4_STEP_REV (MD4_H, b_rev, c_rev, d_rev, a_rev, w[15], MD4C02, MD4S23);
- MD4_STEP_REV (MD4_H, c_rev, d_rev, a_rev, b_rev, w[ 7], MD4C02, MD4S22);
- MD4_STEP_REV (MD4_H, d_rev, a_rev, b_rev, c_rev, w[11], MD4C02, MD4S21);
- MD4_STEP_REV (MD4_H, a_rev, b_rev, c_rev, d_rev, w[ 3], MD4C02, MD4S20);
- MD4_STEP_REV (MD4_H, b_rev, c_rev, d_rev, a_rev, w[13], MD4C02, MD4S23);
- MD4_STEP_REV (MD4_H, c_rev, d_rev, a_rev, b_rev, w[ 5], MD4C02, MD4S22);
- MD4_STEP_REV (MD4_H, d_rev, a_rev, b_rev, c_rev, w[ 9], MD4C02, MD4S21);
- MD4_STEP_REV (MD4_H, a_rev, b_rev, c_rev, d_rev, w[ 1], MD4C02, MD4S20);
- MD4_STEP_REV (MD4_H, b_rev, c_rev, d_rev, a_rev, w[14], MD4C02, MD4S23);
- MD4_STEP_REV (MD4_H, c_rev, d_rev, a_rev, b_rev, w[ 6], MD4C02, MD4S22);
- MD4_STEP_REV (MD4_H, d_rev, a_rev, b_rev, c_rev, w[10], MD4C02, MD4S21);
- MD4_STEP_REV (MD4_H, a_rev, b_rev, c_rev, d_rev, w[ 2], MD4C02, MD4S20);
- MD4_STEP_REV (MD4_H, b_rev, c_rev, d_rev, a_rev, w[12], MD4C02, MD4S23);
- MD4_STEP_REV (MD4_H, c_rev, d_rev, a_rev, b_rev, w[ 4], MD4C02, MD4S22);
- MD4_STEP_REV (MD4_H, d_rev, a_rev, b_rev, c_rev, w[ 8], MD4C02, MD4S21);
- MD4_STEP_REV (MD4_H, a_rev, b_rev, c_rev, d_rev, 0, MD4C02, MD4S20);
-
- const u32x sav_c = c_rev;
- const u32x sav_d = d_rev;
-
- MD4_STEP_REV1(MD4_G, b_rev, c_rev, d_rev, a_rev, w[15], MD4C01, MD4S13);
- MD4_STEP_REV1(MD4_G, c_rev, d_rev, a_rev, b_rev, w[11], MD4C01, MD4S12);
+ u32 a_rev = digests_buf[digests_offset].digest_buf[0];
+ u32 b_rev = digests_buf[digests_offset].digest_buf[1];
+ u32 c_rev = digests_buf[digests_offset].digest_buf[2];
+ u32 d_rev = digests_buf[digests_offset].digest_buf[3];
+
+ MD4_STEP_REV (MD4_H_S, b_rev, c_rev, d_rev, a_rev, w[15], MD4C02, MD4S23);
+ MD4_STEP_REV (MD4_H_S, c_rev, d_rev, a_rev, b_rev, w[ 7], MD4C02, MD4S22);
+ MD4_STEP_REV (MD4_H_S, d_rev, a_rev, b_rev, c_rev, w[11], MD4C02, MD4S21);
+ MD4_STEP_REV (MD4_H_S, a_rev, b_rev, c_rev, d_rev, w[ 3], MD4C02, MD4S20);
+ MD4_STEP_REV (MD4_H_S, b_rev, c_rev, d_rev, a_rev, w[13], MD4C02, MD4S23);
+ MD4_STEP_REV (MD4_H_S, c_rev, d_rev, a_rev, b_rev, w[ 5], MD4C02, MD4S22);
+ MD4_STEP_REV (MD4_H_S, d_rev, a_rev, b_rev, c_rev, w[ 9], MD4C02, MD4S21);
+ MD4_STEP_REV (MD4_H_S, a_rev, b_rev, c_rev, d_rev, w[ 1], MD4C02, MD4S20);
+ MD4_STEP_REV (MD4_H_S, b_rev, c_rev, d_rev, a_rev, w[14], MD4C02, MD4S23);
+ MD4_STEP_REV (MD4_H_S, c_rev, d_rev, a_rev, b_rev, w[ 6], MD4C02, MD4S22);
+ MD4_STEP_REV (MD4_H_S, d_rev, a_rev, b_rev, c_rev, w[10], MD4C02, MD4S21);
+ MD4_STEP_REV (MD4_H_S, a_rev, b_rev, c_rev, d_rev, w[ 2], MD4C02, MD4S20);
+ MD4_STEP_REV (MD4_H_S, b_rev, c_rev, d_rev, a_rev, w[12], MD4C02, MD4S23);
+ MD4_STEP_REV (MD4_H_S, c_rev, d_rev, a_rev, b_rev, w[ 4], MD4C02, MD4S22);
+ MD4_STEP_REV (MD4_H_S, d_rev, a_rev, b_rev, c_rev, w[ 8], MD4C02, MD4S21);
+ MD4_STEP_REV (MD4_H_S, a_rev, b_rev, c_rev, d_rev, 0, MD4C02, MD4S20);
+
+ const u32 sav_c = c_rev;
+ const u32 sav_d = d_rev;
+
+ MD4_STEP_REV1(MD4_G_S, b_rev, c_rev, d_rev, a_rev, w[15], MD4C01, MD4S13);
+ MD4_STEP_REV1(MD4_G_S, c_rev, d_rev, a_rev, b_rev, w[11], MD4C01, MD4S12);
/**
* loop
pre_b = pre_b - MD4_G (sav_c, sav_d, pre_a);
pre_c = pre_c - MD4_G (sav_d, pre_a, pre_b);
- u32x tmp2;
-
u32x a = MD4M_A;
u32x b = MD4M_B;
u32x c = MD4M_C;
MD4_STEP0(MD4_Go, d, a, b, c, G_w6c01, MD4S11);
MD4_STEP0(MD4_Go, c, d, a, b, G_wac01, MD4S12);
- if (MATCHES_NONE_VV (pre_c, c)) continue;
+ if (MATCHES_NONE_VV (c, pre_c)) continue;
MD4_STEP0(MD4_Go, b, c, d, a, G_wec01, MD4S13);
MD4_STEP0(MD4_Go, a, b, c, d, G_w3c01, MD4S10);
- if (MATCHES_NONE_VV (pre_a, a)) continue;
+ if (MATCHES_NONE_VV (a, pre_a)) continue;
MD4_STEP0(MD4_Go, d, a, b, c, G_w7c01, MD4S11);
MD4_STEP0(MD4_Go, c, d, a, b, G_wbc01, MD4S12);
MD4_STEP0(MD4_Go, b, c, d, a, G_wfc01, MD4S13);
- MD4_STEP (MD4_H1, a, b, c, d, w0, H_w0c02, MD4S20);
- MD4_STEP0(MD4_H2, d, a, b, c, H_w8c02, MD4S21);
- MD4_STEP0(MD4_H1, c, d, a, b, H_w4c02, MD4S22);
- MD4_STEP0(MD4_H2, b, c, d, a, H_wcc02, MD4S23);
- MD4_STEP0(MD4_H1, a, b, c, d, H_w2c02, MD4S20);
- MD4_STEP0(MD4_H2, d, a, b, c, H_wac02, MD4S21);
- MD4_STEP0(MD4_H1, c, d, a, b, H_w6c02, MD4S22);
- MD4_STEP0(MD4_H2, b, c, d, a, H_wec02, MD4S23);
- MD4_STEP0(MD4_H1, a, b, c, d, H_w1c02, MD4S20);
- MD4_STEP0(MD4_H2, d, a, b, c, H_w9c02, MD4S21);
- MD4_STEP0(MD4_H1, c, d, a, b, H_w5c02, MD4S22);
- MD4_STEP0(MD4_H2, b, c, d, a, H_wdc02, MD4S23);
- MD4_STEP0(MD4_H1, a, b, c, d, H_w3c02, MD4S20);
- MD4_STEP0(MD4_H2, d, a, b, c, H_wbc02, MD4S21);
- MD4_STEP0(MD4_H1, c, d, a, b, H_w7c02, MD4S22);
- MD4_STEP0(MD4_H2, b, c, d, a, H_wfc02, MD4S23);
+ MD4_STEP (MD4_H , a, b, c, d, w0, H_w0c02, MD4S20);
+ MD4_STEP0(MD4_H , d, a, b, c, H_w8c02, MD4S21);
+ MD4_STEP0(MD4_H , c, d, a, b, H_w4c02, MD4S22);
+ MD4_STEP0(MD4_H , b, c, d, a, H_wcc02, MD4S23);
+ MD4_STEP0(MD4_H , a, b, c, d, H_w2c02, MD4S20);
+ MD4_STEP0(MD4_H , d, a, b, c, H_wac02, MD4S21);
+ MD4_STEP0(MD4_H , c, d, a, b, H_w6c02, MD4S22);
+ MD4_STEP0(MD4_H , b, c, d, a, H_wec02, MD4S23);
+ MD4_STEP0(MD4_H , a, b, c, d, H_w1c02, MD4S20);
+ MD4_STEP0(MD4_H , d, a, b, c, H_w9c02, MD4S21);
+ MD4_STEP0(MD4_H , c, d, a, b, H_w5c02, MD4S22);
+ MD4_STEP0(MD4_H , b, c, d, a, H_wdc02, MD4S23);
+ MD4_STEP0(MD4_H , a, b, c, d, H_w3c02, MD4S20);
+ MD4_STEP0(MD4_H , d, a, b, c, H_wbc02, MD4S21);
+ MD4_STEP0(MD4_H , c, d, a, b, H_w7c02, MD4S22);
+ MD4_STEP0(MD4_H , b, c, d, a, H_wfc02, MD4S23);
COMPARE_S_SIMD (a, d, c, b);
}
{
append_0x80_2x4 (wordl0, wordl1, pw_l_len);
- switch_buffer_by_offset (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len);
+ switch_buffer_by_offset_le (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len);
}
/**
if (combs_mode == COMBINATOR_MODE_BASE_LEFT)
{
- switch_buffer_by_offset (wordr0, wordr1, wordr2, wordr3, pw_l_len);
+ switch_buffer_by_offset_le (wordr0, wordr1, wordr2, wordr3, pw_l_len);
}
u32 w0[4];
{
append_0x80_2x4 (wordl0, wordl1, pw_l_len);
- switch_buffer_by_offset (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len);
+ switch_buffer_by_offset_le (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len);
}
/**
if (combs_mode == COMBINATOR_MODE_BASE_LEFT)
{
- switch_buffer_by_offset (wordr0, wordr1, wordr2, wordr3, pw_l_len);
+ switch_buffer_by_offset_le (wordr0, wordr1, wordr2, wordr3, pw_l_len);
}
u32 w0[4];
#define _MD4_
+#define NEW_SIMD_CODE
+
#include "include/constants.h"
#include "include/kernel_vendor.h"
#include "include/kernel_functions.c"
#include "OpenCL/types_ocl.c"
#include "OpenCL/common.c"
+#include "OpenCL/simd.c"
-#define COMPARE_S "OpenCL/check_single_comp4.c"
-#define COMPARE_M "OpenCL/check_multi_comp4.c"
-
-static void m01100m (u32 w[16], const u32 pw_len, __global pw_t *pws, __global kernel_rule_t *rules_buf, __global comb_t *combs_buf, __constant u32 * words_buf_r, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 bfs_cnt, const u32 digests_cnt, const u32 digests_offset)
+static void m01100m (u32 w[16], const u32 pw_len, __global pw_t *pws, __global kernel_rule_t *rules_buf, __global comb_t *combs_buf, __constant u32x * words_buf_r, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 bfs_cnt, const u32 digests_cnt, const u32 digests_offset)
{
/**
* modifier
u32 w0l = w[0];
- for (u32 il_pos = 0; il_pos < bfs_cnt; il_pos++)
+ for (u32 il_pos = 0; il_pos < bfs_cnt; il_pos += VECT_SIZE)
{
- const u32 w0r = words_buf_r[il_pos];
+ const u32x w0r = words_buf_r[il_pos / VECT_SIZE];
- const u32 w0 = w0l | w0r;
+ const u32x w0 = w0l | w0r;
- u32 a = MD4M_A;
- u32 b = MD4M_B;
- u32 c = MD4M_C;
- u32 d = MD4M_D;
+ u32x a = MD4M_A;
+ u32x b = MD4M_B;
+ u32x c = MD4M_C;
+ u32x d = MD4M_D;
MD4_STEP (MD4_Fo, a, b, c, d, w0, F_w0c00, MD4S00);
MD4_STEP0(MD4_Fo, d, a, b, c, F_w1c00, MD4S01);
c += MD4M_C;
d += MD4M_D;
- u32 w0_t[4];
- u32 w1_t[4];
- u32 w2_t[4];
- u32 w3_t[4];
+ u32x w0_t[4];
+ u32x w1_t[4];
+ u32x w2_t[4];
+ u32x w3_t[4];
w0_t[0] = a;
w0_t[1] = b;
MD4_STEP (MD4_H , c, d, a, b, w1_t[3], MD4C02, MD4S22);
MD4_STEP (MD4_H , b, c, d, a, w3_t[3], MD4C02, MD4S23);
- const u32 r0 = a;
- const u32 r1 = d;
- const u32 r2 = c;
- const u32 r3 = b;
-
- #include COMPARE_M
+ COMPARE_M_SIMD (a, d, c, b);
}
}
-static void m01100s (u32 w[16], const u32 pw_len, __global pw_t *pws, __global kernel_rule_t *rules_buf, __global comb_t *combs_buf, __constant u32 * words_buf_r, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 bfs_cnt, const u32 digests_cnt, const u32 digests_offset)
+static void m01100s (u32 w[16], const u32 pw_len, __global pw_t *pws, __global kernel_rule_t *rules_buf, __global comb_t *combs_buf, __constant u32x * words_buf_r, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 bfs_cnt, const u32 digests_cnt, const u32 digests_offset)
{
/**
* modifier
u32 w0l = w[0];
- for (u32 il_pos = 0; il_pos < bfs_cnt; il_pos++)
+ for (u32 il_pos = 0; il_pos < bfs_cnt; il_pos += VECT_SIZE)
{
- const u32 w0r = words_buf_r[il_pos];
+ const u32x w0r = words_buf_r[il_pos / VECT_SIZE];
- const u32 w0 = w0l | w0r;
+ const u32x w0 = w0l | w0r;
- u32 a = MD4M_A;
- u32 b = MD4M_B;
- u32 c = MD4M_C;
- u32 d = MD4M_D;
+ u32x a = MD4M_A;
+ u32x b = MD4M_B;
+ u32x c = MD4M_C;
+ u32x d = MD4M_D;
MD4_STEP (MD4_Fo, a, b, c, d, w0, F_w0c00, MD4S00);
MD4_STEP0(MD4_Fo, d, a, b, c, F_w1c00, MD4S01);
c += MD4M_C;
d += MD4M_D;
- u32 w0_t[4];
- u32 w1_t[4];
- u32 w2_t[4];
- u32 w3_t[4];
+ u32x w0_t[4];
+ u32x w1_t[4];
+ u32x w2_t[4];
+ u32x w3_t[4];
w0_t[0] = a;
w0_t[1] = b;
MD4_STEP (MD4_H , b, c, d, a, w3_t[1], MD4C02, MD4S23);
MD4_STEP (MD4_H , a, b, c, d, w0_t[3], MD4C02, MD4S20);
- bool q_cond = allx (search[0] != a);
-
- if (q_cond) continue;
+ if (MATCHES_NONE_VS (a, search[0])) continue;
MD4_STEP (MD4_H , d, a, b, c, w2_t[3], MD4C02, MD4S21);
MD4_STEP (MD4_H , c, d, a, b, w1_t[3], MD4C02, MD4S22);
MD4_STEP (MD4_H , b, c, d, a, w3_t[3], MD4C02, MD4S23);
- const u32 r0 = a;
- const u32 r1 = d;
- const u32 r2 = c;
- const u32 r3 = b;
-
- #include COMPARE_S
+ COMPARE_S_SIMD (a, d, c, b);
}
}
-__kernel void m01100_m04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, __global comb_t *combs_buf, __constant u32 * words_buf_r, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 bfs_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max)
+__kernel void m01100_m04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, __global comb_t *combs_buf, __constant u32x * words_buf_r, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 bfs_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max)
{
/**
* base
m01100m (w, pw_len, pws, rules_buf, combs_buf, words_buf_r, tmps, hooks, bitmaps_buf_s1_a, bitmaps_buf_s1_b, bitmaps_buf_s1_c, bitmaps_buf_s1_d, bitmaps_buf_s2_a, bitmaps_buf_s2_b, bitmaps_buf_s2_c, bitmaps_buf_s2_d, plains_buf, digests_buf, hashes_shown, salt_bufs, esalt_bufs, d_return_buf, d_scryptV_buf, bitmap_mask, bitmap_shift1, bitmap_shift2, salt_pos, loop_pos, loop_cnt, bfs_cnt, digests_cnt, digests_offset);
}
-__kernel void m01100_m08 (__global pw_t *pws, __global kernel_rule_t *rules_buf, __global comb_t *combs_buf, __constant u32 * words_buf_r, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 bfs_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max)
+__kernel void m01100_m08 (__global pw_t *pws, __global kernel_rule_t *rules_buf, __global comb_t *combs_buf, __constant u32x * words_buf_r, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 bfs_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max)
{
/**
* base
m01100m (w, pw_len, pws, rules_buf, combs_buf, words_buf_r, tmps, hooks, bitmaps_buf_s1_a, bitmaps_buf_s1_b, bitmaps_buf_s1_c, bitmaps_buf_s1_d, bitmaps_buf_s2_a, bitmaps_buf_s2_b, bitmaps_buf_s2_c, bitmaps_buf_s2_d, plains_buf, digests_buf, hashes_shown, salt_bufs, esalt_bufs, d_return_buf, d_scryptV_buf, bitmap_mask, bitmap_shift1, bitmap_shift2, salt_pos, loop_pos, loop_cnt, bfs_cnt, digests_cnt, digests_offset);
}
-__kernel void m01100_m16 (__global pw_t *pws, __global kernel_rule_t *rules_buf, __global comb_t *combs_buf, __constant u32 * words_buf_r, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 bfs_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max)
+__kernel void m01100_m16 (__global pw_t *pws, __global kernel_rule_t *rules_buf, __global comb_t *combs_buf, __constant u32x * words_buf_r, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 bfs_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max)
{
}
-__kernel void m01100_s04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, __global comb_t *combs_buf, __constant u32 * words_buf_r, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 bfs_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max)
+__kernel void m01100_s04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, __global comb_t *combs_buf, __constant u32x * words_buf_r, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 bfs_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max)
{
/**
* base
m01100s (w, pw_len, pws, rules_buf, combs_buf, words_buf_r, tmps, hooks, bitmaps_buf_s1_a, bitmaps_buf_s1_b, bitmaps_buf_s1_c, bitmaps_buf_s1_d, bitmaps_buf_s2_a, bitmaps_buf_s2_b, bitmaps_buf_s2_c, bitmaps_buf_s2_d, plains_buf, digests_buf, hashes_shown, salt_bufs, esalt_bufs, d_return_buf, d_scryptV_buf, bitmap_mask, bitmap_shift1, bitmap_shift2, salt_pos, loop_pos, loop_cnt, bfs_cnt, digests_cnt, digests_offset);
}
-__kernel void m01100_s08 (__global pw_t *pws, __global kernel_rule_t *rules_buf, __global comb_t *combs_buf, __constant u32 * words_buf_r, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 bfs_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max)
+__kernel void m01100_s08 (__global pw_t *pws, __global kernel_rule_t *rules_buf, __global comb_t *combs_buf, __constant u32x * words_buf_r, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 bfs_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max)
{
/**
* base
m01100s (w, pw_len, pws, rules_buf, combs_buf, words_buf_r, tmps, hooks, bitmaps_buf_s1_a, bitmaps_buf_s1_b, bitmaps_buf_s1_c, bitmaps_buf_s1_d, bitmaps_buf_s2_a, bitmaps_buf_s2_b, bitmaps_buf_s2_c, bitmaps_buf_s2_d, plains_buf, digests_buf, hashes_shown, salt_bufs, esalt_bufs, d_return_buf, d_scryptV_buf, bitmap_mask, bitmap_shift1, bitmap_shift2, salt_pos, loop_pos, loop_cnt, bfs_cnt, digests_cnt, digests_offset);
}
-__kernel void m01100_s16 (__global pw_t *pws, __global kernel_rule_t *rules_buf, __global comb_t *combs_buf, __constant u32 * words_buf_r, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 bfs_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max)
+__kernel void m01100_s16 (__global pw_t *pws, __global kernel_rule_t *rules_buf, __global comb_t *combs_buf, __constant u32x * words_buf_r, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 bfs_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max)
{
}
{
append_0x80_2x4 (wordl0, wordl1, pw_l_len);
- switch_buffer_by_offset (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len);
+ switch_buffer_by_offset_le (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len);
}
/**
{
append_0x80_2x4 (wordr0, wordr1, pw_r_len);
- switch_buffer_by_offset (wordr0, wordr1, wordr2, wordr3, pw_l_len);
+ switch_buffer_by_offset_le (wordr0, wordr1, wordr2, wordr3, pw_l_len);
}
u32 w0[4];
{
append_0x80_2x4 (wordl0, wordl1, pw_l_len);
- switch_buffer_by_offset (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len);
+ switch_buffer_by_offset_le (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len);
}
/**
{
append_0x80_2x4 (wordr0, wordr1, pw_r_len);
- switch_buffer_by_offset (wordr0, wordr1, wordr2, wordr3, pw_l_len);
+ switch_buffer_by_offset_le (wordr0, wordr1, wordr2, wordr3, pw_l_len);
}
u32 w0[4];
#define _SHA256_
+#define NEW_SIMD_CODE
+
#include "include/constants.h"
#include "include/kernel_vendor.h"
#include "include/kernel_functions.c"
#include "OpenCL/types_ocl.c"
#include "OpenCL/common.c"
+#include "OpenCL/simd.c"
-#define COMPARE_S "OpenCL/check_single_comp4.c"
-#define COMPARE_M "OpenCL/check_multi_comp4.c"
-
-static void m01400m (u32 w[16], const u32 pw_len, __global pw_t *pws, __global kernel_rule_t *rules_buf, __global comb_t *combs_buf, __constant u32 * words_buf_r, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 bfs_cnt, const u32 digests_cnt, const u32 digests_offset)
+static void m01400m (u32 w[16], const u32 pw_len, __global pw_t *pws, __global kernel_rule_t *rules_buf, __global comb_t *combs_buf, __constant u32x * words_buf_r, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 bfs_cnt, const u32 digests_cnt, const u32 digests_offset)
{
/**
* modifier
u32 w0l = w[0];
- for (u32 il_pos = 0; il_pos < bfs_cnt; il_pos++)
+ for (u32 il_pos = 0; il_pos < bfs_cnt; il_pos += VECT_SIZE)
{
- const u32 w0r = words_buf_r[il_pos];
-
- const u32 w0 = w0l | w0r;
-
- u32 w0_t = w0;
- u32 w1_t = w[ 1];
- u32 w2_t = w[ 2];
- u32 w3_t = w[ 3];
- u32 w4_t = w[ 4];
- u32 w5_t = w[ 5];
- u32 w6_t = w[ 6];
- u32 w7_t = w[ 7];
- u32 w8_t = w[ 8];
- u32 w9_t = w[ 9];
- u32 wa_t = w[10];
- u32 wb_t = w[11];
- u32 wc_t = w[12];
- u32 wd_t = w[13];
- u32 we_t = w[14];
- u32 wf_t = w[15];
-
- u32 a = SHA256M_A;
- u32 b = SHA256M_B;
- u32 c = SHA256M_C;
- u32 d = SHA256M_D;
- u32 e = SHA256M_E;
- u32 f = SHA256M_F;
- u32 g = SHA256M_G;
- u32 h = SHA256M_H;
+ const u32x w0r = words_buf_r[il_pos / VECT_SIZE];
+
+ const u32x w0 = w0l | w0r;
+
+ u32x w0_t = w0;
+ u32x w1_t = w[ 1];
+ u32x w2_t = w[ 2];
+ u32x w3_t = w[ 3];
+ u32x w4_t = w[ 4];
+ u32x w5_t = w[ 5];
+ u32x w6_t = w[ 6];
+ u32x w7_t = w[ 7];
+ u32x w8_t = w[ 8];
+ u32x w9_t = w[ 9];
+ u32x wa_t = w[10];
+ u32x wb_t = w[11];
+ u32x wc_t = w[12];
+ u32x wd_t = w[13];
+ u32x we_t = w[14];
+ u32x wf_t = w[15];
+
+ u32x a = SHA256M_A;
+ u32x b = SHA256M_B;
+ u32x c = SHA256M_C;
+ u32x d = SHA256M_D;
+ u32x e = SHA256M_E;
+ u32x f = SHA256M_F;
+ u32x g = SHA256M_G;
+ u32x h = SHA256M_H;
SHA256_STEP (SHA256_F0o, SHA256_F1o, a, b, c, d, e, f, g, h, w0_t, SHA256C00);
SHA256_STEP (SHA256_F0o, SHA256_F1o, h, a, b, c, d, e, f, g, w1_t, SHA256C01);
we_t = SHA256_EXPAND (wc_t, w7_t, wf_t, we_t); SHA256_STEP (SHA256_F0o, SHA256_F1o, c, d, e, f, g, h, a, b, we_t, SHA256C3e);
wf_t = SHA256_EXPAND (wd_t, w8_t, w0_t, wf_t); SHA256_STEP (SHA256_F0o, SHA256_F1o, b, c, d, e, f, g, h, a, wf_t, SHA256C3f);
-
- const u32 r0 = d;
- const u32 r1 = h;
- const u32 r2 = c;
- const u32 r3 = g;
-
- #include COMPARE_M
+ COMPARE_M_SIMD (d, h, c, g);
}
}
-static void m01400s (u32 w[16], const u32 pw_len, __global pw_t *pws, __global kernel_rule_t *rules_buf, __global comb_t *combs_buf, __constant u32 * words_buf_r, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 bfs_cnt, const u32 digests_cnt, const u32 digests_offset)
+static void m01400s (u32 w[16], const u32 pw_len, __global pw_t *pws, __global kernel_rule_t *rules_buf, __global comb_t *combs_buf, __constant u32x * words_buf_r, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 bfs_cnt, const u32 digests_cnt, const u32 digests_offset)
{
/**
* modifier
u32 w0l = w[0];
- for (u32 il_pos = 0; il_pos < bfs_cnt; il_pos++)
+ for (u32 il_pos = 0; il_pos < bfs_cnt; il_pos += VECT_SIZE)
{
- const u32 w0r = words_buf_r[il_pos];
-
- const u32 w0 = w0l | w0r;
-
- u32 w0_t = w0;
- u32 w1_t = w[ 1];
- u32 w2_t = w[ 2];
- u32 w3_t = w[ 3];
- u32 w4_t = w[ 4];
- u32 w5_t = w[ 5];
- u32 w6_t = w[ 6];
- u32 w7_t = w[ 7];
- u32 w8_t = w[ 8];
- u32 w9_t = w[ 9];
- u32 wa_t = w[10];
- u32 wb_t = w[11];
- u32 wc_t = w[12];
- u32 wd_t = w[13];
- u32 we_t = w[14];
- u32 wf_t = w[15];
-
- u32 a = SHA256M_A;
- u32 b = SHA256M_B;
- u32 c = SHA256M_C;
- u32 d = SHA256M_D;
- u32 e = SHA256M_E;
- u32 f = SHA256M_F;
- u32 g = SHA256M_G;
- u32 h = SHA256M_H;
+ const u32x w0r = words_buf_r[il_pos / VECT_SIZE];
+
+ const u32x w0 = w0l | w0r;
+
+ u32x w0_t = w0;
+ u32x w1_t = w[ 1];
+ u32x w2_t = w[ 2];
+ u32x w3_t = w[ 3];
+ u32x w4_t = w[ 4];
+ u32x w5_t = w[ 5];
+ u32x w6_t = w[ 6];
+ u32x w7_t = w[ 7];
+ u32x w8_t = w[ 8];
+ u32x w9_t = w[ 9];
+ u32x wa_t = w[10];
+ u32x wb_t = w[11];
+ u32x wc_t = w[12];
+ u32x wd_t = w[13];
+ u32x we_t = w[14];
+ u32x wf_t = w[15];
+
+ u32x a = SHA256M_A;
+ u32x b = SHA256M_B;
+ u32x c = SHA256M_C;
+ u32x d = SHA256M_D;
+ u32x e = SHA256M_E;
+ u32x f = SHA256M_F;
+ u32x g = SHA256M_G;
+ u32x h = SHA256M_H;
SHA256_STEP (SHA256_F0o, SHA256_F1o, a, b, c, d, e, f, g, h, w0_t, SHA256C00);
SHA256_STEP (SHA256_F0o, SHA256_F1o, h, a, b, c, d, e, f, g, w1_t, SHA256C01);
wa_t = SHA256_EXPAND (w8_t, w3_t, wb_t, wa_t); SHA256_STEP (SHA256_F0o, SHA256_F1o, g, h, a, b, c, d, e, f, wa_t, SHA256C3a);
wb_t = SHA256_EXPAND (w9_t, w4_t, wc_t, wb_t); SHA256_STEP (SHA256_F0o, SHA256_F1o, f, g, h, a, b, c, d, e, wb_t, SHA256C3b);
wc_t = SHA256_EXPAND (wa_t, w5_t, wd_t, wc_t); SHA256_STEP (SHA256_F0o, SHA256_F1o, e, f, g, h, a, b, c, d, wc_t, SHA256C3c);
+
+ if (MATCHES_NONE_VS (d, search[0])) continue;
+
wd_t = SHA256_EXPAND (wb_t, w6_t, we_t, wd_t); SHA256_STEP (SHA256_F0o, SHA256_F1o, d, e, f, g, h, a, b, c, wd_t, SHA256C3d);
we_t = SHA256_EXPAND (wc_t, w7_t, wf_t, we_t); SHA256_STEP (SHA256_F0o, SHA256_F1o, c, d, e, f, g, h, a, b, we_t, SHA256C3e);
wf_t = SHA256_EXPAND (wd_t, w8_t, w0_t, wf_t); SHA256_STEP (SHA256_F0o, SHA256_F1o, b, c, d, e, f, g, h, a, wf_t, SHA256C3f);
- const u32 r0 = d;
- const u32 r1 = h;
- const u32 r2 = c;
- const u32 r3 = g;
-
- #include COMPARE_S
+ COMPARE_S_SIMD (d, h, c, g);
}
}
-__kernel void m01400_m04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, __global comb_t *combs_buf, __constant u32 * words_buf_r, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 bfs_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max)
+__kernel void m01400_m04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, __global comb_t *combs_buf, __constant u32x * words_buf_r, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 bfs_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max)
{
/**
* base
m01400m (w, pw_len, pws, rules_buf, combs_buf, words_buf_r, tmps, hooks, bitmaps_buf_s1_a, bitmaps_buf_s1_b, bitmaps_buf_s1_c, bitmaps_buf_s1_d, bitmaps_buf_s2_a, bitmaps_buf_s2_b, bitmaps_buf_s2_c, bitmaps_buf_s2_d, plains_buf, digests_buf, hashes_shown, salt_bufs, esalt_bufs, d_return_buf, d_scryptV_buf, bitmap_mask, bitmap_shift1, bitmap_shift2, salt_pos, loop_pos, loop_cnt, bfs_cnt, digests_cnt, digests_offset);
}
-__kernel void m01400_m08 (__global pw_t *pws, __global kernel_rule_t *rules_buf, __global comb_t *combs_buf, __constant u32 * words_buf_r, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 bfs_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max)
+__kernel void m01400_m08 (__global pw_t *pws, __global kernel_rule_t *rules_buf, __global comb_t *combs_buf, __constant u32x * words_buf_r, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 bfs_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max)
{
/**
* base
m01400m (w, pw_len, pws, rules_buf, combs_buf, words_buf_r, tmps, hooks, bitmaps_buf_s1_a, bitmaps_buf_s1_b, bitmaps_buf_s1_c, bitmaps_buf_s1_d, bitmaps_buf_s2_a, bitmaps_buf_s2_b, bitmaps_buf_s2_c, bitmaps_buf_s2_d, plains_buf, digests_buf, hashes_shown, salt_bufs, esalt_bufs, d_return_buf, d_scryptV_buf, bitmap_mask, bitmap_shift1, bitmap_shift2, salt_pos, loop_pos, loop_cnt, bfs_cnt, digests_cnt, digests_offset);
}
-__kernel void m01400_m16 (__global pw_t *pws, __global kernel_rule_t *rules_buf, __global comb_t *combs_buf, __constant u32 * words_buf_r, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 bfs_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max)
+__kernel void m01400_m16 (__global pw_t *pws, __global kernel_rule_t *rules_buf, __global comb_t *combs_buf, __constant u32x * words_buf_r, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 bfs_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max)
{
/**
* base
m01400m (w, pw_len, pws, rules_buf, combs_buf, words_buf_r, tmps, hooks, bitmaps_buf_s1_a, bitmaps_buf_s1_b, bitmaps_buf_s1_c, bitmaps_buf_s1_d, bitmaps_buf_s2_a, bitmaps_buf_s2_b, bitmaps_buf_s2_c, bitmaps_buf_s2_d, plains_buf, digests_buf, hashes_shown, salt_bufs, esalt_bufs, d_return_buf, d_scryptV_buf, bitmap_mask, bitmap_shift1, bitmap_shift2, salt_pos, loop_pos, loop_cnt, bfs_cnt, digests_cnt, digests_offset);
}
-__kernel void m01400_s04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, __global comb_t *combs_buf, __constant u32 * words_buf_r, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 bfs_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max)
+__kernel void m01400_s04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, __global comb_t *combs_buf, __constant u32x * words_buf_r, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 bfs_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max)
{
/**
* base
m01400s (w, pw_len, pws, rules_buf, combs_buf, words_buf_r, tmps, hooks, bitmaps_buf_s1_a, bitmaps_buf_s1_b, bitmaps_buf_s1_c, bitmaps_buf_s1_d, bitmaps_buf_s2_a, bitmaps_buf_s2_b, bitmaps_buf_s2_c, bitmaps_buf_s2_d, plains_buf, digests_buf, hashes_shown, salt_bufs, esalt_bufs, d_return_buf, d_scryptV_buf, bitmap_mask, bitmap_shift1, bitmap_shift2, salt_pos, loop_pos, loop_cnt, bfs_cnt, digests_cnt, digests_offset);
}
-__kernel void m01400_s08 (__global pw_t *pws, __global kernel_rule_t *rules_buf, __global comb_t *combs_buf, __constant u32 * words_buf_r, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 bfs_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max)
+__kernel void m01400_s08 (__global pw_t *pws, __global kernel_rule_t *rules_buf, __global comb_t *combs_buf, __constant u32x * words_buf_r, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 bfs_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max)
{
/**
* base
m01400s (w, pw_len, pws, rules_buf, combs_buf, words_buf_r, tmps, hooks, bitmaps_buf_s1_a, bitmaps_buf_s1_b, bitmaps_buf_s1_c, bitmaps_buf_s1_d, bitmaps_buf_s2_a, bitmaps_buf_s2_b, bitmaps_buf_s2_c, bitmaps_buf_s2_d, plains_buf, digests_buf, hashes_shown, salt_bufs, esalt_bufs, d_return_buf, d_scryptV_buf, bitmap_mask, bitmap_shift1, bitmap_shift2, salt_pos, loop_pos, loop_cnt, bfs_cnt, digests_cnt, digests_offset);
}
-__kernel void m01400_s16 (__global pw_t *pws, __global kernel_rule_t *rules_buf, __global comb_t *combs_buf, __constant u32 * words_buf_r, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 bfs_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max)
+__kernel void m01400_s16 (__global pw_t *pws, __global kernel_rule_t *rules_buf, __global comb_t *combs_buf, __constant u32x * words_buf_r, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 bfs_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max)
{
/**
* base
s3[2] = 0;
s3[3] = 0;
- switch_buffer_by_offset (s0, s1, s2, s3, out_len);
+ switch_buffer_by_offset_le (s0, s1, s2, s3, out_len);
const u32 out_salt_len = out_len + salt_len;
s3[2] = 0;
s3[3] = 0;
- switch_buffer_by_offset (s0, s1, s2, s3, out_len);
+ switch_buffer_by_offset_le (s0, s1, s2, s3, out_len);
const u32 out_salt_len = out_len + salt_len;
if (combs_mode == COMBINATOR_MODE_BASE_RIGHT)
{
- switch_buffer_by_offset (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len);
+ switch_buffer_by_offset_le (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len);
}
/**
if (combs_mode == COMBINATOR_MODE_BASE_LEFT)
{
- switch_buffer_by_offset (wordr0, wordr1, wordr2, wordr3, pw_l_len);
+ switch_buffer_by_offset_le (wordr0, wordr1, wordr2, wordr3, pw_l_len);
}
/**
s3[2] = 0;
s3[3] = 0;
- switch_buffer_by_offset (s0, s1, s2, s3, pw_len);
+ switch_buffer_by_offset_le (s0, s1, s2, s3, pw_len);
const u32 pw_salt_len = pw_len + salt_len;
if (combs_mode == COMBINATOR_MODE_BASE_RIGHT)
{
- switch_buffer_by_offset (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len);
+ switch_buffer_by_offset_le (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len);
}
/**
if (combs_mode == COMBINATOR_MODE_BASE_LEFT)
{
- switch_buffer_by_offset (wordr0, wordr1, wordr2, wordr3, pw_l_len);
+ switch_buffer_by_offset_le (wordr0, wordr1, wordr2, wordr3, pw_l_len);
}
/**
s3[2] = 0;
s3[3] = 0;
- switch_buffer_by_offset (s0, s1, s2, s3, pw_len);
+ switch_buffer_by_offset_le (s0, s1, s2, s3, pw_len);
const u32 pw_salt_len = pw_len + salt_len;
#define _SHA256_
+#define NEW_SIMD_CODE
+
#include "include/constants.h"
#include "include/kernel_vendor.h"
#include "include/kernel_functions.c"
#include "OpenCL/types_ocl.c"
#include "OpenCL/common.c"
+#include "OpenCL/simd.c"
-#define COMPARE_S "OpenCL/check_single_comp4.c"
-#define COMPARE_M "OpenCL/check_multi_comp4.c"
-
-static void m01410m (u32 w[16], const u32 pw_len, __global pw_t *pws, __global kernel_rule_t *rules_buf, __global comb_t *combs_buf, __constant u32 * words_buf_r, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 bfs_cnt, const u32 digests_cnt, const u32 digests_offset)
+static void m01410m (u32 w[16], const u32 pw_len, __global pw_t *pws, __global kernel_rule_t *rules_buf, __global comb_t *combs_buf, __constant u32x * words_buf_r, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 bfs_cnt, const u32 digests_cnt, const u32 digests_offset)
{
/**
* modifier
salt_buf3[2] = 0;
salt_buf3[3] = 0;
- switch_buffer_by_offset (salt_buf0, salt_buf1, salt_buf2, salt_buf3, pw_len);
-
- w[ 0] |= swap32 (salt_buf0[0]);
- w[ 1] |= swap32 (salt_buf0[1]);
- w[ 2] |= swap32 (salt_buf0[2]);
- w[ 3] |= swap32 (salt_buf0[3]);
- w[ 4] |= swap32 (salt_buf1[0]);
- w[ 5] |= swap32 (salt_buf1[1]);
- w[ 6] |= swap32 (salt_buf1[2]);
- w[ 7] |= swap32 (salt_buf1[3]);
- w[ 8] |= swap32 (salt_buf2[0]);
- w[ 9] |= swap32 (salt_buf2[1]);
- w[10] |= swap32 (salt_buf2[2]);
- w[11] |= swap32 (salt_buf2[3]);
- w[12] |= swap32 (salt_buf3[0]);
- w[13] |= swap32 (salt_buf3[1]);
- w[14] |= swap32 (salt_buf3[2]);
- w[15] |= swap32 (salt_buf3[3]);
+ switch_buffer_by_offset_le_S (salt_buf0, salt_buf1, salt_buf2, salt_buf3, pw_len);
+
+ w[ 0] |= swap32_S (salt_buf0[0]);
+ w[ 1] |= swap32_S (salt_buf0[1]);
+ w[ 2] |= swap32_S (salt_buf0[2]);
+ w[ 3] |= swap32_S (salt_buf0[3]);
+ w[ 4] |= swap32_S (salt_buf1[0]);
+ w[ 5] |= swap32_S (salt_buf1[1]);
+ w[ 6] |= swap32_S (salt_buf1[2]);
+ w[ 7] |= swap32_S (salt_buf1[3]);
+ w[ 8] |= swap32_S (salt_buf2[0]);
+ w[ 9] |= swap32_S (salt_buf2[1]);
+ w[10] |= swap32_S (salt_buf2[2]);
+ w[11] |= swap32_S (salt_buf2[3]);
+ w[12] |= swap32_S (salt_buf3[0]);
+ w[13] |= swap32_S (salt_buf3[1]);
+ w[14] |= swap32_S (salt_buf3[2]);
+ w[15] |= swap32_S (salt_buf3[3]);
const u32 salt_len = salt_bufs[salt_pos].salt_len;
u32 w0l = w[0];
- for (u32 il_pos = 0; il_pos < bfs_cnt; il_pos++)
+ for (u32 il_pos = 0; il_pos < bfs_cnt; il_pos += VECT_SIZE)
{
- const u32 w0r = words_buf_r[il_pos];
-
- const u32 w0 = w0l | w0r;
-
- u32 w0_t = w0;
- u32 w1_t = w[ 1];
- u32 w2_t = w[ 2];
- u32 w3_t = w[ 3];
- u32 w4_t = w[ 4];
- u32 w5_t = w[ 5];
- u32 w6_t = w[ 6];
- u32 w7_t = w[ 7];
- u32 w8_t = w[ 8];
- u32 w9_t = w[ 9];
- u32 wa_t = w[10];
- u32 wb_t = w[11];
- u32 wc_t = w[12];
- u32 wd_t = w[13];
- u32 we_t = w[14];
- u32 wf_t = w[15];
-
- u32 a = SHA256M_A;
- u32 b = SHA256M_B;
- u32 c = SHA256M_C;
- u32 d = SHA256M_D;
- u32 e = SHA256M_E;
- u32 f = SHA256M_F;
- u32 g = SHA256M_G;
- u32 h = SHA256M_H;
+ const u32x w0r = words_buf_r[il_pos / VECT_SIZE];
+
+ const u32x w0 = w0l | w0r;
+
+ u32x w0_t = w0;
+ u32x w1_t = w[ 1];
+ u32x w2_t = w[ 2];
+ u32x w3_t = w[ 3];
+ u32x w4_t = w[ 4];
+ u32x w5_t = w[ 5];
+ u32x w6_t = w[ 6];
+ u32x w7_t = w[ 7];
+ u32x w8_t = w[ 8];
+ u32x w9_t = w[ 9];
+ u32x wa_t = w[10];
+ u32x wb_t = w[11];
+ u32x wc_t = w[12];
+ u32x wd_t = w[13];
+ u32x we_t = w[14];
+ u32x wf_t = w[15];
+
+ u32x a = SHA256M_A;
+ u32x b = SHA256M_B;
+ u32x c = SHA256M_C;
+ u32x d = SHA256M_D;
+ u32x e = SHA256M_E;
+ u32x f = SHA256M_F;
+ u32x g = SHA256M_G;
+ u32x h = SHA256M_H;
SHA256_STEP (SHA256_F0o, SHA256_F1o, a, b, c, d, e, f, g, h, w0_t, SHA256C00);
SHA256_STEP (SHA256_F0o, SHA256_F1o, h, a, b, c, d, e, f, g, w1_t, SHA256C01);
we_t = SHA256_EXPAND (wc_t, w7_t, wf_t, we_t); SHA256_STEP (SHA256_F0o, SHA256_F1o, c, d, e, f, g, h, a, b, we_t, SHA256C3e);
wf_t = SHA256_EXPAND (wd_t, w8_t, w0_t, wf_t); SHA256_STEP (SHA256_F0o, SHA256_F1o, b, c, d, e, f, g, h, a, wf_t, SHA256C3f);
-
- const u32 r0 = d;
- const u32 r1 = h;
- const u32 r2 = c;
- const u32 r3 = g;
-
- #include COMPARE_M
+ COMPARE_M_SIMD (d, h, c, g);
}
}
-static void m01410s (u32 w[16], const u32 pw_len, __global pw_t *pws, __global kernel_rule_t *rules_buf, __global comb_t *combs_buf, __constant u32 * words_buf_r, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 bfs_cnt, const u32 digests_cnt, const u32 digests_offset)
+static void m01410s (u32 w[16], const u32 pw_len, __global pw_t *pws, __global kernel_rule_t *rules_buf, __global comb_t *combs_buf, __constant u32x * words_buf_r, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 bfs_cnt, const u32 digests_cnt, const u32 digests_offset)
{
/**
* modifier
u32 w0l = w[0];
- for (u32 il_pos = 0; il_pos < bfs_cnt; il_pos++)
+ for (u32 il_pos = 0; il_pos < bfs_cnt; il_pos += VECT_SIZE)
{
- const u32 w0r = words_buf_r[il_pos];
-
- const u32 w0 = w0l | w0r;
-
- u32 w0_t = w0;
- u32 w1_t = w[ 1];
- u32 w2_t = w[ 2];
- u32 w3_t = w[ 3];
- u32 w4_t = w[ 4];
- u32 w5_t = w[ 5];
- u32 w6_t = w[ 6];
- u32 w7_t = w[ 7];
- u32 w8_t = w[ 8];
- u32 w9_t = w[ 9];
- u32 wa_t = w[10];
- u32 wb_t = w[11];
- u32 wc_t = w[12];
- u32 wd_t = w[13];
- u32 we_t = w[14];
- u32 wf_t = w[15];
-
- u32 a = SHA256M_A;
- u32 b = SHA256M_B;
- u32 c = SHA256M_C;
- u32 d = SHA256M_D;
- u32 e = SHA256M_E;
- u32 f = SHA256M_F;
- u32 g = SHA256M_G;
- u32 h = SHA256M_H;
+ const u32x w0r = words_buf_r[il_pos / VECT_SIZE];
+
+ const u32x w0 = w0l | w0r;
+
+ u32x w0_t = w0;
+ u32x w1_t = w[ 1];
+ u32x w2_t = w[ 2];
+ u32x w3_t = w[ 3];
+ u32x w4_t = w[ 4];
+ u32x w5_t = w[ 5];
+ u32x w6_t = w[ 6];
+ u32x w7_t = w[ 7];
+ u32x w8_t = w[ 8];
+ u32x w9_t = w[ 9];
+ u32x wa_t = w[10];
+ u32x wb_t = w[11];
+ u32x wc_t = w[12];
+ u32x wd_t = w[13];
+ u32x we_t = w[14];
+ u32x wf_t = w[15];
+
+ u32x a = SHA256M_A;
+ u32x b = SHA256M_B;
+ u32x c = SHA256M_C;
+ u32x d = SHA256M_D;
+ u32x e = SHA256M_E;
+ u32x f = SHA256M_F;
+ u32x g = SHA256M_G;
+ u32x h = SHA256M_H;
SHA256_STEP (SHA256_F0o, SHA256_F1o, a, b, c, d, e, f, g, h, w0_t, SHA256C00);
SHA256_STEP (SHA256_F0o, SHA256_F1o, h, a, b, c, d, e, f, g, w1_t, SHA256C01);
wa_t = SHA256_EXPAND (w8_t, w3_t, wb_t, wa_t); SHA256_STEP (SHA256_F0o, SHA256_F1o, g, h, a, b, c, d, e, f, wa_t, SHA256C3a);
wb_t = SHA256_EXPAND (w9_t, w4_t, wc_t, wb_t); SHA256_STEP (SHA256_F0o, SHA256_F1o, f, g, h, a, b, c, d, e, wb_t, SHA256C3b);
wc_t = SHA256_EXPAND (wa_t, w5_t, wd_t, wc_t); SHA256_STEP (SHA256_F0o, SHA256_F1o, e, f, g, h, a, b, c, d, wc_t, SHA256C3c);
+
+ if (MATCHES_NONE_VS (d, search[0])) continue;
+
wd_t = SHA256_EXPAND (wb_t, w6_t, we_t, wd_t); SHA256_STEP (SHA256_F0o, SHA256_F1o, d, e, f, g, h, a, b, c, wd_t, SHA256C3d);
we_t = SHA256_EXPAND (wc_t, w7_t, wf_t, we_t); SHA256_STEP (SHA256_F0o, SHA256_F1o, c, d, e, f, g, h, a, b, we_t, SHA256C3e);
wf_t = SHA256_EXPAND (wd_t, w8_t, w0_t, wf_t); SHA256_STEP (SHA256_F0o, SHA256_F1o, b, c, d, e, f, g, h, a, wf_t, SHA256C3f);
-
- const u32 r0 = d;
- const u32 r1 = h;
- const u32 r2 = c;
- const u32 r3 = g;
-
- #include COMPARE_S
+ COMPARE_S_SIMD (d, h, c, g);
}
}
-__kernel void m01410_m04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, __global comb_t *combs_buf, __constant u32 * words_buf_r, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 bfs_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max)
+__kernel void m01410_m04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, __global comb_t *combs_buf, __constant u32x * words_buf_r, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 bfs_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max)
{
/**
* base
m01410m (w, pw_len, pws, rules_buf, combs_buf, words_buf_r, tmps, hooks, bitmaps_buf_s1_a, bitmaps_buf_s1_b, bitmaps_buf_s1_c, bitmaps_buf_s1_d, bitmaps_buf_s2_a, bitmaps_buf_s2_b, bitmaps_buf_s2_c, bitmaps_buf_s2_d, plains_buf, digests_buf, hashes_shown, salt_bufs, esalt_bufs, d_return_buf, d_scryptV_buf, bitmap_mask, bitmap_shift1, bitmap_shift2, salt_pos, loop_pos, loop_cnt, bfs_cnt, digests_cnt, digests_offset);
}
-__kernel void m01410_m08 (__global pw_t *pws, __global kernel_rule_t *rules_buf, __global comb_t *combs_buf, __constant u32 * words_buf_r, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 bfs_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max)
+__kernel void m01410_m08 (__global pw_t *pws, __global kernel_rule_t *rules_buf, __global comb_t *combs_buf, __constant u32x * words_buf_r, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 bfs_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max)
{
/**
* base
m01410m (w, pw_len, pws, rules_buf, combs_buf, words_buf_r, tmps, hooks, bitmaps_buf_s1_a, bitmaps_buf_s1_b, bitmaps_buf_s1_c, bitmaps_buf_s1_d, bitmaps_buf_s2_a, bitmaps_buf_s2_b, bitmaps_buf_s2_c, bitmaps_buf_s2_d, plains_buf, digests_buf, hashes_shown, salt_bufs, esalt_bufs, d_return_buf, d_scryptV_buf, bitmap_mask, bitmap_shift1, bitmap_shift2, salt_pos, loop_pos, loop_cnt, bfs_cnt, digests_cnt, digests_offset);
}
-__kernel void m01410_m16 (__global pw_t *pws, __global kernel_rule_t *rules_buf, __global comb_t *combs_buf, __constant u32 * words_buf_r, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 bfs_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max)
+__kernel void m01410_m16 (__global pw_t *pws, __global kernel_rule_t *rules_buf, __global comb_t *combs_buf, __constant u32x * words_buf_r, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 bfs_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max)
{
/**
* base
m01410m (w, pw_len, pws, rules_buf, combs_buf, words_buf_r, tmps, hooks, bitmaps_buf_s1_a, bitmaps_buf_s1_b, bitmaps_buf_s1_c, bitmaps_buf_s1_d, bitmaps_buf_s2_a, bitmaps_buf_s2_b, bitmaps_buf_s2_c, bitmaps_buf_s2_d, plains_buf, digests_buf, hashes_shown, salt_bufs, esalt_bufs, d_return_buf, d_scryptV_buf, bitmap_mask, bitmap_shift1, bitmap_shift2, salt_pos, loop_pos, loop_cnt, bfs_cnt, digests_cnt, digests_offset);
}
-__kernel void m01410_s04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, __global comb_t *combs_buf, __constant u32 * words_buf_r, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 bfs_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max)
+__kernel void m01410_s04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, __global comb_t *combs_buf, __constant u32x * words_buf_r, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 bfs_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max)
{
/**
* base
m01410s (w, pw_len, pws, rules_buf, combs_buf, words_buf_r, tmps, hooks, bitmaps_buf_s1_a, bitmaps_buf_s1_b, bitmaps_buf_s1_c, bitmaps_buf_s1_d, bitmaps_buf_s2_a, bitmaps_buf_s2_b, bitmaps_buf_s2_c, bitmaps_buf_s2_d, plains_buf, digests_buf, hashes_shown, salt_bufs, esalt_bufs, d_return_buf, d_scryptV_buf, bitmap_mask, bitmap_shift1, bitmap_shift2, salt_pos, loop_pos, loop_cnt, bfs_cnt, digests_cnt, digests_offset);
}
-__kernel void m01410_s08 (__global pw_t *pws, __global kernel_rule_t *rules_buf, __global comb_t *combs_buf, __constant u32 * words_buf_r, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 bfs_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max)
+__kernel void m01410_s08 (__global pw_t *pws, __global kernel_rule_t *rules_buf, __global comb_t *combs_buf, __constant u32x * words_buf_r, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 bfs_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max)
{
/**
* base
m01410s (w, pw_len, pws, rules_buf, combs_buf, words_buf_r, tmps, hooks, bitmaps_buf_s1_a, bitmaps_buf_s1_b, bitmaps_buf_s1_c, bitmaps_buf_s1_d, bitmaps_buf_s2_a, bitmaps_buf_s2_b, bitmaps_buf_s2_c, bitmaps_buf_s2_d, plains_buf, digests_buf, hashes_shown, salt_bufs, esalt_bufs, d_return_buf, d_scryptV_buf, bitmap_mask, bitmap_shift1, bitmap_shift2, salt_pos, loop_pos, loop_cnt, bfs_cnt, digests_cnt, digests_offset);
}
-__kernel void m01410_s16 (__global pw_t *pws, __global kernel_rule_t *rules_buf, __global comb_t *combs_buf, __constant u32 * words_buf_r, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 bfs_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max)
+__kernel void m01410_s16 (__global pw_t *pws, __global kernel_rule_t *rules_buf, __global comb_t *combs_buf, __constant u32x * words_buf_r, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 bfs_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max)
{
/**
* base
const u32 out_salt_len = out_len + salt_len;
- switch_buffer_by_offset (w0, w1, w2, w3, salt_len);
+ switch_buffer_by_offset_le (w0, w1, w2, w3, salt_len);
w0[0] |= salt_buf0[0];
w0[1] |= salt_buf0[1];
const u32 out_salt_len = out_len + salt_len;
- switch_buffer_by_offset (w0, w1, w2, w3, salt_len);
+ switch_buffer_by_offset_le (w0, w1, w2, w3, salt_len);
w0[0] |= salt_buf0[0];
w0[1] |= salt_buf0[1];
if (combs_mode == COMBINATOR_MODE_BASE_RIGHT)
{
- switch_buffer_by_offset (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len);
+ switch_buffer_by_offset_le (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len);
}
/**
if (combs_mode == COMBINATOR_MODE_BASE_LEFT)
{
- switch_buffer_by_offset (wordr0, wordr1, wordr2, wordr3, pw_l_len);
+ switch_buffer_by_offset_le (wordr0, wordr1, wordr2, wordr3, pw_l_len);
}
u32 w0[4];
const u32 pw_salt_len = pw_len + salt_len;
- switch_buffer_by_offset (w0, w1, w2, w3, salt_len);
+ switch_buffer_by_offset_le (w0, w1, w2, w3, salt_len);
w0[0] |= salt_buf0[0];
w0[1] |= salt_buf0[1];
if (combs_mode == COMBINATOR_MODE_BASE_RIGHT)
{
- switch_buffer_by_offset (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len);
+ switch_buffer_by_offset_le (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len);
}
/**
if (combs_mode == COMBINATOR_MODE_BASE_LEFT)
{
- switch_buffer_by_offset (wordr0, wordr1, wordr2, wordr3, pw_l_len);
+ switch_buffer_by_offset_le (wordr0, wordr1, wordr2, wordr3, pw_l_len);
}
u32 w0[4];
const u32 pw_salt_len = pw_len + salt_len;
- switch_buffer_by_offset (w0, w1, w2, w3, salt_len);
+ switch_buffer_by_offset_le (w0, w1, w2, w3, salt_len);
w0[0] |= salt_buf0[0];
w0[1] |= salt_buf0[1];
#define _SHA256_
+#define NEW_SIMD_CODE
+
#include "include/constants.h"
#include "include/kernel_vendor.h"
#include "include/kernel_functions.c"
#include "OpenCL/types_ocl.c"
#include "OpenCL/common.c"
-
-#define COMPARE_S "OpenCL/check_single_comp4.c"
-#define COMPARE_M "OpenCL/check_multi_comp4.c"
+#include "OpenCL/simd.c"
static void m01420m (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], const u32 pw_len, __global pw_t *pws, __global kernel_rule_t *rules_buf, __global comb_t *combs_buf, __global bf_t *bfs_buf, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 bfs_cnt, const u32 digests_cnt, const u32 digests_offset)
{
const u32 pw_salt_len = pw_len + salt_len;
+ /**
+ * prepend salt
+ */
+
+ u32 w0_t[4];
+ u32 w1_t[4];
+ u32 w2_t[4];
+ u32 w3_t[4];
+
+ w0_t[0] = swap32_S (w0[0]);
+ w0_t[1] = swap32_S (w0[1]);
+ w0_t[2] = swap32_S (w0[2]);
+ w0_t[3] = swap32_S (w0[3]);
+ w1_t[0] = swap32_S (w1[0]);
+ w1_t[1] = swap32_S (w1[1]);
+ w1_t[2] = swap32_S (w1[2]);
+ w1_t[3] = swap32_S (w1[3]);
+ w2_t[0] = swap32_S (w2[0]);
+ w2_t[1] = swap32_S (w2[1]);
+ w2_t[2] = swap32_S (w2[2]);
+ w2_t[3] = swap32_S (w2[3]);
+ w3_t[0] = swap32_S (w3[0]);
+ w3_t[1] = swap32_S (w3[1]);
+ w3_t[2] = swap32_S (w3[2]);
+ w3_t[3] = swap32_S (w3[3]);
+
+ switch_buffer_by_offset_le_S (w0_t, w1_t, w2_t, w3_t, salt_len);
+
+ w0_t[0] |= salt_buf0[0];
+ w0_t[1] |= salt_buf0[1];
+ w0_t[2] |= salt_buf0[2];
+ w0_t[3] |= salt_buf0[3];
+ w1_t[0] |= salt_buf1[0];
+ w1_t[1] |= salt_buf1[1];
+ w1_t[2] |= salt_buf1[2];
+ w1_t[3] |= salt_buf1[3];
+ w2_t[0] |= salt_buf2[0];
+ w2_t[1] |= salt_buf2[1];
+ w2_t[2] |= salt_buf2[2];
+ w2_t[3] |= salt_buf2[3];
+ w3_t[0] |= salt_buf3[0];
+ w3_t[1] |= salt_buf3[1];
+ w3_t[2] |= salt_buf3[2];
+ w3_t[3] |= salt_buf3[3];
+
+ w0_t[0] = swap32_S (w0_t[0]);
+ w0_t[1] = swap32_S (w0_t[1]);
+ w0_t[2] = swap32_S (w0_t[2]);
+ w0_t[3] = swap32_S (w0_t[3]);
+ w1_t[0] = swap32_S (w1_t[0]);
+ w1_t[1] = swap32_S (w1_t[1]);
+ w1_t[2] = swap32_S (w1_t[2]);
+ w1_t[3] = swap32_S (w1_t[3]);
+ w2_t[0] = swap32_S (w2_t[0]);
+ w2_t[1] = swap32_S (w2_t[1]);
+ w2_t[2] = swap32_S (w2_t[2]);
+ w2_t[3] = swap32_S (w2_t[3]);
+ w3_t[0] = swap32_S (w3_t[0]);
+ w3_t[1] = swap32_S (w3_t[1]);
+ w3_t[2] = swap32_S (w3_t[2]);
+ w3_t[3] = swap32_S (w3_t[3]);
+
/**
* loop
*/
u32 w0l = w0[0];
- for (u32 il_pos = 0; il_pos < bfs_cnt; il_pos++)
+ for (u32 il_pos = 0; il_pos < bfs_cnt; il_pos += VECT_SIZE)
{
- const u32 w0r = bfs_buf[il_pos].i;
-
- w0[0] = w0l | w0r;
-
- /**
- * prepend salt
- */
-
- u32 w0_t2[4];
- u32 w1_t2[4];
- u32 w2_t2[4];
- u32 w3_t2[4];
-
- w0_t2[0] = swap32 (w0[0]);
- w0_t2[1] = swap32 (w0[1]);
- w0_t2[2] = swap32 (w0[2]);
- w0_t2[3] = swap32 (w0[3]);
- w1_t2[0] = swap32 (w1[0]);
- w1_t2[1] = swap32 (w1[1]);
- w1_t2[2] = swap32 (w1[2]);
- w1_t2[3] = swap32 (w1[3]);
- w2_t2[0] = swap32 (w2[0]);
- w2_t2[1] = swap32 (w2[1]);
- w2_t2[2] = swap32 (w2[2]);
- w2_t2[3] = swap32 (w2[3]);
- w3_t2[0] = swap32 (w3[0]);
- w3_t2[1] = swap32 (w3[1]);
- w3_t2[2] = swap32 (w3[2]);
- w3_t2[3] = swap32 (w3[3]);
-
- switch_buffer_by_offset (w0_t2, w1_t2, w2_t2, w3_t2, salt_len);
-
- w0_t2[0] |= salt_buf0[0];
- w0_t2[1] |= salt_buf0[1];
- w0_t2[2] |= salt_buf0[2];
- w0_t2[3] |= salt_buf0[3];
- w1_t2[0] |= salt_buf1[0];
- w1_t2[1] |= salt_buf1[1];
- w1_t2[2] |= salt_buf1[2];
- w1_t2[3] |= salt_buf1[3];
- w2_t2[0] |= salt_buf2[0];
- w2_t2[1] |= salt_buf2[1];
- w2_t2[2] |= salt_buf2[2];
- w2_t2[3] |= salt_buf2[3];
- w3_t2[0] |= salt_buf3[0];
- w3_t2[1] |= salt_buf3[1];
- w3_t2[2] |= salt_buf3[2];
- w3_t2[3] |= salt_buf3[3];
-
- /**
- * sha256
- */
-
- u32 w0_t = swap32 (w0_t2[0]);
- u32 w1_t = swap32 (w0_t2[1]);
- u32 w2_t = swap32 (w0_t2[2]);
- u32 w3_t = swap32 (w0_t2[3]);
- u32 w4_t = swap32 (w1_t2[0]);
- u32 w5_t = swap32 (w1_t2[1]);
- u32 w6_t = swap32 (w1_t2[2]);
- u32 w7_t = swap32 (w1_t2[3]);
- u32 w8_t = swap32 (w2_t2[0]);
- u32 w9_t = swap32 (w2_t2[1]);
- u32 wa_t = swap32 (w2_t2[2]);
- u32 wb_t = swap32 (w2_t2[3]);
- u32 wc_t = swap32 (w3_t2[0]);
- u32 wd_t = swap32 (w3_t2[1]);
- u32 we_t = 0;
- u32 wf_t = pw_salt_len * 8;
-
- u32 a = SHA256M_A;
- u32 b = SHA256M_B;
- u32 c = SHA256M_C;
- u32 d = SHA256M_D;
- u32 e = SHA256M_E;
- u32 f = SHA256M_F;
- u32 g = SHA256M_G;
- u32 h = SHA256M_H;
+ const u32x w0r = w0r_create_bft (bfs_buf, il_pos);
+
+ const u32x w0lr = w0l | w0r;
+
+ u32x wx[16];
+
+ wx[ 0] = w0_t[0];
+ wx[ 1] = w0_t[1];
+ wx[ 2] = w0_t[2];
+ wx[ 3] = w0_t[3];
+ wx[ 4] = w1_t[0];
+ wx[ 5] = w1_t[1];
+ wx[ 6] = w1_t[2];
+ wx[ 7] = w1_t[3];
+ wx[ 8] = w2_t[0];
+ wx[ 9] = w2_t[1];
+ wx[10] = w2_t[2];
+ wx[11] = w2_t[3];
+ wx[12] = w3_t[0];
+ wx[13] = w3_t[1];
+ wx[14] = w3_t[2];
+ wx[15] = w3_t[3];
+
+ overwrite_at_be (wx, w0lr, salt_len);
+
+ u32x w0_t = wx[ 0];
+ u32x w1_t = wx[ 1];
+ u32x w2_t = wx[ 2];
+ u32x w3_t = wx[ 3];
+ u32x w4_t = wx[ 4];
+ u32x w5_t = wx[ 5];
+ u32x w6_t = wx[ 6];
+ u32x w7_t = wx[ 7];
+ u32x w8_t = wx[ 8];
+ u32x w9_t = wx[ 9];
+ u32x wa_t = wx[10];
+ u32x wb_t = wx[11];
+ u32x wc_t = wx[12];
+ u32x wd_t = wx[13];
+ u32x we_t = 0;
+ u32x wf_t = pw_salt_len * 8;
+
+ u32x a = SHA256M_A;
+ u32x b = SHA256M_B;
+ u32x c = SHA256M_C;
+ u32x d = SHA256M_D;
+ u32x e = SHA256M_E;
+ u32x f = SHA256M_F;
+ u32x g = SHA256M_G;
+ u32x h = SHA256M_H;
SHA256_STEP (SHA256_F0o, SHA256_F1o, a, b, c, d, e, f, g, h, w0_t, SHA256C00);
SHA256_STEP (SHA256_F0o, SHA256_F1o, h, a, b, c, d, e, f, g, w1_t, SHA256C01);
we_t = SHA256_EXPAND (wc_t, w7_t, wf_t, we_t); SHA256_STEP (SHA256_F0o, SHA256_F1o, c, d, e, f, g, h, a, b, we_t, SHA256C3e);
wf_t = SHA256_EXPAND (wd_t, w8_t, w0_t, wf_t); SHA256_STEP (SHA256_F0o, SHA256_F1o, b, c, d, e, f, g, h, a, wf_t, SHA256C3f);
-
- const u32 r0 = d;
- const u32 r1 = h;
- const u32 r2 = c;
- const u32 r3 = g;
-
- #include COMPARE_M
+ COMPARE_M_SIMD (d, h, c, g);
}
}
const u32 pw_salt_len = pw_len + salt_len;
+ /**
+ * prepend salt
+ */
+
+ u32 w0_t[4];
+ u32 w1_t[4];
+ u32 w2_t[4];
+ u32 w3_t[4];
+
+ w0_t[0] = swap32_S (w0[0]);
+ w0_t[1] = swap32_S (w0[1]);
+ w0_t[2] = swap32_S (w0[2]);
+ w0_t[3] = swap32_S (w0[3]);
+ w1_t[0] = swap32_S (w1[0]);
+ w1_t[1] = swap32_S (w1[1]);
+ w1_t[2] = swap32_S (w1[2]);
+ w1_t[3] = swap32_S (w1[3]);
+ w2_t[0] = swap32_S (w2[0]);
+ w2_t[1] = swap32_S (w2[1]);
+ w2_t[2] = swap32_S (w2[2]);
+ w2_t[3] = swap32_S (w2[3]);
+ w3_t[0] = swap32_S (w3[0]);
+ w3_t[1] = swap32_S (w3[1]);
+ w3_t[2] = swap32_S (w3[2]);
+ w3_t[3] = swap32_S (w3[3]);
+
+ switch_buffer_by_offset_le_S (w0_t, w1_t, w2_t, w3_t, salt_len);
+
+ w0_t[0] |= salt_buf0[0];
+ w0_t[1] |= salt_buf0[1];
+ w0_t[2] |= salt_buf0[2];
+ w0_t[3] |= salt_buf0[3];
+ w1_t[0] |= salt_buf1[0];
+ w1_t[1] |= salt_buf1[1];
+ w1_t[2] |= salt_buf1[2];
+ w1_t[3] |= salt_buf1[3];
+ w2_t[0] |= salt_buf2[0];
+ w2_t[1] |= salt_buf2[1];
+ w2_t[2] |= salt_buf2[2];
+ w2_t[3] |= salt_buf2[3];
+ w3_t[0] |= salt_buf3[0];
+ w3_t[1] |= salt_buf3[1];
+ w3_t[2] |= salt_buf3[2];
+ w3_t[3] |= salt_buf3[3];
+
+ w0_t[0] = swap32_S (w0_t[0]);
+ w0_t[1] = swap32_S (w0_t[1]);
+ w0_t[2] = swap32_S (w0_t[2]);
+ w0_t[3] = swap32_S (w0_t[3]);
+ w1_t[0] = swap32_S (w1_t[0]);
+ w1_t[1] = swap32_S (w1_t[1]);
+ w1_t[2] = swap32_S (w1_t[2]);
+ w1_t[3] = swap32_S (w1_t[3]);
+ w2_t[0] = swap32_S (w2_t[0]);
+ w2_t[1] = swap32_S (w2_t[1]);
+ w2_t[2] = swap32_S (w2_t[2]);
+ w2_t[3] = swap32_S (w2_t[3]);
+ w3_t[0] = swap32_S (w3_t[0]);
+ w3_t[1] = swap32_S (w3_t[1]);
+ w3_t[2] = swap32_S (w3_t[2]);
+ w3_t[3] = swap32_S (w3_t[3]);
+
/**
* loop
*/
u32 w0l = w0[0];
- for (u32 il_pos = 0; il_pos < bfs_cnt; il_pos++)
+ for (u32 il_pos = 0; il_pos < bfs_cnt; il_pos += VECT_SIZE)
{
- const u32 w0r = bfs_buf[il_pos].i;
-
- w0[0] = w0l | w0r;
-
- /**
- * prepend salt
- */
-
- u32 w0_t2[4];
- u32 w1_t2[4];
- u32 w2_t2[4];
- u32 w3_t2[4];
-
- w0_t2[0] = swap32 (w0[0]);
- w0_t2[1] = swap32 (w0[1]);
- w0_t2[2] = swap32 (w0[2]);
- w0_t2[3] = swap32 (w0[3]);
- w1_t2[0] = swap32 (w1[0]);
- w1_t2[1] = swap32 (w1[1]);
- w1_t2[2] = swap32 (w1[2]);
- w1_t2[3] = swap32 (w1[3]);
- w2_t2[0] = swap32 (w2[0]);
- w2_t2[1] = swap32 (w2[1]);
- w2_t2[2] = swap32 (w2[2]);
- w2_t2[3] = swap32 (w2[3]);
- w3_t2[0] = swap32 (w3[0]);
- w3_t2[1] = swap32 (w3[1]);
- w3_t2[2] = swap32 (w3[2]);
- w3_t2[3] = swap32 (w3[3]);
-
- switch_buffer_by_offset (w0_t2, w1_t2, w2_t2, w3_t2, salt_len);
-
- w0_t2[0] |= salt_buf0[0];
- w0_t2[1] |= salt_buf0[1];
- w0_t2[2] |= salt_buf0[2];
- w0_t2[3] |= salt_buf0[3];
- w1_t2[0] |= salt_buf1[0];
- w1_t2[1] |= salt_buf1[1];
- w1_t2[2] |= salt_buf1[2];
- w1_t2[3] |= salt_buf1[3];
- w2_t2[0] |= salt_buf2[0];
- w2_t2[1] |= salt_buf2[1];
- w2_t2[2] |= salt_buf2[2];
- w2_t2[3] |= salt_buf2[3];
- w3_t2[0] |= salt_buf3[0];
- w3_t2[1] |= salt_buf3[1];
- w3_t2[2] |= salt_buf3[2];
- w3_t2[3] |= salt_buf3[3];
-
- /**
- * sha256
- */
-
- u32 w0_t = swap32 (w0_t2[0]);
- u32 w1_t = swap32 (w0_t2[1]);
- u32 w2_t = swap32 (w0_t2[2]);
- u32 w3_t = swap32 (w0_t2[3]);
- u32 w4_t = swap32 (w1_t2[0]);
- u32 w5_t = swap32 (w1_t2[1]);
- u32 w6_t = swap32 (w1_t2[2]);
- u32 w7_t = swap32 (w1_t2[3]);
- u32 w8_t = swap32 (w2_t2[0]);
- u32 w9_t = swap32 (w2_t2[1]);
- u32 wa_t = swap32 (w2_t2[2]);
- u32 wb_t = swap32 (w2_t2[3]);
- u32 wc_t = swap32 (w3_t2[0]);
- u32 wd_t = swap32 (w3_t2[1]);
- u32 we_t = 0;
- u32 wf_t = pw_salt_len * 8;
-
- u32 a = SHA256M_A;
- u32 b = SHA256M_B;
- u32 c = SHA256M_C;
- u32 d = SHA256M_D;
- u32 e = SHA256M_E;
- u32 f = SHA256M_F;
- u32 g = SHA256M_G;
- u32 h = SHA256M_H;
+ const u32x w0r = w0r_create_bft (bfs_buf, il_pos);
+
+ const u32x w0lr = w0l | w0r;
+
+ u32x wx[16];
+
+ wx[ 0] = w0_t[0];
+ wx[ 1] = w0_t[1];
+ wx[ 2] = w0_t[2];
+ wx[ 3] = w0_t[3];
+ wx[ 4] = w1_t[0];
+ wx[ 5] = w1_t[1];
+ wx[ 6] = w1_t[2];
+ wx[ 7] = w1_t[3];
+ wx[ 8] = w2_t[0];
+ wx[ 9] = w2_t[1];
+ wx[10] = w2_t[2];
+ wx[11] = w2_t[3];
+ wx[12] = w3_t[0];
+ wx[13] = w3_t[1];
+ wx[14] = w3_t[2];
+ wx[15] = w3_t[3];
+
+ overwrite_at_be (wx, w0lr, salt_len);
+
+ u32x w0_t = wx[ 0];
+ u32x w1_t = wx[ 1];
+ u32x w2_t = wx[ 2];
+ u32x w3_t = wx[ 3];
+ u32x w4_t = wx[ 4];
+ u32x w5_t = wx[ 5];
+ u32x w6_t = wx[ 6];
+ u32x w7_t = wx[ 7];
+ u32x w8_t = wx[ 8];
+ u32x w9_t = wx[ 9];
+ u32x wa_t = wx[10];
+ u32x wb_t = wx[11];
+ u32x wc_t = wx[12];
+ u32x wd_t = wx[13];
+ u32x we_t = 0;
+ u32x wf_t = pw_salt_len * 8;
+
+ u32x a = SHA256M_A;
+ u32x b = SHA256M_B;
+ u32x c = SHA256M_C;
+ u32x d = SHA256M_D;
+ u32x e = SHA256M_E;
+ u32x f = SHA256M_F;
+ u32x g = SHA256M_G;
+ u32x h = SHA256M_H;
SHA256_STEP (SHA256_F0o, SHA256_F1o, a, b, c, d, e, f, g, h, w0_t, SHA256C00);
SHA256_STEP (SHA256_F0o, SHA256_F1o, h, a, b, c, d, e, f, g, w1_t, SHA256C01);
wa_t = SHA256_EXPAND (w8_t, w3_t, wb_t, wa_t); SHA256_STEP (SHA256_F0o, SHA256_F1o, g, h, a, b, c, d, e, f, wa_t, SHA256C3a);
wb_t = SHA256_EXPAND (w9_t, w4_t, wc_t, wb_t); SHA256_STEP (SHA256_F0o, SHA256_F1o, f, g, h, a, b, c, d, e, wb_t, SHA256C3b);
wc_t = SHA256_EXPAND (wa_t, w5_t, wd_t, wc_t); SHA256_STEP (SHA256_F0o, SHA256_F1o, e, f, g, h, a, b, c, d, wc_t, SHA256C3c);
+
+ if (MATCHES_NONE_VS (d, search[0])) continue;
+
wd_t = SHA256_EXPAND (wb_t, w6_t, we_t, wd_t); SHA256_STEP (SHA256_F0o, SHA256_F1o, d, e, f, g, h, a, b, c, wd_t, SHA256C3d);
we_t = SHA256_EXPAND (wc_t, w7_t, wf_t, we_t); SHA256_STEP (SHA256_F0o, SHA256_F1o, c, d, e, f, g, h, a, b, we_t, SHA256C3e);
wf_t = SHA256_EXPAND (wd_t, w8_t, w0_t, wf_t); SHA256_STEP (SHA256_F0o, SHA256_F1o, b, c, d, e, f, g, h, a, wf_t, SHA256C3f);
-
- const u32 r0 = d;
- const u32 r1 = h;
- const u32 r2 = c;
- const u32 r3 = g;
-
- #include COMPARE_S
+ COMPARE_S_SIMD (d, h, c, g);
}
}
s3[2] = 0;
s3[3] = 0;
- switch_buffer_by_offset (s0, s1, s2, s3, (out_len * 2));
+ switch_buffer_by_offset_le (s0, s1, s2, s3, (out_len * 2));
const u32 out_salt_len = (out_len * 2) + salt_len;
s3[2] = 0;
s3[3] = 0;
- switch_buffer_by_offset (s0, s1, s2, s3, (out_len * 2));
+ switch_buffer_by_offset_le (s0, s1, s2, s3, (out_len * 2));
const u32 out_salt_len = (out_len * 2) + salt_len;
if (combs_mode == COMBINATOR_MODE_BASE_RIGHT)
{
- switch_buffer_by_offset (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len);
+ switch_buffer_by_offset_le (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len);
}
/**
if (combs_mode == COMBINATOR_MODE_BASE_LEFT)
{
- switch_buffer_by_offset (wordr0, wordr1, wordr2, wordr3, pw_l_len);
+ switch_buffer_by_offset_le (wordr0, wordr1, wordr2, wordr3, pw_l_len);
}
u32 w0[4];
s3[2] = 0;
s3[3] = 0;
- switch_buffer_by_offset (s0, s1, s2, s3, (pw_len * 2));
+ switch_buffer_by_offset_le (s0, s1, s2, s3, (pw_len * 2));
const u32 pw_salt_len = (pw_len * 2) + salt_len;
if (combs_mode == COMBINATOR_MODE_BASE_RIGHT)
{
- switch_buffer_by_offset (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len);
+ switch_buffer_by_offset_le (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len);
}
/**
if (combs_mode == COMBINATOR_MODE_BASE_LEFT)
{
- switch_buffer_by_offset (wordr0, wordr1, wordr2, wordr3, pw_l_len);
+ switch_buffer_by_offset_le (wordr0, wordr1, wordr2, wordr3, pw_l_len);
}
u32 w0[4];
s3[2] = 0;
s3[3] = 0;
- switch_buffer_by_offset (s0, s1, s2, s3, (pw_len * 2));
+ switch_buffer_by_offset_le (s0, s1, s2, s3, (pw_len * 2));
const u32 pw_salt_len = (pw_len * 2) + salt_len;
#define _SHA256_
+#define NEW_SIMD_CODE
+
#include "include/constants.h"
#include "include/kernel_vendor.h"
#include "include/kernel_functions.c"
#include "OpenCL/types_ocl.c"
#include "OpenCL/common.c"
+#include "OpenCL/simd.c"
-#define COMPARE_S "OpenCL/check_single_comp4.c"
-#define COMPARE_M "OpenCL/check_multi_comp4.c"
-
-static void m01430m (u32 w[16], const u32 pw_len, __global pw_t *pws, __global kernel_rule_t *rules_buf, __global comb_t *combs_buf, __constant u32 * words_buf_r, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 bfs_cnt, const u32 digests_cnt, const u32 digests_offset)
+static void m01430m (u32 w[16], const u32 pw_len, __global pw_t *pws, __global kernel_rule_t *rules_buf, __global comb_t *combs_buf, __constant u32x * words_buf_r, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 bfs_cnt, const u32 digests_cnt, const u32 digests_offset)
{
/**
* modifier
salt_buf3[2] = 0;
salt_buf3[3] = 0;
- switch_buffer_by_offset (salt_buf0, salt_buf1, salt_buf2, salt_buf3, pw_len);
-
- w[ 0] |= swap32 (salt_buf0[0]);
- w[ 1] |= swap32 (salt_buf0[1]);
- w[ 2] |= swap32 (salt_buf0[2]);
- w[ 3] |= swap32 (salt_buf0[3]);
- w[ 4] |= swap32 (salt_buf1[0]);
- w[ 5] |= swap32 (salt_buf1[1]);
- w[ 6] |= swap32 (salt_buf1[2]);
- w[ 7] |= swap32 (salt_buf1[3]);
- w[ 8] |= swap32 (salt_buf2[0]);
- w[ 9] |= swap32 (salt_buf2[1]);
- w[10] |= swap32 (salt_buf2[2]);
- w[11] |= swap32 (salt_buf2[3]);
- w[12] |= swap32 (salt_buf3[0]);
- w[13] |= swap32 (salt_buf3[1]);
- w[14] |= swap32 (salt_buf3[2]);
- w[15] |= swap32 (salt_buf3[3]);
+ switch_buffer_by_offset_le_S (salt_buf0, salt_buf1, salt_buf2, salt_buf3, pw_len);
+
+ w[ 0] |= swap32_S (salt_buf0[0]);
+ w[ 1] |= swap32_S (salt_buf0[1]);
+ w[ 2] |= swap32_S (salt_buf0[2]);
+ w[ 3] |= swap32_S (salt_buf0[3]);
+ w[ 4] |= swap32_S (salt_buf1[0]);
+ w[ 5] |= swap32_S (salt_buf1[1]);
+ w[ 6] |= swap32_S (salt_buf1[2]);
+ w[ 7] |= swap32_S (salt_buf1[3]);
+ w[ 8] |= swap32_S (salt_buf2[0]);
+ w[ 9] |= swap32_S (salt_buf2[1]);
+ w[10] |= swap32_S (salt_buf2[2]);
+ w[11] |= swap32_S (salt_buf2[3]);
+ w[12] |= swap32_S (salt_buf3[0]);
+ w[13] |= swap32_S (salt_buf3[1]);
+ w[14] |= swap32_S (salt_buf3[2]);
+ w[15] |= swap32_S (salt_buf3[3]);
const u32 salt_len = salt_bufs[salt_pos].salt_len;
u32 w0l = w[0];
- for (u32 il_pos = 0; il_pos < bfs_cnt; il_pos++)
+ for (u32 il_pos = 0; il_pos < bfs_cnt; il_pos += VECT_SIZE)
{
- const u32 w0r = words_buf_r[il_pos];
-
- const u32 w0 = w0l | w0r;
-
- u32 w0_t = w0;
- u32 w1_t = w[ 1];
- u32 w2_t = w[ 2];
- u32 w3_t = w[ 3];
- u32 w4_t = w[ 4];
- u32 w5_t = w[ 5];
- u32 w6_t = w[ 6];
- u32 w7_t = w[ 7];
- u32 w8_t = w[ 8];
- u32 w9_t = w[ 9];
- u32 wa_t = w[10];
- u32 wb_t = w[11];
- u32 wc_t = w[12];
- u32 wd_t = w[13];
- u32 we_t = w[14];
- u32 wf_t = w[15];
-
- u32 a = SHA256M_A;
- u32 b = SHA256M_B;
- u32 c = SHA256M_C;
- u32 d = SHA256M_D;
- u32 e = SHA256M_E;
- u32 f = SHA256M_F;
- u32 g = SHA256M_G;
- u32 h = SHA256M_H;
+ const u32x w0r = words_buf_r[il_pos / VECT_SIZE];
+
+ const u32x w0 = w0l | w0r;
+
+ u32x w0_t = w0;
+ u32x w1_t = w[ 1];
+ u32x w2_t = w[ 2];
+ u32x w3_t = w[ 3];
+ u32x w4_t = w[ 4];
+ u32x w5_t = w[ 5];
+ u32x w6_t = w[ 6];
+ u32x w7_t = w[ 7];
+ u32x w8_t = w[ 8];
+ u32x w9_t = w[ 9];
+ u32x wa_t = w[10];
+ u32x wb_t = w[11];
+ u32x wc_t = w[12];
+ u32x wd_t = w[13];
+ u32x we_t = w[14];
+ u32x wf_t = w[15];
+
+ u32x a = SHA256M_A;
+ u32x b = SHA256M_B;
+ u32x c = SHA256M_C;
+ u32x d = SHA256M_D;
+ u32x e = SHA256M_E;
+ u32x f = SHA256M_F;
+ u32x g = SHA256M_G;
+ u32x h = SHA256M_H;
SHA256_STEP (SHA256_F0o, SHA256_F1o, a, b, c, d, e, f, g, h, w0_t, SHA256C00);
SHA256_STEP (SHA256_F0o, SHA256_F1o, h, a, b, c, d, e, f, g, w1_t, SHA256C01);
we_t = SHA256_EXPAND (wc_t, w7_t, wf_t, we_t); SHA256_STEP (SHA256_F0o, SHA256_F1o, c, d, e, f, g, h, a, b, we_t, SHA256C3e);
wf_t = SHA256_EXPAND (wd_t, w8_t, w0_t, wf_t); SHA256_STEP (SHA256_F0o, SHA256_F1o, b, c, d, e, f, g, h, a, wf_t, SHA256C3f);
-
- const u32 r0 = d;
- const u32 r1 = h;
- const u32 r2 = c;
- const u32 r3 = g;
-
- #include COMPARE_M
+ COMPARE_M_SIMD (d, h, c, g);
}
}
-static void m01430s (u32 w[16], const u32 pw_len, __global pw_t *pws, __global kernel_rule_t *rules_buf, __global comb_t *combs_buf, __constant u32 * words_buf_r, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 bfs_cnt, const u32 digests_cnt, const u32 digests_offset)
+static void m01430s (u32 w[16], const u32 pw_len, __global pw_t *pws, __global kernel_rule_t *rules_buf, __global comb_t *combs_buf, __constant u32x * words_buf_r, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 bfs_cnt, const u32 digests_cnt, const u32 digests_offset)
{
/**
* modifier
u32 w0l = w[0];
- for (u32 il_pos = 0; il_pos < bfs_cnt; il_pos++)
+ for (u32 il_pos = 0; il_pos < bfs_cnt; il_pos += VECT_SIZE)
{
- const u32 w0r = words_buf_r[il_pos];
-
- const u32 w0 = w0l | w0r;
-
- u32 w0_t = w0;
- u32 w1_t = w[ 1];
- u32 w2_t = w[ 2];
- u32 w3_t = w[ 3];
- u32 w4_t = w[ 4];
- u32 w5_t = w[ 5];
- u32 w6_t = w[ 6];
- u32 w7_t = w[ 7];
- u32 w8_t = w[ 8];
- u32 w9_t = w[ 9];
- u32 wa_t = w[10];
- u32 wb_t = w[11];
- u32 wc_t = w[12];
- u32 wd_t = w[13];
- u32 we_t = w[14];
- u32 wf_t = w[15];
-
- u32 a = SHA256M_A;
- u32 b = SHA256M_B;
- u32 c = SHA256M_C;
- u32 d = SHA256M_D;
- u32 e = SHA256M_E;
- u32 f = SHA256M_F;
- u32 g = SHA256M_G;
- u32 h = SHA256M_H;
+ const u32x w0r = words_buf_r[il_pos / VECT_SIZE];
+
+ const u32x w0 = w0l | w0r;
+
+ u32x w0_t = w0;
+ u32x w1_t = w[ 1];
+ u32x w2_t = w[ 2];
+ u32x w3_t = w[ 3];
+ u32x w4_t = w[ 4];
+ u32x w5_t = w[ 5];
+ u32x w6_t = w[ 6];
+ u32x w7_t = w[ 7];
+ u32x w8_t = w[ 8];
+ u32x w9_t = w[ 9];
+ u32x wa_t = w[10];
+ u32x wb_t = w[11];
+ u32x wc_t = w[12];
+ u32x wd_t = w[13];
+ u32x we_t = w[14];
+ u32x wf_t = w[15];
+
+ u32x a = SHA256M_A;
+ u32x b = SHA256M_B;
+ u32x c = SHA256M_C;
+ u32x d = SHA256M_D;
+ u32x e = SHA256M_E;
+ u32x f = SHA256M_F;
+ u32x g = SHA256M_G;
+ u32x h = SHA256M_H;
SHA256_STEP (SHA256_F0o, SHA256_F1o, a, b, c, d, e, f, g, h, w0_t, SHA256C00);
SHA256_STEP (SHA256_F0o, SHA256_F1o, h, a, b, c, d, e, f, g, w1_t, SHA256C01);
wa_t = SHA256_EXPAND (w8_t, w3_t, wb_t, wa_t); SHA256_STEP (SHA256_F0o, SHA256_F1o, g, h, a, b, c, d, e, f, wa_t, SHA256C3a);
wb_t = SHA256_EXPAND (w9_t, w4_t, wc_t, wb_t); SHA256_STEP (SHA256_F0o, SHA256_F1o, f, g, h, a, b, c, d, e, wb_t, SHA256C3b);
wc_t = SHA256_EXPAND (wa_t, w5_t, wd_t, wc_t); SHA256_STEP (SHA256_F0o, SHA256_F1o, e, f, g, h, a, b, c, d, wc_t, SHA256C3c);
+
+ if (MATCHES_NONE_VS (d, search[0])) continue;
+
wd_t = SHA256_EXPAND (wb_t, w6_t, we_t, wd_t); SHA256_STEP (SHA256_F0o, SHA256_F1o, d, e, f, g, h, a, b, c, wd_t, SHA256C3d);
we_t = SHA256_EXPAND (wc_t, w7_t, wf_t, we_t); SHA256_STEP (SHA256_F0o, SHA256_F1o, c, d, e, f, g, h, a, b, we_t, SHA256C3e);
wf_t = SHA256_EXPAND (wd_t, w8_t, w0_t, wf_t); SHA256_STEP (SHA256_F0o, SHA256_F1o, b, c, d, e, f, g, h, a, wf_t, SHA256C3f);
-
- const u32 r0 = d;
- const u32 r1 = h;
- const u32 r2 = c;
- const u32 r3 = g;
-
- #include COMPARE_S
+ COMPARE_S_SIMD (d, h, c, g);
}
}
-__kernel void m01430_m04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, __global comb_t *combs_buf, __constant u32 * words_buf_r, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 bfs_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max)
+__kernel void m01430_m04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, __global comb_t *combs_buf, __constant u32x * words_buf_r, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 bfs_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max)
{
/**
* base
m01430m (w, pw_len, pws, rules_buf, combs_buf, words_buf_r, tmps, hooks, bitmaps_buf_s1_a, bitmaps_buf_s1_b, bitmaps_buf_s1_c, bitmaps_buf_s1_d, bitmaps_buf_s2_a, bitmaps_buf_s2_b, bitmaps_buf_s2_c, bitmaps_buf_s2_d, plains_buf, digests_buf, hashes_shown, salt_bufs, esalt_bufs, d_return_buf, d_scryptV_buf, bitmap_mask, bitmap_shift1, bitmap_shift2, salt_pos, loop_pos, loop_cnt, bfs_cnt, digests_cnt, digests_offset);
}
-__kernel void m01430_m08 (__global pw_t *pws, __global kernel_rule_t *rules_buf, __global comb_t *combs_buf, __constant u32 * words_buf_r, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 bfs_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max)
+__kernel void m01430_m08 (__global pw_t *pws, __global kernel_rule_t *rules_buf, __global comb_t *combs_buf, __constant u32x * words_buf_r, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 bfs_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max)
{
/**
* base
m01430m (w, pw_len, pws, rules_buf, combs_buf, words_buf_r, tmps, hooks, bitmaps_buf_s1_a, bitmaps_buf_s1_b, bitmaps_buf_s1_c, bitmaps_buf_s1_d, bitmaps_buf_s2_a, bitmaps_buf_s2_b, bitmaps_buf_s2_c, bitmaps_buf_s2_d, plains_buf, digests_buf, hashes_shown, salt_bufs, esalt_bufs, d_return_buf, d_scryptV_buf, bitmap_mask, bitmap_shift1, bitmap_shift2, salt_pos, loop_pos, loop_cnt, bfs_cnt, digests_cnt, digests_offset);
}
-__kernel void m01430_m16 (__global pw_t *pws, __global kernel_rule_t *rules_buf, __global comb_t *combs_buf, __constant u32 * words_buf_r, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 bfs_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max)
+__kernel void m01430_m16 (__global pw_t *pws, __global kernel_rule_t *rules_buf, __global comb_t *combs_buf, __constant u32x * words_buf_r, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 bfs_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max)
{
/**
* base
m01430m (w, pw_len, pws, rules_buf, combs_buf, words_buf_r, tmps, hooks, bitmaps_buf_s1_a, bitmaps_buf_s1_b, bitmaps_buf_s1_c, bitmaps_buf_s1_d, bitmaps_buf_s2_a, bitmaps_buf_s2_b, bitmaps_buf_s2_c, bitmaps_buf_s2_d, plains_buf, digests_buf, hashes_shown, salt_bufs, esalt_bufs, d_return_buf, d_scryptV_buf, bitmap_mask, bitmap_shift1, bitmap_shift2, salt_pos, loop_pos, loop_cnt, bfs_cnt, digests_cnt, digests_offset);
}
-__kernel void m01430_s04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, __global comb_t *combs_buf, __constant u32 * words_buf_r, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 bfs_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max)
+__kernel void m01430_s04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, __global comb_t *combs_buf, __constant u32x * words_buf_r, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 bfs_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max)
{
/**
* base
m01430s (w, pw_len, pws, rules_buf, combs_buf, words_buf_r, tmps, hooks, bitmaps_buf_s1_a, bitmaps_buf_s1_b, bitmaps_buf_s1_c, bitmaps_buf_s1_d, bitmaps_buf_s2_a, bitmaps_buf_s2_b, bitmaps_buf_s2_c, bitmaps_buf_s2_d, plains_buf, digests_buf, hashes_shown, salt_bufs, esalt_bufs, d_return_buf, d_scryptV_buf, bitmap_mask, bitmap_shift1, bitmap_shift2, salt_pos, loop_pos, loop_cnt, bfs_cnt, digests_cnt, digests_offset);
}
-__kernel void m01430_s08 (__global pw_t *pws, __global kernel_rule_t *rules_buf, __global comb_t *combs_buf, __constant u32 * words_buf_r, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 bfs_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max)
+__kernel void m01430_s08 (__global pw_t *pws, __global kernel_rule_t *rules_buf, __global comb_t *combs_buf, __constant u32x * words_buf_r, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 bfs_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max)
{
/**
* base
m01430s (w, pw_len, pws, rules_buf, combs_buf, words_buf_r, tmps, hooks, bitmaps_buf_s1_a, bitmaps_buf_s1_b, bitmaps_buf_s1_c, bitmaps_buf_s1_d, bitmaps_buf_s2_a, bitmaps_buf_s2_b, bitmaps_buf_s2_c, bitmaps_buf_s2_d, plains_buf, digests_buf, hashes_shown, salt_bufs, esalt_bufs, d_return_buf, d_scryptV_buf, bitmap_mask, bitmap_shift1, bitmap_shift2, salt_pos, loop_pos, loop_cnt, bfs_cnt, digests_cnt, digests_offset);
}
-__kernel void m01430_s16 (__global pw_t *pws, __global kernel_rule_t *rules_buf, __global comb_t *combs_buf, __constant u32 * words_buf_r, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 bfs_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max)
+__kernel void m01430_s16 (__global pw_t *pws, __global kernel_rule_t *rules_buf, __global comb_t *combs_buf, __constant u32x * words_buf_r, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 bfs_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max)
{
/**
* base
make_unicode (w0, w0_t2, w1_t2);
make_unicode (w1, w2_t2, w3_t2);
- switch_buffer_by_offset (w0_t2, w1_t2, w2_t2, w3_t2, salt_len);
+ switch_buffer_by_offset_le (w0_t2, w1_t2, w2_t2, w3_t2, salt_len);
w0_t2[0] |= salt_buf0[0];
w0_t2[1] |= salt_buf0[1];
make_unicode (w0, w0_t2, w1_t2);
make_unicode (w1, w2_t2, w3_t2);
- switch_buffer_by_offset (w0_t2, w1_t2, w2_t2, w3_t2, salt_len);
+ switch_buffer_by_offset_le (w0_t2, w1_t2, w2_t2, w3_t2, salt_len);
w0_t2[0] |= salt_buf0[0];
w0_t2[1] |= salt_buf0[1];
if (combs_mode == COMBINATOR_MODE_BASE_RIGHT)
{
- switch_buffer_by_offset (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len);
+ switch_buffer_by_offset_le (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len);
}
/**
if (combs_mode == COMBINATOR_MODE_BASE_LEFT)
{
- switch_buffer_by_offset (wordr0, wordr1, wordr2, wordr3, pw_l_len);
+ switch_buffer_by_offset_le (wordr0, wordr1, wordr2, wordr3, pw_l_len);
}
u32 w0[4];
make_unicode (w0, w0_t2, w1_t2);
make_unicode (w1, w2_t2, w3_t2);
- switch_buffer_by_offset (w0_t2, w1_t2, w2_t2, w3_t2, salt_len);
+ switch_buffer_by_offset_le (w0_t2, w1_t2, w2_t2, w3_t2, salt_len);
w0_t2[0] |= salt_buf0[0];
w0_t2[1] |= salt_buf0[1];
if (combs_mode == COMBINATOR_MODE_BASE_RIGHT)
{
- switch_buffer_by_offset (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len);
+ switch_buffer_by_offset_le (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len);
}
/**
if (combs_mode == COMBINATOR_MODE_BASE_LEFT)
{
- switch_buffer_by_offset (wordr0, wordr1, wordr2, wordr3, pw_l_len);
+ switch_buffer_by_offset_le (wordr0, wordr1, wordr2, wordr3, pw_l_len);
}
u32 w0[4];
make_unicode (w0, w0_t2, w1_t2);
make_unicode (w1, w2_t2, w3_t2);
- switch_buffer_by_offset (w0_t2, w1_t2, w2_t2, w3_t2, salt_len);
+ switch_buffer_by_offset_le (w0_t2, w1_t2, w2_t2, w3_t2, salt_len);
w0_t2[0] |= salt_buf0[0];
w0_t2[1] |= salt_buf0[1];
#define _SHA256_
+#define NEW_SIMD_CODE
+
#include "include/constants.h"
#include "include/kernel_vendor.h"
#include "include/kernel_functions.c"
#include "OpenCL/types_ocl.c"
#include "OpenCL/common.c"
-
-#define COMPARE_S "OpenCL/check_single_comp4.c"
-#define COMPARE_M "OpenCL/check_multi_comp4.c"
+#include "OpenCL/simd.c"
static void m01440m (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], const u32 pw_len, __global pw_t *pws, __global kernel_rule_t *rules_buf, __global comb_t *combs_buf, __global bf_t *bfs_buf, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 bfs_cnt, const u32 digests_cnt, const u32 digests_offset)
{
const u32 pw_salt_len = pw_len + salt_len;
+ /**
+ * prepend salt
+ */
+
+ u32 w0_t[4];
+ u32 w1_t[4];
+ u32 w2_t[4];
+ u32 w3_t[4];
+
+ w0_t[0] = swap32_S (w0[0]);
+ w0_t[1] = swap32_S (w0[1]);
+ w0_t[2] = swap32_S (w0[2]);
+ w0_t[3] = swap32_S (w0[3]);
+ w1_t[0] = swap32_S (w1[0]);
+ w1_t[1] = swap32_S (w1[1]);
+ w1_t[2] = swap32_S (w1[2]);
+ w1_t[3] = swap32_S (w1[3]);
+ w2_t[0] = swap32_S (w2[0]);
+ w2_t[1] = swap32_S (w2[1]);
+ w2_t[2] = swap32_S (w2[2]);
+ w2_t[3] = swap32_S (w2[3]);
+ w3_t[0] = swap32_S (w3[0]);
+ w3_t[1] = swap32_S (w3[1]);
+ w3_t[2] = swap32_S (w3[2]);
+ w3_t[3] = swap32_S (w3[3]);
+
+ switch_buffer_by_offset_le_S (w0_t, w1_t, w2_t, w3_t, salt_len);
+
+ w0_t[0] |= salt_buf0[0];
+ w0_t[1] |= salt_buf0[1];
+ w0_t[2] |= salt_buf0[2];
+ w0_t[3] |= salt_buf0[3];
+ w1_t[0] |= salt_buf1[0];
+ w1_t[1] |= salt_buf1[1];
+ w1_t[2] |= salt_buf1[2];
+ w1_t[3] |= salt_buf1[3];
+ w2_t[0] |= salt_buf2[0];
+ w2_t[1] |= salt_buf2[1];
+ w2_t[2] |= salt_buf2[2];
+ w2_t[3] |= salt_buf2[3];
+ w3_t[0] |= salt_buf3[0];
+ w3_t[1] |= salt_buf3[1];
+ w3_t[2] |= salt_buf3[2];
+ w3_t[3] |= salt_buf3[3];
+
+ w0_t[0] = swap32_S (w0_t[0]);
+ w0_t[1] = swap32_S (w0_t[1]);
+ w0_t[2] = swap32_S (w0_t[2]);
+ w0_t[3] = swap32_S (w0_t[3]);
+ w1_t[0] = swap32_S (w1_t[0]);
+ w1_t[1] = swap32_S (w1_t[1]);
+ w1_t[2] = swap32_S (w1_t[2]);
+ w1_t[3] = swap32_S (w1_t[3]);
+ w2_t[0] = swap32_S (w2_t[0]);
+ w2_t[1] = swap32_S (w2_t[1]);
+ w2_t[2] = swap32_S (w2_t[2]);
+ w2_t[3] = swap32_S (w2_t[3]);
+ w3_t[0] = swap32_S (w3_t[0]);
+ w3_t[1] = swap32_S (w3_t[1]);
+ w3_t[2] = swap32_S (w3_t[2]);
+ w3_t[3] = swap32_S (w3_t[3]);
+
/**
* loop
*/
u32 w0l = w0[0];
- for (u32 il_pos = 0; il_pos < bfs_cnt; il_pos++)
+ for (u32 il_pos = 0; il_pos < bfs_cnt; il_pos += VECT_SIZE)
{
- const u32 w0r = bfs_buf[il_pos].i;
-
- w0[0] = w0l | w0r;
-
- /**
- * prepend salt
- */
-
- u32 w0_t2[4];
- u32 w1_t2[4];
- u32 w2_t2[4];
- u32 w3_t2[4];
-
- w0_t2[0] = swap32 (w0[0]);
- w0_t2[1] = swap32 (w0[1]);
- w0_t2[2] = swap32 (w0[2]);
- w0_t2[3] = swap32 (w0[3]);
- w1_t2[0] = swap32 (w1[0]);
- w1_t2[1] = swap32 (w1[1]);
- w1_t2[2] = swap32 (w1[2]);
- w1_t2[3] = swap32 (w1[3]);
- w2_t2[0] = swap32 (w2[0]);
- w2_t2[1] = swap32 (w2[1]);
- w2_t2[2] = swap32 (w2[2]);
- w2_t2[3] = swap32 (w2[3]);
- w3_t2[0] = swap32 (w3[0]);
- w3_t2[1] = swap32 (w3[1]);
- w3_t2[2] = swap32 (w3[2]);
- w3_t2[3] = swap32 (w3[3]);
-
- switch_buffer_by_offset (w0_t2, w1_t2, w2_t2, w3_t2, salt_len);
-
- w0_t2[0] |= salt_buf0[0];
- w0_t2[1] |= salt_buf0[1];
- w0_t2[2] |= salt_buf0[2];
- w0_t2[3] |= salt_buf0[3];
- w1_t2[0] |= salt_buf1[0];
- w1_t2[1] |= salt_buf1[1];
- w1_t2[2] |= salt_buf1[2];
- w1_t2[3] |= salt_buf1[3];
- w2_t2[0] |= salt_buf2[0];
- w2_t2[1] |= salt_buf2[1];
- w2_t2[2] |= salt_buf2[2];
- w2_t2[3] |= salt_buf2[3];
- w3_t2[0] |= salt_buf3[0];
- w3_t2[1] |= salt_buf3[1];
- w3_t2[2] |= salt_buf3[2];
- w3_t2[3] |= salt_buf3[3];
-
- /**
- * sha256
- */
-
- u32 w0_t = swap32 (w0_t2[0]);
- u32 w1_t = swap32 (w0_t2[1]);
- u32 w2_t = swap32 (w0_t2[2]);
- u32 w3_t = swap32 (w0_t2[3]);
- u32 w4_t = swap32 (w1_t2[0]);
- u32 w5_t = swap32 (w1_t2[1]);
- u32 w6_t = swap32 (w1_t2[2]);
- u32 w7_t = swap32 (w1_t2[3]);
- u32 w8_t = swap32 (w2_t2[0]);
- u32 w9_t = swap32 (w2_t2[1]);
- u32 wa_t = swap32 (w2_t2[2]);
- u32 wb_t = swap32 (w2_t2[3]);
- u32 wc_t = swap32 (w3_t2[0]);
- u32 wd_t = swap32 (w3_t2[1]);
- u32 we_t = 0;
- u32 wf_t = pw_salt_len * 8;
-
- u32 a = SHA256M_A;
- u32 b = SHA256M_B;
- u32 c = SHA256M_C;
- u32 d = SHA256M_D;
- u32 e = SHA256M_E;
- u32 f = SHA256M_F;
- u32 g = SHA256M_G;
- u32 h = SHA256M_H;
+ const u32x w0r = w0r_create_bft (bfs_buf, il_pos);
+
+ const u32x w0lr = w0l | w0r;
+
+ u32x wx[16];
+
+ wx[ 0] = w0_t[0];
+ wx[ 1] = w0_t[1];
+ wx[ 2] = w0_t[2];
+ wx[ 3] = w0_t[3];
+ wx[ 4] = w1_t[0];
+ wx[ 5] = w1_t[1];
+ wx[ 6] = w1_t[2];
+ wx[ 7] = w1_t[3];
+ wx[ 8] = w2_t[0];
+ wx[ 9] = w2_t[1];
+ wx[10] = w2_t[2];
+ wx[11] = w2_t[3];
+ wx[12] = w3_t[0];
+ wx[13] = w3_t[1];
+ wx[14] = w3_t[2];
+ wx[15] = w3_t[3];
+
+ overwrite_at_be (wx, w0lr, salt_len);
+
+ u32x w0_t = wx[ 0];
+ u32x w1_t = wx[ 1];
+ u32x w2_t = wx[ 2];
+ u32x w3_t = wx[ 3];
+ u32x w4_t = wx[ 4];
+ u32x w5_t = wx[ 5];
+ u32x w6_t = wx[ 6];
+ u32x w7_t = wx[ 7];
+ u32x w8_t = wx[ 8];
+ u32x w9_t = wx[ 9];
+ u32x wa_t = wx[10];
+ u32x wb_t = wx[11];
+ u32x wc_t = wx[12];
+ u32x wd_t = wx[13];
+ u32x we_t = 0;
+ u32x wf_t = pw_salt_len * 8;
+
+ u32x a = SHA256M_A;
+ u32x b = SHA256M_B;
+ u32x c = SHA256M_C;
+ u32x d = SHA256M_D;
+ u32x e = SHA256M_E;
+ u32x f = SHA256M_F;
+ u32x g = SHA256M_G;
+ u32x h = SHA256M_H;
SHA256_STEP (SHA256_F0o, SHA256_F1o, a, b, c, d, e, f, g, h, w0_t, SHA256C00);
SHA256_STEP (SHA256_F0o, SHA256_F1o, h, a, b, c, d, e, f, g, w1_t, SHA256C01);
we_t = SHA256_EXPAND (wc_t, w7_t, wf_t, we_t); SHA256_STEP (SHA256_F0o, SHA256_F1o, c, d, e, f, g, h, a, b, we_t, SHA256C3e);
wf_t = SHA256_EXPAND (wd_t, w8_t, w0_t, wf_t); SHA256_STEP (SHA256_F0o, SHA256_F1o, b, c, d, e, f, g, h, a, wf_t, SHA256C3f);
-
- const u32 r0 = d;
- const u32 r1 = h;
- const u32 r2 = c;
- const u32 r3 = g;
-
- #include COMPARE_M
+ COMPARE_M_SIMD (d, h, c, g);
}
}
const u32 pw_salt_len = pw_len + salt_len;
+ /**
+ * prepend salt
+ */
+
+ u32 w0_t[4];
+ u32 w1_t[4];
+ u32 w2_t[4];
+ u32 w3_t[4];
+
+ w0_t[0] = swap32_S (w0[0]);
+ w0_t[1] = swap32_S (w0[1]);
+ w0_t[2] = swap32_S (w0[2]);
+ w0_t[3] = swap32_S (w0[3]);
+ w1_t[0] = swap32_S (w1[0]);
+ w1_t[1] = swap32_S (w1[1]);
+ w1_t[2] = swap32_S (w1[2]);
+ w1_t[3] = swap32_S (w1[3]);
+ w2_t[0] = swap32_S (w2[0]);
+ w2_t[1] = swap32_S (w2[1]);
+ w2_t[2] = swap32_S (w2[2]);
+ w2_t[3] = swap32_S (w2[3]);
+ w3_t[0] = swap32_S (w3[0]);
+ w3_t[1] = swap32_S (w3[1]);
+ w3_t[2] = swap32_S (w3[2]);
+ w3_t[3] = swap32_S (w3[3]);
+
+ switch_buffer_by_offset_le_S (w0_t, w1_t, w2_t, w3_t, salt_len);
+
+ w0_t[0] |= salt_buf0[0];
+ w0_t[1] |= salt_buf0[1];
+ w0_t[2] |= salt_buf0[2];
+ w0_t[3] |= salt_buf0[3];
+ w1_t[0] |= salt_buf1[0];
+ w1_t[1] |= salt_buf1[1];
+ w1_t[2] |= salt_buf1[2];
+ w1_t[3] |= salt_buf1[3];
+ w2_t[0] |= salt_buf2[0];
+ w2_t[1] |= salt_buf2[1];
+ w2_t[2] |= salt_buf2[2];
+ w2_t[3] |= salt_buf2[3];
+ w3_t[0] |= salt_buf3[0];
+ w3_t[1] |= salt_buf3[1];
+ w3_t[2] |= salt_buf3[2];
+ w3_t[3] |= salt_buf3[3];
+
+ w0_t[0] = swap32_S (w0_t[0]);
+ w0_t[1] = swap32_S (w0_t[1]);
+ w0_t[2] = swap32_S (w0_t[2]);
+ w0_t[3] = swap32_S (w0_t[3]);
+ w1_t[0] = swap32_S (w1_t[0]);
+ w1_t[1] = swap32_S (w1_t[1]);
+ w1_t[2] = swap32_S (w1_t[2]);
+ w1_t[3] = swap32_S (w1_t[3]);
+ w2_t[0] = swap32_S (w2_t[0]);
+ w2_t[1] = swap32_S (w2_t[1]);
+ w2_t[2] = swap32_S (w2_t[2]);
+ w2_t[3] = swap32_S (w2_t[3]);
+ w3_t[0] = swap32_S (w3_t[0]);
+ w3_t[1] = swap32_S (w3_t[1]);
+ w3_t[2] = swap32_S (w3_t[2]);
+ w3_t[3] = swap32_S (w3_t[3]);
+
/**
* loop
*/
u32 w0l = w0[0];
- for (u32 il_pos = 0; il_pos < bfs_cnt; il_pos++)
+ for (u32 il_pos = 0; il_pos < bfs_cnt; il_pos += VECT_SIZE)
{
- const u32 w0r = bfs_buf[il_pos].i;
-
- w0[0] = w0l | w0r;
-
- /**
- * prepend salt
- */
-
- u32 w0_t2[4];
- u32 w1_t2[4];
- u32 w2_t2[4];
- u32 w3_t2[4];
-
- w0_t2[0] = swap32 (w0[0]);
- w0_t2[1] = swap32 (w0[1]);
- w0_t2[2] = swap32 (w0[2]);
- w0_t2[3] = swap32 (w0[3]);
- w1_t2[0] = swap32 (w1[0]);
- w1_t2[1] = swap32 (w1[1]);
- w1_t2[2] = swap32 (w1[2]);
- w1_t2[3] = swap32 (w1[3]);
- w2_t2[0] = swap32 (w2[0]);
- w2_t2[1] = swap32 (w2[1]);
- w2_t2[2] = swap32 (w2[2]);
- w2_t2[3] = swap32 (w2[3]);
- w3_t2[0] = swap32 (w3[0]);
- w3_t2[1] = swap32 (w3[1]);
- w3_t2[2] = swap32 (w3[2]);
- w3_t2[3] = swap32 (w3[3]);
-
- switch_buffer_by_offset (w0_t2, w1_t2, w2_t2, w3_t2, salt_len);
-
- w0_t2[0] |= salt_buf0[0];
- w0_t2[1] |= salt_buf0[1];
- w0_t2[2] |= salt_buf0[2];
- w0_t2[3] |= salt_buf0[3];
- w1_t2[0] |= salt_buf1[0];
- w1_t2[1] |= salt_buf1[1];
- w1_t2[2] |= salt_buf1[2];
- w1_t2[3] |= salt_buf1[3];
- w2_t2[0] |= salt_buf2[0];
- w2_t2[1] |= salt_buf2[1];
- w2_t2[2] |= salt_buf2[2];
- w2_t2[3] |= salt_buf2[3];
- w3_t2[0] |= salt_buf3[0];
- w3_t2[1] |= salt_buf3[1];
- w3_t2[2] |= salt_buf3[2];
- w3_t2[3] |= salt_buf3[3];
-
- /**
- * sha256
- */
-
- u32 w0_t = swap32 (w0_t2[0]);
- u32 w1_t = swap32 (w0_t2[1]);
- u32 w2_t = swap32 (w0_t2[2]);
- u32 w3_t = swap32 (w0_t2[3]);
- u32 w4_t = swap32 (w1_t2[0]);
- u32 w5_t = swap32 (w1_t2[1]);
- u32 w6_t = swap32 (w1_t2[2]);
- u32 w7_t = swap32 (w1_t2[3]);
- u32 w8_t = swap32 (w2_t2[0]);
- u32 w9_t = swap32 (w2_t2[1]);
- u32 wa_t = swap32 (w2_t2[2]);
- u32 wb_t = swap32 (w2_t2[3]);
- u32 wc_t = swap32 (w3_t2[0]);
- u32 wd_t = swap32 (w3_t2[1]);
- u32 we_t = 0;
- u32 wf_t = pw_salt_len * 8;
-
- u32 a = SHA256M_A;
- u32 b = SHA256M_B;
- u32 c = SHA256M_C;
- u32 d = SHA256M_D;
- u32 e = SHA256M_E;
- u32 f = SHA256M_F;
- u32 g = SHA256M_G;
- u32 h = SHA256M_H;
+ const u32x w0r = w0r_create_bft (bfs_buf, il_pos);
+
+ const u32x w0lr = w0l | w0r;
+
+ u32x wx[16];
+
+ wx[ 0] = w0_t[0];
+ wx[ 1] = w0_t[1];
+ wx[ 2] = w0_t[2];
+ wx[ 3] = w0_t[3];
+ wx[ 4] = w1_t[0];
+ wx[ 5] = w1_t[1];
+ wx[ 6] = w1_t[2];
+ wx[ 7] = w1_t[3];
+ wx[ 8] = w2_t[0];
+ wx[ 9] = w2_t[1];
+ wx[10] = w2_t[2];
+ wx[11] = w2_t[3];
+ wx[12] = w3_t[0];
+ wx[13] = w3_t[1];
+ wx[14] = w3_t[2];
+ wx[15] = w3_t[3];
+
+ overwrite_at_be (wx, w0lr, salt_len);
+
+ u32x w0_t = wx[ 0];
+ u32x w1_t = wx[ 1];
+ u32x w2_t = wx[ 2];
+ u32x w3_t = wx[ 3];
+ u32x w4_t = wx[ 4];
+ u32x w5_t = wx[ 5];
+ u32x w6_t = wx[ 6];
+ u32x w7_t = wx[ 7];
+ u32x w8_t = wx[ 8];
+ u32x w9_t = wx[ 9];
+ u32x wa_t = wx[10];
+ u32x wb_t = wx[11];
+ u32x wc_t = wx[12];
+ u32x wd_t = wx[13];
+ u32x we_t = 0;
+ u32x wf_t = pw_salt_len * 8;
+
+ u32x a = SHA256M_A;
+ u32x b = SHA256M_B;
+ u32x c = SHA256M_C;
+ u32x d = SHA256M_D;
+ u32x e = SHA256M_E;
+ u32x f = SHA256M_F;
+ u32x g = SHA256M_G;
+ u32x h = SHA256M_H;
SHA256_STEP (SHA256_F0o, SHA256_F1o, a, b, c, d, e, f, g, h, w0_t, SHA256C00);
SHA256_STEP (SHA256_F0o, SHA256_F1o, h, a, b, c, d, e, f, g, w1_t, SHA256C01);
wa_t = SHA256_EXPAND (w8_t, w3_t, wb_t, wa_t); SHA256_STEP (SHA256_F0o, SHA256_F1o, g, h, a, b, c, d, e, f, wa_t, SHA256C3a);
wb_t = SHA256_EXPAND (w9_t, w4_t, wc_t, wb_t); SHA256_STEP (SHA256_F0o, SHA256_F1o, f, g, h, a, b, c, d, e, wb_t, SHA256C3b);
wc_t = SHA256_EXPAND (wa_t, w5_t, wd_t, wc_t); SHA256_STEP (SHA256_F0o, SHA256_F1o, e, f, g, h, a, b, c, d, wc_t, SHA256C3c);
+
+ if (MATCHES_NONE_VS (d, search[0])) continue;
+
wd_t = SHA256_EXPAND (wb_t, w6_t, we_t, wd_t); SHA256_STEP (SHA256_F0o, SHA256_F1o, d, e, f, g, h, a, b, c, wd_t, SHA256C3d);
we_t = SHA256_EXPAND (wc_t, w7_t, wf_t, we_t); SHA256_STEP (SHA256_F0o, SHA256_F1o, c, d, e, f, g, h, a, b, we_t, SHA256C3e);
wf_t = SHA256_EXPAND (wd_t, w8_t, w0_t, wf_t); SHA256_STEP (SHA256_F0o, SHA256_F1o, b, c, d, e, f, g, h, a, wf_t, SHA256C3f);
-
- const u32 r0 = d;
- const u32 r1 = h;
- const u32 r2 = c;
- const u32 r3 = g;
-
- #include COMPARE_S
+ COMPARE_S_SIMD (d, h, c, g);
}
}
if (combs_mode == COMBINATOR_MODE_BASE_RIGHT)
{
- switch_buffer_by_offset (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len);
+ switch_buffer_by_offset_le (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len);
}
/**
if (combs_mode == COMBINATOR_MODE_BASE_LEFT)
{
- switch_buffer_by_offset (wordr0, wordr1, wordr2, wordr3, pw_l_len);
+ switch_buffer_by_offset_le (wordr0, wordr1, wordr2, wordr3, pw_l_len);
}
u32 w0[4];
if (combs_mode == COMBINATOR_MODE_BASE_RIGHT)
{
- switch_buffer_by_offset (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len);
+ switch_buffer_by_offset_le (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len);
}
/**
if (combs_mode == COMBINATOR_MODE_BASE_LEFT)
{
- switch_buffer_by_offset (wordr0, wordr1, wordr2, wordr3, pw_l_len);
+ switch_buffer_by_offset_le (wordr0, wordr1, wordr2, wordr3, pw_l_len);
}
u32 w0[4];
#define _SHA256_
+#define NEW_SIMD_CODE
+
#include "include/constants.h"
#include "include/kernel_vendor.h"
#include "include/kernel_functions.c"
#include "OpenCL/types_ocl.c"
#include "OpenCL/common.c"
-
-#define COMPARE_S "OpenCL/check_single_comp4.c"
-#define COMPARE_M "OpenCL/check_multi_comp4.c"
+#include "OpenCL/simd.c"
__constant u32 k_sha256[64] =
{
SHA256C3c, SHA256C3d, SHA256C3e, SHA256C3f,
};
-static void sha256_transform (const u32 w0[4], const u32 w1[4], const u32 w2[4], const u32 w3[4], u32 digest[8])
+static void sha256_transform (const u32x w0[4], const u32x w1[4], const u32x w2[4], const u32x w3[4], u32x digest[8])
{
- u32 a = digest[0];
- u32 b = digest[1];
- u32 c = digest[2];
- u32 d = digest[3];
- u32 e = digest[4];
- u32 f = digest[5];
- u32 g = digest[6];
- u32 h = digest[7];
-
- u32 w0_t = w0[0];
- u32 w1_t = w0[1];
- u32 w2_t = w0[2];
- u32 w3_t = w0[3];
- u32 w4_t = w1[0];
- u32 w5_t = w1[1];
- u32 w6_t = w1[2];
- u32 w7_t = w1[3];
- u32 w8_t = w2[0];
- u32 w9_t = w2[1];
- u32 wa_t = w2[2];
- u32 wb_t = w2[3];
- u32 wc_t = w3[0];
- u32 wd_t = w3[1];
- u32 we_t = w3[2];
- u32 wf_t = w3[3];
+ u32x a = digest[0];
+ u32x b = digest[1];
+ u32x c = digest[2];
+ u32x d = digest[3];
+ u32x e = digest[4];
+ u32x f = digest[5];
+ u32x g = digest[6];
+ u32x h = digest[7];
+
+ u32x w0_t = w0[0];
+ u32x w1_t = w0[1];
+ u32x w2_t = w0[2];
+ u32x w3_t = w0[3];
+ u32x w4_t = w1[0];
+ u32x w5_t = w1[1];
+ u32x w6_t = w1[2];
+ u32x w7_t = w1[3];
+ u32x w8_t = w2[0];
+ u32x w9_t = w2[1];
+ u32x wa_t = w2[2];
+ u32x wb_t = w2[3];
+ u32x wc_t = w3[0];
+ u32x wd_t = w3[1];
+ u32x we_t = w3[2];
+ u32x wf_t = w3[3];
#define ROUND_EXPAND() \
{ \
digest[7] += h;
}
-static void hmac_sha256_pad (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], u32 ipad[8], u32 opad[8])
+static void hmac_sha256_pad (u32x w0[4], u32x w1[4], u32x w2[4], u32x w3[4], u32x ipad[8], u32x opad[8])
{
w0[0] = w0[0] ^ 0x36363636;
w0[1] = w0[1] ^ 0x36363636;
sha256_transform (w0, w1, w2, w3, opad);
}
-static void hmac_sha256_run (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], u32 ipad[8], u32 opad[8], u32 digest[8])
+static void hmac_sha256_run (u32x w0[4], u32x w1[4], u32x w2[4], u32x w3[4], u32x ipad[8], u32x opad[8], u32x digest[8])
{
digest[0] = ipad[0];
digest[1] = ipad[1];
u32 w0l = w0[0];
- for (u32 il_pos = 0; il_pos < bfs_cnt; il_pos++)
+ for (u32 il_pos = 0; il_pos < bfs_cnt; il_pos += VECT_SIZE)
{
- const u32 w0r = bfs_buf[il_pos].i;
+ const u32x w0r = w0r_create_bft (bfs_buf, il_pos);
- w0[0] = w0l | w0r;
+ const u32x w0lr = w0l | w0r;
/**
* pads
*/
- u32 w0_t[4];
+ u32x w0_t[4];
- w0_t[0] = w0[0];
+ w0_t[0] = w0lr;
w0_t[1] = w0[1];
w0_t[2] = w0[2];
w0_t[3] = w0[3];
- u32 w1_t[4];
+ u32x w1_t[4];
w1_t[0] = w1[0];
w1_t[1] = w1[1];
w1_t[2] = w1[2];
w1_t[3] = w1[3];
- u32 w2_t[4];
+ u32x w2_t[4];
w2_t[0] = w2[0];
w2_t[1] = w2[1];
w2_t[2] = w2[2];
w2_t[3] = w2[3];
- u32 w3_t[4];
+ u32x w3_t[4];
w3_t[0] = w3[0];
w3_t[1] = w3[1];
w3_t[2] = 0;
w3_t[3] = 0;
- u32 ipad[8];
- u32 opad[8];
+ u32x ipad[8];
+ u32x opad[8];
hmac_sha256_pad (w0_t, w1_t, w2_t, w3_t, ipad, opad);
w3_t[2] = 0;
w3_t[3] = (64 + salt_len) * 8;
- u32 digest[8];
+ u32x digest[8];
hmac_sha256_run (w0_t, w1_t, w2_t, w3_t, ipad, opad, digest);
- const u32 r0 = digest[3];
- const u32 r1 = digest[7];
- const u32 r2 = digest[2];
- const u32 r3 = digest[6];
-
- #include COMPARE_M
+ COMPARE_M_SIMD (digest[3], digest[7], digest[2], digest[6]);
}
}
u32 w0l = w0[0];
- for (u32 il_pos = 0; il_pos < bfs_cnt; il_pos++)
+ for (u32 il_pos = 0; il_pos < bfs_cnt; il_pos += VECT_SIZE)
{
- const u32 w0r = bfs_buf[il_pos].i;
+ const u32x w0r = w0r_create_bft (bfs_buf, il_pos);
- w0[0] = w0l | w0r;
+ const u32x w0lr = w0l | w0r;
/**
* pads
*/
- u32 w0_t[4];
+ u32x w0_t[4];
- w0_t[0] = w0[0];
+ w0_t[0] = w0lr;
w0_t[1] = w0[1];
w0_t[2] = w0[2];
w0_t[3] = w0[3];
- u32 w1_t[4];
+ u32x w1_t[4];
w1_t[0] = w1[0];
w1_t[1] = w1[1];
w1_t[2] = w1[2];
w1_t[3] = w1[3];
- u32 w2_t[4];
+ u32x w2_t[4];
w2_t[0] = w2[0];
w2_t[1] = w2[1];
w2_t[2] = w2[2];
w2_t[3] = w2[3];
- u32 w3_t[4];
+ u32x w3_t[4];
w3_t[0] = w3[0];
w3_t[1] = w3[1];
w3_t[2] = 0;
w3_t[3] = 0;
- u32 ipad[8];
- u32 opad[8];
+ u32x ipad[8];
+ u32x opad[8];
hmac_sha256_pad (w0_t, w1_t, w2_t, w3_t, ipad, opad);
w3_t[2] = 0;
w3_t[3] = (64 + salt_len) * 8;
- u32 digest[8];
+ u32x digest[8];
hmac_sha256_run (w0_t, w1_t, w2_t, w3_t, ipad, opad, digest);
- const u32 r0 = digest[3];
- const u32 r1 = digest[7];
- const u32 r2 = digest[2];
- const u32 r3 = digest[6];
-
- #include COMPARE_S
+ COMPARE_S_SIMD (digest[3], digest[7], digest[2], digest[6]);
}
}
if (combs_mode == COMBINATOR_MODE_BASE_RIGHT)
{
- switch_buffer_by_offset (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len);
+ switch_buffer_by_offset_le (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len);
}
/**
if (combs_mode == COMBINATOR_MODE_BASE_LEFT)
{
- switch_buffer_by_offset (wordr0, wordr1, wordr2, wordr3, pw_l_len);
+ switch_buffer_by_offset_le (wordr0, wordr1, wordr2, wordr3, pw_l_len);
}
u32 w0[4];
if (combs_mode == COMBINATOR_MODE_BASE_RIGHT)
{
- switch_buffer_by_offset (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len);
+ switch_buffer_by_offset_le (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len);
}
/**
if (combs_mode == COMBINATOR_MODE_BASE_LEFT)
{
- switch_buffer_by_offset (wordr0, wordr1, wordr2, wordr3, pw_l_len);
+ switch_buffer_by_offset_le (wordr0, wordr1, wordr2, wordr3, pw_l_len);
}
u32 w0[4];
#define _SHA256_
+#define NEW_SIMD_CODE
+
#include "include/constants.h"
#include "include/kernel_vendor.h"
#include "include/kernel_functions.c"
#include "OpenCL/types_ocl.c"
#include "OpenCL/common.c"
-
-#define COMPARE_S "OpenCL/check_single_comp4.c"
-#define COMPARE_M "OpenCL/check_multi_comp4.c"
+#include "OpenCL/simd.c"
__constant u32 k_sha256[64] =
{
SHA256C3c, SHA256C3d, SHA256C3e, SHA256C3f,
};
-static void sha256_transform (const u32 w0[4], const u32 w1[4], const u32 w2[4], const u32 w3[4], u32 digest[8])
+static void sha256_transform (const u32x w0[4], const u32x w1[4], const u32x w2[4], const u32x w3[4], u32x digest[8])
{
- u32 a = digest[0];
- u32 b = digest[1];
- u32 c = digest[2];
- u32 d = digest[3];
- u32 e = digest[4];
- u32 f = digest[5];
- u32 g = digest[6];
- u32 h = digest[7];
-
- u32 w0_t = w0[0];
- u32 w1_t = w0[1];
- u32 w2_t = w0[2];
- u32 w3_t = w0[3];
- u32 w4_t = w1[0];
- u32 w5_t = w1[1];
- u32 w6_t = w1[2];
- u32 w7_t = w1[3];
- u32 w8_t = w2[0];
- u32 w9_t = w2[1];
- u32 wa_t = w2[2];
- u32 wb_t = w2[3];
- u32 wc_t = w3[0];
- u32 wd_t = w3[1];
- u32 we_t = w3[2];
- u32 wf_t = w3[3];
+ u32x a = digest[0];
+ u32x b = digest[1];
+ u32x c = digest[2];
+ u32x d = digest[3];
+ u32x e = digest[4];
+ u32x f = digest[5];
+ u32x g = digest[6];
+ u32x h = digest[7];
+
+ u32x w0_t = w0[0];
+ u32x w1_t = w0[1];
+ u32x w2_t = w0[2];
+ u32x w3_t = w0[3];
+ u32x w4_t = w1[0];
+ u32x w5_t = w1[1];
+ u32x w6_t = w1[2];
+ u32x w7_t = w1[3];
+ u32x w8_t = w2[0];
+ u32x w9_t = w2[1];
+ u32x wa_t = w2[2];
+ u32x wb_t = w2[3];
+ u32x wc_t = w3[0];
+ u32x wd_t = w3[1];
+ u32x we_t = w3[2];
+ u32x wf_t = w3[3];
#define ROUND_EXPAND() \
{ \
digest[7] += h;
}
-static void hmac_sha256_pad (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], u32 ipad[8], u32 opad[8])
+static void hmac_sha256_pad (u32x w0[4], u32x w1[4], u32x w2[4], u32x w3[4], u32x ipad[8], u32x opad[8])
{
w0[0] = w0[0] ^ 0x36363636;
w0[1] = w0[1] ^ 0x36363636;
sha256_transform (w0, w1, w2, w3, opad);
}
-static void hmac_sha256_run (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], u32 ipad[8], u32 opad[8], u32 digest[8])
+static void hmac_sha256_run (u32x w0[4], u32x w1[4], u32x w2[4], u32x w3[4], u32x ipad[8], u32x opad[8], u32x digest[8])
{
digest[0] = ipad[0];
digest[1] = ipad[1];
* pads
*/
- u32 w0_t[4];
+ u32x w0_t[4];
w0_t[0] = swap32 (salt_buf0[0]);
w0_t[1] = swap32 (salt_buf0[1]);
w0_t[2] = swap32 (salt_buf0[2]);
w0_t[3] = swap32 (salt_buf0[3]);
- u32 w1_t[4];
+ u32x w1_t[4];
w1_t[0] = swap32 (salt_buf1[0]);
w1_t[1] = swap32 (salt_buf1[1]);
w1_t[2] = swap32 (salt_buf1[2]);
w1_t[3] = swap32 (salt_buf1[3]);
- u32 w2_t[4];
+ u32x w2_t[4];
w2_t[0] = 0;
w2_t[1] = 0;
w2_t[2] = 0;
w2_t[3] = 0;
- u32 w3_t[4];
+ u32x w3_t[4];
w3_t[0] = 0;
w3_t[1] = 0;
w3_t[2] = 0;
w3_t[3] = 0;
- u32 ipad[8];
- u32 opad[8];
+ u32x ipad[8];
+ u32x opad[8];
hmac_sha256_pad (w0_t, w1_t, w2_t, w3_t, ipad, opad);
u32 w0l = w0[0];
- for (u32 il_pos = 0; il_pos < bfs_cnt; il_pos++)
+ for (u32 il_pos = 0; il_pos < bfs_cnt; il_pos += VECT_SIZE)
{
- const u32 w0r = bfs_buf[il_pos].i;
+ const u32x w0r = w0r_create_bft (bfs_buf, il_pos);
- w0[0] = w0l | w0r;
+ const u32x w0lr = w0l | w0r;
- w0_t[0] = w0[0];
+ w0_t[0] = w0lr;
w0_t[1] = w0[1];
w0_t[2] = w0[2];
w0_t[3] = w0[3];
w3_t[2] = 0;
w3_t[3] = (64 + pw_len) * 8;
- u32 digest[8];
+ u32x digest[8];
hmac_sha256_run (w0_t, w1_t, w2_t, w3_t, ipad, opad, digest);
- const u32 r0 = digest[3];
- const u32 r1 = digest[7];
- const u32 r2 = digest[2];
- const u32 r3 = digest[6];
-
- #include COMPARE_M
+ COMPARE_M_SIMD (digest[3], digest[7], digest[2], digest[6]);
}
}
* pads
*/
- u32 w0_t[4];
+ u32x w0_t[4];
w0_t[0] = swap32 (salt_buf0[0]);
w0_t[1] = swap32 (salt_buf0[1]);
w0_t[2] = swap32 (salt_buf0[2]);
w0_t[3] = swap32 (salt_buf0[3]);
- u32 w1_t[4];
+ u32x w1_t[4];
w1_t[0] = swap32 (salt_buf1[0]);
w1_t[1] = swap32 (salt_buf1[1]);
w1_t[2] = swap32 (salt_buf1[2]);
w1_t[3] = swap32 (salt_buf1[3]);
- u32 w2_t[4];
+ u32x w2_t[4];
w2_t[0] = 0;
w2_t[1] = 0;
w2_t[2] = 0;
w2_t[3] = 0;
- u32 w3_t[4];
+ u32x w3_t[4];
w3_t[0] = 0;
w3_t[1] = 0;
w3_t[2] = 0;
w3_t[3] = 0;
- u32 ipad[8];
- u32 opad[8];
+ u32x ipad[8];
+ u32x opad[8];
hmac_sha256_pad (w0_t, w1_t, w2_t, w3_t, ipad, opad);
u32 w0l = w0[0];
- for (u32 il_pos = 0; il_pos < bfs_cnt; il_pos++)
+ for (u32 il_pos = 0; il_pos < bfs_cnt; il_pos += VECT_SIZE)
{
- const u32 w0r = bfs_buf[il_pos].i;
+ const u32x w0r = w0r_create_bft (bfs_buf, il_pos);
- w0[0] = w0l | w0r;
+ const u32x w0lr = w0l | w0r;
- w0_t[0] = w0[0];
+ w0_t[0] = w0lr;
w0_t[1] = w0[1];
w0_t[2] = w0[2];
w0_t[3] = w0[3];
w3_t[2] = 0;
w3_t[3] = (64 + pw_len) * 8;
- u32 digest[8];
+ u32x digest[8];
hmac_sha256_run (w0_t, w1_t, w2_t, w3_t, ipad, opad, digest);
- const u32 r0 = digest[3];
- const u32 r1 = digest[7];
- const u32 r2 = digest[2];
- const u32 r3 = digest[6];
-
- #include COMPARE_S
+ COMPARE_S_SIMD (digest[3], digest[7], digest[2], digest[6]);
}
}
if (combs_mode == COMBINATOR_MODE_BASE_RIGHT)
{
- switch_buffer_by_offset (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len);
+ switch_buffer_by_offset_le (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len);
}
/**
if (combs_mode == COMBINATOR_MODE_BASE_LEFT)
{
- switch_buffer_by_offset (wordr0, wordr1, wordr2, wordr3, pw_l_len);
+ switch_buffer_by_offset_le (wordr0, wordr1, wordr2, wordr3, pw_l_len);
}
u32 w0[4];
if (combs_mode == COMBINATOR_MODE_BASE_RIGHT)
{
- switch_buffer_by_offset (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len);
+ switch_buffer_by_offset_le (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len);
}
/**
if (combs_mode == COMBINATOR_MODE_BASE_LEFT)
{
- switch_buffer_by_offset (wordr0, wordr1, wordr2, wordr3, pw_l_len);
+ switch_buffer_by_offset_le (wordr0, wordr1, wordr2, wordr3, pw_l_len);
}
u32 w0[4];
{
append_0x80_2x4 (wordl0, wordl1, pw_l_len);
- switch_buffer_by_offset (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len);
+ switch_buffer_by_offset_le (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len);
}
/**
{
append_0x80_2x4 (wordr0, wordr1, pw_r_len);
- switch_buffer_by_offset (wordr0, wordr1, wordr2, wordr3, pw_l_len);
+ switch_buffer_by_offset_le (wordr0, wordr1, wordr2, wordr3, pw_l_len);
}
u32 w0[4];
{
append_0x80_2x4 (wordl0, wordl1, pw_l_len);
- switch_buffer_by_offset (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len);
+ switch_buffer_by_offset_le (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len);
}
/**
{
append_0x80_2x4 (wordr0, wordr1, pw_r_len);
- switch_buffer_by_offset (wordr0, wordr1, wordr2, wordr3, pw_l_len);
+ switch_buffer_by_offset_le (wordr0, wordr1, wordr2, wordr3, pw_l_len);
}
u32 w0[4];
#define _SHA512_
+#define NEW_SIMD_CODE
+
#include "include/constants.h"
#include "include/kernel_vendor.h"
#include "include/kernel_functions.c"
#include "OpenCL/types_ocl.c"
#include "OpenCL/common.c"
-
-#define COMPARE_S "OpenCL/check_single_comp4.c"
-#define COMPARE_M "OpenCL/check_multi_comp4.c"
+#include "OpenCL/simd.c"
__constant u64 k_sha512[80] =
{
SHA512C4c, SHA512C4d, SHA512C4e, SHA512C4f,
};
-static void sha512_transform (const u32 w0[4], const u32 w1[4], const u32 w2[4], const u32 w3[4], u64 digest[8])
+static void sha512_transform (const u32x w0[4], const u32x w1[4], const u32x w2[4], const u32x w3[4], u64x digest[8])
{
- u64 w0_t = hl32_to_64 (w0[0], w0[1]);
- u64 w1_t = hl32_to_64 (w0[2], w0[3]);
- u64 w2_t = hl32_to_64 (w1[0], w1[1]);
- u64 w3_t = hl32_to_64 (w1[2], w1[3]);
- u64 w4_t = hl32_to_64 (w2[0], w2[1]);
- u64 w5_t = hl32_to_64 (w2[2], w2[3]);
- u64 w6_t = hl32_to_64 (w3[0], w3[1]);
- u64 w7_t = 0;
- u64 w8_t = 0;
- u64 w9_t = 0;
- u64 wa_t = 0;
- u64 wb_t = 0;
- u64 wc_t = 0;
- u64 wd_t = 0;
- u64 we_t = 0;
- u64 wf_t = hl32_to_64 (w3[2], w3[3]);
-
- u64 a = digest[0];
- u64 b = digest[1];
- u64 c = digest[2];
- u64 d = digest[3];
- u64 e = digest[4];
- u64 f = digest[5];
- u64 g = digest[6];
- u64 h = digest[7];
+ u64x w0_t = hl32_to_64 (w0[0], w0[1]);
+ u64x w1_t = hl32_to_64 (w0[2], w0[3]);
+ u64x w2_t = hl32_to_64 (w1[0], w1[1]);
+ u64x w3_t = hl32_to_64 (w1[2], w1[3]);
+ u64x w4_t = hl32_to_64 (w2[0], w2[1]);
+ u64x w5_t = hl32_to_64 (w2[2], w2[3]);
+ u64x w6_t = hl32_to_64 (w3[0], w3[1]);
+ u64x w7_t = 0;
+ u64x w8_t = 0;
+ u64x w9_t = 0;
+ u64x wa_t = 0;
+ u64x wb_t = 0;
+ u64x wc_t = 0;
+ u64x wd_t = 0;
+ u64x we_t = 0;
+ u64x wf_t = hl32_to_64 (w3[2], w3[3]);
+
+ u64x a = digest[0];
+ u64x b = digest[1];
+ u64x c = digest[2];
+ u64x d = digest[3];
+ u64x e = digest[4];
+ u64x f = digest[5];
+ u64x g = digest[6];
+ u64x h = digest[7];
#define ROUND_EXPAND() \
{ \
digest[7] = h;
}
-static void m01700m (u32 w[16], const u32 pw_len, __global pw_t *pws, __global kernel_rule_t *rules_buf, __global comb_t *combs_buf, __constant u32 * words_buf_r, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 bfs_cnt, const u32 digests_cnt, const u32 digests_offset)
+static void m01700m (u32 w[16], const u32 pw_len, __global pw_t *pws, __global kernel_rule_t *rules_buf, __global comb_t *combs_buf, __constant u32x * words_buf_r, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 bfs_cnt, const u32 digests_cnt, const u32 digests_offset)
{
/**
* modifier
u32 w0l = w[0];
- for (u32 il_pos = 0; il_pos < bfs_cnt; il_pos++)
+ for (u32 il_pos = 0; il_pos < bfs_cnt; il_pos += VECT_SIZE)
{
- const u32 w0r = words_buf_r[il_pos];
+ const u32x w0r = words_buf_r[il_pos / VECT_SIZE];
- const u32 w0 = w0l | w0r;
+ const u32x w0 = w0l | w0r;
- u32 w0_t[4];
- u32 w1_t[4];
- u32 w2_t[4];
- u32 w3_t[4];
+ u32x w0_t[4];
+ u32x w1_t[4];
+ u32x w2_t[4];
+ u32x w3_t[4];
w0_t[0] = w0;
w0_t[1] = w[ 1];
w3_t[2] = w[14];
w3_t[3] = w[15];
- u64 digest[8];
+ u64x digest[8];
digest[0] = SHA512M_A;
digest[1] = SHA512M_B;
sha512_transform (w0_t, w1_t, w2_t, w3_t, digest);
+ const u32x r0 = l32_from_64 (digest[7]);
+ const u32x r1 = h32_from_64 (digest[7]);
+ const u32x r2 = l32_from_64 (digest[3]);
+ const u32x r3 = h32_from_64 (digest[3]);
- const u32 r0 = l32_from_64 (digest[7]);
- const u32 r1 = h32_from_64 (digest[7]);
- const u32 r2 = l32_from_64 (digest[3]);
- const u32 r3 = h32_from_64 (digest[3]);
-
- #include COMPARE_M
+ COMPARE_M_SIMD (r0, r1, r2, r3);
}
}
-static void m01700s (u32 w[16], const u32 pw_len, __global pw_t *pws, __global kernel_rule_t *rules_buf, __global comb_t *combs_buf, __constant u32 * words_buf_r, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 bfs_cnt, const u32 digests_cnt, const u32 digests_offset)
+static void m01700s (u32 w[16], const u32 pw_len, __global pw_t *pws, __global kernel_rule_t *rules_buf, __global comb_t *combs_buf, __constant u32x * words_buf_r, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 bfs_cnt, const u32 digests_cnt, const u32 digests_offset)
{
/**
* modifier
u32 w0l = w[0];
- for (u32 il_pos = 0; il_pos < bfs_cnt; il_pos++)
+ for (u32 il_pos = 0; il_pos < bfs_cnt; il_pos += VECT_SIZE)
{
- const u32 w0r = words_buf_r[il_pos];
+ const u32x w0r = words_buf_r[il_pos / VECT_SIZE];
- const u32 w0 = w0l | w0r;
+ const u32x w0 = w0l | w0r;
- u32 w0_t[4];
- u32 w1_t[4];
- u32 w2_t[4];
- u32 w3_t[4];
+ u32x w0_t[4];
+ u32x w1_t[4];
+ u32x w2_t[4];
+ u32x w3_t[4];
w0_t[0] = w0;
w0_t[1] = w[ 1];
w3_t[2] = w[14];
w3_t[3] = w[15];
- u64 digest[8];
+ u64x digest[8];
digest[0] = SHA512M_A;
digest[1] = SHA512M_B;
sha512_transform (w0_t, w1_t, w2_t, w3_t, digest);
+ const u32x r0 = l32_from_64 (digest[7]);
+ const u32x r1 = h32_from_64 (digest[7]);
+ const u32x r2 = l32_from_64 (digest[3]);
+ const u32x r3 = h32_from_64 (digest[3]);
- const u32 r0 = l32_from_64 (digest[7]);
- const u32 r1 = h32_from_64 (digest[7]);
- const u32 r2 = l32_from_64 (digest[3]);
- const u32 r3 = h32_from_64 (digest[3]);
-
- #include COMPARE_S
+ COMPARE_S_SIMD (r0, r1, r2, r3);
}
}
-__kernel void m01700_m04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, __global comb_t *combs_buf, __constant u32 * words_buf_r, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 bfs_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max)
+__kernel void m01700_m04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, __global comb_t *combs_buf, __constant u32x * words_buf_r, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 bfs_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max)
{
/**
* base
m01700m (w, pw_len, pws, rules_buf, combs_buf, words_buf_r, tmps, hooks, bitmaps_buf_s1_a, bitmaps_buf_s1_b, bitmaps_buf_s1_c, bitmaps_buf_s1_d, bitmaps_buf_s2_a, bitmaps_buf_s2_b, bitmaps_buf_s2_c, bitmaps_buf_s2_d, plains_buf, digests_buf, hashes_shown, salt_bufs, esalt_bufs, d_return_buf, d_scryptV_buf, bitmap_mask, bitmap_shift1, bitmap_shift2, salt_pos, loop_pos, loop_cnt, bfs_cnt, digests_cnt, digests_offset);
}
-__kernel void m01700_m08 (__global pw_t *pws, __global kernel_rule_t *rules_buf, __global comb_t *combs_buf, __constant u32 * words_buf_r, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 bfs_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max)
+__kernel void m01700_m08 (__global pw_t *pws, __global kernel_rule_t *rules_buf, __global comb_t *combs_buf, __constant u32x * words_buf_r, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 bfs_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max)
{
/**
* base
m01700m (w, pw_len, pws, rules_buf, combs_buf, words_buf_r, tmps, hooks, bitmaps_buf_s1_a, bitmaps_buf_s1_b, bitmaps_buf_s1_c, bitmaps_buf_s1_d, bitmaps_buf_s2_a, bitmaps_buf_s2_b, bitmaps_buf_s2_c, bitmaps_buf_s2_d, plains_buf, digests_buf, hashes_shown, salt_bufs, esalt_bufs, d_return_buf, d_scryptV_buf, bitmap_mask, bitmap_shift1, bitmap_shift2, salt_pos, loop_pos, loop_cnt, bfs_cnt, digests_cnt, digests_offset);
}
-__kernel void m01700_m16 (__global pw_t *pws, __global kernel_rule_t *rules_buf, __global comb_t *combs_buf, __constant u32 * words_buf_r, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 bfs_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max)
+__kernel void m01700_m16 (__global pw_t *pws, __global kernel_rule_t *rules_buf, __global comb_t *combs_buf, __constant u32x * words_buf_r, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 bfs_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max)
{
/**
* base
m01700m (w, pw_len, pws, rules_buf, combs_buf, words_buf_r, tmps, hooks, bitmaps_buf_s1_a, bitmaps_buf_s1_b, bitmaps_buf_s1_c, bitmaps_buf_s1_d, bitmaps_buf_s2_a, bitmaps_buf_s2_b, bitmaps_buf_s2_c, bitmaps_buf_s2_d, plains_buf, digests_buf, hashes_shown, salt_bufs, esalt_bufs, d_return_buf, d_scryptV_buf, bitmap_mask, bitmap_shift1, bitmap_shift2, salt_pos, loop_pos, loop_cnt, bfs_cnt, digests_cnt, digests_offset);
}
-__kernel void m01700_s04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, __global comb_t *combs_buf, __constant u32 * words_buf_r, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 bfs_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max)
+__kernel void m01700_s04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, __global comb_t *combs_buf, __constant u32x * words_buf_r, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 bfs_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max)
{
/**
* base
m01700s (w, pw_len, pws, rules_buf, combs_buf, words_buf_r, tmps, hooks, bitmaps_buf_s1_a, bitmaps_buf_s1_b, bitmaps_buf_s1_c, bitmaps_buf_s1_d, bitmaps_buf_s2_a, bitmaps_buf_s2_b, bitmaps_buf_s2_c, bitmaps_buf_s2_d, plains_buf, digests_buf, hashes_shown, salt_bufs, esalt_bufs, d_return_buf, d_scryptV_buf, bitmap_mask, bitmap_shift1, bitmap_shift2, salt_pos, loop_pos, loop_cnt, bfs_cnt, digests_cnt, digests_offset);
}
-__kernel void m01700_s08 (__global pw_t *pws, __global kernel_rule_t *rules_buf, __global comb_t *combs_buf, __constant u32 * words_buf_r, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 bfs_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max)
+__kernel void m01700_s08 (__global pw_t *pws, __global kernel_rule_t *rules_buf, __global comb_t *combs_buf, __constant u32x * words_buf_r, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 bfs_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max)
{
/**
* base
m01700s (w, pw_len, pws, rules_buf, combs_buf, words_buf_r, tmps, hooks, bitmaps_buf_s1_a, bitmaps_buf_s1_b, bitmaps_buf_s1_c, bitmaps_buf_s1_d, bitmaps_buf_s2_a, bitmaps_buf_s2_b, bitmaps_buf_s2_c, bitmaps_buf_s2_d, plains_buf, digests_buf, hashes_shown, salt_bufs, esalt_bufs, d_return_buf, d_scryptV_buf, bitmap_mask, bitmap_shift1, bitmap_shift2, salt_pos, loop_pos, loop_cnt, bfs_cnt, digests_cnt, digests_offset);
}
-__kernel void m01700_s16 (__global pw_t *pws, __global kernel_rule_t *rules_buf, __global comb_t *combs_buf, __constant u32 * words_buf_r, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 bfs_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max)
+__kernel void m01700_s16 (__global pw_t *pws, __global kernel_rule_t *rules_buf, __global comb_t *combs_buf, __constant u32x * words_buf_r, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 bfs_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max)
{
/**
* base
s3[2] = 0;
s3[3] = 0;
- switch_buffer_by_offset (s0, s1, s2, s3, out_len);
+ switch_buffer_by_offset_le (s0, s1, s2, s3, out_len);
const u32 out_salt_len = out_len + salt_len;
s3[2] = 0;
s3[3] = 0;
- switch_buffer_by_offset (s0, s1, s2, s3, out_len);
+ switch_buffer_by_offset_le (s0, s1, s2, s3, out_len);
const u32 out_salt_len = out_len + salt_len;
if (combs_mode == COMBINATOR_MODE_BASE_RIGHT)
{
- switch_buffer_by_offset (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len);
+ switch_buffer_by_offset_le (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len);
}
/**
if (combs_mode == COMBINATOR_MODE_BASE_LEFT)
{
- switch_buffer_by_offset (wordr0, wordr1, wordr2, wordr3, pw_l_len);
+ switch_buffer_by_offset_le (wordr0, wordr1, wordr2, wordr3, pw_l_len);
}
/**
s3[2] = 0;
s3[3] = 0;
- switch_buffer_by_offset (s0, s1, s2, s3, pw_len);
+ switch_buffer_by_offset_le (s0, s1, s2, s3, pw_len);
const u32 pw_salt_len = pw_len + salt_len;
if (combs_mode == COMBINATOR_MODE_BASE_RIGHT)
{
- switch_buffer_by_offset (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len);
+ switch_buffer_by_offset_le (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len);
}
/**
if (combs_mode == COMBINATOR_MODE_BASE_LEFT)
{
- switch_buffer_by_offset (wordr0, wordr1, wordr2, wordr3, pw_l_len);
+ switch_buffer_by_offset_le (wordr0, wordr1, wordr2, wordr3, pw_l_len);
}
/**
s3[2] = 0;
s3[3] = 0;
- switch_buffer_by_offset (s0, s1, s2, s3, pw_len);
+ switch_buffer_by_offset_le (s0, s1, s2, s3, pw_len);
const u32 pw_salt_len = pw_len + salt_len;
#define _SHA512_
+#define NEW_SIMD_CODE
+
#include "include/constants.h"
#include "include/kernel_vendor.h"
#include "include/kernel_functions.c"
#include "OpenCL/types_ocl.c"
#include "OpenCL/common.c"
-
-#define COMPARE_S "OpenCL/check_single_comp4.c"
-#define COMPARE_M "OpenCL/check_multi_comp4.c"
+#include "OpenCL/simd.c"
__constant u64 k_sha512[80] =
{
SHA512C4c, SHA512C4d, SHA512C4e, SHA512C4f,
};
-static void sha512_transform (const u32 w0[4], const u32 w1[4], const u32 w2[4], const u32 w3[4], u64 digest[8])
+static void sha512_transform (const u32x w0[4], const u32x w1[4], const u32x w2[4], const u32x w3[4], u64x digest[8])
{
- u64 w0_t = hl32_to_64 (w0[0], w0[1]);
- u64 w1_t = hl32_to_64 (w0[2], w0[3]);
- u64 w2_t = hl32_to_64 (w1[0], w1[1]);
- u64 w3_t = hl32_to_64 (w1[2], w1[3]);
- u64 w4_t = hl32_to_64 (w2[0], w2[1]);
- u64 w5_t = hl32_to_64 (w2[2], w2[3]);
- u64 w6_t = hl32_to_64 (w3[0], w3[1]);
- u64 w7_t = 0;
- u64 w8_t = 0;
- u64 w9_t = 0;
- u64 wa_t = 0;
- u64 wb_t = 0;
- u64 wc_t = 0;
- u64 wd_t = 0;
- u64 we_t = 0;
- u64 wf_t = hl32_to_64 (w3[2], w3[3]);
-
- u64 a = digest[0];
- u64 b = digest[1];
- u64 c = digest[2];
- u64 d = digest[3];
- u64 e = digest[4];
- u64 f = digest[5];
- u64 g = digest[6];
- u64 h = digest[7];
+ u64x w0_t = hl32_to_64 (w0[0], w0[1]);
+ u64x w1_t = hl32_to_64 (w0[2], w0[3]);
+ u64x w2_t = hl32_to_64 (w1[0], w1[1]);
+ u64x w3_t = hl32_to_64 (w1[2], w1[3]);
+ u64x w4_t = hl32_to_64 (w2[0], w2[1]);
+ u64x w5_t = hl32_to_64 (w2[2], w2[3]);
+ u64x w6_t = hl32_to_64 (w3[0], w3[1]);
+ u64x w7_t = 0;
+ u64x w8_t = 0;
+ u64x w9_t = 0;
+ u64x wa_t = 0;
+ u64x wb_t = 0;
+ u64x wc_t = 0;
+ u64x wd_t = 0;
+ u64x we_t = 0;
+ u64x wf_t = hl32_to_64 (w3[2], w3[3]);
+
+ u64x a = digest[0];
+ u64x b = digest[1];
+ u64x c = digest[2];
+ u64x d = digest[3];
+ u64x e = digest[4];
+ u64x f = digest[5];
+ u64x g = digest[6];
+ u64x h = digest[7];
#define ROUND_EXPAND() \
{ \
digest[7] = h;
}
-static void m01710m (u32 w[16], const u32 pw_len, __global pw_t *pws, __global kernel_rule_t *rules_buf, __global comb_t *combs_buf, __constant u32 * words_buf_r, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 bfs_cnt, const u32 digests_cnt, const u32 digests_offset)
+static void m01710m (u32 w[16], const u32 pw_len, __global pw_t *pws, __global kernel_rule_t *rules_buf, __global comb_t *combs_buf, __constant u32x * words_buf_r, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 bfs_cnt, const u32 digests_cnt, const u32 digests_offset)
{
/**
* modifier
salt_buf3[2] = 0;
salt_buf3[3] = 0;
- switch_buffer_by_offset (salt_buf0, salt_buf1, salt_buf2, salt_buf3, pw_len);
-
- w[ 0] |= swap32 (salt_buf0[0]);
- w[ 1] |= swap32 (salt_buf0[1]);
- w[ 2] |= swap32 (salt_buf0[2]);
- w[ 3] |= swap32 (salt_buf0[3]);
- w[ 4] |= swap32 (salt_buf1[0]);
- w[ 5] |= swap32 (salt_buf1[1]);
- w[ 6] |= swap32 (salt_buf1[2]);
- w[ 7] |= swap32 (salt_buf1[3]);
- w[ 8] |= swap32 (salt_buf2[0]);
- w[ 9] |= swap32 (salt_buf2[1]);
- w[10] |= swap32 (salt_buf2[2]);
- w[11] |= swap32 (salt_buf2[3]);
- w[12] |= swap32 (salt_buf3[0]);
- w[13] |= swap32 (salt_buf3[1]);
- w[14] |= swap32 (salt_buf3[2]);
- w[15] |= swap32 (salt_buf3[3]);
+ switch_buffer_by_offset_le_S (salt_buf0, salt_buf1, salt_buf2, salt_buf3, pw_len);
+
+ w[ 0] |= swap32_S (salt_buf0[0]);
+ w[ 1] |= swap32_S (salt_buf0[1]);
+ w[ 2] |= swap32_S (salt_buf0[2]);
+ w[ 3] |= swap32_S (salt_buf0[3]);
+ w[ 4] |= swap32_S (salt_buf1[0]);
+ w[ 5] |= swap32_S (salt_buf1[1]);
+ w[ 6] |= swap32_S (salt_buf1[2]);
+ w[ 7] |= swap32_S (salt_buf1[3]);
+ w[ 8] |= swap32_S (salt_buf2[0]);
+ w[ 9] |= swap32_S (salt_buf2[1]);
+ w[10] |= swap32_S (salt_buf2[2]);
+ w[11] |= swap32_S (salt_buf2[3]);
+ w[12] |= swap32_S (salt_buf3[0]);
+ w[13] |= swap32_S (salt_buf3[1]);
+ w[14] |= swap32_S (salt_buf3[2]);
+ w[15] |= swap32_S (salt_buf3[3]);
const u32 salt_len = salt_bufs[salt_pos].salt_len;
u32 w0l = w[0];
- for (u32 il_pos = 0; il_pos < bfs_cnt; il_pos++)
+ for (u32 il_pos = 0; il_pos < bfs_cnt; il_pos += VECT_SIZE)
{
- const u32 w0r = words_buf_r[il_pos];
+ const u32x w0r = words_buf_r[il_pos / VECT_SIZE];
- const u32 w0 = w0l | w0r;
+ const u32x w0 = w0l | w0r;
- u32 w0_t[4];
- u32 w1_t[4];
- u32 w2_t[4];
- u32 w3_t[4];
+ u32x w0_t[4];
+ u32x w1_t[4];
+ u32x w2_t[4];
+ u32x w3_t[4];
w0_t[0] = w0;
w0_t[1] = w[ 1];
w3_t[2] = w[14];
w3_t[3] = w[15];
- u64 digest[8];
+ u64x digest[8];
digest[0] = SHA512M_A;
digest[1] = SHA512M_B;
sha512_transform (w0_t, w1_t, w2_t, w3_t, digest);
+ const u32x r0 = l32_from_64 (digest[7]);
+ const u32x r1 = h32_from_64 (digest[7]);
+ const u32x r2 = l32_from_64 (digest[3]);
+ const u32x r3 = h32_from_64 (digest[3]);
- const u32 r0 = l32_from_64 (digest[7]);
- const u32 r1 = h32_from_64 (digest[7]);
- const u32 r2 = l32_from_64 (digest[3]);
- const u32 r3 = h32_from_64 (digest[3]);
-
- #include COMPARE_M
+ COMPARE_M_SIMD (r0, r1, r2, r3);
}
}
-static void m01710s (u32 w[16], const u32 pw_len, __global pw_t *pws, __global kernel_rule_t *rules_buf, __global comb_t *combs_buf, __constant u32 * words_buf_r, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 bfs_cnt, const u32 digests_cnt, const u32 digests_offset)
+static void m01710s (u32 w[16], const u32 pw_len, __global pw_t *pws, __global kernel_rule_t *rules_buf, __global comb_t *combs_buf, __constant u32x * words_buf_r, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 bfs_cnt, const u32 digests_cnt, const u32 digests_offset)
{
/**
* modifier
u32 w0l = w[0];
- for (u32 il_pos = 0; il_pos < bfs_cnt; il_pos++)
+ for (u32 il_pos = 0; il_pos < bfs_cnt; il_pos += VECT_SIZE)
{
- const u32 w0r = words_buf_r[il_pos];
+ const u32x w0r = words_buf_r[il_pos / VECT_SIZE];
- const u32 w0 = w0l | w0r;
+ const u32x w0 = w0l | w0r;
- u32 w0_t[4];
- u32 w1_t[4];
- u32 w2_t[4];
- u32 w3_t[4];
+ u32x w0_t[4];
+ u32x w1_t[4];
+ u32x w2_t[4];
+ u32x w3_t[4];
w0_t[0] = w0;
w0_t[1] = w[ 1];
w3_t[2] = w[14];
w3_t[3] = w[15];
- u64 digest[8];
+ u64x digest[8];
digest[0] = SHA512M_A;
digest[1] = SHA512M_B;
sha512_transform (w0_t, w1_t, w2_t, w3_t, digest);
+ const u32x r0 = l32_from_64 (digest[7]);
+ const u32x r1 = h32_from_64 (digest[7]);
+ const u32x r2 = l32_from_64 (digest[3]);
+ const u32x r3 = h32_from_64 (digest[3]);
- const u32 r0 = l32_from_64 (digest[7]);
- const u32 r1 = h32_from_64 (digest[7]);
- const u32 r2 = l32_from_64 (digest[3]);
- const u32 r3 = h32_from_64 (digest[3]);
-
- #include COMPARE_S
+ COMPARE_S_SIMD (r0, r1, r2, r3);
}
}
-__kernel void m01710_m04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, __global comb_t *combs_buf, __constant u32 * words_buf_r, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 bfs_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max)
+__kernel void m01710_m04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, __global comb_t *combs_buf, __constant u32x * words_buf_r, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 bfs_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max)
{
/**
* base
m01710m (w, pw_len, pws, rules_buf, combs_buf, words_buf_r, tmps, hooks, bitmaps_buf_s1_a, bitmaps_buf_s1_b, bitmaps_buf_s1_c, bitmaps_buf_s1_d, bitmaps_buf_s2_a, bitmaps_buf_s2_b, bitmaps_buf_s2_c, bitmaps_buf_s2_d, plains_buf, digests_buf, hashes_shown, salt_bufs, esalt_bufs, d_return_buf, d_scryptV_buf, bitmap_mask, bitmap_shift1, bitmap_shift2, salt_pos, loop_pos, loop_cnt, bfs_cnt, digests_cnt, digests_offset);
}
-__kernel void m01710_m08 (__global pw_t *pws, __global kernel_rule_t *rules_buf, __global comb_t *combs_buf, __constant u32 * words_buf_r, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 bfs_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max)
+__kernel void m01710_m08 (__global pw_t *pws, __global kernel_rule_t *rules_buf, __global comb_t *combs_buf, __constant u32x * words_buf_r, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 bfs_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max)
{
/**
* base
m01710m (w, pw_len, pws, rules_buf, combs_buf, words_buf_r, tmps, hooks, bitmaps_buf_s1_a, bitmaps_buf_s1_b, bitmaps_buf_s1_c, bitmaps_buf_s1_d, bitmaps_buf_s2_a, bitmaps_buf_s2_b, bitmaps_buf_s2_c, bitmaps_buf_s2_d, plains_buf, digests_buf, hashes_shown, salt_bufs, esalt_bufs, d_return_buf, d_scryptV_buf, bitmap_mask, bitmap_shift1, bitmap_shift2, salt_pos, loop_pos, loop_cnt, bfs_cnt, digests_cnt, digests_offset);
}
-__kernel void m01710_m16 (__global pw_t *pws, __global kernel_rule_t *rules_buf, __global comb_t *combs_buf, __constant u32 * words_buf_r, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 bfs_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max)
+__kernel void m01710_m16 (__global pw_t *pws, __global kernel_rule_t *rules_buf, __global comb_t *combs_buf, __constant u32x * words_buf_r, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 bfs_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max)
{
/**
* base
m01710m (w, pw_len, pws, rules_buf, combs_buf, words_buf_r, tmps, hooks, bitmaps_buf_s1_a, bitmaps_buf_s1_b, bitmaps_buf_s1_c, bitmaps_buf_s1_d, bitmaps_buf_s2_a, bitmaps_buf_s2_b, bitmaps_buf_s2_c, bitmaps_buf_s2_d, plains_buf, digests_buf, hashes_shown, salt_bufs, esalt_bufs, d_return_buf, d_scryptV_buf, bitmap_mask, bitmap_shift1, bitmap_shift2, salt_pos, loop_pos, loop_cnt, bfs_cnt, digests_cnt, digests_offset);
}
-__kernel void m01710_s04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, __global comb_t *combs_buf, __constant u32 * words_buf_r, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 bfs_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max)
+__kernel void m01710_s04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, __global comb_t *combs_buf, __constant u32x * words_buf_r, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 bfs_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max)
{
/**
* base
m01710s (w, pw_len, pws, rules_buf, combs_buf, words_buf_r, tmps, hooks, bitmaps_buf_s1_a, bitmaps_buf_s1_b, bitmaps_buf_s1_c, bitmaps_buf_s1_d, bitmaps_buf_s2_a, bitmaps_buf_s2_b, bitmaps_buf_s2_c, bitmaps_buf_s2_d, plains_buf, digests_buf, hashes_shown, salt_bufs, esalt_bufs, d_return_buf, d_scryptV_buf, bitmap_mask, bitmap_shift1, bitmap_shift2, salt_pos, loop_pos, loop_cnt, bfs_cnt, digests_cnt, digests_offset);
}
-__kernel void m01710_s08 (__global pw_t *pws, __global kernel_rule_t *rules_buf, __global comb_t *combs_buf, __constant u32 * words_buf_r, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 bfs_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max)
+__kernel void m01710_s08 (__global pw_t *pws, __global kernel_rule_t *rules_buf, __global comb_t *combs_buf, __constant u32x * words_buf_r, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 bfs_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max)
{
/**
* base
m01710s (w, pw_len, pws, rules_buf, combs_buf, words_buf_r, tmps, hooks, bitmaps_buf_s1_a, bitmaps_buf_s1_b, bitmaps_buf_s1_c, bitmaps_buf_s1_d, bitmaps_buf_s2_a, bitmaps_buf_s2_b, bitmaps_buf_s2_c, bitmaps_buf_s2_d, plains_buf, digests_buf, hashes_shown, salt_bufs, esalt_bufs, d_return_buf, d_scryptV_buf, bitmap_mask, bitmap_shift1, bitmap_shift2, salt_pos, loop_pos, loop_cnt, bfs_cnt, digests_cnt, digests_offset);
}
-__kernel void m01710_s16 (__global pw_t *pws, __global kernel_rule_t *rules_buf, __global comb_t *combs_buf, __constant u32 * words_buf_r, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 bfs_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max)
+__kernel void m01710_s16 (__global pw_t *pws, __global kernel_rule_t *rules_buf, __global comb_t *combs_buf, __constant u32x * words_buf_r, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 bfs_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max)
{
/**
* base
const u32 out_salt_len = out_len + salt_len;
- switch_buffer_by_offset (w0, w1, w2, w3, salt_len);
+ switch_buffer_by_offset_le (w0, w1, w2, w3, salt_len);
w0[0] |= salt_buf0[0];
w0[1] |= salt_buf0[1];
const u32 out_salt_len = out_len + salt_len;
- switch_buffer_by_offset (w0, w1, w2, w3, salt_len);
+ switch_buffer_by_offset_le (w0, w1, w2, w3, salt_len);
w0[0] |= salt_buf0[0];
w0[1] |= salt_buf0[1];
if (combs_mode == COMBINATOR_MODE_BASE_RIGHT)
{
- switch_buffer_by_offset (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len);
+ switch_buffer_by_offset_le (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len);
}
/**
if (combs_mode == COMBINATOR_MODE_BASE_LEFT)
{
- switch_buffer_by_offset (wordr0, wordr1, wordr2, wordr3, pw_l_len);
+ switch_buffer_by_offset_le (wordr0, wordr1, wordr2, wordr3, pw_l_len);
}
u32 w0[4];
const u32 pw_salt_len = pw_len + salt_len;
- switch_buffer_by_offset (w0, w1, w2, w3, salt_len);
+ switch_buffer_by_offset_le (w0, w1, w2, w3, salt_len);
w0[0] |= salt_buf0[0];
w0[1] |= salt_buf0[1];
if (combs_mode == COMBINATOR_MODE_BASE_RIGHT)
{
- switch_buffer_by_offset (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len);
+ switch_buffer_by_offset_le (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len);
}
/**
if (combs_mode == COMBINATOR_MODE_BASE_LEFT)
{
- switch_buffer_by_offset (wordr0, wordr1, wordr2, wordr3, pw_l_len);
+ switch_buffer_by_offset_le (wordr0, wordr1, wordr2, wordr3, pw_l_len);
}
u32 w0[4];
const u32 pw_salt_len = pw_len + salt_len;
- switch_buffer_by_offset (w0, w1, w2, w3, salt_len);
+ switch_buffer_by_offset_le (w0, w1, w2, w3, salt_len);
w0[0] |= salt_buf0[0];
w0[1] |= salt_buf0[1];
#define _SHA512_
+#define NEW_SIMD_CODE
+
#include "include/constants.h"
#include "include/kernel_vendor.h"
#include "include/kernel_functions.c"
#include "OpenCL/types_ocl.c"
#include "OpenCL/common.c"
-
-#define COMPARE_S "OpenCL/check_single_comp4.c"
-#define COMPARE_M "OpenCL/check_multi_comp4.c"
+#include "OpenCL/simd.c"
__constant u64 k_sha512[80] =
{
SHA512C4c, SHA512C4d, SHA512C4e, SHA512C4f,
};
-static void sha512_transform (const u32 w0[4], const u32 w1[4], const u32 w2[4], const u32 w3[4], u64 digest[8])
+static void sha512_transform (const u32x w0[4], const u32x w1[4], const u32x w2[4], const u32x w3[4], u64x digest[8])
{
- u64 w0_t = hl32_to_64 (w0[0], w0[1]);
- u64 w1_t = hl32_to_64 (w0[2], w0[3]);
- u64 w2_t = hl32_to_64 (w1[0], w1[1]);
- u64 w3_t = hl32_to_64 (w1[2], w1[3]);
- u64 w4_t = hl32_to_64 (w2[0], w2[1]);
- u64 w5_t = hl32_to_64 (w2[2], w2[3]);
- u64 w6_t = hl32_to_64 (w3[0], w3[1]);
- u64 w7_t = 0;
- u64 w8_t = 0;
- u64 w9_t = 0;
- u64 wa_t = 0;
- u64 wb_t = 0;
- u64 wc_t = 0;
- u64 wd_t = 0;
- u64 we_t = 0;
- u64 wf_t = hl32_to_64 (w3[2], w3[3]);
-
- u64 a = digest[0];
- u64 b = digest[1];
- u64 c = digest[2];
- u64 d = digest[3];
- u64 e = digest[4];
- u64 f = digest[5];
- u64 g = digest[6];
- u64 h = digest[7];
+ u64x w0_t = hl32_to_64 (w0[0], w0[1]);
+ u64x w1_t = hl32_to_64 (w0[2], w0[3]);
+ u64x w2_t = hl32_to_64 (w1[0], w1[1]);
+ u64x w3_t = hl32_to_64 (w1[2], w1[3]);
+ u64x w4_t = hl32_to_64 (w2[0], w2[1]);
+ u64x w5_t = hl32_to_64 (w2[2], w2[3]);
+ u64x w6_t = hl32_to_64 (w3[0], w3[1]);
+ u64x w7_t = 0;
+ u64x w8_t = 0;
+ u64x w9_t = 0;
+ u64x wa_t = 0;
+ u64x wb_t = 0;
+ u64x wc_t = 0;
+ u64x wd_t = 0;
+ u64x we_t = 0;
+ u64x wf_t = hl32_to_64 (w3[2], w3[3]);
+
+ u64x a = digest[0];
+ u64x b = digest[1];
+ u64x c = digest[2];
+ u64x d = digest[3];
+ u64x e = digest[4];
+ u64x f = digest[5];
+ u64x g = digest[6];
+ u64x h = digest[7];
#define ROUND_EXPAND() \
{ \
const u32 pw_salt_len = pw_len + salt_len;
+
+ /**
+ * prepend salt
+ */
+
+ u32 w0_t[4];
+ u32 w1_t[4];
+ u32 w2_t[4];
+ u32 w3_t[4];
+
+ w0_t[0] = swap32_S (w0[0]);
+ w0_t[1] = swap32_S (w0[1]);
+ w0_t[2] = swap32_S (w0[2]);
+ w0_t[3] = swap32_S (w0[3]);
+ w1_t[0] = swap32_S (w1[0]);
+ w1_t[1] = swap32_S (w1[1]);
+ w1_t[2] = swap32_S (w1[2]);
+ w1_t[3] = swap32_S (w1[3]);
+ w2_t[0] = swap32_S (w2[0]);
+ w2_t[1] = swap32_S (w2[1]);
+ w2_t[2] = swap32_S (w2[2]);
+ w2_t[3] = swap32_S (w2[3]);
+ w3_t[0] = swap32_S (w3[0]);
+ w3_t[1] = swap32_S (w3[1]);
+ w3_t[2] = swap32_S (w3[2]);
+ w3_t[3] = swap32_S (w3[3]);
+
+ switch_buffer_by_offset_le_S (w0_t, w1_t, w2_t, w3_t, salt_len);
+
+ w0_t[0] |= salt_buf0[0];
+ w0_t[1] |= salt_buf0[1];
+ w0_t[2] |= salt_buf0[2];
+ w0_t[3] |= salt_buf0[3];
+ w1_t[0] |= salt_buf1[0];
+ w1_t[1] |= salt_buf1[1];
+ w1_t[2] |= salt_buf1[2];
+ w1_t[3] |= salt_buf1[3];
+ w2_t[0] |= salt_buf2[0];
+ w2_t[1] |= salt_buf2[1];
+ w2_t[2] |= salt_buf2[2];
+ w2_t[3] |= salt_buf2[3];
+ w3_t[0] |= salt_buf3[0];
+ w3_t[1] |= salt_buf3[1];
+ w3_t[2] |= salt_buf3[2];
+ w3_t[3] |= salt_buf3[3];
+
+ w0_t[0] = swap32_S (w0_t[0]);
+ w0_t[1] = swap32_S (w0_t[1]);
+ w0_t[2] = swap32_S (w0_t[2]);
+ w0_t[3] = swap32_S (w0_t[3]);
+ w1_t[0] = swap32_S (w1_t[0]);
+ w1_t[1] = swap32_S (w1_t[1]);
+ w1_t[2] = swap32_S (w1_t[2]);
+ w1_t[3] = swap32_S (w1_t[3]);
+ w2_t[0] = swap32_S (w2_t[0]);
+ w2_t[1] = swap32_S (w2_t[1]);
+ w2_t[2] = swap32_S (w2_t[2]);
+ w2_t[3] = swap32_S (w2_t[3]);
+ w3_t[0] = swap32_S (w3_t[0]);
+ w3_t[1] = swap32_S (w3_t[1]);
+ w3_t[2] = swap32_S (w3_t[2]);
+ w3_t[3] = swap32_S (w3_t[3]);
+
/**
* loop
*/
u32 w0l = w0[0];
- for (u32 il_pos = 0; il_pos < bfs_cnt; il_pos++)
+ for (u32 il_pos = 0; il_pos < bfs_cnt; il_pos += VECT_SIZE)
{
- const u32 w0r = bfs_buf[il_pos].i;
-
- w0[0] = w0l | w0r;
-
- /**
- * prepend salt
- */
-
- u32 w0_t[4];
- u32 w1_t[4];
- u32 w2_t[4];
- u32 w3_t[4];
-
- w0_t[0] = swap32 (w0[0]);
- w0_t[1] = swap32 (w0[1]);
- w0_t[2] = swap32 (w0[2]);
- w0_t[3] = swap32 (w0[3]);
- w1_t[0] = swap32 (w1[0]);
- w1_t[1] = swap32 (w1[1]);
- w1_t[2] = swap32 (w1[2]);
- w1_t[3] = swap32 (w1[3]);
- w2_t[0] = swap32 (w2[0]);
- w2_t[1] = swap32 (w2[1]);
- w2_t[2] = swap32 (w2[2]);
- w2_t[3] = swap32 (w2[3]);
- w3_t[0] = swap32 (w3[0]);
- w3_t[1] = swap32 (w3[1]);
- w3_t[2] = swap32 (w3[2]);
- w3_t[3] = swap32 (w3[3]);
-
- switch_buffer_by_offset (w0_t, w1_t, w2_t, w3_t, salt_len);
-
- w0_t[0] |= salt_buf0[0];
- w0_t[1] |= salt_buf0[1];
- w0_t[2] |= salt_buf0[2];
- w0_t[3] |= salt_buf0[3];
- w1_t[0] |= salt_buf1[0];
- w1_t[1] |= salt_buf1[1];
- w1_t[2] |= salt_buf1[2];
- w1_t[3] |= salt_buf1[3];
- w2_t[0] |= salt_buf2[0];
- w2_t[1] |= salt_buf2[1];
- w2_t[2] |= salt_buf2[2];
- w2_t[3] |= salt_buf2[3];
- w3_t[0] |= salt_buf3[0];
- w3_t[1] |= salt_buf3[1];
- w3_t[2] = 0;
- w3_t[3] = pw_salt_len * 8;
+ const u32x w0r = w0r_create_bft (bfs_buf, il_pos);
+
+ const u32x w0lr = w0l | w0r;
+
+ u32x wx[16];
+
+ wx[ 0] = w0_t[0];
+ wx[ 1] = w0_t[1];
+ wx[ 2] = w0_t[2];
+ wx[ 3] = w0_t[3];
+ wx[ 4] = w1_t[0];
+ wx[ 5] = w1_t[1];
+ wx[ 6] = w1_t[2];
+ wx[ 7] = w1_t[3];
+ wx[ 8] = w2_t[0];
+ wx[ 9] = w2_t[1];
+ wx[10] = w2_t[2];
+ wx[11] = w2_t[3];
+ wx[12] = w3_t[0];
+ wx[13] = w3_t[1];
+ wx[14] = w3_t[2];
+ wx[15] = w3_t[3];
+
+ overwrite_at_be (wx, w0lr, salt_len);
+
+ u32x w0_t[4];
+ u32x w1_t[4];
+ u32x w2_t[4];
+ u32x w3_t[4];
+
+ w0_t[0] = wx[ 0];
+ w0_t[1] = wx[ 1];
+ w0_t[2] = wx[ 2];
+ w0_t[3] = wx[ 3];
+ w1_t[0] = wx[ 4];
+ w1_t[1] = wx[ 5];
+ w1_t[2] = wx[ 6];
+ w1_t[3] = wx[ 7];
+ w2_t[0] = wx[ 8];
+ w2_t[1] = wx[ 9];
+ w2_t[2] = wx[10];
+ w2_t[3] = wx[11];
+ w3_t[0] = wx[12];
+ w3_t[1] = wx[13];
+ w3_t[2] = 0;
+ w3_t[3] = pw_salt_len * 8;
/**
* sha512
*/
- w0_t[0] = swap32 (w0_t[0]);
- w0_t[1] = swap32 (w0_t[1]);
- w0_t[2] = swap32 (w0_t[2]);
- w0_t[3] = swap32 (w0_t[3]);
- w1_t[0] = swap32 (w1_t[0]);
- w1_t[1] = swap32 (w1_t[1]);
- w1_t[2] = swap32 (w1_t[2]);
- w1_t[3] = swap32 (w1_t[3]);
- w2_t[0] = swap32 (w2_t[0]);
- w2_t[1] = swap32 (w2_t[1]);
- w2_t[2] = swap32 (w2_t[2]);
- w2_t[3] = swap32 (w2_t[3]);
- w3_t[0] = swap32 (w3_t[0]);
- w3_t[1] = swap32 (w3_t[1]);
- //w3_t[2] = swap32 (w3_t[2]);
- //w3_t[3] = swap32 (w3_t[3]);
-
- u64 digest[8];
+ u64x digest[8];
digest[0] = SHA512M_A;
digest[1] = SHA512M_B;
sha512_transform (w0_t, w1_t, w2_t, w3_t, digest);
+ const u32x r0 = l32_from_64 (digest[7]);
+ const u32x r1 = h32_from_64 (digest[7]);
+ const u32x r2 = l32_from_64 (digest[3]);
+ const u32x r3 = h32_from_64 (digest[3]);
- const u32 r0 = l32_from_64 (digest[7]);
- const u32 r1 = h32_from_64 (digest[7]);
- const u32 r2 = l32_from_64 (digest[3]);
- const u32 r3 = h32_from_64 (digest[3]);
-
- #include COMPARE_M
+ COMPARE_M_SIMD (r0, r1, r2, r3);
}
}
const u32 pw_salt_len = pw_len + salt_len;
+ /**
+ * prepend salt
+ */
+
+ u32 w0_t[4];
+ u32 w1_t[4];
+ u32 w2_t[4];
+ u32 w3_t[4];
+
+ w0_t[0] = swap32_S (w0[0]);
+ w0_t[1] = swap32_S (w0[1]);
+ w0_t[2] = swap32_S (w0[2]);
+ w0_t[3] = swap32_S (w0[3]);
+ w1_t[0] = swap32_S (w1[0]);
+ w1_t[1] = swap32_S (w1[1]);
+ w1_t[2] = swap32_S (w1[2]);
+ w1_t[3] = swap32_S (w1[3]);
+ w2_t[0] = swap32_S (w2[0]);
+ w2_t[1] = swap32_S (w2[1]);
+ w2_t[2] = swap32_S (w2[2]);
+ w2_t[3] = swap32_S (w2[3]);
+ w3_t[0] = swap32_S (w3[0]);
+ w3_t[1] = swap32_S (w3[1]);
+ w3_t[2] = swap32_S (w3[2]);
+ w3_t[3] = swap32_S (w3[3]);
+
+ switch_buffer_by_offset_le_S (w0_t, w1_t, w2_t, w3_t, salt_len);
+
+ w0_t[0] |= salt_buf0[0];
+ w0_t[1] |= salt_buf0[1];
+ w0_t[2] |= salt_buf0[2];
+ w0_t[3] |= salt_buf0[3];
+ w1_t[0] |= salt_buf1[0];
+ w1_t[1] |= salt_buf1[1];
+ w1_t[2] |= salt_buf1[2];
+ w1_t[3] |= salt_buf1[3];
+ w2_t[0] |= salt_buf2[0];
+ w2_t[1] |= salt_buf2[1];
+ w2_t[2] |= salt_buf2[2];
+ w2_t[3] |= salt_buf2[3];
+ w3_t[0] |= salt_buf3[0];
+ w3_t[1] |= salt_buf3[1];
+ w3_t[2] |= salt_buf3[2];
+ w3_t[3] |= salt_buf3[3];
+
+ w0_t[0] = swap32_S (w0_t[0]);
+ w0_t[1] = swap32_S (w0_t[1]);
+ w0_t[2] = swap32_S (w0_t[2]);
+ w0_t[3] = swap32_S (w0_t[3]);
+ w1_t[0] = swap32_S (w1_t[0]);
+ w1_t[1] = swap32_S (w1_t[1]);
+ w1_t[2] = swap32_S (w1_t[2]);
+ w1_t[3] = swap32_S (w1_t[3]);
+ w2_t[0] = swap32_S (w2_t[0]);
+ w2_t[1] = swap32_S (w2_t[1]);
+ w2_t[2] = swap32_S (w2_t[2]);
+ w2_t[3] = swap32_S (w2_t[3]);
+ w3_t[0] = swap32_S (w3_t[0]);
+ w3_t[1] = swap32_S (w3_t[1]);
+ w3_t[2] = swap32_S (w3_t[2]);
+ w3_t[3] = swap32_S (w3_t[3]);
+
/**
* loop
*/
u32 w0l = w0[0];
- for (u32 il_pos = 0; il_pos < bfs_cnt; il_pos++)
+ for (u32 il_pos = 0; il_pos < bfs_cnt; il_pos += VECT_SIZE)
{
- const u32 w0r = bfs_buf[il_pos].i;
+ const u32x w0r = w0r_create_bft (bfs_buf, il_pos);
- w0[0] = w0l | w0r;
+ const u32x w0lr = w0l | w0r;
/**
* prepend salt
*/
- u32 w0_t[4];
- u32 w1_t[4];
- u32 w2_t[4];
- u32 w3_t[4];
-
- w0_t[0] = swap32 (w0[0]);
- w0_t[1] = swap32 (w0[1]);
- w0_t[2] = swap32 (w0[2]);
- w0_t[3] = swap32 (w0[3]);
- w1_t[0] = swap32 (w1[0]);
- w1_t[1] = swap32 (w1[1]);
- w1_t[2] = swap32 (w1[2]);
- w1_t[3] = swap32 (w1[3]);
- w2_t[0] = swap32 (w2[0]);
- w2_t[1] = swap32 (w2[1]);
- w2_t[2] = swap32 (w2[2]);
- w2_t[3] = swap32 (w2[3]);
- w3_t[0] = swap32 (w3[0]);
- w3_t[1] = swap32 (w3[1]);
- w3_t[2] = swap32 (w3[2]);
- w3_t[3] = swap32 (w3[3]);
-
- switch_buffer_by_offset (w0_t, w1_t, w2_t, w3_t, salt_len);
-
- w0_t[0] |= salt_buf0[0];
- w0_t[1] |= salt_buf0[1];
- w0_t[2] |= salt_buf0[2];
- w0_t[3] |= salt_buf0[3];
- w1_t[0] |= salt_buf1[0];
- w1_t[1] |= salt_buf1[1];
- w1_t[2] |= salt_buf1[2];
- w1_t[3] |= salt_buf1[3];
- w2_t[0] |= salt_buf2[0];
- w2_t[1] |= salt_buf2[1];
- w2_t[2] |= salt_buf2[2];
- w2_t[3] |= salt_buf2[3];
- w3_t[0] |= salt_buf3[0];
- w3_t[1] |= salt_buf3[1];
- w3_t[2] = 0;
- w3_t[3] = pw_salt_len * 8;
-
- /**
- * sha512
- */
-
- w0_t[0] = swap32 (w0_t[0]);
- w0_t[1] = swap32 (w0_t[1]);
- w0_t[2] = swap32 (w0_t[2]);
- w0_t[3] = swap32 (w0_t[3]);
- w1_t[0] = swap32 (w1_t[0]);
- w1_t[1] = swap32 (w1_t[1]);
- w1_t[2] = swap32 (w1_t[2]);
- w1_t[3] = swap32 (w1_t[3]);
- w2_t[0] = swap32 (w2_t[0]);
- w2_t[1] = swap32 (w2_t[1]);
- w2_t[2] = swap32 (w2_t[2]);
- w2_t[3] = swap32 (w2_t[3]);
- w3_t[0] = swap32 (w3_t[0]);
- w3_t[1] = swap32 (w3_t[1]);
- //w3_t[2] = swap32 (w3_t[2]);
- //w3_t[3] = swap32 (w3_t[3]);
-
- u64 digest[8];
+ u32x wx[16];
+
+ wx[ 0] = w0_t[0];
+ wx[ 1] = w0_t[1];
+ wx[ 2] = w0_t[2];
+ wx[ 3] = w0_t[3];
+ wx[ 4] = w1_t[0];
+ wx[ 5] = w1_t[1];
+ wx[ 6] = w1_t[2];
+ wx[ 7] = w1_t[3];
+ wx[ 8] = w2_t[0];
+ wx[ 9] = w2_t[1];
+ wx[10] = w2_t[2];
+ wx[11] = w2_t[3];
+ wx[12] = w3_t[0];
+ wx[13] = w3_t[1];
+ wx[14] = w3_t[2];
+ wx[15] = w3_t[3];
+
+ overwrite_at_be (wx, w0lr, salt_len);
+
+ u32x w0_t[4];
+ u32x w1_t[4];
+ u32x w2_t[4];
+ u32x w3_t[4];
+
+ w0_t[0] = wx[ 0];
+ w0_t[1] = wx[ 1];
+ w0_t[2] = wx[ 2];
+ w0_t[3] = wx[ 3];
+ w1_t[0] = wx[ 4];
+ w1_t[1] = wx[ 5];
+ w1_t[2] = wx[ 6];
+ w1_t[3] = wx[ 7];
+ w2_t[0] = wx[ 8];
+ w2_t[1] = wx[ 9];
+ w2_t[2] = wx[10];
+ w2_t[3] = wx[11];
+ w3_t[0] = wx[12];
+ w3_t[1] = wx[13];
+ w3_t[2] = 0;
+ w3_t[3] = pw_salt_len * 8;
+
+ u64x digest[8];
digest[0] = SHA512M_A;
digest[1] = SHA512M_B;
sha512_transform (w0_t, w1_t, w2_t, w3_t, digest);
+ const u32x r0 = l32_from_64 (digest[7]);
+ const u32x r1 = h32_from_64 (digest[7]);
+ const u32x r2 = l32_from_64 (digest[3]);
+ const u32x r3 = h32_from_64 (digest[3]);
- const u32 r0 = l32_from_64 (digest[7]);
- const u32 r1 = h32_from_64 (digest[7]);
- const u32 r2 = l32_from_64 (digest[3]);
- const u32 r3 = h32_from_64 (digest[3]);
-
- #include COMPARE_S
+ COMPARE_S_SIMD (r0, r1, r2, r3);
}
}
s3[2] = 0;
s3[3] = 0;
- switch_buffer_by_offset (s0, s1, s2, s3, (out_len * 2));
+ switch_buffer_by_offset_le (s0, s1, s2, s3, (out_len * 2));
const u32 out_salt_len = (out_len * 2) + salt_len;
s3[2] = 0;
s3[3] = 0;
- switch_buffer_by_offset (s0, s1, s2, s3, (out_len * 2));
+ switch_buffer_by_offset_le (s0, s1, s2, s3, (out_len * 2));
const u32 out_salt_len = (out_len * 2) + salt_len;
if (combs_mode == COMBINATOR_MODE_BASE_RIGHT)
{
- switch_buffer_by_offset (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len);
+ switch_buffer_by_offset_le (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len);
}
/**
if (combs_mode == COMBINATOR_MODE_BASE_LEFT)
{
- switch_buffer_by_offset (wordr0, wordr1, wordr2, wordr3, pw_l_len);
+ switch_buffer_by_offset_le (wordr0, wordr1, wordr2, wordr3, pw_l_len);
}
u32 w0[4];
s3[2] = 0;
s3[3] = 0;
- switch_buffer_by_offset (s0, s1, s2, s3, (pw_len * 2));
+ switch_buffer_by_offset_le (s0, s1, s2, s3, (pw_len * 2));
const u32 pw_salt_len = (pw_len * 2) + salt_len;
if (combs_mode == COMBINATOR_MODE_BASE_RIGHT)
{
- switch_buffer_by_offset (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len);
+ switch_buffer_by_offset_le (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len);
}
/**
if (combs_mode == COMBINATOR_MODE_BASE_LEFT)
{
- switch_buffer_by_offset (wordr0, wordr1, wordr2, wordr3, pw_l_len);
+ switch_buffer_by_offset_le (wordr0, wordr1, wordr2, wordr3, pw_l_len);
}
u32 w0[4];
s3[2] = 0;
s3[3] = 0;
- switch_buffer_by_offset (s0, s1, s2, s3, (pw_len * 2));
+ switch_buffer_by_offset_le (s0, s1, s2, s3, (pw_len * 2));
const u32 pw_salt_len = (pw_len * 2) + salt_len;
#define _SHA512_
+#define NEW_SIMD_CODE
+
#include "include/constants.h"
#include "include/kernel_vendor.h"
#include "include/kernel_functions.c"
#include "OpenCL/types_ocl.c"
#include "OpenCL/common.c"
-
-#define COMPARE_S "OpenCL/check_single_comp4.c"
-#define COMPARE_M "OpenCL/check_multi_comp4.c"
+#include "OpenCL/simd.c"
__constant u64 k_sha512[80] =
{
SHA512C4c, SHA512C4d, SHA512C4e, SHA512C4f,
};
-static void sha512_transform (const u32 w0[4], const u32 w1[4], const u32 w2[4], const u32 w3[4], u64 digest[8])
+static void sha512_transform (const u32x w0[4], const u32x w1[4], const u32x w2[4], const u32x w3[4], u64x digest[8])
{
- u64 w0_t = hl32_to_64 (w0[0], w0[1]);
- u64 w1_t = hl32_to_64 (w0[2], w0[3]);
- u64 w2_t = hl32_to_64 (w1[0], w1[1]);
- u64 w3_t = hl32_to_64 (w1[2], w1[3]);
- u64 w4_t = hl32_to_64 (w2[0], w2[1]);
- u64 w5_t = hl32_to_64 (w2[2], w2[3]);
- u64 w6_t = hl32_to_64 (w3[0], w3[1]);
- u64 w7_t = 0;
- u64 w8_t = 0;
- u64 w9_t = 0;
- u64 wa_t = 0;
- u64 wb_t = 0;
- u64 wc_t = 0;
- u64 wd_t = 0;
- u64 we_t = 0;
- u64 wf_t = hl32_to_64 (w3[2], w3[3]);
-
- u64 a = digest[0];
- u64 b = digest[1];
- u64 c = digest[2];
- u64 d = digest[3];
- u64 e = digest[4];
- u64 f = digest[5];
- u64 g = digest[6];
- u64 h = digest[7];
+ u64x w0_t = hl32_to_64 (w0[0], w0[1]);
+ u64x w1_t = hl32_to_64 (w0[2], w0[3]);
+ u64x w2_t = hl32_to_64 (w1[0], w1[1]);
+ u64x w3_t = hl32_to_64 (w1[2], w1[3]);
+ u64x w4_t = hl32_to_64 (w2[0], w2[1]);
+ u64x w5_t = hl32_to_64 (w2[2], w2[3]);
+ u64x w6_t = hl32_to_64 (w3[0], w3[1]);
+ u64x w7_t = 0;
+ u64x w8_t = 0;
+ u64x w9_t = 0;
+ u64x wa_t = 0;
+ u64x wb_t = 0;
+ u64x wc_t = 0;
+ u64x wd_t = 0;
+ u64x we_t = 0;
+ u64x wf_t = hl32_to_64 (w3[2], w3[3]);
+
+ u64x a = digest[0];
+ u64x b = digest[1];
+ u64x c = digest[2];
+ u64x d = digest[3];
+ u64x e = digest[4];
+ u64x f = digest[5];
+ u64x g = digest[6];
+ u64x h = digest[7];
#define ROUND_EXPAND() \
{ \
digest[7] = h;
}
-static void m01730m (u32 w[16], const u32 pw_len, __global pw_t *pws, __global kernel_rule_t *rules_buf, __global comb_t *combs_buf, __constant u32 * words_buf_r, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 bfs_cnt, const u32 digests_cnt, const u32 digests_offset)
+static void m01730m (u32 w[16], const u32 pw_len, __global pw_t *pws, __global kernel_rule_t *rules_buf, __global comb_t *combs_buf, __constant u32x * words_buf_r, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 bfs_cnt, const u32 digests_cnt, const u32 digests_offset)
{
/**
* modifier
salt_buf3[2] = 0;
salt_buf3[3] = 0;
- switch_buffer_by_offset (salt_buf0, salt_buf1, salt_buf2, salt_buf3, pw_len);
-
- w[ 0] |= swap32 (salt_buf0[0]);
- w[ 1] |= swap32 (salt_buf0[1]);
- w[ 2] |= swap32 (salt_buf0[2]);
- w[ 3] |= swap32 (salt_buf0[3]);
- w[ 4] |= swap32 (salt_buf1[0]);
- w[ 5] |= swap32 (salt_buf1[1]);
- w[ 6] |= swap32 (salt_buf1[2]);
- w[ 7] |= swap32 (salt_buf1[3]);
- w[ 8] |= swap32 (salt_buf2[0]);
- w[ 9] |= swap32 (salt_buf2[1]);
- w[10] |= swap32 (salt_buf2[2]);
- w[11] |= swap32 (salt_buf2[3]);
- w[12] |= swap32 (salt_buf3[0]);
- w[13] |= swap32 (salt_buf3[1]);
- w[14] |= swap32 (salt_buf3[2]);
- w[15] |= swap32 (salt_buf3[3]);
+ switch_buffer_by_offset_le_S (salt_buf0, salt_buf1, salt_buf2, salt_buf3, pw_len);
+
+ w[ 0] |= swap32_S (salt_buf0[0]);
+ w[ 1] |= swap32_S (salt_buf0[1]);
+ w[ 2] |= swap32_S (salt_buf0[2]);
+ w[ 3] |= swap32_S (salt_buf0[3]);
+ w[ 4] |= swap32_S (salt_buf1[0]);
+ w[ 5] |= swap32_S (salt_buf1[1]);
+ w[ 6] |= swap32_S (salt_buf1[2]);
+ w[ 7] |= swap32_S (salt_buf1[3]);
+ w[ 8] |= swap32_S (salt_buf2[0]);
+ w[ 9] |= swap32_S (salt_buf2[1]);
+ w[10] |= swap32_S (salt_buf2[2]);
+ w[11] |= swap32_S (salt_buf2[3]);
+ w[12] |= swap32_S (salt_buf3[0]);
+ w[13] |= swap32_S (salt_buf3[1]);
+ w[14] |= swap32_S (salt_buf3[2]);
+ w[15] |= swap32_S (salt_buf3[3]);
const u32 salt_len = salt_bufs[salt_pos].salt_len;
u32 w0l = w[0];
- for (u32 il_pos = 0; il_pos < bfs_cnt; il_pos++)
+ for (u32 il_pos = 0; il_pos < bfs_cnt; il_pos += VECT_SIZE)
{
- const u32 w0r = words_buf_r[il_pos];
-
- const u32 w0 = w0l | w0r;
+ const u32x w0r = words_buf_r[il_pos / VECT_SIZE];
+ const u32x w0 = w0l | w0r;
- u32 w0_t[4];
- u32 w1_t[4];
- u32 w2_t[4];
- u32 w3_t[4];
+ u32x w0_t[4];
+ u32x w1_t[4];
+ u32x w2_t[4];
+ u32x w3_t[4];
w0_t[0] = w0;
w0_t[1] = w[ 1];
w3_t[2] = w[14];
w3_t[3] = w[15];
- u64 digest[8];
+ u64x digest[8];
digest[0] = SHA512M_A;
digest[1] = SHA512M_B;
sha512_transform (w0_t, w1_t, w2_t, w3_t, digest);
+ const u32x r0 = l32_from_64 (digest[7]);
+ const u32x r1 = h32_from_64 (digest[7]);
+ const u32x r2 = l32_from_64 (digest[3]);
+ const u32x r3 = h32_from_64 (digest[3]);
- const u32 r0 = l32_from_64 (digest[7]);
- const u32 r1 = h32_from_64 (digest[7]);
- const u32 r2 = l32_from_64 (digest[3]);
- const u32 r3 = h32_from_64 (digest[3]);
-
- #include COMPARE_M
+ COMPARE_M_SIMD (r0, r1, r2, r3);
}
}
-static void m01730s (u32 w[16], const u32 pw_len, __global pw_t *pws, __global kernel_rule_t *rules_buf, __global comb_t *combs_buf, __constant u32 * words_buf_r, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 bfs_cnt, const u32 digests_cnt, const u32 digests_offset)
+static void m01730s (u32 w[16], const u32 pw_len, __global pw_t *pws, __global kernel_rule_t *rules_buf, __global comb_t *combs_buf, __constant u32x * words_buf_r, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 bfs_cnt, const u32 digests_cnt, const u32 digests_offset)
{
/**
* modifier
u32 w0l = w[0];
- for (u32 il_pos = 0; il_pos < bfs_cnt; il_pos++)
+ for (u32 il_pos = 0; il_pos < bfs_cnt; il_pos += VECT_SIZE)
{
- const u32 w0r = words_buf_r[il_pos];
+ const u32x w0r = words_buf_r[il_pos / VECT_SIZE];
- const u32 w0 = w0l | w0r;
+ const u32x w0 = w0l | w0r;
- u32 w0_t[4];
- u32 w1_t[4];
- u32 w2_t[4];
- u32 w3_t[4];
+ u32x w0_t[4];
+ u32x w1_t[4];
+ u32x w2_t[4];
+ u32x w3_t[4];
w0_t[0] = w0;
w0_t[1] = w[ 1];
w3_t[2] = w[14];
w3_t[3] = w[15];
- u64 digest[8];
+ u64x digest[8];
digest[0] = SHA512M_A;
digest[1] = SHA512M_B;
sha512_transform (w0_t, w1_t, w2_t, w3_t, digest);
+ const u32x r0 = l32_from_64 (digest[7]);
+ const u32x r1 = h32_from_64 (digest[7]);
+ const u32x r2 = l32_from_64 (digest[3]);
+ const u32x r3 = h32_from_64 (digest[3]);
- const u32 r0 = l32_from_64 (digest[7]);
- const u32 r1 = h32_from_64 (digest[7]);
- const u32 r2 = l32_from_64 (digest[3]);
- const u32 r3 = h32_from_64 (digest[3]);
-
- #include COMPARE_S
+ COMPARE_S_SIMD (r0, r1, r2, r3);
}
}
-__kernel void m01730_m04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, __global comb_t *combs_buf, __constant u32 * words_buf_r, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 bfs_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max)
+__kernel void m01730_m04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, __global comb_t *combs_buf, __constant u32x * words_buf_r, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 bfs_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max)
{
/**
* base
m01730m (w, pw_len, pws, rules_buf, combs_buf, words_buf_r, tmps, hooks, bitmaps_buf_s1_a, bitmaps_buf_s1_b, bitmaps_buf_s1_c, bitmaps_buf_s1_d, bitmaps_buf_s2_a, bitmaps_buf_s2_b, bitmaps_buf_s2_c, bitmaps_buf_s2_d, plains_buf, digests_buf, hashes_shown, salt_bufs, esalt_bufs, d_return_buf, d_scryptV_buf, bitmap_mask, bitmap_shift1, bitmap_shift2, salt_pos, loop_pos, loop_cnt, bfs_cnt, digests_cnt, digests_offset);
}
-__kernel void m01730_m08 (__global pw_t *pws, __global kernel_rule_t *rules_buf, __global comb_t *combs_buf, __constant u32 * words_buf_r, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 bfs_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max)
+__kernel void m01730_m08 (__global pw_t *pws, __global kernel_rule_t *rules_buf, __global comb_t *combs_buf, __constant u32x * words_buf_r, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 bfs_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max)
{
/**
* base
m01730m (w, pw_len, pws, rules_buf, combs_buf, words_buf_r, tmps, hooks, bitmaps_buf_s1_a, bitmaps_buf_s1_b, bitmaps_buf_s1_c, bitmaps_buf_s1_d, bitmaps_buf_s2_a, bitmaps_buf_s2_b, bitmaps_buf_s2_c, bitmaps_buf_s2_d, plains_buf, digests_buf, hashes_shown, salt_bufs, esalt_bufs, d_return_buf, d_scryptV_buf, bitmap_mask, bitmap_shift1, bitmap_shift2, salt_pos, loop_pos, loop_cnt, bfs_cnt, digests_cnt, digests_offset);
}
-__kernel void m01730_m16 (__global pw_t *pws, __global kernel_rule_t *rules_buf, __global comb_t *combs_buf, __constant u32 * words_buf_r, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 bfs_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max)
+__kernel void m01730_m16 (__global pw_t *pws, __global kernel_rule_t *rules_buf, __global comb_t *combs_buf, __constant u32x * words_buf_r, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 bfs_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max)
{
/**
* base
m01730m (w, pw_len, pws, rules_buf, combs_buf, words_buf_r, tmps, hooks, bitmaps_buf_s1_a, bitmaps_buf_s1_b, bitmaps_buf_s1_c, bitmaps_buf_s1_d, bitmaps_buf_s2_a, bitmaps_buf_s2_b, bitmaps_buf_s2_c, bitmaps_buf_s2_d, plains_buf, digests_buf, hashes_shown, salt_bufs, esalt_bufs, d_return_buf, d_scryptV_buf, bitmap_mask, bitmap_shift1, bitmap_shift2, salt_pos, loop_pos, loop_cnt, bfs_cnt, digests_cnt, digests_offset);
}
-__kernel void m01730_s04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, __global comb_t *combs_buf, __constant u32 * words_buf_r, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 bfs_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max)
+__kernel void m01730_s04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, __global comb_t *combs_buf, __constant u32x * words_buf_r, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 bfs_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max)
{
/**
* base
m01730s (w, pw_len, pws, rules_buf, combs_buf, words_buf_r, tmps, hooks, bitmaps_buf_s1_a, bitmaps_buf_s1_b, bitmaps_buf_s1_c, bitmaps_buf_s1_d, bitmaps_buf_s2_a, bitmaps_buf_s2_b, bitmaps_buf_s2_c, bitmaps_buf_s2_d, plains_buf, digests_buf, hashes_shown, salt_bufs, esalt_bufs, d_return_buf, d_scryptV_buf, bitmap_mask, bitmap_shift1, bitmap_shift2, salt_pos, loop_pos, loop_cnt, bfs_cnt, digests_cnt, digests_offset);
}
-__kernel void m01730_s08 (__global pw_t *pws, __global kernel_rule_t *rules_buf, __global comb_t *combs_buf, __constant u32 * words_buf_r, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 bfs_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max)
+__kernel void m01730_s08 (__global pw_t *pws, __global kernel_rule_t *rules_buf, __global comb_t *combs_buf, __constant u32x * words_buf_r, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 bfs_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max)
{
/**
* base
m01730s (w, pw_len, pws, rules_buf, combs_buf, words_buf_r, tmps, hooks, bitmaps_buf_s1_a, bitmaps_buf_s1_b, bitmaps_buf_s1_c, bitmaps_buf_s1_d, bitmaps_buf_s2_a, bitmaps_buf_s2_b, bitmaps_buf_s2_c, bitmaps_buf_s2_d, plains_buf, digests_buf, hashes_shown, salt_bufs, esalt_bufs, d_return_buf, d_scryptV_buf, bitmap_mask, bitmap_shift1, bitmap_shift2, salt_pos, loop_pos, loop_cnt, bfs_cnt, digests_cnt, digests_offset);
}
-__kernel void m01730_s16 (__global pw_t *pws, __global kernel_rule_t *rules_buf, __global comb_t *combs_buf, __constant u32 * words_buf_r, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 bfs_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max)
+__kernel void m01730_s16 (__global pw_t *pws, __global kernel_rule_t *rules_buf, __global comb_t *combs_buf, __constant u32x * words_buf_r, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 bfs_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max)
{
/**
* base
make_unicode (w0, w0_t, w1_t);
make_unicode (w1, w2_t, w3_t);
- switch_buffer_by_offset (w0_t, w1_t, w2_t, w3_t, salt_len);
+ switch_buffer_by_offset_le (w0_t, w1_t, w2_t, w3_t, salt_len);
w0_t[0] |= salt_buf0[0];
w0_t[1] |= salt_buf0[1];
make_unicode (w0, w0_t, w1_t);
make_unicode (w1, w2_t, w3_t);
- switch_buffer_by_offset (w0_t, w1_t, w2_t, w3_t, salt_len);
+ switch_buffer_by_offset_le (w0_t, w1_t, w2_t, w3_t, salt_len);
w0_t[0] |= salt_buf0[0];
w0_t[1] |= salt_buf0[1];
if (combs_mode == COMBINATOR_MODE_BASE_RIGHT)
{
- switch_buffer_by_offset (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len);
+ switch_buffer_by_offset_le (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len);
}
/**
if (combs_mode == COMBINATOR_MODE_BASE_LEFT)
{
- switch_buffer_by_offset (wordr0, wordr1, wordr2, wordr3, pw_l_len);
+ switch_buffer_by_offset_le (wordr0, wordr1, wordr2, wordr3, pw_l_len);
}
u32 w0[4];
make_unicode (w0, w0_t, w1_t);
make_unicode (w1, w2_t, w3_t);
- switch_buffer_by_offset (w0_t, w1_t, w2_t, w3_t, salt_len);
+ switch_buffer_by_offset_le (w0_t, w1_t, w2_t, w3_t, salt_len);
w0_t[0] |= salt_buf0[0];
w0_t[1] |= salt_buf0[1];
if (combs_mode == COMBINATOR_MODE_BASE_RIGHT)
{
- switch_buffer_by_offset (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len);
+ switch_buffer_by_offset_le (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len);
}
/**
if (combs_mode == COMBINATOR_MODE_BASE_LEFT)
{
- switch_buffer_by_offset (wordr0, wordr1, wordr2, wordr3, pw_l_len);
+ switch_buffer_by_offset_le (wordr0, wordr1, wordr2, wordr3, pw_l_len);
}
u32 w0[4];
make_unicode (w0, w0_t, w1_t);
make_unicode (w1, w2_t, w3_t);
- switch_buffer_by_offset (w0_t, w1_t, w2_t, w3_t, salt_len);
+ switch_buffer_by_offset_le (w0_t, w1_t, w2_t, w3_t, salt_len);
w0_t[0] |= salt_buf0[0];
w0_t[1] |= salt_buf0[1];
#define _SHA512_
+#define NEW_SIMD_CODE
+
#include "include/constants.h"
#include "include/kernel_vendor.h"
#include "include/kernel_functions.c"
#include "OpenCL/types_ocl.c"
#include "OpenCL/common.c"
-
-#define COMPARE_S "OpenCL/check_single_comp4.c"
-#define COMPARE_M "OpenCL/check_multi_comp4.c"
+#include "OpenCL/simd.c"
__constant u64 k_sha512[80] =
{
SHA512C4c, SHA512C4d, SHA512C4e, SHA512C4f,
};
-static void sha512_transform (const u32 w0[4], const u32 w1[4], const u32 w2[4], const u32 w3[4], u64 digest[8])
+static void sha512_transform (const u32x w0[4], const u32x w1[4], const u32x w2[4], const u32x w3[4], u64x digest[8])
{
- u64 w0_t = hl32_to_64 (w0[0], w0[1]);
- u64 w1_t = hl32_to_64 (w0[2], w0[3]);
- u64 w2_t = hl32_to_64 (w1[0], w1[1]);
- u64 w3_t = hl32_to_64 (w1[2], w1[3]);
- u64 w4_t = hl32_to_64 (w2[0], w2[1]);
- u64 w5_t = hl32_to_64 (w2[2], w2[3]);
- u64 w6_t = hl32_to_64 (w3[0], w3[1]);
- u64 w7_t = 0;
- u64 w8_t = 0;
- u64 w9_t = 0;
- u64 wa_t = 0;
- u64 wb_t = 0;
- u64 wc_t = 0;
- u64 wd_t = 0;
- u64 we_t = 0;
- u64 wf_t = hl32_to_64 (w3[2], w3[3]);
-
- u64 a = digest[0];
- u64 b = digest[1];
- u64 c = digest[2];
- u64 d = digest[3];
- u64 e = digest[4];
- u64 f = digest[5];
- u64 g = digest[6];
- u64 h = digest[7];
+ u64x w0_t = hl32_to_64 (w0[0], w0[1]);
+ u64x w1_t = hl32_to_64 (w0[2], w0[3]);
+ u64x w2_t = hl32_to_64 (w1[0], w1[1]);
+ u64x w3_t = hl32_to_64 (w1[2], w1[3]);
+ u64x w4_t = hl32_to_64 (w2[0], w2[1]);
+ u64x w5_t = hl32_to_64 (w2[2], w2[3]);
+ u64x w6_t = hl32_to_64 (w3[0], w3[1]);
+ u64x w7_t = 0;
+ u64x w8_t = 0;
+ u64x w9_t = 0;
+ u64x wa_t = 0;
+ u64x wb_t = 0;
+ u64x wc_t = 0;
+ u64x wd_t = 0;
+ u64x we_t = 0;
+ u64x wf_t = hl32_to_64 (w3[2], w3[3]);
+
+ u64x a = digest[0];
+ u64x b = digest[1];
+ u64x c = digest[2];
+ u64x d = digest[3];
+ u64x e = digest[4];
+ u64x f = digest[5];
+ u64x g = digest[6];
+ u64x h = digest[7];
#define ROUND_EXPAND() \
{ \
const u32 pw_salt_len = pw_len + salt_len;
+
+ /**
+ * prepend salt
+ */
+
+ u32 w0_t[4];
+ u32 w1_t[4];
+ u32 w2_t[4];
+ u32 w3_t[4];
+
+ w0_t[0] = swap32_S (w0[0]);
+ w0_t[1] = swap32_S (w0[1]);
+ w0_t[2] = swap32_S (w0[2]);
+ w0_t[3] = swap32_S (w0[3]);
+ w1_t[0] = swap32_S (w1[0]);
+ w1_t[1] = swap32_S (w1[1]);
+ w1_t[2] = swap32_S (w1[2]);
+ w1_t[3] = swap32_S (w1[3]);
+ w2_t[0] = swap32_S (w2[0]);
+ w2_t[1] = swap32_S (w2[1]);
+ w2_t[2] = swap32_S (w2[2]);
+ w2_t[3] = swap32_S (w2[3]);
+ w3_t[0] = swap32_S (w3[0]);
+ w3_t[1] = swap32_S (w3[1]);
+ w3_t[2] = swap32_S (w3[2]);
+ w3_t[3] = swap32_S (w3[3]);
+
+ switch_buffer_by_offset_le_S (w0_t, w1_t, w2_t, w3_t, salt_len);
+
+ w0_t[0] |= salt_buf0[0];
+ w0_t[1] |= salt_buf0[1];
+ w0_t[2] |= salt_buf0[2];
+ w0_t[3] |= salt_buf0[3];
+ w1_t[0] |= salt_buf1[0];
+ w1_t[1] |= salt_buf1[1];
+ w1_t[2] |= salt_buf1[2];
+ w1_t[3] |= salt_buf1[3];
+ w2_t[0] |= salt_buf2[0];
+ w2_t[1] |= salt_buf2[1];
+ w2_t[2] |= salt_buf2[2];
+ w2_t[3] |= salt_buf2[3];
+ w3_t[0] |= salt_buf3[0];
+ w3_t[1] |= salt_buf3[1];
+ w3_t[2] |= salt_buf3[2];
+ w3_t[3] |= salt_buf3[3];
+
+ w0_t[0] = swap32_S (w0_t[0]);
+ w0_t[1] = swap32_S (w0_t[1]);
+ w0_t[2] = swap32_S (w0_t[2]);
+ w0_t[3] = swap32_S (w0_t[3]);
+ w1_t[0] = swap32_S (w1_t[0]);
+ w1_t[1] = swap32_S (w1_t[1]);
+ w1_t[2] = swap32_S (w1_t[2]);
+ w1_t[3] = swap32_S (w1_t[3]);
+ w2_t[0] = swap32_S (w2_t[0]);
+ w2_t[1] = swap32_S (w2_t[1]);
+ w2_t[2] = swap32_S (w2_t[2]);
+ w2_t[3] = swap32_S (w2_t[3]);
+ w3_t[0] = swap32_S (w3_t[0]);
+ w3_t[1] = swap32_S (w3_t[1]);
+ w3_t[2] = swap32_S (w3_t[2]);
+ w3_t[3] = swap32_S (w3_t[3]);
+
/**
* loop
*/
u32 w0l = w0[0];
- for (u32 il_pos = 0; il_pos < bfs_cnt; il_pos++)
+ for (u32 il_pos = 0; il_pos < bfs_cnt; il_pos += VECT_SIZE)
{
- const u32 w0r = bfs_buf[il_pos].i;
-
- w0[0] = w0l | w0r;
-
- /**
- * prepend salt
- */
-
- u32 w0_t[4];
- u32 w1_t[4];
- u32 w2_t[4];
- u32 w3_t[4];
-
- w0_t[0] = swap32 (w0[0]);
- w0_t[1] = swap32 (w0[1]);
- w0_t[2] = swap32 (w0[2]);
- w0_t[3] = swap32 (w0[3]);
- w1_t[0] = swap32 (w1[0]);
- w1_t[1] = swap32 (w1[1]);
- w1_t[2] = swap32 (w1[2]);
- w1_t[3] = swap32 (w1[3]);
- w2_t[0] = swap32 (w2[0]);
- w2_t[1] = swap32 (w2[1]);
- w2_t[2] = swap32 (w2[2]);
- w2_t[3] = swap32 (w2[3]);
- w3_t[0] = swap32 (w3[0]);
- w3_t[1] = swap32 (w3[1]);
- w3_t[2] = swap32 (w3[2]);
- w3_t[3] = swap32 (w3[3]);
-
- switch_buffer_by_offset (w0_t, w1_t, w2_t, w3_t, salt_len);
-
- w0_t[0] |= salt_buf0[0];
- w0_t[1] |= salt_buf0[1];
- w0_t[2] |= salt_buf0[2];
- w0_t[3] |= salt_buf0[3];
- w1_t[0] |= salt_buf1[0];
- w1_t[1] |= salt_buf1[1];
- w1_t[2] |= salt_buf1[2];
- w1_t[3] |= salt_buf1[3];
- w2_t[0] |= salt_buf2[0];
- w2_t[1] |= salt_buf2[1];
- w2_t[2] |= salt_buf2[2];
- w2_t[3] |= salt_buf2[3];
- w3_t[0] |= salt_buf3[0];
- w3_t[1] |= salt_buf3[1];
- w3_t[2] = 0;
- w3_t[3] = pw_salt_len * 8;
+ const u32x w0r = w0r_create_bft (bfs_buf, il_pos);
+
+ const u32x w0lr = w0l | w0r;
+
+ u32x wx[16];
+
+ wx[ 0] = w0_t[0];
+ wx[ 1] = w0_t[1];
+ wx[ 2] = w0_t[2];
+ wx[ 3] = w0_t[3];
+ wx[ 4] = w1_t[0];
+ wx[ 5] = w1_t[1];
+ wx[ 6] = w1_t[2];
+ wx[ 7] = w1_t[3];
+ wx[ 8] = w2_t[0];
+ wx[ 9] = w2_t[1];
+ wx[10] = w2_t[2];
+ wx[11] = w2_t[3];
+ wx[12] = w3_t[0];
+ wx[13] = w3_t[1];
+ wx[14] = w3_t[2];
+ wx[15] = w3_t[3];
+
+ overwrite_at_be (wx, w0lr, salt_len);
+
+ u32x w0_t[4];
+ u32x w1_t[4];
+ u32x w2_t[4];
+ u32x w3_t[4];
+
+ w0_t[0] = wx[ 0];
+ w0_t[1] = wx[ 1];
+ w0_t[2] = wx[ 2];
+ w0_t[3] = wx[ 3];
+ w1_t[0] = wx[ 4];
+ w1_t[1] = wx[ 5];
+ w1_t[2] = wx[ 6];
+ w1_t[3] = wx[ 7];
+ w2_t[0] = wx[ 8];
+ w2_t[1] = wx[ 9];
+ w2_t[2] = wx[10];
+ w2_t[3] = wx[11];
+ w3_t[0] = wx[12];
+ w3_t[1] = wx[13];
+ w3_t[2] = 0;
+ w3_t[3] = pw_salt_len * 8;
/**
* sha512
*/
- w0_t[0] = swap32 (w0_t[0]);
- w0_t[1] = swap32 (w0_t[1]);
- w0_t[2] = swap32 (w0_t[2]);
- w0_t[3] = swap32 (w0_t[3]);
- w1_t[0] = swap32 (w1_t[0]);
- w1_t[1] = swap32 (w1_t[1]);
- w1_t[2] = swap32 (w1_t[2]);
- w1_t[3] = swap32 (w1_t[3]);
- w2_t[0] = swap32 (w2_t[0]);
- w2_t[1] = swap32 (w2_t[1]);
- w2_t[2] = swap32 (w2_t[2]);
- w2_t[3] = swap32 (w2_t[3]);
- w3_t[0] = swap32 (w3_t[0]);
- w3_t[1] = swap32 (w3_t[1]);
- //w3_t[2] = swap32 (w3_t[2]);
- //w3_t[3] = swap32 (w3_t[3]);
-
- u64 digest[8];
+ u64x digest[8];
digest[0] = SHA512M_A;
digest[1] = SHA512M_B;
sha512_transform (w0_t, w1_t, w2_t, w3_t, digest);
+ const u32x r0 = l32_from_64 (digest[7]);
+ const u32x r1 = h32_from_64 (digest[7]);
+ const u32x r2 = l32_from_64 (digest[3]);
+ const u32x r3 = h32_from_64 (digest[3]);
- const u32 r0 = l32_from_64 (digest[7]);
- const u32 r1 = h32_from_64 (digest[7]);
- const u32 r2 = l32_from_64 (digest[3]);
- const u32 r3 = h32_from_64 (digest[3]);
-
- #include COMPARE_M
+ COMPARE_M_SIMD (r0, r1, r2, r3);
}
}
const u32 pw_salt_len = pw_len + salt_len;
+ /**
+ * prepend salt
+ */
+
+ u32 w0_t[4];
+ u32 w1_t[4];
+ u32 w2_t[4];
+ u32 w3_t[4];
+
+ w0_t[0] = swap32_S (w0[0]);
+ w0_t[1] = swap32_S (w0[1]);
+ w0_t[2] = swap32_S (w0[2]);
+ w0_t[3] = swap32_S (w0[3]);
+ w1_t[0] = swap32_S (w1[0]);
+ w1_t[1] = swap32_S (w1[1]);
+ w1_t[2] = swap32_S (w1[2]);
+ w1_t[3] = swap32_S (w1[3]);
+ w2_t[0] = swap32_S (w2[0]);
+ w2_t[1] = swap32_S (w2[1]);
+ w2_t[2] = swap32_S (w2[2]);
+ w2_t[3] = swap32_S (w2[3]);
+ w3_t[0] = swap32_S (w3[0]);
+ w3_t[1] = swap32_S (w3[1]);
+ w3_t[2] = swap32_S (w3[2]);
+ w3_t[3] = swap32_S (w3[3]);
+
+ switch_buffer_by_offset_le_S (w0_t, w1_t, w2_t, w3_t, salt_len);
+
+ w0_t[0] |= salt_buf0[0];
+ w0_t[1] |= salt_buf0[1];
+ w0_t[2] |= salt_buf0[2];
+ w0_t[3] |= salt_buf0[3];
+ w1_t[0] |= salt_buf1[0];
+ w1_t[1] |= salt_buf1[1];
+ w1_t[2] |= salt_buf1[2];
+ w1_t[3] |= salt_buf1[3];
+ w2_t[0] |= salt_buf2[0];
+ w2_t[1] |= salt_buf2[1];
+ w2_t[2] |= salt_buf2[2];
+ w2_t[3] |= salt_buf2[3];
+ w3_t[0] |= salt_buf3[0];
+ w3_t[1] |= salt_buf3[1];
+ w3_t[2] |= salt_buf3[2];
+ w3_t[3] |= salt_buf3[3];
+
+ w0_t[0] = swap32_S (w0_t[0]);
+ w0_t[1] = swap32_S (w0_t[1]);
+ w0_t[2] = swap32_S (w0_t[2]);
+ w0_t[3] = swap32_S (w0_t[3]);
+ w1_t[0] = swap32_S (w1_t[0]);
+ w1_t[1] = swap32_S (w1_t[1]);
+ w1_t[2] = swap32_S (w1_t[2]);
+ w1_t[3] = swap32_S (w1_t[3]);
+ w2_t[0] = swap32_S (w2_t[0]);
+ w2_t[1] = swap32_S (w2_t[1]);
+ w2_t[2] = swap32_S (w2_t[2]);
+ w2_t[3] = swap32_S (w2_t[3]);
+ w3_t[0] = swap32_S (w3_t[0]);
+ w3_t[1] = swap32_S (w3_t[1]);
+ w3_t[2] = swap32_S (w3_t[2]);
+ w3_t[3] = swap32_S (w3_t[3]);
+
/**
* loop
*/
u32 w0l = w0[0];
- for (u32 il_pos = 0; il_pos < bfs_cnt; il_pos++)
+ for (u32 il_pos = 0; il_pos < bfs_cnt; il_pos += VECT_SIZE)
{
- const u32 w0r = bfs_buf[il_pos].i;
+ const u32x w0r = w0r_create_bft (bfs_buf, il_pos);
- w0[0] = w0l | w0r;
+ const u32x w0lr = w0l | w0r;
/**
* prepend salt
*/
- u32 w0_t[4];
- u32 w1_t[4];
- u32 w2_t[4];
- u32 w3_t[4];
-
- w0_t[0] = swap32 (w0[0]);
- w0_t[1] = swap32 (w0[1]);
- w0_t[2] = swap32 (w0[2]);
- w0_t[3] = swap32 (w0[3]);
- w1_t[0] = swap32 (w1[0]);
- w1_t[1] = swap32 (w1[1]);
- w1_t[2] = swap32 (w1[2]);
- w1_t[3] = swap32 (w1[3]);
- w2_t[0] = swap32 (w2[0]);
- w2_t[1] = swap32 (w2[1]);
- w2_t[2] = swap32 (w2[2]);
- w2_t[3] = swap32 (w2[3]);
- w3_t[0] = swap32 (w3[0]);
- w3_t[1] = swap32 (w3[1]);
- w3_t[2] = swap32 (w3[2]);
- w3_t[3] = swap32 (w3[3]);
-
- switch_buffer_by_offset (w0_t, w1_t, w2_t, w3_t, salt_len);
-
- w0_t[0] |= salt_buf0[0];
- w0_t[1] |= salt_buf0[1];
- w0_t[2] |= salt_buf0[2];
- w0_t[3] |= salt_buf0[3];
- w1_t[0] |= salt_buf1[0];
- w1_t[1] |= salt_buf1[1];
- w1_t[2] |= salt_buf1[2];
- w1_t[3] |= salt_buf1[3];
- w2_t[0] |= salt_buf2[0];
- w2_t[1] |= salt_buf2[1];
- w2_t[2] |= salt_buf2[2];
- w2_t[3] |= salt_buf2[3];
- w3_t[0] |= salt_buf3[0];
- w3_t[1] |= salt_buf3[1];
- w3_t[2] = 0;
- w3_t[3] = pw_salt_len * 8;
-
- /**
- * sha512
- */
-
- w0_t[0] = swap32 (w0_t[0]);
- w0_t[1] = swap32 (w0_t[1]);
- w0_t[2] = swap32 (w0_t[2]);
- w0_t[3] = swap32 (w0_t[3]);
- w1_t[0] = swap32 (w1_t[0]);
- w1_t[1] = swap32 (w1_t[1]);
- w1_t[2] = swap32 (w1_t[2]);
- w1_t[3] = swap32 (w1_t[3]);
- w2_t[0] = swap32 (w2_t[0]);
- w2_t[1] = swap32 (w2_t[1]);
- w2_t[2] = swap32 (w2_t[2]);
- w2_t[3] = swap32 (w2_t[3]);
- w3_t[0] = swap32 (w3_t[0]);
- w3_t[1] = swap32 (w3_t[1]);
- //w3_t[2] = swap32 (w3_t[2]);
- //w3_t[3] = swap32 (w3_t[3]);
-
- u64 digest[8];
+ u32x wx[16];
+
+ wx[ 0] = w0_t[0];
+ wx[ 1] = w0_t[1];
+ wx[ 2] = w0_t[2];
+ wx[ 3] = w0_t[3];
+ wx[ 4] = w1_t[0];
+ wx[ 5] = w1_t[1];
+ wx[ 6] = w1_t[2];
+ wx[ 7] = w1_t[3];
+ wx[ 8] = w2_t[0];
+ wx[ 9] = w2_t[1];
+ wx[10] = w2_t[2];
+ wx[11] = w2_t[3];
+ wx[12] = w3_t[0];
+ wx[13] = w3_t[1];
+ wx[14] = w3_t[2];
+ wx[15] = w3_t[3];
+
+ overwrite_at_be (wx, w0lr, salt_len);
+
+ u32x w0_t[4];
+ u32x w1_t[4];
+ u32x w2_t[4];
+ u32x w3_t[4];
+
+ w0_t[0] = wx[ 0];
+ w0_t[1] = wx[ 1];
+ w0_t[2] = wx[ 2];
+ w0_t[3] = wx[ 3];
+ w1_t[0] = wx[ 4];
+ w1_t[1] = wx[ 5];
+ w1_t[2] = wx[ 6];
+ w1_t[3] = wx[ 7];
+ w2_t[0] = wx[ 8];
+ w2_t[1] = wx[ 9];
+ w2_t[2] = wx[10];
+ w2_t[3] = wx[11];
+ w3_t[0] = wx[12];
+ w3_t[1] = wx[13];
+ w3_t[2] = 0;
+ w3_t[3] = pw_salt_len * 8;
+
+ u64x digest[8];
digest[0] = SHA512M_A;
digest[1] = SHA512M_B;
sha512_transform (w0_t, w1_t, w2_t, w3_t, digest);
+ const u32x r0 = l32_from_64 (digest[7]);
+ const u32x r1 = h32_from_64 (digest[7]);
+ const u32x r2 = l32_from_64 (digest[3]);
+ const u32x r3 = h32_from_64 (digest[3]);
- const u32 r0 = l32_from_64 (digest[7]);
- const u32 r1 = h32_from_64 (digest[7]);
- const u32 r2 = l32_from_64 (digest[3]);
- const u32 r3 = h32_from_64 (digest[3]);
-
- #include COMPARE_S
+ COMPARE_S_SIMD (r0, r1, r2, r3);
}
}
if (combs_mode == COMBINATOR_MODE_BASE_RIGHT)
{
- switch_buffer_by_offset (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len);
+ switch_buffer_by_offset_le (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len);
}
/**
if (combs_mode == COMBINATOR_MODE_BASE_LEFT)
{
- switch_buffer_by_offset (wordr0, wordr1, wordr2, wordr3, pw_l_len);
+ switch_buffer_by_offset_le (wordr0, wordr1, wordr2, wordr3, pw_l_len);
}
u32 w0[4];
if (combs_mode == COMBINATOR_MODE_BASE_RIGHT)
{
- switch_buffer_by_offset (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len);
+ switch_buffer_by_offset_le (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len);
}
/**
if (combs_mode == COMBINATOR_MODE_BASE_LEFT)
{
- switch_buffer_by_offset (wordr0, wordr1, wordr2, wordr3, pw_l_len);
+ switch_buffer_by_offset_le (wordr0, wordr1, wordr2, wordr3, pw_l_len);
}
u32 w0[4];
#define _SHA512_
+#define NEW_SIMD_CODE
+
#include "include/constants.h"
#include "include/kernel_vendor.h"
#include "include/kernel_functions.c"
#include "OpenCL/types_ocl.c"
#include "OpenCL/common.c"
-
-#define COMPARE_S "OpenCL/check_single_comp4.c"
-#define COMPARE_M "OpenCL/check_multi_comp4.c"
+#include "OpenCL/simd.c"
__constant u64 k_sha512[80] =
{
SHA512C4c, SHA512C4d, SHA512C4e, SHA512C4f,
};
-static void sha512_transform (const u64 w0[4], const u64 w1[4], const u64 w2[4], const u64 w3[4], u64 digest[8])
+static void sha512_transform (const u64x w0[4], const u64x w1[4], const u64x w2[4], const u64x w3[4], u64x digest[8])
{
- u64 w0_t = w0[0];
- u64 w1_t = w0[1];
- u64 w2_t = w0[2];
- u64 w3_t = w0[3];
- u64 w4_t = w1[0];
- u64 w5_t = w1[1];
- u64 w6_t = w1[2];
- u64 w7_t = w1[3];
- u64 w8_t = w2[0];
- u64 w9_t = w2[1];
- u64 wa_t = w2[2];
- u64 wb_t = w2[3];
- u64 wc_t = w3[0];
- u64 wd_t = w3[1];
- u64 we_t = w3[2];
- u64 wf_t = w3[3];
-
- u64 a = digest[0];
- u64 b = digest[1];
- u64 c = digest[2];
- u64 d = digest[3];
- u64 e = digest[4];
- u64 f = digest[5];
- u64 g = digest[6];
- u64 h = digest[7];
+ u64x w0_t = w0[0];
+ u64x w1_t = w0[1];
+ u64x w2_t = w0[2];
+ u64x w3_t = w0[3];
+ u64x w4_t = w1[0];
+ u64x w5_t = w1[1];
+ u64x w6_t = w1[2];
+ u64x w7_t = w1[3];
+ u64x w8_t = w2[0];
+ u64x w9_t = w2[1];
+ u64x wa_t = w2[2];
+ u64x wb_t = w2[3];
+ u64x wc_t = w3[0];
+ u64x wd_t = w3[1];
+ u64x we_t = w3[2];
+ u64x wf_t = w3[3];
+
+ u64x a = digest[0];
+ u64x b = digest[1];
+ u64x c = digest[2];
+ u64x d = digest[3];
+ u64x e = digest[4];
+ u64x f = digest[5];
+ u64x g = digest[6];
+ u64x h = digest[7];
#define ROUND_EXPAND() \
{ \
digest[7] += h;
}
-static void hmac_sha512_pad (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], u64 ipad[8], u64 opad[8])
+static void hmac_sha512_pad (u32x w0[4], u32x w1[4], u32x w2[4], u32x w3[4], u64x ipad[8], u64x opad[8])
{
- u64 w0_t[4];
- u64 w1_t[4];
- u64 w2_t[4];
- u64 w3_t[4];
+ u64x w0_t[4];
+ u64x w1_t[4];
+ u64x w2_t[4];
+ u64x w3_t[4];
w0_t[0] = hl32_to_64 (w0[0], w0[1]) ^ 0x3636363636363636;
w0_t[1] = hl32_to_64 (w0[2], w0[3]) ^ 0x3636363636363636;
w1_t[1] = hl32_to_64 (w2[2], w2[3]) ^ 0x3636363636363636;
w1_t[2] = hl32_to_64 (w3[0], w3[1]) ^ 0x3636363636363636;
w1_t[3] = hl32_to_64 (w3[2], w3[3]) ^ 0x3636363636363636;
- w2_t[0] = 0x3636363636363636;
- w2_t[1] = 0x3636363636363636;
- w2_t[2] = 0x3636363636363636;
- w2_t[3] = 0x3636363636363636;
- w3_t[0] = 0x3636363636363636;
- w3_t[1] = 0x3636363636363636;
- w3_t[2] = 0x3636363636363636;
- w3_t[3] = 0x3636363636363636;
+ w2_t[0] = 0x3636363636363636;
+ w2_t[1] = 0x3636363636363636;
+ w2_t[2] = 0x3636363636363636;
+ w2_t[3] = 0x3636363636363636;
+ w3_t[0] = 0x3636363636363636;
+ w3_t[1] = 0x3636363636363636;
+ w3_t[2] = 0x3636363636363636;
+ w3_t[3] = 0x3636363636363636;
ipad[0] = SHA512M_A;
ipad[1] = SHA512M_B;
w1_t[1] = hl32_to_64 (w2[2], w2[3]) ^ 0x5c5c5c5c5c5c5c5c;
w1_t[2] = hl32_to_64 (w3[0], w3[1]) ^ 0x5c5c5c5c5c5c5c5c;
w1_t[3] = hl32_to_64 (w3[2], w3[3]) ^ 0x5c5c5c5c5c5c5c5c;
- w2_t[0] = 0x5c5c5c5c5c5c5c5c;
- w2_t[1] = 0x5c5c5c5c5c5c5c5c;
- w2_t[2] = 0x5c5c5c5c5c5c5c5c;
- w2_t[3] = 0x5c5c5c5c5c5c5c5c;
- w3_t[0] = 0x5c5c5c5c5c5c5c5c;
- w3_t[1] = 0x5c5c5c5c5c5c5c5c;
- w3_t[2] = 0x5c5c5c5c5c5c5c5c;
- w3_t[3] = 0x5c5c5c5c5c5c5c5c;
+ w2_t[0] = 0x5c5c5c5c5c5c5c5c;
+ w2_t[1] = 0x5c5c5c5c5c5c5c5c;
+ w2_t[2] = 0x5c5c5c5c5c5c5c5c;
+ w2_t[3] = 0x5c5c5c5c5c5c5c5c;
+ w3_t[0] = 0x5c5c5c5c5c5c5c5c;
+ w3_t[1] = 0x5c5c5c5c5c5c5c5c;
+ w3_t[2] = 0x5c5c5c5c5c5c5c5c;
+ w3_t[3] = 0x5c5c5c5c5c5c5c5c;
opad[0] = SHA512M_A;
opad[1] = SHA512M_B;
sha512_transform (w0_t, w1_t, w2_t, w3_t, opad);
}
-static void hmac_sha512_run (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], u64 ipad[8], u64 opad[8], u64 digest[8])
+static void hmac_sha512_run (u32x w0[4], u32x w1[4], u32x w2[4], u32x w3[4], u64x ipad[8], u64x opad[8], u64x digest[8])
{
- u64 w0_t[4];
- u64 w1_t[4];
- u64 w2_t[4];
- u64 w3_t[4];
+ u64x w0_t[4];
+ u64x w1_t[4];
+ u64x w2_t[4];
+ u64x w3_t[4];
w0_t[0] = hl32_to_64 (w0[0], w0[1]);
w0_t[1] = hl32_to_64 (w0[2], w0[3]);
u32 w0l = w0[0];
- for (u32 il_pos = 0; il_pos < bfs_cnt; il_pos++)
+ for (u32 il_pos = 0; il_pos < bfs_cnt; il_pos += VECT_SIZE)
{
- const u32 w0r = bfs_buf[il_pos].i;
+ const u32x w0r = w0r_create_bft (bfs_buf, il_pos);
- w0[0] = w0l | w0r;
+ const u32x w0lr = w0l | w0r;
/**
* pads
*/
- u32 w0_t[4];
+ u32x w0_t[4];
- w0_t[0] = w0[0];
+ w0_t[0] = w0lr;
w0_t[1] = w0[1];
w0_t[2] = w0[2];
w0_t[3] = w0[3];
- u32 w1_t[4];
+ u32x w1_t[4];
w1_t[0] = w1[0];
w1_t[1] = w1[1];
w1_t[2] = w1[2];
w1_t[3] = w1[3];
- u32 w2_t[4];
+ u32x w2_t[4];
w2_t[0] = w2[0];
w2_t[1] = w2[1];
w2_t[2] = w2[2];
w2_t[3] = w2[3];
- u32 w3_t[4];
+ u32x w3_t[4];
w3_t[0] = w3[0];
w3_t[1] = w3[1];
w3_t[2] = 0;
w3_t[3] = 0;
- u64 ipad[8];
- u64 opad[8];
+ u64x ipad[8];
+ u64x opad[8];
hmac_sha512_pad (w0_t, w1_t, w2_t, w3_t, ipad, opad);
w3_t[2] = 0;
w3_t[3] = (128 + salt_len) * 8;
- u64 digest[8];
+ u64x digest[8];
hmac_sha512_run (w0_t, w1_t, w2_t, w3_t, ipad, opad, digest);
+ const u32x r0 = l32_from_64 (digest[7]);
+ const u32x r1 = h32_from_64 (digest[7]);
+ const u32x r2 = l32_from_64 (digest[3]);
+ const u32x r3 = h32_from_64 (digest[3]);
- const u32 r0 = l32_from_64 (digest[7]);
- const u32 r1 = h32_from_64 (digest[7]);
- const u32 r2 = l32_from_64 (digest[3]);
- const u32 r3 = h32_from_64 (digest[3]);
-
- #include COMPARE_M
+ COMPARE_M_SIMD (r0, r1, r2, r3);
}
}
u32 w0l = w0[0];
- for (u32 il_pos = 0; il_pos < bfs_cnt; il_pos++)
+ for (u32 il_pos = 0; il_pos < bfs_cnt; il_pos += VECT_SIZE)
{
- const u32 w0r = bfs_buf[il_pos].i;
+ const u32x w0r = w0r_create_bft (bfs_buf, il_pos);
- w0[0] = w0l | w0r;
+ const u32x w0lr = w0l | w0r;
/**
* pads
*/
- u32 w0_t[4];
+ u32x w0_t[4];
- w0_t[0] = w0[0];
+ w0_t[0] = w0lr;
w0_t[1] = w0[1];
w0_t[2] = w0[2];
w0_t[3] = w0[3];
- u32 w1_t[4];
+ u32x w1_t[4];
w1_t[0] = w1[0];
w1_t[1] = w1[1];
w1_t[2] = w1[2];
w1_t[3] = w1[3];
- u32 w2_t[4];
+ u32x w2_t[4];
w2_t[0] = w2[0];
w2_t[1] = w2[1];
w2_t[2] = w2[2];
w2_t[3] = w2[3];
- u32 w3_t[4];
+ u32x w3_t[4];
w3_t[0] = w3[0];
w3_t[1] = w3[1];
w3_t[2] = 0;
w3_t[3] = 0;
- u64 ipad[8];
- u64 opad[8];
+ u64x ipad[8];
+ u64x opad[8];
hmac_sha512_pad (w0_t, w1_t, w2_t, w3_t, ipad, opad);
w3_t[2] = 0;
w3_t[3] = (128 + salt_len) * 8;
- u64 digest[8];
+ u64x digest[8];
hmac_sha512_run (w0_t, w1_t, w2_t, w3_t, ipad, opad, digest);
+ const u32x r0 = l32_from_64 (digest[7]);
+ const u32x r1 = h32_from_64 (digest[7]);
+ const u32x r2 = l32_from_64 (digest[3]);
+ const u32x r3 = h32_from_64 (digest[3]);
- const u32 r0 = l32_from_64 (digest[7]);
- const u32 r1 = h32_from_64 (digest[7]);
- const u32 r2 = l32_from_64 (digest[3]);
- const u32 r3 = h32_from_64 (digest[3]);
-
- #include COMPARE_S
+ COMPARE_S_SIMD (r0, r1, r2, r3);
}
}
if (combs_mode == COMBINATOR_MODE_BASE_RIGHT)
{
- switch_buffer_by_offset (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len);
+ switch_buffer_by_offset_le (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len);
}
/**
if (combs_mode == COMBINATOR_MODE_BASE_LEFT)
{
- switch_buffer_by_offset (wordr0, wordr1, wordr2, wordr3, pw_l_len);
+ switch_buffer_by_offset_le (wordr0, wordr1, wordr2, wordr3, pw_l_len);
}
u32 w0[4];
if (combs_mode == COMBINATOR_MODE_BASE_RIGHT)
{
- switch_buffer_by_offset (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len);
+ switch_buffer_by_offset_le (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len);
}
/**
if (combs_mode == COMBINATOR_MODE_BASE_LEFT)
{
- switch_buffer_by_offset (wordr0, wordr1, wordr2, wordr3, pw_l_len);
+ switch_buffer_by_offset_le (wordr0, wordr1, wordr2, wordr3, pw_l_len);
}
u32 w0[4];
#define _SHA512_
+#define NEW_SIMD_CODE
+
#include "include/constants.h"
#include "include/kernel_vendor.h"
#include "include/kernel_functions.c"
#include "OpenCL/types_ocl.c"
#include "OpenCL/common.c"
-
-#define COMPARE_S "OpenCL/check_single_comp4.c"
-#define COMPARE_M "OpenCL/check_multi_comp4.c"
+#include "OpenCL/simd.c"
__constant u64 k_sha512[80] =
{
SHA512C4c, SHA512C4d, SHA512C4e, SHA512C4f,
};
-static void sha512_transform (const u64 w0[4], const u64 w1[4], const u64 w2[4], const u64 w3[4], u64 digest[8])
+static void sha512_transform (const u64x w0[4], const u64x w1[4], const u64x w2[4], const u64x w3[4], u64x digest[8])
{
- u64 w0_t = w0[0];
- u64 w1_t = w0[1];
- u64 w2_t = w0[2];
- u64 w3_t = w0[3];
- u64 w4_t = w1[0];
- u64 w5_t = w1[1];
- u64 w6_t = w1[2];
- u64 w7_t = w1[3];
- u64 w8_t = w2[0];
- u64 w9_t = w2[1];
- u64 wa_t = w2[2];
- u64 wb_t = w2[3];
- u64 wc_t = w3[0];
- u64 wd_t = w3[1];
- u64 we_t = w3[2];
- u64 wf_t = w3[3];
-
- u64 a = digest[0];
- u64 b = digest[1];
- u64 c = digest[2];
- u64 d = digest[3];
- u64 e = digest[4];
- u64 f = digest[5];
- u64 g = digest[6];
- u64 h = digest[7];
+ u64x w0_t = w0[0];
+ u64x w1_t = w0[1];
+ u64x w2_t = w0[2];
+ u64x w3_t = w0[3];
+ u64x w4_t = w1[0];
+ u64x w5_t = w1[1];
+ u64x w6_t = w1[2];
+ u64x w7_t = w1[3];
+ u64x w8_t = w2[0];
+ u64x w9_t = w2[1];
+ u64x wa_t = w2[2];
+ u64x wb_t = w2[3];
+ u64x wc_t = w3[0];
+ u64x wd_t = w3[1];
+ u64x we_t = w3[2];
+ u64x wf_t = w3[3];
+
+ u64x a = digest[0];
+ u64x b = digest[1];
+ u64x c = digest[2];
+ u64x d = digest[3];
+ u64x e = digest[4];
+ u64x f = digest[5];
+ u64x g = digest[6];
+ u64x h = digest[7];
#define ROUND_EXPAND() \
{ \
digest[7] += h;
}
-static void hmac_sha512_pad (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], u64 ipad[8], u64 opad[8])
+static void hmac_sha512_pad (u32x w0[4], u32x w1[4], u32x w2[4], u32x w3[4], u64x ipad[8], u64x opad[8])
{
- u64 w0_t[4];
- u64 w1_t[4];
- u64 w2_t[4];
- u64 w3_t[4];
+ u64x w0_t[4];
+ u64x w1_t[4];
+ u64x w2_t[4];
+ u64x w3_t[4];
w0_t[0] = hl32_to_64 (w0[0], w0[1]) ^ 0x3636363636363636;
w0_t[1] = hl32_to_64 (w0[2], w0[3]) ^ 0x3636363636363636;
w1_t[1] = hl32_to_64 (w2[2], w2[3]) ^ 0x3636363636363636;
w1_t[2] = hl32_to_64 (w3[0], w3[1]) ^ 0x3636363636363636;
w1_t[3] = hl32_to_64 (w3[2], w3[3]) ^ 0x3636363636363636;
- w2_t[0] = 0 ^ 0x3636363636363636;
- w2_t[1] = 0 ^ 0x3636363636363636;
- w2_t[2] = 0 ^ 0x3636363636363636;
- w2_t[3] = 0 ^ 0x3636363636363636;
- w3_t[0] = 0 ^ 0x3636363636363636;
- w3_t[1] = 0 ^ 0x3636363636363636;
- w3_t[2] = 0 ^ 0x3636363636363636;
- w3_t[3] = 0 ^ 0x3636363636363636;
+ w2_t[0] = 0x3636363636363636;
+ w2_t[1] = 0x3636363636363636;
+ w2_t[2] = 0x3636363636363636;
+ w2_t[3] = 0x3636363636363636;
+ w3_t[0] = 0x3636363636363636;
+ w3_t[1] = 0x3636363636363636;
+ w3_t[2] = 0x3636363636363636;
+ w3_t[3] = 0x3636363636363636;
ipad[0] = SHA512M_A;
ipad[1] = SHA512M_B;
w1_t[1] = hl32_to_64 (w2[2], w2[3]) ^ 0x5c5c5c5c5c5c5c5c;
w1_t[2] = hl32_to_64 (w3[0], w3[1]) ^ 0x5c5c5c5c5c5c5c5c;
w1_t[3] = hl32_to_64 (w3[2], w3[3]) ^ 0x5c5c5c5c5c5c5c5c;
- w2_t[0] = 0 ^ 0x5c5c5c5c5c5c5c5c;
- w2_t[1] = 0 ^ 0x5c5c5c5c5c5c5c5c;
- w2_t[2] = 0 ^ 0x5c5c5c5c5c5c5c5c;
- w2_t[3] = 0 ^ 0x5c5c5c5c5c5c5c5c;
- w3_t[0] = 0 ^ 0x5c5c5c5c5c5c5c5c;
- w3_t[1] = 0 ^ 0x5c5c5c5c5c5c5c5c;
- w3_t[2] = 0 ^ 0x5c5c5c5c5c5c5c5c;
- w3_t[3] = 0 ^ 0x5c5c5c5c5c5c5c5c;
+ w2_t[0] = 0x5c5c5c5c5c5c5c5c;
+ w2_t[1] = 0x5c5c5c5c5c5c5c5c;
+ w2_t[2] = 0x5c5c5c5c5c5c5c5c;
+ w2_t[3] = 0x5c5c5c5c5c5c5c5c;
+ w3_t[0] = 0x5c5c5c5c5c5c5c5c;
+ w3_t[1] = 0x5c5c5c5c5c5c5c5c;
+ w3_t[2] = 0x5c5c5c5c5c5c5c5c;
+ w3_t[3] = 0x5c5c5c5c5c5c5c5c;
opad[0] = SHA512M_A;
opad[1] = SHA512M_B;
sha512_transform (w0_t, w1_t, w2_t, w3_t, opad);
}
-static void hmac_sha512_run (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], u64 ipad[8], u64 opad[8], u64 digest[8])
+static void hmac_sha512_run (u32x w0[4], u32x w1[4], u32x w2[4], u32x w3[4], u64x ipad[8], u64x opad[8], u64x digest[8])
{
- u64 w0_t[4];
- u64 w1_t[4];
- u64 w2_t[4];
- u64 w3_t[4];
+ u64x w0_t[4];
+ u64x w1_t[4];
+ u64x w2_t[4];
+ u64x w3_t[4];
w0_t[0] = hl32_to_64 (w0[0], w0[1]);
w0_t[1] = hl32_to_64 (w0[2], w0[3]);
* pads
*/
- u32 w0_t[4];
+ u32x w0_t[4];
w0_t[0] = swap32 (salt_buf0[0]);
w0_t[1] = swap32 (salt_buf0[1]);
w0_t[2] = swap32 (salt_buf0[2]);
w0_t[3] = swap32 (salt_buf0[3]);
- u32 w1_t[4];
+ u32x w1_t[4];
w1_t[0] = swap32 (salt_buf1[0]);
w1_t[1] = swap32 (salt_buf1[1]);
w1_t[2] = swap32 (salt_buf1[2]);
w1_t[3] = swap32 (salt_buf1[3]);
- u32 w2_t[4];
+ u32x w2_t[4];
w2_t[0] = 0;
w2_t[1] = 0;
w2_t[2] = 0;
w2_t[3] = 0;
- u32 w3_t[4];
+ u32x w3_t[4];
w3_t[0] = 0;
w3_t[1] = 0;
w3_t[2] = 0;
w3_t[3] = 0;
- u64 ipad[8];
- u64 opad[8];
+ u64x ipad[8];
+ u64x opad[8];
hmac_sha512_pad (w0_t, w1_t, w2_t, w3_t, ipad, opad);
u32 w0l = w0[0];
- for (u32 il_pos = 0; il_pos < bfs_cnt; il_pos++)
+ for (u32 il_pos = 0; il_pos < bfs_cnt; il_pos += VECT_SIZE)
{
- const u32 w0r = bfs_buf[il_pos].i;
+ const u32x w0r = w0r_create_bft (bfs_buf, il_pos);
- w0[0] = w0l | w0r;
+ const u32x w0lr = w0l | w0r;
- w0_t[0] = w0[0];
+ w0_t[0] = w0lr;
w0_t[1] = w0[1];
w0_t[2] = w0[2];
w0_t[3] = w0[3];
w3_t[2] = 0;
w3_t[3] = (128 + pw_len) * 8;
- u64 digest[8];
+ u64x digest[8];
hmac_sha512_run (w0_t, w1_t, w2_t, w3_t, ipad, opad, digest);
+ const u32x r0 = l32_from_64 (digest[7]);
+ const u32x r1 = h32_from_64 (digest[7]);
+ const u32x r2 = l32_from_64 (digest[3]);
+ const u32x r3 = h32_from_64 (digest[3]);
- const u32 r0 = l32_from_64 (digest[7]);
- const u32 r1 = h32_from_64 (digest[7]);
- const u32 r2 = l32_from_64 (digest[3]);
- const u32 r3 = h32_from_64 (digest[3]);
-
- #include COMPARE_M
+ COMPARE_M_SIMD (r0, r1, r2, r3);
}
}
* pads
*/
- u32 w0_t[4];
+ u32x w0_t[4];
w0_t[0] = swap32 (salt_buf0[0]);
w0_t[1] = swap32 (salt_buf0[1]);
w0_t[2] = swap32 (salt_buf0[2]);
w0_t[3] = swap32 (salt_buf0[3]);
- u32 w1_t[4];
+ u32x w1_t[4];
w1_t[0] = swap32 (salt_buf1[0]);
w1_t[1] = swap32 (salt_buf1[1]);
w1_t[2] = swap32 (salt_buf1[2]);
w1_t[3] = swap32 (salt_buf1[3]);
- u32 w2_t[4];
+ u32x w2_t[4];
w2_t[0] = 0;
w2_t[1] = 0;
w2_t[2] = 0;
w2_t[3] = 0;
- u32 w3_t[4];
+ u32x w3_t[4];
w3_t[0] = 0;
w3_t[1] = 0;
w3_t[2] = 0;
w3_t[3] = 0;
- u64 ipad[8];
- u64 opad[8];
+ u64x ipad[8];
+ u64x opad[8];
hmac_sha512_pad (w0_t, w1_t, w2_t, w3_t, ipad, opad);
u32 w0l = w0[0];
- for (u32 il_pos = 0; il_pos < bfs_cnt; il_pos++)
+ for (u32 il_pos = 0; il_pos < bfs_cnt; il_pos += VECT_SIZE)
{
- const u32 w0r = bfs_buf[il_pos].i;
+ const u32x w0r = w0r_create_bft (bfs_buf, il_pos);
- w0[0] = w0l | w0r;
+ const u32x w0lr = w0l | w0r;
- w0_t[0] = w0[0];
+ w0_t[0] = w0lr;
w0_t[1] = w0[1];
w0_t[2] = w0[2];
w0_t[3] = w0[3];
w3_t[2] = 0;
w3_t[3] = (128 + pw_len) * 8;
- u64 digest[8];
+ u64x digest[8];
hmac_sha512_run (w0_t, w1_t, w2_t, w3_t, ipad, opad, digest);
+ const u32x r0 = l32_from_64 (digest[7]);
+ const u32x r1 = h32_from_64 (digest[7]);
+ const u32x r2 = l32_from_64 (digest[3]);
+ const u32x r3 = h32_from_64 (digest[3]);
- const u32 r0 = l32_from_64 (digest[7]);
- const u32 r1 = h32_from_64 (digest[7]);
- const u32 r2 = l32_from_64 (digest[3]);
- const u32 r3 = h32_from_64 (digest[3]);
-
- #include COMPARE_S
+ COMPARE_S_SIMD (r0, r1, r2, r3);
}
}
if (combs_mode == COMBINATOR_MODE_BASE_RIGHT)
{
- switch_buffer_by_offset (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len);
+ switch_buffer_by_offset_le (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len);
}
/**
if (combs_mode == COMBINATOR_MODE_BASE_LEFT)
{
- switch_buffer_by_offset (wordr0, wordr1, wordr2, wordr3, pw_l_len);
+ switch_buffer_by_offset_le (wordr0, wordr1, wordr2, wordr3, pw_l_len);
}
u32 w0[4];
if (combs_mode == COMBINATOR_MODE_BASE_RIGHT)
{
- switch_buffer_by_offset (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len);
+ switch_buffer_by_offset_le (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len);
}
/**
if (combs_mode == COMBINATOR_MODE_BASE_LEFT)
{
- switch_buffer_by_offset (wordr0, wordr1, wordr2, wordr3, pw_l_len);
+ switch_buffer_by_offset_le (wordr0, wordr1, wordr2, wordr3, pw_l_len);
}
u32 w0[4];
#define _MD5_
+#define NEW_SIMD_CODE
+
#include "include/constants.h"
#include "include/kernel_vendor.h"
#include "include/kernel_functions.c"
#include "OpenCL/types_ocl.c"
#include "OpenCL/common.c"
+#include "OpenCL/simd.c"
-#define COMPARE_S "OpenCL/check_single_comp4.c"
-#define COMPARE_M "OpenCL/check_multi_comp4.c"
-
-static void m02400m (u32 w[16], const u32 pw_len, __global pw_t *pws, __global kernel_rule_t *rules_buf, __global comb_t *combs_buf, __constant u32 * words_buf_r, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 bfs_cnt, const u32 digests_cnt, const u32 digests_offset)
+static void m02400m (u32 w[16], const u32 pw_len, __global pw_t *pws, __global kernel_rule_t *rules_buf, __global comb_t *combs_buf, __constant u32x * words_buf_r, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 bfs_cnt, const u32 digests_cnt, const u32 digests_offset)
{
/**
* modifier
u32 w0l = w[0];
- for (u32 il_pos = 0; il_pos < bfs_cnt; il_pos++)
+ for (u32 il_pos = 0; il_pos < bfs_cnt; il_pos += VECT_SIZE)
{
- const u32 w0r = words_buf_r[il_pos];
+ const u32x w0r = words_buf_r[il_pos / VECT_SIZE];
- const u32 w0 = w0l | w0r;
+ const u32x w0 = w0l | w0r;
- u32 tmp2;
+ u32x tmp2;
- u32 a = MD5M_A;
- u32 b = MD5M_B;
- u32 c = MD5M_C;
- u32 d = MD5M_D;
+ u32x a = MD5M_A;
+ u32x b = MD5M_B;
+ u32x c = MD5M_C;
+ u32x d = MD5M_D;
MD5_STEP (MD5_Fo, a, b, c, d, w0, F_w0c00, MD5S00);
MD5_STEP0(MD5_Fo, d, a, b, c, F_w1c01, MD5S01);
c &= 0x00ffffff;
b &= 0x00ffffff;
- const u32 r0 = a;
- const u32 r1 = d;
- const u32 r2 = c;
- const u32 r3 = b;
-
- #include COMPARE_M
+ COMPARE_M_SIMD (a, d, c, b);
}
}
-static void m02400s (u32 w[16], const u32 pw_len, __global pw_t *pws, __global kernel_rule_t *rules_buf, __global comb_t *combs_buf, __constant u32 * words_buf_r, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 bfs_cnt, const u32 digests_cnt, const u32 digests_offset)
+static void m02400s (u32 w[16], const u32 pw_len, __global pw_t *pws, __global kernel_rule_t *rules_buf, __global comb_t *combs_buf, __constant u32x * words_buf_r, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 bfs_cnt, const u32 digests_cnt, const u32 digests_offset)
{
/**
* modifier
u32 w0l = w[0];
- for (u32 il_pos = 0; il_pos < bfs_cnt; il_pos++)
+ for (u32 il_pos = 0; il_pos < bfs_cnt; il_pos += VECT_SIZE)
{
- const u32 w0r = words_buf_r[il_pos];
+ const u32x w0r = words_buf_r[il_pos / VECT_SIZE];
- const u32 w0 = w0l | w0r;
+ const u32x w0 = w0l | w0r;
- u32 tmp2;
+ u32x tmp2;
- u32 a = MD5M_A;
- u32 b = MD5M_B;
- u32 c = MD5M_C;
- u32 d = MD5M_D;
+ u32x a = MD5M_A;
+ u32x b = MD5M_B;
+ u32x c = MD5M_C;
+ u32x d = MD5M_D;
MD5_STEP (MD5_Fo, a, b, c, d, w0, F_w0c00, MD5S00);
MD5_STEP0(MD5_Fo, d, a, b, c, F_w1c01, MD5S01);
MD5_STEP0(MD5_I , b, c, d, a, I_wdc3b, MD5S33);
MD5_STEP0(MD5_I , a, b, c, d, I_w4c3c, MD5S30);
- bool q_cond = allx ((a & 0x00ffffff) != search[0]);
-
- if (q_cond) continue;
+ if (MATCHES_NONE_VS ((a & 0x00ffffff), search[0])) continue;
MD5_STEP0(MD5_I , d, a, b, c, I_wbc3d, MD5S31);
MD5_STEP0(MD5_I , c, d, a, b, I_w2c3e, MD5S32);
c &= 0x00ffffff;
b &= 0x00ffffff;
- const u32 r0 = a;
- const u32 r1 = d;
- const u32 r2 = c;
- const u32 r3 = b;
-
- #include COMPARE_S
+ COMPARE_S_SIMD (a, d, c, b);
}
}
-__kernel void m02400_m04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, __global comb_t *combs_buf, __constant u32 * words_buf_r, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 bfs_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max)
+__kernel void m02400_m04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, __global comb_t *combs_buf, __constant u32x * words_buf_r, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 bfs_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max)
{
/**
* base
m02400m (w, pw_len, pws, rules_buf, combs_buf, words_buf_r, tmps, hooks, bitmaps_buf_s1_a, bitmaps_buf_s1_b, bitmaps_buf_s1_c, bitmaps_buf_s1_d, bitmaps_buf_s2_a, bitmaps_buf_s2_b, bitmaps_buf_s2_c, bitmaps_buf_s2_d, plains_buf, digests_buf, hashes_shown, salt_bufs, esalt_bufs, d_return_buf, d_scryptV_buf, bitmap_mask, bitmap_shift1, bitmap_shift2, salt_pos, loop_pos, loop_cnt, bfs_cnt, digests_cnt, digests_offset);
}
-__kernel void m02400_m08 (__global pw_t *pws, __global kernel_rule_t *rules_buf, __global comb_t *combs_buf, __constant u32 * words_buf_r, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 bfs_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max)
+__kernel void m02400_m08 (__global pw_t *pws, __global kernel_rule_t *rules_buf, __global comb_t *combs_buf, __constant u32x * words_buf_r, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 bfs_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max)
{
}
-__kernel void m02400_m16 (__global pw_t *pws, __global kernel_rule_t *rules_buf, __global comb_t *combs_buf, __constant u32 * words_buf_r, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 bfs_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max)
+__kernel void m02400_m16 (__global pw_t *pws, __global kernel_rule_t *rules_buf, __global comb_t *combs_buf, __constant u32x * words_buf_r, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 bfs_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max)
{
}
-__kernel void m02400_s04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, __global comb_t *combs_buf, __constant u32 * words_buf_r, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 bfs_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max)
+__kernel void m02400_s04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, __global comb_t *combs_buf, __constant u32x * words_buf_r, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 bfs_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max)
{
/**
* base
m02400s (w, pw_len, pws, rules_buf, combs_buf, words_buf_r, tmps, hooks, bitmaps_buf_s1_a, bitmaps_buf_s1_b, bitmaps_buf_s1_c, bitmaps_buf_s1_d, bitmaps_buf_s2_a, bitmaps_buf_s2_b, bitmaps_buf_s2_c, bitmaps_buf_s2_d, plains_buf, digests_buf, hashes_shown, salt_bufs, esalt_bufs, d_return_buf, d_scryptV_buf, bitmap_mask, bitmap_shift1, bitmap_shift2, salt_pos, loop_pos, loop_cnt, bfs_cnt, digests_cnt, digests_offset);
}
-__kernel void m02400_s08 (__global pw_t *pws, __global kernel_rule_t *rules_buf, __global comb_t *combs_buf, __constant u32 * words_buf_r, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 bfs_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max)
+__kernel void m02400_s08 (__global pw_t *pws, __global kernel_rule_t *rules_buf, __global comb_t *combs_buf, __constant u32x * words_buf_r, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 bfs_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max)
{
}
-__kernel void m02400_s16 (__global pw_t *pws, __global kernel_rule_t *rules_buf, __global comb_t *combs_buf, __constant u32 * words_buf_r, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 bfs_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max)
+__kernel void m02400_s16 (__global pw_t *pws, __global kernel_rule_t *rules_buf, __global comb_t *combs_buf, __constant u32x * words_buf_r, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 bfs_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max)
{
}
s3[2] = 0;
s3[3] = 0;
- switch_buffer_by_offset (s0, s1, s2, s3, out_len);
+ switch_buffer_by_offset_le (s0, s1, s2, s3, out_len);
w0[0] |= s0[0];
w0[1] |= s0[1];
s3[2] = 0;
s3[3] = 0;
- switch_buffer_by_offset (s0, s1, s2, s3, out_len);
+ switch_buffer_by_offset_le (s0, s1, s2, s3, out_len);
w0[0] |= s0[0];
w0[1] |= s0[1];
if (combs_mode == COMBINATOR_MODE_BASE_RIGHT)
{
- switch_buffer_by_offset (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len);
+ switch_buffer_by_offset_le (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len);
}
/**
if (combs_mode == COMBINATOR_MODE_BASE_LEFT)
{
- switch_buffer_by_offset (wordr0, wordr1, wordr2, wordr3, pw_l_len);
+ switch_buffer_by_offset_le (wordr0, wordr1, wordr2, wordr3, pw_l_len);
}
/**
s3[2] = 0;
s3[3] = 0;
- switch_buffer_by_offset (s0, s1, s2, s3, pw_len);
+ switch_buffer_by_offset_le (s0, s1, s2, s3, pw_len);
const u32 pw_salt_len = pw_len + salt_len;
if (combs_mode == COMBINATOR_MODE_BASE_RIGHT)
{
- switch_buffer_by_offset (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len);
+ switch_buffer_by_offset_le (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len);
}
/**
if (combs_mode == COMBINATOR_MODE_BASE_LEFT)
{
- switch_buffer_by_offset (wordr0, wordr1, wordr2, wordr3, pw_l_len);
+ switch_buffer_by_offset_le (wordr0, wordr1, wordr2, wordr3, pw_l_len);
}
/**
s3[2] = 0;
s3[3] = 0;
- switch_buffer_by_offset (s0, s1, s2, s3, pw_len);
+ switch_buffer_by_offset_le (s0, s1, s2, s3, pw_len);
const u32 pw_salt_len = pw_len + salt_len;
#define _MD5_
+#define NEW_SIMD_CODE
+
#include "include/constants.h"
#include "include/kernel_vendor.h"
#include "include/kernel_functions.c"
#include "OpenCL/types_ocl.c"
#include "OpenCL/common.c"
+#include "OpenCL/simd.c"
-#define COMPARE_S "OpenCL/check_single_comp4.c"
-#define COMPARE_M "OpenCL/check_multi_comp4.c"
-
-static void m02410m (u32 w[16], const u32 pw_len, __global pw_t *pws, __global kernel_rule_t *rules_buf, __global comb_t *combs_buf, __constant u32 * words_buf_r, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 bfs_cnt, const u32 digests_cnt, const u32 digests_offset)
+static void m02410m (u32 w[16], const u32 pw_len, __global pw_t *pws, __global kernel_rule_t *rules_buf, __global comb_t *combs_buf, __constant u32x * words_buf_r, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 bfs_cnt, const u32 digests_cnt, const u32 digests_offset)
{
/**
* modifier
salt_buf3[2] = 0;
salt_buf3[3] = 0;
- switch_buffer_by_offset (salt_buf0, salt_buf1, salt_buf2, salt_buf3, pw_len);
+ switch_buffer_by_offset_le_S (salt_buf0, salt_buf1, salt_buf2, salt_buf3, pw_len);
w[0] |= salt_buf0[0];
w[1] |= salt_buf0[1];
const u32 pw_salt_len = pw_len + salt_len;
- truncate_block (w, pw_salt_len);
+ truncate_block_S (w, pw_salt_len);
/**
* algorithm specific
u32 w0l = w[0];
- for (u32 il_pos = 0; il_pos < bfs_cnt; il_pos++)
+ for (u32 il_pos = 0; il_pos < bfs_cnt; il_pos += VECT_SIZE)
{
- const u32 w0r = words_buf_r[il_pos];
+ const u32x w0r = words_buf_r[il_pos / VECT_SIZE];
- const u32 w0 = w0l | w0r;
+ const u32x w0 = w0l | w0r;
- u32 tmp2;
+ u32x tmp2;
- u32 a = MD5M_A;
- u32 b = MD5M_B;
- u32 c = MD5M_C;
- u32 d = MD5M_D;
+ u32x a = MD5M_A;
+ u32x b = MD5M_B;
+ u32x c = MD5M_C;
+ u32x d = MD5M_D;
MD5_STEP (MD5_Fo, a, b, c, d, w0, F_w0c00, MD5S00);
MD5_STEP0(MD5_Fo, d, a, b, c, F_w1c01, MD5S01);
c &= 0x00ffffff;
b &= 0x00ffffff;
- const u32 r0 = a;
- const u32 r1 = d;
- const u32 r2 = c;
- const u32 r3 = b;
-
- #include COMPARE_M
+ COMPARE_M_SIMD (a, d, c, b);
}
}
-static void m02410s (u32 w[16], const u32 pw_len, __global pw_t *pws, __global kernel_rule_t *rules_buf, __global comb_t *combs_buf, __constant u32 * words_buf_r, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 bfs_cnt, const u32 digests_cnt, const u32 digests_offset)
+static void m02410s (u32 w[16], const u32 pw_len, __global pw_t *pws, __global kernel_rule_t *rules_buf, __global comb_t *combs_buf, __constant u32x * words_buf_r, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 bfs_cnt, const u32 digests_cnt, const u32 digests_offset)
{
/**
* modifier
salt_buf3[2] = 0;
salt_buf3[3] = 0;
- switch_buffer_by_offset (salt_buf0, salt_buf1, salt_buf2, salt_buf3, pw_len);
+ switch_buffer_by_offset_le_S (salt_buf0, salt_buf1, salt_buf2, salt_buf3, pw_len);
w[0] |= salt_buf0[0];
w[1] |= salt_buf0[1];
const u32 pw_salt_len = pw_len + salt_len;
- truncate_block (w, pw_salt_len);
+ truncate_block_S (w, pw_salt_len);
/**
* algorithm specific
u32 w0l = w[0];
- for (u32 il_pos = 0; il_pos < bfs_cnt; il_pos++)
+ for (u32 il_pos = 0; il_pos < bfs_cnt; il_pos += VECT_SIZE)
{
- const u32 w0r = words_buf_r[il_pos];
+ const u32x w0r = words_buf_r[il_pos / VECT_SIZE];
- const u32 w0 = w0l | w0r;
+ const u32x w0 = w0l | w0r;
- u32 tmp2;
+ u32x tmp2;
- u32 a = MD5M_A;
- u32 b = MD5M_B;
- u32 c = MD5M_C;
- u32 d = MD5M_D;
+ u32x a = MD5M_A;
+ u32x b = MD5M_B;
+ u32x c = MD5M_C;
+ u32x d = MD5M_D;
MD5_STEP (MD5_Fo, a, b, c, d, w0, F_w0c00, MD5S00);
MD5_STEP0(MD5_Fo, d, a, b, c, F_w1c01, MD5S01);
MD5_STEP0(MD5_I , b, c, d, a, I_wdc3b, MD5S33);
MD5_STEP0(MD5_I , a, b, c, d, I_w4c3c, MD5S30);
- bool q_cond = allx ((a & 0x00ffffff) != search[0]);
-
- if (q_cond) continue;
+ if (MATCHES_NONE_VS ((a & 0x00ffffff), search[0])) continue;
MD5_STEP0(MD5_I , d, a, b, c, I_wbc3d, MD5S31);
MD5_STEP0(MD5_I , c, d, a, b, I_w2c3e, MD5S32);
c &= 0x00ffffff;
b &= 0x00ffffff;
- const u32 r0 = a;
- const u32 r1 = d;
- const u32 r2 = c;
- const u32 r3 = b;
-
- #include COMPARE_S
+ COMPARE_S_SIMD (a, d, c, b);
}
}
-__kernel void m02410_m04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, __global comb_t *combs_buf, __constant u32 * words_buf_r, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 bfs_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max)
+__kernel void m02410_m04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, __global comb_t *combs_buf, __constant u32x * words_buf_r, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 bfs_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max)
{
/**
* base
m02410m (w, pw_len, pws, rules_buf, combs_buf, words_buf_r, tmps, hooks, bitmaps_buf_s1_a, bitmaps_buf_s1_b, bitmaps_buf_s1_c, bitmaps_buf_s1_d, bitmaps_buf_s2_a, bitmaps_buf_s2_b, bitmaps_buf_s2_c, bitmaps_buf_s2_d, plains_buf, digests_buf, hashes_shown, salt_bufs, esalt_bufs, d_return_buf, d_scryptV_buf, bitmap_mask, bitmap_shift1, bitmap_shift2, salt_pos, loop_pos, loop_cnt, bfs_cnt, digests_cnt, digests_offset);
}
-__kernel void m02410_m08 (__global pw_t *pws, __global kernel_rule_t *rules_buf, __global comb_t *combs_buf, __constant u32 * words_buf_r, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 bfs_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max)
+__kernel void m02410_m08 (__global pw_t *pws, __global kernel_rule_t *rules_buf, __global comb_t *combs_buf, __constant u32x * words_buf_r, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 bfs_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max)
{
}
-__kernel void m02410_m16 (__global pw_t *pws, __global kernel_rule_t *rules_buf, __global comb_t *combs_buf, __constant u32 * words_buf_r, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 bfs_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max)
+__kernel void m02410_m16 (__global pw_t *pws, __global kernel_rule_t *rules_buf, __global comb_t *combs_buf, __constant u32x * words_buf_r, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 bfs_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max)
{
}
-__kernel void m02410_s04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, __global comb_t *combs_buf, __constant u32 * words_buf_r, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 bfs_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max)
+__kernel void m02410_s04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, __global comb_t *combs_buf, __constant u32x * words_buf_r, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 bfs_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max)
{
/**
* base
m02410s (w, pw_len, pws, rules_buf, combs_buf, words_buf_r, tmps, hooks, bitmaps_buf_s1_a, bitmaps_buf_s1_b, bitmaps_buf_s1_c, bitmaps_buf_s1_d, bitmaps_buf_s2_a, bitmaps_buf_s2_b, bitmaps_buf_s2_c, bitmaps_buf_s2_d, plains_buf, digests_buf, hashes_shown, salt_bufs, esalt_bufs, d_return_buf, d_scryptV_buf, bitmap_mask, bitmap_shift1, bitmap_shift2, salt_pos, loop_pos, loop_cnt, bfs_cnt, digests_cnt, digests_offset);
}
-__kernel void m02410_s08 (__global pw_t *pws, __global kernel_rule_t *rules_buf, __global comb_t *combs_buf, __constant u32 * words_buf_r, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 bfs_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max)
+__kernel void m02410_s08 (__global pw_t *pws, __global kernel_rule_t *rules_buf, __global comb_t *combs_buf, __constant u32x * words_buf_r, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 bfs_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max)
{
}
-__kernel void m02410_s16 (__global pw_t *pws, __global kernel_rule_t *rules_buf, __global comb_t *combs_buf, __constant u32 * words_buf_r, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 bfs_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max)
+__kernel void m02410_s16 (__global pw_t *pws, __global kernel_rule_t *rules_buf, __global comb_t *combs_buf, __constant u32x * words_buf_r, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 bfs_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max)
{
}
{
append_0x80_2x4 (wordl0, wordl1, pw_l_len);
- switch_buffer_by_offset (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len);
+ switch_buffer_by_offset_le (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len);
}
/**
if (combs_mode == COMBINATOR_MODE_BASE_LEFT)
{
- switch_buffer_by_offset (wordr0, wordr1, wordr2, wordr3, pw_l_len);
+ switch_buffer_by_offset_le (wordr0, wordr1, wordr2, wordr3, pw_l_len);
}
u32 w0[4];
{
append_0x80_2x4 (wordl0, wordl1, pw_l_len);
- switch_buffer_by_offset (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len);
+ switch_buffer_by_offset_le (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len);
}
/**
if (combs_mode == COMBINATOR_MODE_BASE_LEFT)
{
- switch_buffer_by_offset (wordr0, wordr1, wordr2, wordr3, pw_l_len);
+ switch_buffer_by_offset_le (wordr0, wordr1, wordr2, wordr3, pw_l_len);
}
u32 w0[4];
#define _MD5_
+#define NEW_SIMD_CODE
+
#include "include/constants.h"
#include "include/kernel_vendor.h"
#include "include/kernel_functions.c"
#include "OpenCL/types_ocl.c"
#include "OpenCL/common.c"
-
-#define COMPARE_S "OpenCL/check_single_comp4.c"
-#define COMPARE_M "OpenCL/check_multi_comp4.c"
-
-#define uint_to_hex_lower8(i) l_bin2asc[(i)]
+#include "OpenCL/simd.c"
+
+#if VECT_SIZE == 1
+#define uint_to_hex_lower8(i) (u32x) (l_bin2asc[(i)])
+#elif VECT_SIZE == 2
+#define uint_to_hex_lower8(i) (u32x) (l_bin2asc[(i).s0], l_bin2asc[(i).s1])
+#elif VECT_SIZE == 4
+#define uint_to_hex_lower8(i) (u32x) (l_bin2asc[(i).s0], l_bin2asc[(i).s1], l_bin2asc[(i).s2], l_bin2asc[(i).s3])
+#elif VECT_SIZE == 8
+#define uint_to_hex_lower8(i) (u32x) (l_bin2asc[(i).s0], l_bin2asc[(i).s1], l_bin2asc[(i).s2], l_bin2asc[(i).s3], l_bin2asc[(i).s4], l_bin2asc[(i).s5], l_bin2asc[(i).s6], l_bin2asc[(i).s7])
+#endif
static void m02610m (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], const u32 pw_len, __global pw_t *pws, __global kernel_rule_t *rules_buf, __global comb_t *combs_buf, __global bf_t *bfs_buf, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 bfs_cnt, const u32 digests_cnt, const u32 digests_offset, __local u32 l_bin2asc[256])
{
u32 w0l = w0[0];
- for (u32 il_pos = 0; il_pos < bfs_cnt; il_pos++)
+ for (u32 il_pos = 0; il_pos < bfs_cnt; il_pos += VECT_SIZE)
{
- const u32 w0r = bfs_buf[il_pos].i;
-
- w0[0] = w0l | w0r;
-
- u32 a = MD5M_A;
- u32 b = MD5M_B;
- u32 c = MD5M_C;
- u32 d = MD5M_D;
-
- MD5_STEP (MD5_Fo, a, b, c, d, w0[0], MD5C00, MD5S00);
- MD5_STEP (MD5_Fo, d, a, b, c, w0[1], MD5C01, MD5S01);
- MD5_STEP (MD5_Fo, c, d, a, b, w0[2], MD5C02, MD5S02);
- MD5_STEP (MD5_Fo, b, c, d, a, w0[3], MD5C03, MD5S03);
- MD5_STEP (MD5_Fo, a, b, c, d, w1[0], MD5C04, MD5S00);
- MD5_STEP (MD5_Fo, d, a, b, c, w1[1], MD5C05, MD5S01);
- MD5_STEP (MD5_Fo, c, d, a, b, w1[2], MD5C06, MD5S02);
- MD5_STEP (MD5_Fo, b, c, d, a, w1[3], MD5C07, MD5S03);
- MD5_STEP (MD5_Fo, a, b, c, d, w2[0], MD5C08, MD5S00);
- MD5_STEP (MD5_Fo, d, a, b, c, w2[1], MD5C09, MD5S01);
- MD5_STEP (MD5_Fo, c, d, a, b, w2[2], MD5C0a, MD5S02);
- MD5_STEP (MD5_Fo, b, c, d, a, w2[3], MD5C0b, MD5S03);
- MD5_STEP (MD5_Fo, a, b, c, d, w3[0], MD5C0c, MD5S00);
- MD5_STEP (MD5_Fo, d, a, b, c, w3[1], MD5C0d, MD5S01);
- MD5_STEP (MD5_Fo, c, d, a, b, w3[2], MD5C0e, MD5S02);
- MD5_STEP (MD5_Fo, b, c, d, a, w3[3], MD5C0f, MD5S03);
-
- MD5_STEP (MD5_Go, a, b, c, d, w0[1], MD5C10, MD5S10);
- MD5_STEP (MD5_Go, d, a, b, c, w1[2], MD5C11, MD5S11);
- MD5_STEP (MD5_Go, c, d, a, b, w2[3], MD5C12, MD5S12);
- MD5_STEP (MD5_Go, b, c, d, a, w0[0], MD5C13, MD5S13);
- MD5_STEP (MD5_Go, a, b, c, d, w1[1], MD5C14, MD5S10);
- MD5_STEP (MD5_Go, d, a, b, c, w2[2], MD5C15, MD5S11);
- MD5_STEP (MD5_Go, c, d, a, b, w3[3], MD5C16, MD5S12);
- MD5_STEP (MD5_Go, b, c, d, a, w1[0], MD5C17, MD5S13);
- MD5_STEP (MD5_Go, a, b, c, d, w2[1], MD5C18, MD5S10);
- MD5_STEP (MD5_Go, d, a, b, c, w3[2], MD5C19, MD5S11);
- MD5_STEP (MD5_Go, c, d, a, b, w0[3], MD5C1a, MD5S12);
- MD5_STEP (MD5_Go, b, c, d, a, w2[0], MD5C1b, MD5S13);
- MD5_STEP (MD5_Go, a, b, c, d, w3[1], MD5C1c, MD5S10);
- MD5_STEP (MD5_Go, d, a, b, c, w0[2], MD5C1d, MD5S11);
- MD5_STEP (MD5_Go, c, d, a, b, w1[3], MD5C1e, MD5S12);
- MD5_STEP (MD5_Go, b, c, d, a, w3[0], MD5C1f, MD5S13);
-
- MD5_STEP (MD5_H , a, b, c, d, w1[1], MD5C20, MD5S20);
- MD5_STEP (MD5_H , d, a, b, c, w2[0], MD5C21, MD5S21);
- MD5_STEP (MD5_H , c, d, a, b, w2[3], MD5C22, MD5S22);
- MD5_STEP (MD5_H , b, c, d, a, w3[2], MD5C23, MD5S23);
- MD5_STEP (MD5_H , a, b, c, d, w0[1], MD5C24, MD5S20);
- MD5_STEP (MD5_H , d, a, b, c, w1[0], MD5C25, MD5S21);
- MD5_STEP (MD5_H , c, d, a, b, w1[3], MD5C26, MD5S22);
- MD5_STEP (MD5_H , b, c, d, a, w2[2], MD5C27, MD5S23);
- MD5_STEP (MD5_H , a, b, c, d, w3[1], MD5C28, MD5S20);
- MD5_STEP (MD5_H , d, a, b, c, w0[0], MD5C29, MD5S21);
- MD5_STEP (MD5_H , c, d, a, b, w0[3], MD5C2a, MD5S22);
- MD5_STEP (MD5_H , b, c, d, a, w1[2], MD5C2b, MD5S23);
- MD5_STEP (MD5_H , a, b, c, d, w2[1], MD5C2c, MD5S20);
- MD5_STEP (MD5_H , d, a, b, c, w3[0], MD5C2d, MD5S21);
- MD5_STEP (MD5_H , c, d, a, b, w3[3], MD5C2e, MD5S22);
- MD5_STEP (MD5_H , b, c, d, a, w0[2], MD5C2f, MD5S23);
-
- MD5_STEP (MD5_I , a, b, c, d, w0[0], MD5C30, MD5S30);
- MD5_STEP (MD5_I , d, a, b, c, w1[3], MD5C31, MD5S31);
- MD5_STEP (MD5_I , c, d, a, b, w3[2], MD5C32, MD5S32);
- MD5_STEP (MD5_I , b, c, d, a, w1[1], MD5C33, MD5S33);
- MD5_STEP (MD5_I , a, b, c, d, w3[0], MD5C34, MD5S30);
- MD5_STEP (MD5_I , d, a, b, c, w0[3], MD5C35, MD5S31);
- MD5_STEP (MD5_I , c, d, a, b, w2[2], MD5C36, MD5S32);
- MD5_STEP (MD5_I , b, c, d, a, w0[1], MD5C37, MD5S33);
- MD5_STEP (MD5_I , a, b, c, d, w2[0], MD5C38, MD5S30);
- MD5_STEP (MD5_I , d, a, b, c, w3[3], MD5C39, MD5S31);
- MD5_STEP (MD5_I , c, d, a, b, w1[2], MD5C3a, MD5S32);
- MD5_STEP (MD5_I , b, c, d, a, w3[1], MD5C3b, MD5S33);
- MD5_STEP (MD5_I , a, b, c, d, w1[0], MD5C3c, MD5S30);
- MD5_STEP (MD5_I , d, a, b, c, w2[3], MD5C3d, MD5S31);
- MD5_STEP (MD5_I , c, d, a, b, w0[2], MD5C3e, MD5S32);
- MD5_STEP (MD5_I , b, c, d, a, w2[1], MD5C3f, MD5S33);
+ const u32x w0r = w0r_create_bft (bfs_buf, il_pos);
+
+ const u32x w0lr = w0l | w0r;
+
+ u32x w0_t[4];
+
+ w0_t[0] = w0lr;
+ w0_t[1] = w0[1];
+ w0_t[2] = w0[2];
+ w0_t[3] = w0[3];
+
+ u32x w1_t[4];
+
+ w1_t[0] = w1[0];
+ w1_t[1] = w1[1];
+ w1_t[2] = w1[2];
+ w1_t[3] = w1[3];
+
+ u32x w2_t[4];
+
+ w2_t[0] = w2[0];
+ w2_t[1] = w2[1];
+ w2_t[2] = w2[2];
+ w2_t[3] = w2[3];
+
+ u32x w3_t[4];
+
+ w3_t[0] = w3[0];
+ w3_t[1] = w3[1];
+ w3_t[2] = w3[2];
+ w3_t[3] = w3[3];
+
+ u32x a = MD5M_A;
+ u32x b = MD5M_B;
+ u32x c = MD5M_C;
+ u32x d = MD5M_D;
+
+ MD5_STEP (MD5_Fo, a, b, c, d, w0_t[0], MD5C00, MD5S00);
+ MD5_STEP (MD5_Fo, d, a, b, c, w0_t[1], MD5C01, MD5S01);
+ MD5_STEP (MD5_Fo, c, d, a, b, w0_t[2], MD5C02, MD5S02);
+ MD5_STEP (MD5_Fo, b, c, d, a, w0_t[3], MD5C03, MD5S03);
+ MD5_STEP (MD5_Fo, a, b, c, d, w1_t[0], MD5C04, MD5S00);
+ MD5_STEP (MD5_Fo, d, a, b, c, w1_t[1], MD5C05, MD5S01);
+ MD5_STEP (MD5_Fo, c, d, a, b, w1_t[2], MD5C06, MD5S02);
+ MD5_STEP (MD5_Fo, b, c, d, a, w1_t[3], MD5C07, MD5S03);
+ MD5_STEP (MD5_Fo, a, b, c, d, w2_t[0], MD5C08, MD5S00);
+ MD5_STEP (MD5_Fo, d, a, b, c, w2_t[1], MD5C09, MD5S01);
+ MD5_STEP (MD5_Fo, c, d, a, b, w2_t[2], MD5C0a, MD5S02);
+ MD5_STEP (MD5_Fo, b, c, d, a, w2_t[3], MD5C0b, MD5S03);
+ MD5_STEP (MD5_Fo, a, b, c, d, w3_t[0], MD5C0c, MD5S00);
+ MD5_STEP (MD5_Fo, d, a, b, c, w3_t[1], MD5C0d, MD5S01);
+ MD5_STEP (MD5_Fo, c, d, a, b, w3_t[2], MD5C0e, MD5S02);
+ MD5_STEP (MD5_Fo, b, c, d, a, w3_t[3], MD5C0f, MD5S03);
+
+ MD5_STEP (MD5_Go, a, b, c, d, w0_t[1], MD5C10, MD5S10);
+ MD5_STEP (MD5_Go, d, a, b, c, w1_t[2], MD5C11, MD5S11);
+ MD5_STEP (MD5_Go, c, d, a, b, w2_t[3], MD5C12, MD5S12);
+ MD5_STEP (MD5_Go, b, c, d, a, w0_t[0], MD5C13, MD5S13);
+ MD5_STEP (MD5_Go, a, b, c, d, w1_t[1], MD5C14, MD5S10);
+ MD5_STEP (MD5_Go, d, a, b, c, w2_t[2], MD5C15, MD5S11);
+ MD5_STEP (MD5_Go, c, d, a, b, w3_t[3], MD5C16, MD5S12);
+ MD5_STEP (MD5_Go, b, c, d, a, w1_t[0], MD5C17, MD5S13);
+ MD5_STEP (MD5_Go, a, b, c, d, w2_t[1], MD5C18, MD5S10);
+ MD5_STEP (MD5_Go, d, a, b, c, w3_t[2], MD5C19, MD5S11);
+ MD5_STEP (MD5_Go, c, d, a, b, w0_t[3], MD5C1a, MD5S12);
+ MD5_STEP (MD5_Go, b, c, d, a, w2_t[0], MD5C1b, MD5S13);
+ MD5_STEP (MD5_Go, a, b, c, d, w3_t[1], MD5C1c, MD5S10);
+ MD5_STEP (MD5_Go, d, a, b, c, w0_t[2], MD5C1d, MD5S11);
+ MD5_STEP (MD5_Go, c, d, a, b, w1_t[3], MD5C1e, MD5S12);
+ MD5_STEP (MD5_Go, b, c, d, a, w3_t[0], MD5C1f, MD5S13);
+
+ MD5_STEP (MD5_H , a, b, c, d, w1_t[1], MD5C20, MD5S20);
+ MD5_STEP (MD5_H , d, a, b, c, w2_t[0], MD5C21, MD5S21);
+ MD5_STEP (MD5_H , c, d, a, b, w2_t[3], MD5C22, MD5S22);
+ MD5_STEP (MD5_H , b, c, d, a, w3_t[2], MD5C23, MD5S23);
+ MD5_STEP (MD5_H , a, b, c, d, w0_t[1], MD5C24, MD5S20);
+ MD5_STEP (MD5_H , d, a, b, c, w1_t[0], MD5C25, MD5S21);
+ MD5_STEP (MD5_H , c, d, a, b, w1_t[3], MD5C26, MD5S22);
+ MD5_STEP (MD5_H , b, c, d, a, w2_t[2], MD5C27, MD5S23);
+ MD5_STEP (MD5_H , a, b, c, d, w3_t[1], MD5C28, MD5S20);
+ MD5_STEP (MD5_H , d, a, b, c, w0_t[0], MD5C29, MD5S21);
+ MD5_STEP (MD5_H , c, d, a, b, w0_t[3], MD5C2a, MD5S22);
+ MD5_STEP (MD5_H , b, c, d, a, w1_t[2], MD5C2b, MD5S23);
+ MD5_STEP (MD5_H , a, b, c, d, w2_t[1], MD5C2c, MD5S20);
+ MD5_STEP (MD5_H , d, a, b, c, w3_t[0], MD5C2d, MD5S21);
+ MD5_STEP (MD5_H , c, d, a, b, w3_t[3], MD5C2e, MD5S22);
+ MD5_STEP (MD5_H , b, c, d, a, w0_t[2], MD5C2f, MD5S23);
+
+ MD5_STEP (MD5_I , a, b, c, d, w0_t[0], MD5C30, MD5S30);
+ MD5_STEP (MD5_I , d, a, b, c, w1_t[3], MD5C31, MD5S31);
+ MD5_STEP (MD5_I , c, d, a, b, w3_t[2], MD5C32, MD5S32);
+ MD5_STEP (MD5_I , b, c, d, a, w1_t[1], MD5C33, MD5S33);
+ MD5_STEP (MD5_I , a, b, c, d, w3_t[0], MD5C34, MD5S30);
+ MD5_STEP (MD5_I , d, a, b, c, w0_t[3], MD5C35, MD5S31);
+ MD5_STEP (MD5_I , c, d, a, b, w2_t[2], MD5C36, MD5S32);
+ MD5_STEP (MD5_I , b, c, d, a, w0_t[1], MD5C37, MD5S33);
+ MD5_STEP (MD5_I , a, b, c, d, w2_t[0], MD5C38, MD5S30);
+ MD5_STEP (MD5_I , d, a, b, c, w3_t[3], MD5C39, MD5S31);
+ MD5_STEP (MD5_I , c, d, a, b, w1_t[2], MD5C3a, MD5S32);
+ MD5_STEP (MD5_I , b, c, d, a, w3_t[1], MD5C3b, MD5S33);
+ MD5_STEP (MD5_I , a, b, c, d, w1_t[0], MD5C3c, MD5S30);
+ MD5_STEP (MD5_I , d, a, b, c, w2_t[3], MD5C3d, MD5S31);
+ MD5_STEP (MD5_I , c, d, a, b, w0_t[2], MD5C3e, MD5S32);
+ MD5_STEP (MD5_I , b, c, d, a, w2_t[1], MD5C3f, MD5S33);
a += MD5M_A;
b += MD5M_B;
c += MD5M_C;
d += MD5M_D;
- const u32 w0_t = uint_to_hex_lower8 ((a >> 0) & 255) << 0
- | uint_to_hex_lower8 ((a >> 8) & 255) << 16;
- const u32 w1_t = uint_to_hex_lower8 ((a >> 16) & 255) << 0
- | uint_to_hex_lower8 ((a >> 24) & 255) << 16;
- const u32 w2_t = uint_to_hex_lower8 ((b >> 0) & 255) << 0
- | uint_to_hex_lower8 ((b >> 8) & 255) << 16;
- const u32 w3_t = uint_to_hex_lower8 ((b >> 16) & 255) << 0
- | uint_to_hex_lower8 ((b >> 24) & 255) << 16;
- const u32 w4_t = uint_to_hex_lower8 ((c >> 0) & 255) << 0
- | uint_to_hex_lower8 ((c >> 8) & 255) << 16;
- const u32 w5_t = uint_to_hex_lower8 ((c >> 16) & 255) << 0
- | uint_to_hex_lower8 ((c >> 24) & 255) << 16;
- const u32 w6_t = uint_to_hex_lower8 ((d >> 0) & 255) << 0
- | uint_to_hex_lower8 ((d >> 8) & 255) << 16;
- const u32 w7_t = uint_to_hex_lower8 ((d >> 16) & 255) << 0
- | uint_to_hex_lower8 ((d >> 24) & 255) << 16;
-
- const u32 w8_t = s[0];
- const u32 w9_t = s[1];
- const u32 wa_t = s[2];
- const u32 wb_t = s[3];
- const u32 wc_t = s[4];
- const u32 wd_t = s[5];
- const u32 we_t = s[6];
- const u32 wf_t = s[7];
+ w0_t[0] = uint_to_hex_lower8 ((a >> 0) & 255) << 0
+ | uint_to_hex_lower8 ((a >> 8) & 255) << 16;
+ w0_t[1] = uint_to_hex_lower8 ((a >> 16) & 255) << 0
+ | uint_to_hex_lower8 ((a >> 24) & 255) << 16;
+ w0_t[2] = uint_to_hex_lower8 ((b >> 0) & 255) << 0
+ | uint_to_hex_lower8 ((b >> 8) & 255) << 16;
+ w0_t[3] = uint_to_hex_lower8 ((b >> 16) & 255) << 0
+ | uint_to_hex_lower8 ((b >> 24) & 255) << 16;
+ w1_t[0] = uint_to_hex_lower8 ((c >> 0) & 255) << 0
+ | uint_to_hex_lower8 ((c >> 8) & 255) << 16;
+ w1_t[1] = uint_to_hex_lower8 ((c >> 16) & 255) << 0
+ | uint_to_hex_lower8 ((c >> 24) & 255) << 16;
+ w1_t[2] = uint_to_hex_lower8 ((d >> 0) & 255) << 0
+ | uint_to_hex_lower8 ((d >> 8) & 255) << 16;
+ w1_t[3] = uint_to_hex_lower8 ((d >> 16) & 255) << 0
+ | uint_to_hex_lower8 ((d >> 24) & 255) << 16;
+
+ w2_t[0] = s[0];
+ w2_t[1] = s[1];
+ w2_t[2] = s[2];
+ w2_t[3] = s[3];
+
+ w3_t[0] = s[4];
+ w3_t[1] = s[5];
+ w3_t[2] = s[6];
+ w3_t[3] = s[7];
a = MD5M_A;
b = MD5M_B;
c = MD5M_C;
d = MD5M_D;
- MD5_STEP (MD5_Fo, a, b, c, d, w0_t, MD5C00, MD5S00);
- MD5_STEP (MD5_Fo, d, a, b, c, w1_t, MD5C01, MD5S01);
- MD5_STEP (MD5_Fo, c, d, a, b, w2_t, MD5C02, MD5S02);
- MD5_STEP (MD5_Fo, b, c, d, a, w3_t, MD5C03, MD5S03);
- MD5_STEP (MD5_Fo, a, b, c, d, w4_t, MD5C04, MD5S00);
- MD5_STEP (MD5_Fo, d, a, b, c, w5_t, MD5C05, MD5S01);
- MD5_STEP (MD5_Fo, c, d, a, b, w6_t, MD5C06, MD5S02);
- MD5_STEP (MD5_Fo, b, c, d, a, w7_t, MD5C07, MD5S03);
- MD5_STEP (MD5_Fo, a, b, c, d, w8_t, MD5C08, MD5S00);
- MD5_STEP (MD5_Fo, d, a, b, c, w9_t, MD5C09, MD5S01);
- MD5_STEP (MD5_Fo, c, d, a, b, wa_t, MD5C0a, MD5S02);
- MD5_STEP (MD5_Fo, b, c, d, a, wb_t, MD5C0b, MD5S03);
- MD5_STEP (MD5_Fo, a, b, c, d, wc_t, MD5C0c, MD5S00);
- MD5_STEP (MD5_Fo, d, a, b, c, wd_t, MD5C0d, MD5S01);
- MD5_STEP (MD5_Fo, c, d, a, b, we_t, MD5C0e, MD5S02);
- MD5_STEP (MD5_Fo, b, c, d, a, wf_t, MD5C0f, MD5S03);
-
- MD5_STEP (MD5_Go, a, b, c, d, w1_t, MD5C10, MD5S10);
- MD5_STEP (MD5_Go, d, a, b, c, w6_t, MD5C11, MD5S11);
- MD5_STEP (MD5_Go, c, d, a, b, wb_t, MD5C12, MD5S12);
- MD5_STEP (MD5_Go, b, c, d, a, w0_t, MD5C13, MD5S13);
- MD5_STEP (MD5_Go, a, b, c, d, w5_t, MD5C14, MD5S10);
- MD5_STEP (MD5_Go, d, a, b, c, wa_t, MD5C15, MD5S11);
- MD5_STEP (MD5_Go, c, d, a, b, wf_t, MD5C16, MD5S12);
- MD5_STEP (MD5_Go, b, c, d, a, w4_t, MD5C17, MD5S13);
- MD5_STEP (MD5_Go, a, b, c, d, w9_t, MD5C18, MD5S10);
- MD5_STEP (MD5_Go, d, a, b, c, we_t, MD5C19, MD5S11);
- MD5_STEP (MD5_Go, c, d, a, b, w3_t, MD5C1a, MD5S12);
- MD5_STEP (MD5_Go, b, c, d, a, w8_t, MD5C1b, MD5S13);
- MD5_STEP (MD5_Go, a, b, c, d, wd_t, MD5C1c, MD5S10);
- MD5_STEP (MD5_Go, d, a, b, c, w2_t, MD5C1d, MD5S11);
- MD5_STEP (MD5_Go, c, d, a, b, w7_t, MD5C1e, MD5S12);
- MD5_STEP (MD5_Go, b, c, d, a, wc_t, MD5C1f, MD5S13);
-
- MD5_STEP (MD5_H , a, b, c, d, w5_t, MD5C20, MD5S20);
- MD5_STEP (MD5_H , d, a, b, c, w8_t, MD5C21, MD5S21);
- MD5_STEP (MD5_H , c, d, a, b, wb_t, MD5C22, MD5S22);
- MD5_STEP (MD5_H , b, c, d, a, we_t, MD5C23, MD5S23);
- MD5_STEP (MD5_H , a, b, c, d, w1_t, MD5C24, MD5S20);
- MD5_STEP (MD5_H , d, a, b, c, w4_t, MD5C25, MD5S21);
- MD5_STEP (MD5_H , c, d, a, b, w7_t, MD5C26, MD5S22);
- MD5_STEP (MD5_H , b, c, d, a, wa_t, MD5C27, MD5S23);
- MD5_STEP (MD5_H , a, b, c, d, wd_t, MD5C28, MD5S20);
- MD5_STEP (MD5_H , d, a, b, c, w0_t, MD5C29, MD5S21);
- MD5_STEP (MD5_H , c, d, a, b, w3_t, MD5C2a, MD5S22);
- MD5_STEP (MD5_H , b, c, d, a, w6_t, MD5C2b, MD5S23);
- MD5_STEP (MD5_H , a, b, c, d, w9_t, MD5C2c, MD5S20);
- MD5_STEP (MD5_H , d, a, b, c, wc_t, MD5C2d, MD5S21);
- MD5_STEP (MD5_H , c, d, a, b, wf_t, MD5C2e, MD5S22);
- MD5_STEP (MD5_H , b, c, d, a, w2_t, MD5C2f, MD5S23);
-
- MD5_STEP (MD5_I , a, b, c, d, w0_t, MD5C30, MD5S30);
- MD5_STEP (MD5_I , d, a, b, c, w7_t, MD5C31, MD5S31);
- MD5_STEP (MD5_I , c, d, a, b, we_t, MD5C32, MD5S32);
- MD5_STEP (MD5_I , b, c, d, a, w5_t, MD5C33, MD5S33);
- MD5_STEP (MD5_I , a, b, c, d, wc_t, MD5C34, MD5S30);
- MD5_STEP (MD5_I , d, a, b, c, w3_t, MD5C35, MD5S31);
- MD5_STEP (MD5_I , c, d, a, b, wa_t, MD5C36, MD5S32);
- MD5_STEP (MD5_I , b, c, d, a, w1_t, MD5C37, MD5S33);
- MD5_STEP (MD5_I , a, b, c, d, w8_t, MD5C38, MD5S30);
- MD5_STEP (MD5_I , d, a, b, c, wf_t, MD5C39, MD5S31);
- MD5_STEP (MD5_I , c, d, a, b, w6_t, MD5C3a, MD5S32);
- MD5_STEP (MD5_I , b, c, d, a, wd_t, MD5C3b, MD5S33);
- MD5_STEP (MD5_I , a, b, c, d, w4_t, MD5C3c, MD5S30);
- MD5_STEP (MD5_I , d, a, b, c, wb_t, MD5C3d, MD5S31);
- MD5_STEP (MD5_I , c, d, a, b, w2_t, MD5C3e, MD5S32);
- MD5_STEP (MD5_I , b, c, d, a, w9_t, MD5C3f, MD5S33);
-
- const u32 r0 = a;
- const u32 r1 = d;
- const u32 r2 = c;
- const u32 r3 = b;
-
- #include COMPARE_M
+ MD5_STEP (MD5_Fo, a, b, c, d, w0_t[0], MD5C00, MD5S00);
+ MD5_STEP (MD5_Fo, d, a, b, c, w0_t[1], MD5C01, MD5S01);
+ MD5_STEP (MD5_Fo, c, d, a, b, w0_t[2], MD5C02, MD5S02);
+ MD5_STEP (MD5_Fo, b, c, d, a, w0_t[3], MD5C03, MD5S03);
+ MD5_STEP (MD5_Fo, a, b, c, d, w1_t[0], MD5C04, MD5S00);
+ MD5_STEP (MD5_Fo, d, a, b, c, w1_t[1], MD5C05, MD5S01);
+ MD5_STEP (MD5_Fo, c, d, a, b, w1_t[2], MD5C06, MD5S02);
+ MD5_STEP (MD5_Fo, b, c, d, a, w1_t[3], MD5C07, MD5S03);
+ MD5_STEP (MD5_Fo, a, b, c, d, w2_t[0], MD5C08, MD5S00);
+ MD5_STEP (MD5_Fo, d, a, b, c, w2_t[1], MD5C09, MD5S01);
+ MD5_STEP (MD5_Fo, c, d, a, b, w2_t[2], MD5C0a, MD5S02);
+ MD5_STEP (MD5_Fo, b, c, d, a, w2_t[3], MD5C0b, MD5S03);
+ MD5_STEP (MD5_Fo, a, b, c, d, w3_t[0], MD5C0c, MD5S00);
+ MD5_STEP (MD5_Fo, d, a, b, c, w3_t[1], MD5C0d, MD5S01);
+ MD5_STEP (MD5_Fo, c, d, a, b, w3_t[2], MD5C0e, MD5S02);
+ MD5_STEP (MD5_Fo, b, c, d, a, w3_t[3], MD5C0f, MD5S03);
+
+ MD5_STEP (MD5_Go, a, b, c, d, w0_t[1], MD5C10, MD5S10);
+ MD5_STEP (MD5_Go, d, a, b, c, w1_t[2], MD5C11, MD5S11);
+ MD5_STEP (MD5_Go, c, d, a, b, w2_t[3], MD5C12, MD5S12);
+ MD5_STEP (MD5_Go, b, c, d, a, w0_t[0], MD5C13, MD5S13);
+ MD5_STEP (MD5_Go, a, b, c, d, w1_t[1], MD5C14, MD5S10);
+ MD5_STEP (MD5_Go, d, a, b, c, w2_t[2], MD5C15, MD5S11);
+ MD5_STEP (MD5_Go, c, d, a, b, w3_t[3], MD5C16, MD5S12);
+ MD5_STEP (MD5_Go, b, c, d, a, w1_t[0], MD5C17, MD5S13);
+ MD5_STEP (MD5_Go, a, b, c, d, w2_t[1], MD5C18, MD5S10);
+ MD5_STEP (MD5_Go, d, a, b, c, w3_t[2], MD5C19, MD5S11);
+ MD5_STEP (MD5_Go, c, d, a, b, w0_t[3], MD5C1a, MD5S12);
+ MD5_STEP (MD5_Go, b, c, d, a, w2_t[0], MD5C1b, MD5S13);
+ MD5_STEP (MD5_Go, a, b, c, d, w3_t[1], MD5C1c, MD5S10);
+ MD5_STEP (MD5_Go, d, a, b, c, w0_t[2], MD5C1d, MD5S11);
+ MD5_STEP (MD5_Go, c, d, a, b, w1_t[3], MD5C1e, MD5S12);
+ MD5_STEP (MD5_Go, b, c, d, a, w3_t[0], MD5C1f, MD5S13);
+
+ MD5_STEP (MD5_H , a, b, c, d, w1_t[1], MD5C20, MD5S20);
+ MD5_STEP (MD5_H , d, a, b, c, w2_t[0], MD5C21, MD5S21);
+ MD5_STEP (MD5_H , c, d, a, b, w2_t[3], MD5C22, MD5S22);
+ MD5_STEP (MD5_H , b, c, d, a, w3_t[2], MD5C23, MD5S23);
+ MD5_STEP (MD5_H , a, b, c, d, w0_t[1], MD5C24, MD5S20);
+ MD5_STEP (MD5_H , d, a, b, c, w1_t[0], MD5C25, MD5S21);
+ MD5_STEP (MD5_H , c, d, a, b, w1_t[3], MD5C26, MD5S22);
+ MD5_STEP (MD5_H , b, c, d, a, w2_t[2], MD5C27, MD5S23);
+ MD5_STEP (MD5_H , a, b, c, d, w3_t[1], MD5C28, MD5S20);
+ MD5_STEP (MD5_H , d, a, b, c, w0_t[0], MD5C29, MD5S21);
+ MD5_STEP (MD5_H , c, d, a, b, w0_t[3], MD5C2a, MD5S22);
+ MD5_STEP (MD5_H , b, c, d, a, w1_t[2], MD5C2b, MD5S23);
+ MD5_STEP (MD5_H , a, b, c, d, w2_t[1], MD5C2c, MD5S20);
+ MD5_STEP (MD5_H , d, a, b, c, w3_t[0], MD5C2d, MD5S21);
+ MD5_STEP (MD5_H , c, d, a, b, w3_t[3], MD5C2e, MD5S22);
+ MD5_STEP (MD5_H , b, c, d, a, w0_t[2], MD5C2f, MD5S23);
+
+ MD5_STEP (MD5_I , a, b, c, d, w0_t[0], MD5C30, MD5S30);
+ MD5_STEP (MD5_I , d, a, b, c, w1_t[3], MD5C31, MD5S31);
+ MD5_STEP (MD5_I , c, d, a, b, w3_t[2], MD5C32, MD5S32);
+ MD5_STEP (MD5_I , b, c, d, a, w1_t[1], MD5C33, MD5S33);
+ MD5_STEP (MD5_I , a, b, c, d, w3_t[0], MD5C34, MD5S30);
+ MD5_STEP (MD5_I , d, a, b, c, w0_t[3], MD5C35, MD5S31);
+ MD5_STEP (MD5_I , c, d, a, b, w2_t[2], MD5C36, MD5S32);
+ MD5_STEP (MD5_I , b, c, d, a, w0_t[1], MD5C37, MD5S33);
+ MD5_STEP (MD5_I , a, b, c, d, w2_t[0], MD5C38, MD5S30);
+ MD5_STEP (MD5_I , d, a, b, c, w3_t[3], MD5C39, MD5S31);
+ MD5_STEP (MD5_I , c, d, a, b, w1_t[2], MD5C3a, MD5S32);
+ MD5_STEP (MD5_I , b, c, d, a, w3_t[1], MD5C3b, MD5S33);
+ MD5_STEP (MD5_I , a, b, c, d, w1_t[0], MD5C3c, MD5S30);
+ MD5_STEP (MD5_I , d, a, b, c, w2_t[3], MD5C3d, MD5S31);
+ MD5_STEP (MD5_I , c, d, a, b, w0_t[2], MD5C3e, MD5S32);
+ MD5_STEP (MD5_I , b, c, d, a, w2_t[1], MD5C3f, MD5S33);
+
+ COMPARE_M_SIMD (a, d, c, b);
}
}
u32 w0l = w0[0];
- for (u32 il_pos = 0; il_pos < bfs_cnt; il_pos++)
+ for (u32 il_pos = 0; il_pos < bfs_cnt; il_pos += VECT_SIZE)
{
- const u32 w0r = bfs_buf[il_pos].i;
-
- w0[0] = w0l | w0r;
-
- u32 a = MD5M_A;
- u32 b = MD5M_B;
- u32 c = MD5M_C;
- u32 d = MD5M_D;
-
- MD5_STEP (MD5_Fo, a, b, c, d, w0[0], MD5C00, MD5S00);
- MD5_STEP (MD5_Fo, d, a, b, c, w0[1], MD5C01, MD5S01);
- MD5_STEP (MD5_Fo, c, d, a, b, w0[2], MD5C02, MD5S02);
- MD5_STEP (MD5_Fo, b, c, d, a, w0[3], MD5C03, MD5S03);
- MD5_STEP (MD5_Fo, a, b, c, d, w1[0], MD5C04, MD5S00);
- MD5_STEP (MD5_Fo, d, a, b, c, w1[1], MD5C05, MD5S01);
- MD5_STEP (MD5_Fo, c, d, a, b, w1[2], MD5C06, MD5S02);
- MD5_STEP (MD5_Fo, b, c, d, a, w1[3], MD5C07, MD5S03);
- MD5_STEP (MD5_Fo, a, b, c, d, w2[0], MD5C08, MD5S00);
- MD5_STEP (MD5_Fo, d, a, b, c, w2[1], MD5C09, MD5S01);
- MD5_STEP (MD5_Fo, c, d, a, b, w2[2], MD5C0a, MD5S02);
- MD5_STEP (MD5_Fo, b, c, d, a, w2[3], MD5C0b, MD5S03);
- MD5_STEP (MD5_Fo, a, b, c, d, w3[0], MD5C0c, MD5S00);
- MD5_STEP (MD5_Fo, d, a, b, c, w3[1], MD5C0d, MD5S01);
- MD5_STEP (MD5_Fo, c, d, a, b, w3[2], MD5C0e, MD5S02);
- MD5_STEP (MD5_Fo, b, c, d, a, w3[3], MD5C0f, MD5S03);
-
- MD5_STEP (MD5_Go, a, b, c, d, w0[1], MD5C10, MD5S10);
- MD5_STEP (MD5_Go, d, a, b, c, w1[2], MD5C11, MD5S11);
- MD5_STEP (MD5_Go, c, d, a, b, w2[3], MD5C12, MD5S12);
- MD5_STEP (MD5_Go, b, c, d, a, w0[0], MD5C13, MD5S13);
- MD5_STEP (MD5_Go, a, b, c, d, w1[1], MD5C14, MD5S10);
- MD5_STEP (MD5_Go, d, a, b, c, w2[2], MD5C15, MD5S11);
- MD5_STEP (MD5_Go, c, d, a, b, w3[3], MD5C16, MD5S12);
- MD5_STEP (MD5_Go, b, c, d, a, w1[0], MD5C17, MD5S13);
- MD5_STEP (MD5_Go, a, b, c, d, w2[1], MD5C18, MD5S10);
- MD5_STEP (MD5_Go, d, a, b, c, w3[2], MD5C19, MD5S11);
- MD5_STEP (MD5_Go, c, d, a, b, w0[3], MD5C1a, MD5S12);
- MD5_STEP (MD5_Go, b, c, d, a, w2[0], MD5C1b, MD5S13);
- MD5_STEP (MD5_Go, a, b, c, d, w3[1], MD5C1c, MD5S10);
- MD5_STEP (MD5_Go, d, a, b, c, w0[2], MD5C1d, MD5S11);
- MD5_STEP (MD5_Go, c, d, a, b, w1[3], MD5C1e, MD5S12);
- MD5_STEP (MD5_Go, b, c, d, a, w3[0], MD5C1f, MD5S13);
-
- MD5_STEP (MD5_H , a, b, c, d, w1[1], MD5C20, MD5S20);
- MD5_STEP (MD5_H , d, a, b, c, w2[0], MD5C21, MD5S21);
- MD5_STEP (MD5_H , c, d, a, b, w2[3], MD5C22, MD5S22);
- MD5_STEP (MD5_H , b, c, d, a, w3[2], MD5C23, MD5S23);
- MD5_STEP (MD5_H , a, b, c, d, w0[1], MD5C24, MD5S20);
- MD5_STEP (MD5_H , d, a, b, c, w1[0], MD5C25, MD5S21);
- MD5_STEP (MD5_H , c, d, a, b, w1[3], MD5C26, MD5S22);
- MD5_STEP (MD5_H , b, c, d, a, w2[2], MD5C27, MD5S23);
- MD5_STEP (MD5_H , a, b, c, d, w3[1], MD5C28, MD5S20);
- MD5_STEP (MD5_H , d, a, b, c, w0[0], MD5C29, MD5S21);
- MD5_STEP (MD5_H , c, d, a, b, w0[3], MD5C2a, MD5S22);
- MD5_STEP (MD5_H , b, c, d, a, w1[2], MD5C2b, MD5S23);
- MD5_STEP (MD5_H , a, b, c, d, w2[1], MD5C2c, MD5S20);
- MD5_STEP (MD5_H , d, a, b, c, w3[0], MD5C2d, MD5S21);
- MD5_STEP (MD5_H , c, d, a, b, w3[3], MD5C2e, MD5S22);
- MD5_STEP (MD5_H , b, c, d, a, w0[2], MD5C2f, MD5S23);
-
- MD5_STEP (MD5_I , a, b, c, d, w0[0], MD5C30, MD5S30);
- MD5_STEP (MD5_I , d, a, b, c, w1[3], MD5C31, MD5S31);
- MD5_STEP (MD5_I , c, d, a, b, w3[2], MD5C32, MD5S32);
- MD5_STEP (MD5_I , b, c, d, a, w1[1], MD5C33, MD5S33);
- MD5_STEP (MD5_I , a, b, c, d, w3[0], MD5C34, MD5S30);
- MD5_STEP (MD5_I , d, a, b, c, w0[3], MD5C35, MD5S31);
- MD5_STEP (MD5_I , c, d, a, b, w2[2], MD5C36, MD5S32);
- MD5_STEP (MD5_I , b, c, d, a, w0[1], MD5C37, MD5S33);
- MD5_STEP (MD5_I , a, b, c, d, w2[0], MD5C38, MD5S30);
- MD5_STEP (MD5_I , d, a, b, c, w3[3], MD5C39, MD5S31);
- MD5_STEP (MD5_I , c, d, a, b, w1[2], MD5C3a, MD5S32);
- MD5_STEP (MD5_I , b, c, d, a, w3[1], MD5C3b, MD5S33);
- MD5_STEP (MD5_I , a, b, c, d, w1[0], MD5C3c, MD5S30);
- MD5_STEP (MD5_I , d, a, b, c, w2[3], MD5C3d, MD5S31);
- MD5_STEP (MD5_I , c, d, a, b, w0[2], MD5C3e, MD5S32);
- MD5_STEP (MD5_I , b, c, d, a, w2[1], MD5C3f, MD5S33);
+ const u32x w0r = w0r_create_bft (bfs_buf, il_pos);
+
+ const u32x w0lr = w0l | w0r;
+
+ u32x w0_t[4];
+
+ w0_t[0] = w0lr;
+ w0_t[1] = w0[1];
+ w0_t[2] = w0[2];
+ w0_t[3] = w0[3];
+
+ u32x w1_t[4];
+
+ w1_t[0] = w1[0];
+ w1_t[1] = w1[1];
+ w1_t[2] = w1[2];
+ w1_t[3] = w1[3];
+
+ u32x w2_t[4];
+
+ w2_t[0] = w2[0];
+ w2_t[1] = w2[1];
+ w2_t[2] = w2[2];
+ w2_t[3] = w2[3];
+
+ u32x w3_t[4];
+
+ w3_t[0] = w3[0];
+ w3_t[1] = w3[1];
+ w3_t[2] = w3[2];
+ w3_t[3] = w3[3];
+
+ u32x a = MD5M_A;
+ u32x b = MD5M_B;
+ u32x c = MD5M_C;
+ u32x d = MD5M_D;
+
+ MD5_STEP (MD5_Fo, a, b, c, d, w0_t[0], MD5C00, MD5S00);
+ MD5_STEP (MD5_Fo, d, a, b, c, w0_t[1], MD5C01, MD5S01);
+ MD5_STEP (MD5_Fo, c, d, a, b, w0_t[2], MD5C02, MD5S02);
+ MD5_STEP (MD5_Fo, b, c, d, a, w0_t[3], MD5C03, MD5S03);
+ MD5_STEP (MD5_Fo, a, b, c, d, w1_t[0], MD5C04, MD5S00);
+ MD5_STEP (MD5_Fo, d, a, b, c, w1_t[1], MD5C05, MD5S01);
+ MD5_STEP (MD5_Fo, c, d, a, b, w1_t[2], MD5C06, MD5S02);
+ MD5_STEP (MD5_Fo, b, c, d, a, w1_t[3], MD5C07, MD5S03);
+ MD5_STEP (MD5_Fo, a, b, c, d, w2_t[0], MD5C08, MD5S00);
+ MD5_STEP (MD5_Fo, d, a, b, c, w2_t[1], MD5C09, MD5S01);
+ MD5_STEP (MD5_Fo, c, d, a, b, w2_t[2], MD5C0a, MD5S02);
+ MD5_STEP (MD5_Fo, b, c, d, a, w2_t[3], MD5C0b, MD5S03);
+ MD5_STEP (MD5_Fo, a, b, c, d, w3_t[0], MD5C0c, MD5S00);
+ MD5_STEP (MD5_Fo, d, a, b, c, w3_t[1], MD5C0d, MD5S01);
+ MD5_STEP (MD5_Fo, c, d, a, b, w3_t[2], MD5C0e, MD5S02);
+ MD5_STEP (MD5_Fo, b, c, d, a, w3_t[3], MD5C0f, MD5S03);
+
+ MD5_STEP (MD5_Go, a, b, c, d, w0_t[1], MD5C10, MD5S10);
+ MD5_STEP (MD5_Go, d, a, b, c, w1_t[2], MD5C11, MD5S11);
+ MD5_STEP (MD5_Go, c, d, a, b, w2_t[3], MD5C12, MD5S12);
+ MD5_STEP (MD5_Go, b, c, d, a, w0_t[0], MD5C13, MD5S13);
+ MD5_STEP (MD5_Go, a, b, c, d, w1_t[1], MD5C14, MD5S10);
+ MD5_STEP (MD5_Go, d, a, b, c, w2_t[2], MD5C15, MD5S11);
+ MD5_STEP (MD5_Go, c, d, a, b, w3_t[3], MD5C16, MD5S12);
+ MD5_STEP (MD5_Go, b, c, d, a, w1_t[0], MD5C17, MD5S13);
+ MD5_STEP (MD5_Go, a, b, c, d, w2_t[1], MD5C18, MD5S10);
+ MD5_STEP (MD5_Go, d, a, b, c, w3_t[2], MD5C19, MD5S11);
+ MD5_STEP (MD5_Go, c, d, a, b, w0_t[3], MD5C1a, MD5S12);
+ MD5_STEP (MD5_Go, b, c, d, a, w2_t[0], MD5C1b, MD5S13);
+ MD5_STEP (MD5_Go, a, b, c, d, w3_t[1], MD5C1c, MD5S10);
+ MD5_STEP (MD5_Go, d, a, b, c, w0_t[2], MD5C1d, MD5S11);
+ MD5_STEP (MD5_Go, c, d, a, b, w1_t[3], MD5C1e, MD5S12);
+ MD5_STEP (MD5_Go, b, c, d, a, w3_t[0], MD5C1f, MD5S13);
+
+ MD5_STEP (MD5_H , a, b, c, d, w1_t[1], MD5C20, MD5S20);
+ MD5_STEP (MD5_H , d, a, b, c, w2_t[0], MD5C21, MD5S21);
+ MD5_STEP (MD5_H , c, d, a, b, w2_t[3], MD5C22, MD5S22);
+ MD5_STEP (MD5_H , b, c, d, a, w3_t[2], MD5C23, MD5S23);
+ MD5_STEP (MD5_H , a, b, c, d, w0_t[1], MD5C24, MD5S20);
+ MD5_STEP (MD5_H , d, a, b, c, w1_t[0], MD5C25, MD5S21);
+ MD5_STEP (MD5_H , c, d, a, b, w1_t[3], MD5C26, MD5S22);
+ MD5_STEP (MD5_H , b, c, d, a, w2_t[2], MD5C27, MD5S23);
+ MD5_STEP (MD5_H , a, b, c, d, w3_t[1], MD5C28, MD5S20);
+ MD5_STEP (MD5_H , d, a, b, c, w0_t[0], MD5C29, MD5S21);
+ MD5_STEP (MD5_H , c, d, a, b, w0_t[3], MD5C2a, MD5S22);
+ MD5_STEP (MD5_H , b, c, d, a, w1_t[2], MD5C2b, MD5S23);
+ MD5_STEP (MD5_H , a, b, c, d, w2_t[1], MD5C2c, MD5S20);
+ MD5_STEP (MD5_H , d, a, b, c, w3_t[0], MD5C2d, MD5S21);
+ MD5_STEP (MD5_H , c, d, a, b, w3_t[3], MD5C2e, MD5S22);
+ MD5_STEP (MD5_H , b, c, d, a, w0_t[2], MD5C2f, MD5S23);
+
+ MD5_STEP (MD5_I , a, b, c, d, w0_t[0], MD5C30, MD5S30);
+ MD5_STEP (MD5_I , d, a, b, c, w1_t[3], MD5C31, MD5S31);
+ MD5_STEP (MD5_I , c, d, a, b, w3_t[2], MD5C32, MD5S32);
+ MD5_STEP (MD5_I , b, c, d, a, w1_t[1], MD5C33, MD5S33);
+ MD5_STEP (MD5_I , a, b, c, d, w3_t[0], MD5C34, MD5S30);
+ MD5_STEP (MD5_I , d, a, b, c, w0_t[3], MD5C35, MD5S31);
+ MD5_STEP (MD5_I , c, d, a, b, w2_t[2], MD5C36, MD5S32);
+ MD5_STEP (MD5_I , b, c, d, a, w0_t[1], MD5C37, MD5S33);
+ MD5_STEP (MD5_I , a, b, c, d, w2_t[0], MD5C38, MD5S30);
+ MD5_STEP (MD5_I , d, a, b, c, w3_t[3], MD5C39, MD5S31);
+ MD5_STEP (MD5_I , c, d, a, b, w1_t[2], MD5C3a, MD5S32);
+ MD5_STEP (MD5_I , b, c, d, a, w3_t[1], MD5C3b, MD5S33);
+ MD5_STEP (MD5_I , a, b, c, d, w1_t[0], MD5C3c, MD5S30);
+ MD5_STEP (MD5_I , d, a, b, c, w2_t[3], MD5C3d, MD5S31);
+ MD5_STEP (MD5_I , c, d, a, b, w0_t[2], MD5C3e, MD5S32);
+ MD5_STEP (MD5_I , b, c, d, a, w2_t[1], MD5C3f, MD5S33);
a += MD5M_A;
b += MD5M_B;
c += MD5M_C;
d += MD5M_D;
- const u32 w0_t = uint_to_hex_lower8 ((a >> 0) & 255) << 0
- | uint_to_hex_lower8 ((a >> 8) & 255) << 16;
- const u32 w1_t = uint_to_hex_lower8 ((a >> 16) & 255) << 0
- | uint_to_hex_lower8 ((a >> 24) & 255) << 16;
- const u32 w2_t = uint_to_hex_lower8 ((b >> 0) & 255) << 0
- | uint_to_hex_lower8 ((b >> 8) & 255) << 16;
- const u32 w3_t = uint_to_hex_lower8 ((b >> 16) & 255) << 0
- | uint_to_hex_lower8 ((b >> 24) & 255) << 16;
- const u32 w4_t = uint_to_hex_lower8 ((c >> 0) & 255) << 0
- | uint_to_hex_lower8 ((c >> 8) & 255) << 16;
- const u32 w5_t = uint_to_hex_lower8 ((c >> 16) & 255) << 0
- | uint_to_hex_lower8 ((c >> 24) & 255) << 16;
- const u32 w6_t = uint_to_hex_lower8 ((d >> 0) & 255) << 0
- | uint_to_hex_lower8 ((d >> 8) & 255) << 16;
- const u32 w7_t = uint_to_hex_lower8 ((d >> 16) & 255) << 0
- | uint_to_hex_lower8 ((d >> 24) & 255) << 16;
-
- const u32 w8_t = s[0];
- const u32 w9_t = s[1];
- const u32 wa_t = s[2];
- const u32 wb_t = s[3];
- const u32 wc_t = s[4];
- const u32 wd_t = s[5];
- const u32 we_t = s[6];
- const u32 wf_t = s[7];
+ w0_t[0] = uint_to_hex_lower8 ((a >> 0) & 255) << 0
+ | uint_to_hex_lower8 ((a >> 8) & 255) << 16;
+ w0_t[1] = uint_to_hex_lower8 ((a >> 16) & 255) << 0
+ | uint_to_hex_lower8 ((a >> 24) & 255) << 16;
+ w0_t[2] = uint_to_hex_lower8 ((b >> 0) & 255) << 0
+ | uint_to_hex_lower8 ((b >> 8) & 255) << 16;
+ w0_t[3] = uint_to_hex_lower8 ((b >> 16) & 255) << 0
+ | uint_to_hex_lower8 ((b >> 24) & 255) << 16;
+ w1_t[0] = uint_to_hex_lower8 ((c >> 0) & 255) << 0
+ | uint_to_hex_lower8 ((c >> 8) & 255) << 16;
+ w1_t[1] = uint_to_hex_lower8 ((c >> 16) & 255) << 0
+ | uint_to_hex_lower8 ((c >> 24) & 255) << 16;
+ w1_t[2] = uint_to_hex_lower8 ((d >> 0) & 255) << 0
+ | uint_to_hex_lower8 ((d >> 8) & 255) << 16;
+ w1_t[3] = uint_to_hex_lower8 ((d >> 16) & 255) << 0
+ | uint_to_hex_lower8 ((d >> 24) & 255) << 16;
+
+ w2_t[0] = s[0];
+ w2_t[1] = s[1];
+ w2_t[2] = s[2];
+ w2_t[3] = s[3];
+
+ w3_t[0] = s[4];
+ w3_t[1] = s[5];
+ w3_t[2] = s[6];
+ w3_t[3] = s[7];
a = MD5M_A;
b = MD5M_B;
c = MD5M_C;
d = MD5M_D;
- MD5_STEP (MD5_Fo, a, b, c, d, w0_t, MD5C00, MD5S00);
- MD5_STEP (MD5_Fo, d, a, b, c, w1_t, MD5C01, MD5S01);
- MD5_STEP (MD5_Fo, c, d, a, b, w2_t, MD5C02, MD5S02);
- MD5_STEP (MD5_Fo, b, c, d, a, w3_t, MD5C03, MD5S03);
- MD5_STEP (MD5_Fo, a, b, c, d, w4_t, MD5C04, MD5S00);
- MD5_STEP (MD5_Fo, d, a, b, c, w5_t, MD5C05, MD5S01);
- MD5_STEP (MD5_Fo, c, d, a, b, w6_t, MD5C06, MD5S02);
- MD5_STEP (MD5_Fo, b, c, d, a, w7_t, MD5C07, MD5S03);
- MD5_STEP (MD5_Fo, a, b, c, d, w8_t, MD5C08, MD5S00);
- MD5_STEP (MD5_Fo, d, a, b, c, w9_t, MD5C09, MD5S01);
- MD5_STEP (MD5_Fo, c, d, a, b, wa_t, MD5C0a, MD5S02);
- MD5_STEP (MD5_Fo, b, c, d, a, wb_t, MD5C0b, MD5S03);
- MD5_STEP (MD5_Fo, a, b, c, d, wc_t, MD5C0c, MD5S00);
- MD5_STEP (MD5_Fo, d, a, b, c, wd_t, MD5C0d, MD5S01);
- MD5_STEP (MD5_Fo, c, d, a, b, we_t, MD5C0e, MD5S02);
- MD5_STEP (MD5_Fo, b, c, d, a, wf_t, MD5C0f, MD5S03);
-
- MD5_STEP (MD5_Go, a, b, c, d, w1_t, MD5C10, MD5S10);
- MD5_STEP (MD5_Go, d, a, b, c, w6_t, MD5C11, MD5S11);
- MD5_STEP (MD5_Go, c, d, a, b, wb_t, MD5C12, MD5S12);
- MD5_STEP (MD5_Go, b, c, d, a, w0_t, MD5C13, MD5S13);
- MD5_STEP (MD5_Go, a, b, c, d, w5_t, MD5C14, MD5S10);
- MD5_STEP (MD5_Go, d, a, b, c, wa_t, MD5C15, MD5S11);
- MD5_STEP (MD5_Go, c, d, a, b, wf_t, MD5C16, MD5S12);
- MD5_STEP (MD5_Go, b, c, d, a, w4_t, MD5C17, MD5S13);
- MD5_STEP (MD5_Go, a, b, c, d, w9_t, MD5C18, MD5S10);
- MD5_STEP (MD5_Go, d, a, b, c, we_t, MD5C19, MD5S11);
- MD5_STEP (MD5_Go, c, d, a, b, w3_t, MD5C1a, MD5S12);
- MD5_STEP (MD5_Go, b, c, d, a, w8_t, MD5C1b, MD5S13);
- MD5_STEP (MD5_Go, a, b, c, d, wd_t, MD5C1c, MD5S10);
- MD5_STEP (MD5_Go, d, a, b, c, w2_t, MD5C1d, MD5S11);
- MD5_STEP (MD5_Go, c, d, a, b, w7_t, MD5C1e, MD5S12);
- MD5_STEP (MD5_Go, b, c, d, a, wc_t, MD5C1f, MD5S13);
-
- MD5_STEP (MD5_H , a, b, c, d, w5_t, MD5C20, MD5S20);
- MD5_STEP (MD5_H , d, a, b, c, w8_t, MD5C21, MD5S21);
- MD5_STEP (MD5_H , c, d, a, b, wb_t, MD5C22, MD5S22);
- MD5_STEP (MD5_H , b, c, d, a, we_t, MD5C23, MD5S23);
- MD5_STEP (MD5_H , a, b, c, d, w1_t, MD5C24, MD5S20);
- MD5_STEP (MD5_H , d, a, b, c, w4_t, MD5C25, MD5S21);
- MD5_STEP (MD5_H , c, d, a, b, w7_t, MD5C26, MD5S22);
- MD5_STEP (MD5_H , b, c, d, a, wa_t, MD5C27, MD5S23);
- MD5_STEP (MD5_H , a, b, c, d, wd_t, MD5C28, MD5S20);
- MD5_STEP (MD5_H , d, a, b, c, w0_t, MD5C29, MD5S21);
- MD5_STEP (MD5_H , c, d, a, b, w3_t, MD5C2a, MD5S22);
- MD5_STEP (MD5_H , b, c, d, a, w6_t, MD5C2b, MD5S23);
- MD5_STEP (MD5_H , a, b, c, d, w9_t, MD5C2c, MD5S20);
- MD5_STEP (MD5_H , d, a, b, c, wc_t, MD5C2d, MD5S21);
- MD5_STEP (MD5_H , c, d, a, b, wf_t, MD5C2e, MD5S22);
- MD5_STEP (MD5_H , b, c, d, a, w2_t, MD5C2f, MD5S23);
-
- MD5_STEP (MD5_I , a, b, c, d, w0_t, MD5C30, MD5S30);
- MD5_STEP (MD5_I , d, a, b, c, w7_t, MD5C31, MD5S31);
- MD5_STEP (MD5_I , c, d, a, b, we_t, MD5C32, MD5S32);
- MD5_STEP (MD5_I , b, c, d, a, w5_t, MD5C33, MD5S33);
- MD5_STEP (MD5_I , a, b, c, d, wc_t, MD5C34, MD5S30);
- MD5_STEP (MD5_I , d, a, b, c, w3_t, MD5C35, MD5S31);
- MD5_STEP (MD5_I , c, d, a, b, wa_t, MD5C36, MD5S32);
- MD5_STEP (MD5_I , b, c, d, a, w1_t, MD5C37, MD5S33);
- MD5_STEP (MD5_I , a, b, c, d, w8_t, MD5C38, MD5S30);
- MD5_STEP (MD5_I , d, a, b, c, wf_t, MD5C39, MD5S31);
- MD5_STEP (MD5_I , c, d, a, b, w6_t, MD5C3a, MD5S32);
- MD5_STEP (MD5_I , b, c, d, a, wd_t, MD5C3b, MD5S33);
- MD5_STEP (MD5_I , a, b, c, d, w4_t, MD5C3c, MD5S30);
- MD5_STEP (MD5_I , d, a, b, c, wb_t, MD5C3d, MD5S31);
- MD5_STEP (MD5_I , c, d, a, b, w2_t, MD5C3e, MD5S32);
- MD5_STEP (MD5_I , b, c, d, a, w9_t, MD5C3f, MD5S33);
-
- const u32 r0 = a;
- const u32 r1 = d;
- const u32 r2 = c;
- const u32 r3 = b;
-
- #include COMPARE_S
+ MD5_STEP (MD5_Fo, a, b, c, d, w0_t[0], MD5C00, MD5S00);
+ MD5_STEP (MD5_Fo, d, a, b, c, w0_t[1], MD5C01, MD5S01);
+ MD5_STEP (MD5_Fo, c, d, a, b, w0_t[2], MD5C02, MD5S02);
+ MD5_STEP (MD5_Fo, b, c, d, a, w0_t[3], MD5C03, MD5S03);
+ MD5_STEP (MD5_Fo, a, b, c, d, w1_t[0], MD5C04, MD5S00);
+ MD5_STEP (MD5_Fo, d, a, b, c, w1_t[1], MD5C05, MD5S01);
+ MD5_STEP (MD5_Fo, c, d, a, b, w1_t[2], MD5C06, MD5S02);
+ MD5_STEP (MD5_Fo, b, c, d, a, w1_t[3], MD5C07, MD5S03);
+ MD5_STEP (MD5_Fo, a, b, c, d, w2_t[0], MD5C08, MD5S00);
+ MD5_STEP (MD5_Fo, d, a, b, c, w2_t[1], MD5C09, MD5S01);
+ MD5_STEP (MD5_Fo, c, d, a, b, w2_t[2], MD5C0a, MD5S02);
+ MD5_STEP (MD5_Fo, b, c, d, a, w2_t[3], MD5C0b, MD5S03);
+ MD5_STEP (MD5_Fo, a, b, c, d, w3_t[0], MD5C0c, MD5S00);
+ MD5_STEP (MD5_Fo, d, a, b, c, w3_t[1], MD5C0d, MD5S01);
+ MD5_STEP (MD5_Fo, c, d, a, b, w3_t[2], MD5C0e, MD5S02);
+ MD5_STEP (MD5_Fo, b, c, d, a, w3_t[3], MD5C0f, MD5S03);
+
+ MD5_STEP (MD5_Go, a, b, c, d, w0_t[1], MD5C10, MD5S10);
+ MD5_STEP (MD5_Go, d, a, b, c, w1_t[2], MD5C11, MD5S11);
+ MD5_STEP (MD5_Go, c, d, a, b, w2_t[3], MD5C12, MD5S12);
+ MD5_STEP (MD5_Go, b, c, d, a, w0_t[0], MD5C13, MD5S13);
+ MD5_STEP (MD5_Go, a, b, c, d, w1_t[1], MD5C14, MD5S10);
+ MD5_STEP (MD5_Go, d, a, b, c, w2_t[2], MD5C15, MD5S11);
+ MD5_STEP (MD5_Go, c, d, a, b, w3_t[3], MD5C16, MD5S12);
+ MD5_STEP (MD5_Go, b, c, d, a, w1_t[0], MD5C17, MD5S13);
+ MD5_STEP (MD5_Go, a, b, c, d, w2_t[1], MD5C18, MD5S10);
+ MD5_STEP (MD5_Go, d, a, b, c, w3_t[2], MD5C19, MD5S11);
+ MD5_STEP (MD5_Go, c, d, a, b, w0_t[3], MD5C1a, MD5S12);
+ MD5_STEP (MD5_Go, b, c, d, a, w2_t[0], MD5C1b, MD5S13);
+ MD5_STEP (MD5_Go, a, b, c, d, w3_t[1], MD5C1c, MD5S10);
+ MD5_STEP (MD5_Go, d, a, b, c, w0_t[2], MD5C1d, MD5S11);
+ MD5_STEP (MD5_Go, c, d, a, b, w1_t[3], MD5C1e, MD5S12);
+ MD5_STEP (MD5_Go, b, c, d, a, w3_t[0], MD5C1f, MD5S13);
+
+ MD5_STEP (MD5_H , a, b, c, d, w1_t[1], MD5C20, MD5S20);
+ MD5_STEP (MD5_H , d, a, b, c, w2_t[0], MD5C21, MD5S21);
+ MD5_STEP (MD5_H , c, d, a, b, w2_t[3], MD5C22, MD5S22);
+ MD5_STEP (MD5_H , b, c, d, a, w3_t[2], MD5C23, MD5S23);
+ MD5_STEP (MD5_H , a, b, c, d, w0_t[1], MD5C24, MD5S20);
+ MD5_STEP (MD5_H , d, a, b, c, w1_t[0], MD5C25, MD5S21);
+ MD5_STEP (MD5_H , c, d, a, b, w1_t[3], MD5C26, MD5S22);
+ MD5_STEP (MD5_H , b, c, d, a, w2_t[2], MD5C27, MD5S23);
+ MD5_STEP (MD5_H , a, b, c, d, w3_t[1], MD5C28, MD5S20);
+ MD5_STEP (MD5_H , d, a, b, c, w0_t[0], MD5C29, MD5S21);
+ MD5_STEP (MD5_H , c, d, a, b, w0_t[3], MD5C2a, MD5S22);
+ MD5_STEP (MD5_H , b, c, d, a, w1_t[2], MD5C2b, MD5S23);
+ MD5_STEP (MD5_H , a, b, c, d, w2_t[1], MD5C2c, MD5S20);
+ MD5_STEP (MD5_H , d, a, b, c, w3_t[0], MD5C2d, MD5S21);
+ MD5_STEP (MD5_H , c, d, a, b, w3_t[3], MD5C2e, MD5S22);
+ MD5_STEP (MD5_H , b, c, d, a, w0_t[2], MD5C2f, MD5S23);
+
+ MD5_STEP (MD5_I , a, b, c, d, w0_t[0], MD5C30, MD5S30);
+ MD5_STEP (MD5_I , d, a, b, c, w1_t[3], MD5C31, MD5S31);
+ MD5_STEP (MD5_I , c, d, a, b, w3_t[2], MD5C32, MD5S32);
+ MD5_STEP (MD5_I , b, c, d, a, w1_t[1], MD5C33, MD5S33);
+ MD5_STEP (MD5_I , a, b, c, d, w3_t[0], MD5C34, MD5S30);
+ MD5_STEP (MD5_I , d, a, b, c, w0_t[3], MD5C35, MD5S31);
+ MD5_STEP (MD5_I , c, d, a, b, w2_t[2], MD5C36, MD5S32);
+ MD5_STEP (MD5_I , b, c, d, a, w0_t[1], MD5C37, MD5S33);
+ MD5_STEP (MD5_I , a, b, c, d, w2_t[0], MD5C38, MD5S30);
+ MD5_STEP (MD5_I , d, a, b, c, w3_t[3], MD5C39, MD5S31);
+ MD5_STEP (MD5_I , c, d, a, b, w1_t[2], MD5C3a, MD5S32);
+ MD5_STEP (MD5_I , b, c, d, a, w3_t[1], MD5C3b, MD5S33);
+ MD5_STEP (MD5_I , a, b, c, d, w1_t[0], MD5C3c, MD5S30);
+ MD5_STEP (MD5_I , d, a, b, c, w2_t[3], MD5C3d, MD5S31);
+ MD5_STEP (MD5_I , c, d, a, b, w0_t[2], MD5C3e, MD5S32);
+ MD5_STEP (MD5_I , b, c, d, a, w2_t[1], MD5C3f, MD5S33);
+
+ COMPARE_S_SIMD (a, d, c, b);
}
}
{
append_0x80_2x4 (wordl0, wordl1, pw_l_len);
- switch_buffer_by_offset (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len);
+ switch_buffer_by_offset_le (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len);
}
/**
if (combs_mode == COMBINATOR_MODE_BASE_LEFT)
{
- switch_buffer_by_offset (wordr0, wordr1, wordr2, wordr3, pw_l_len);
+ switch_buffer_by_offset_le (wordr0, wordr1, wordr2, wordr3, pw_l_len);
}
u32 w0[4];
{
append_0x80_2x4 (wordl0, wordl1, pw_l_len);
- switch_buffer_by_offset (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len);
+ switch_buffer_by_offset_le (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len);
}
/**
if (combs_mode == COMBINATOR_MODE_BASE_LEFT)
{
- switch_buffer_by_offset (wordr0, wordr1, wordr2, wordr3, pw_l_len);
+ switch_buffer_by_offset_le (wordr0, wordr1, wordr2, wordr3, pw_l_len);
}
u32 w0[4];
#define _MD5_
+#define NEW_SIMD_CODE
+
#include "include/constants.h"
#include "include/kernel_vendor.h"
#include "include/kernel_functions.c"
#include "OpenCL/types_ocl.c"
#include "OpenCL/common.c"
-
-#define COMPARE_S "OpenCL/check_single_comp4.c"
-#define COMPARE_M "OpenCL/check_multi_comp4.c"
-
-#define uint_to_hex_lower8(i) l_bin2asc[(i)]
+#include "OpenCL/simd.c"
+
+#if VECT_SIZE == 1
+#define uint_to_hex_lower8(i) (u32x) (l_bin2asc[(i)])
+#elif VECT_SIZE == 2
+#define uint_to_hex_lower8(i) (u32x) (l_bin2asc[(i).s0], l_bin2asc[(i).s1])
+#elif VECT_SIZE == 4
+#define uint_to_hex_lower8(i) (u32x) (l_bin2asc[(i).s0], l_bin2asc[(i).s1], l_bin2asc[(i).s2], l_bin2asc[(i).s3])
+#elif VECT_SIZE == 8
+#define uint_to_hex_lower8(i) (u32x) (l_bin2asc[(i).s0], l_bin2asc[(i).s1], l_bin2asc[(i).s2], l_bin2asc[(i).s3], l_bin2asc[(i).s4], l_bin2asc[(i).s5], l_bin2asc[(i).s6], l_bin2asc[(i).s7])
+#endif
static void m02710m (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], const u32 pw_len, __global pw_t *pws, __global kernel_rule_t *rules_buf, __global comb_t *combs_buf, __global bf_t *bfs_buf, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 bfs_cnt, const u32 digests_cnt, const u32 digests_offset, __local u32 l_bin2asc[256])
{
u32 w0l = w0[0];
- for (u32 il_pos = 0; il_pos < bfs_cnt; il_pos++)
+ for (u32 il_pos = 0; il_pos < bfs_cnt; il_pos += VECT_SIZE)
{
- const u32 w0r = bfs_buf[il_pos].i;
-
- w0[0] = w0l | w0r;
-
- u32 a = MD5M_A;
- u32 b = MD5M_B;
- u32 c = MD5M_C;
- u32 d = MD5M_D;
-
- MD5_STEP (MD5_Fo, a, b, c, d, w0[0], MD5C00, MD5S00);
- MD5_STEP (MD5_Fo, d, a, b, c, w0[1], MD5C01, MD5S01);
- MD5_STEP (MD5_Fo, c, d, a, b, w0[2], MD5C02, MD5S02);
- MD5_STEP (MD5_Fo, b, c, d, a, w0[3], MD5C03, MD5S03);
- MD5_STEP (MD5_Fo, a, b, c, d, w1[0], MD5C04, MD5S00);
- MD5_STEP (MD5_Fo, d, a, b, c, w1[1], MD5C05, MD5S01);
- MD5_STEP (MD5_Fo, c, d, a, b, w1[2], MD5C06, MD5S02);
- MD5_STEP (MD5_Fo, b, c, d, a, w1[3], MD5C07, MD5S03);
- MD5_STEP (MD5_Fo, a, b, c, d, w2[0], MD5C08, MD5S00);
- MD5_STEP (MD5_Fo, d, a, b, c, w2[1], MD5C09, MD5S01);
- MD5_STEP (MD5_Fo, c, d, a, b, w2[2], MD5C0a, MD5S02);
- MD5_STEP (MD5_Fo, b, c, d, a, w2[3], MD5C0b, MD5S03);
- MD5_STEP (MD5_Fo, a, b, c, d, w3[0], MD5C0c, MD5S00);
- MD5_STEP (MD5_Fo, d, a, b, c, w3[1], MD5C0d, MD5S01);
- MD5_STEP (MD5_Fo, c, d, a, b, w3[2], MD5C0e, MD5S02);
- MD5_STEP (MD5_Fo, b, c, d, a, w3[3], MD5C0f, MD5S03);
-
- MD5_STEP (MD5_Go, a, b, c, d, w0[1], MD5C10, MD5S10);
- MD5_STEP (MD5_Go, d, a, b, c, w1[2], MD5C11, MD5S11);
- MD5_STEP (MD5_Go, c, d, a, b, w2[3], MD5C12, MD5S12);
- MD5_STEP (MD5_Go, b, c, d, a, w0[0], MD5C13, MD5S13);
- MD5_STEP (MD5_Go, a, b, c, d, w1[1], MD5C14, MD5S10);
- MD5_STEP (MD5_Go, d, a, b, c, w2[2], MD5C15, MD5S11);
- MD5_STEP (MD5_Go, c, d, a, b, w3[3], MD5C16, MD5S12);
- MD5_STEP (MD5_Go, b, c, d, a, w1[0], MD5C17, MD5S13);
- MD5_STEP (MD5_Go, a, b, c, d, w2[1], MD5C18, MD5S10);
- MD5_STEP (MD5_Go, d, a, b, c, w3[2], MD5C19, MD5S11);
- MD5_STEP (MD5_Go, c, d, a, b, w0[3], MD5C1a, MD5S12);
- MD5_STEP (MD5_Go, b, c, d, a, w2[0], MD5C1b, MD5S13);
- MD5_STEP (MD5_Go, a, b, c, d, w3[1], MD5C1c, MD5S10);
- MD5_STEP (MD5_Go, d, a, b, c, w0[2], MD5C1d, MD5S11);
- MD5_STEP (MD5_Go, c, d, a, b, w1[3], MD5C1e, MD5S12);
- MD5_STEP (MD5_Go, b, c, d, a, w3[0], MD5C1f, MD5S13);
-
- MD5_STEP (MD5_H , a, b, c, d, w1[1], MD5C20, MD5S20);
- MD5_STEP (MD5_H , d, a, b, c, w2[0], MD5C21, MD5S21);
- MD5_STEP (MD5_H , c, d, a, b, w2[3], MD5C22, MD5S22);
- MD5_STEP (MD5_H , b, c, d, a, w3[2], MD5C23, MD5S23);
- MD5_STEP (MD5_H , a, b, c, d, w0[1], MD5C24, MD5S20);
- MD5_STEP (MD5_H , d, a, b, c, w1[0], MD5C25, MD5S21);
- MD5_STEP (MD5_H , c, d, a, b, w1[3], MD5C26, MD5S22);
- MD5_STEP (MD5_H , b, c, d, a, w2[2], MD5C27, MD5S23);
- MD5_STEP (MD5_H , a, b, c, d, w3[1], MD5C28, MD5S20);
- MD5_STEP (MD5_H , d, a, b, c, w0[0], MD5C29, MD5S21);
- MD5_STEP (MD5_H , c, d, a, b, w0[3], MD5C2a, MD5S22);
- MD5_STEP (MD5_H , b, c, d, a, w1[2], MD5C2b, MD5S23);
- MD5_STEP (MD5_H , a, b, c, d, w2[1], MD5C2c, MD5S20);
- MD5_STEP (MD5_H , d, a, b, c, w3[0], MD5C2d, MD5S21);
- MD5_STEP (MD5_H , c, d, a, b, w3[3], MD5C2e, MD5S22);
- MD5_STEP (MD5_H , b, c, d, a, w0[2], MD5C2f, MD5S23);
-
- MD5_STEP (MD5_I , a, b, c, d, w0[0], MD5C30, MD5S30);
- MD5_STEP (MD5_I , d, a, b, c, w1[3], MD5C31, MD5S31);
- MD5_STEP (MD5_I , c, d, a, b, w3[2], MD5C32, MD5S32);
- MD5_STEP (MD5_I , b, c, d, a, w1[1], MD5C33, MD5S33);
- MD5_STEP (MD5_I , a, b, c, d, w3[0], MD5C34, MD5S30);
- MD5_STEP (MD5_I , d, a, b, c, w0[3], MD5C35, MD5S31);
- MD5_STEP (MD5_I , c, d, a, b, w2[2], MD5C36, MD5S32);
- MD5_STEP (MD5_I , b, c, d, a, w0[1], MD5C37, MD5S33);
- MD5_STEP (MD5_I , a, b, c, d, w2[0], MD5C38, MD5S30);
- MD5_STEP (MD5_I , d, a, b, c, w3[3], MD5C39, MD5S31);
- MD5_STEP (MD5_I , c, d, a, b, w1[2], MD5C3a, MD5S32);
- MD5_STEP (MD5_I , b, c, d, a, w3[1], MD5C3b, MD5S33);
- MD5_STEP (MD5_I , a, b, c, d, w1[0], MD5C3c, MD5S30);
- MD5_STEP (MD5_I , d, a, b, c, w2[3], MD5C3d, MD5S31);
- MD5_STEP (MD5_I , c, d, a, b, w0[2], MD5C3e, MD5S32);
- MD5_STEP (MD5_I , b, c, d, a, w2[1], MD5C3f, MD5S33);
+ const u32x w0r = w0r_create_bft (bfs_buf, il_pos);
+
+ const u32x w0lr = w0l | w0r;
+
+ u32x w0_t[4];
+
+ w0_t[0] = w0lr;
+ w0_t[1] = w0[1];
+ w0_t[2] = w0[2];
+ w0_t[3] = w0[3];
+
+ u32x w1_t[4];
+
+ w1_t[0] = w1[0];
+ w1_t[1] = w1[1];
+ w1_t[2] = w1[2];
+ w1_t[3] = w1[3];
+
+ u32x w2_t[4];
+
+ w2_t[0] = w2[0];
+ w2_t[1] = w2[1];
+ w2_t[2] = w2[2];
+ w2_t[3] = w2[3];
+
+ u32x w3_t[4];
+
+ w3_t[0] = w3[0];
+ w3_t[1] = w3[1];
+ w3_t[2] = w3[2];
+ w3_t[3] = w3[3];
+
+ u32x a = MD5M_A;
+ u32x b = MD5M_B;
+ u32x c = MD5M_C;
+ u32x d = MD5M_D;
+
+ MD5_STEP (MD5_Fo, a, b, c, d, w0_t[0], MD5C00, MD5S00);
+ MD5_STEP (MD5_Fo, d, a, b, c, w0_t[1], MD5C01, MD5S01);
+ MD5_STEP (MD5_Fo, c, d, a, b, w0_t[2], MD5C02, MD5S02);
+ MD5_STEP (MD5_Fo, b, c, d, a, w0_t[3], MD5C03, MD5S03);
+ MD5_STEP (MD5_Fo, a, b, c, d, w1_t[0], MD5C04, MD5S00);
+ MD5_STEP (MD5_Fo, d, a, b, c, w1_t[1], MD5C05, MD5S01);
+ MD5_STEP (MD5_Fo, c, d, a, b, w1_t[2], MD5C06, MD5S02);
+ MD5_STEP (MD5_Fo, b, c, d, a, w1_t[3], MD5C07, MD5S03);
+ MD5_STEP (MD5_Fo, a, b, c, d, w2_t[0], MD5C08, MD5S00);
+ MD5_STEP (MD5_Fo, d, a, b, c, w2_t[1], MD5C09, MD5S01);
+ MD5_STEP (MD5_Fo, c, d, a, b, w2_t[2], MD5C0a, MD5S02);
+ MD5_STEP (MD5_Fo, b, c, d, a, w2_t[3], MD5C0b, MD5S03);
+ MD5_STEP (MD5_Fo, a, b, c, d, w3_t[0], MD5C0c, MD5S00);
+ MD5_STEP (MD5_Fo, d, a, b, c, w3_t[1], MD5C0d, MD5S01);
+ MD5_STEP (MD5_Fo, c, d, a, b, w3_t[2], MD5C0e, MD5S02);
+ MD5_STEP (MD5_Fo, b, c, d, a, w3_t[3], MD5C0f, MD5S03);
+
+ MD5_STEP (MD5_Go, a, b, c, d, w0_t[1], MD5C10, MD5S10);
+ MD5_STEP (MD5_Go, d, a, b, c, w1_t[2], MD5C11, MD5S11);
+ MD5_STEP (MD5_Go, c, d, a, b, w2_t[3], MD5C12, MD5S12);
+ MD5_STEP (MD5_Go, b, c, d, a, w0_t[0], MD5C13, MD5S13);
+ MD5_STEP (MD5_Go, a, b, c, d, w1_t[1], MD5C14, MD5S10);
+ MD5_STEP (MD5_Go, d, a, b, c, w2_t[2], MD5C15, MD5S11);
+ MD5_STEP (MD5_Go, c, d, a, b, w3_t[3], MD5C16, MD5S12);
+ MD5_STEP (MD5_Go, b, c, d, a, w1_t[0], MD5C17, MD5S13);
+ MD5_STEP (MD5_Go, a, b, c, d, w2_t[1], MD5C18, MD5S10);
+ MD5_STEP (MD5_Go, d, a, b, c, w3_t[2], MD5C19, MD5S11);
+ MD5_STEP (MD5_Go, c, d, a, b, w0_t[3], MD5C1a, MD5S12);
+ MD5_STEP (MD5_Go, b, c, d, a, w2_t[0], MD5C1b, MD5S13);
+ MD5_STEP (MD5_Go, a, b, c, d, w3_t[1], MD5C1c, MD5S10);
+ MD5_STEP (MD5_Go, d, a, b, c, w0_t[2], MD5C1d, MD5S11);
+ MD5_STEP (MD5_Go, c, d, a, b, w1_t[3], MD5C1e, MD5S12);
+ MD5_STEP (MD5_Go, b, c, d, a, w3_t[0], MD5C1f, MD5S13);
+
+ MD5_STEP (MD5_H , a, b, c, d, w1_t[1], MD5C20, MD5S20);
+ MD5_STEP (MD5_H , d, a, b, c, w2_t[0], MD5C21, MD5S21);
+ MD5_STEP (MD5_H , c, d, a, b, w2_t[3], MD5C22, MD5S22);
+ MD5_STEP (MD5_H , b, c, d, a, w3_t[2], MD5C23, MD5S23);
+ MD5_STEP (MD5_H , a, b, c, d, w0_t[1], MD5C24, MD5S20);
+ MD5_STEP (MD5_H , d, a, b, c, w1_t[0], MD5C25, MD5S21);
+ MD5_STEP (MD5_H , c, d, a, b, w1_t[3], MD5C26, MD5S22);
+ MD5_STEP (MD5_H , b, c, d, a, w2_t[2], MD5C27, MD5S23);
+ MD5_STEP (MD5_H , a, b, c, d, w3_t[1], MD5C28, MD5S20);
+ MD5_STEP (MD5_H , d, a, b, c, w0_t[0], MD5C29, MD5S21);
+ MD5_STEP (MD5_H , c, d, a, b, w0_t[3], MD5C2a, MD5S22);
+ MD5_STEP (MD5_H , b, c, d, a, w1_t[2], MD5C2b, MD5S23);
+ MD5_STEP (MD5_H , a, b, c, d, w2_t[1], MD5C2c, MD5S20);
+ MD5_STEP (MD5_H , d, a, b, c, w3_t[0], MD5C2d, MD5S21);
+ MD5_STEP (MD5_H , c, d, a, b, w3_t[3], MD5C2e, MD5S22);
+ MD5_STEP (MD5_H , b, c, d, a, w0_t[2], MD5C2f, MD5S23);
+
+ MD5_STEP (MD5_I , a, b, c, d, w0_t[0], MD5C30, MD5S30);
+ MD5_STEP (MD5_I , d, a, b, c, w1_t[3], MD5C31, MD5S31);
+ MD5_STEP (MD5_I , c, d, a, b, w3_t[2], MD5C32, MD5S32);
+ MD5_STEP (MD5_I , b, c, d, a, w1_t[1], MD5C33, MD5S33);
+ MD5_STEP (MD5_I , a, b, c, d, w3_t[0], MD5C34, MD5S30);
+ MD5_STEP (MD5_I , d, a, b, c, w0_t[3], MD5C35, MD5S31);
+ MD5_STEP (MD5_I , c, d, a, b, w2_t[2], MD5C36, MD5S32);
+ MD5_STEP (MD5_I , b, c, d, a, w0_t[1], MD5C37, MD5S33);
+ MD5_STEP (MD5_I , a, b, c, d, w2_t[0], MD5C38, MD5S30);
+ MD5_STEP (MD5_I , d, a, b, c, w3_t[3], MD5C39, MD5S31);
+ MD5_STEP (MD5_I , c, d, a, b, w1_t[2], MD5C3a, MD5S32);
+ MD5_STEP (MD5_I , b, c, d, a, w3_t[1], MD5C3b, MD5S33);
+ MD5_STEP (MD5_I , a, b, c, d, w1_t[0], MD5C3c, MD5S30);
+ MD5_STEP (MD5_I , d, a, b, c, w2_t[3], MD5C3d, MD5S31);
+ MD5_STEP (MD5_I , c, d, a, b, w0_t[2], MD5C3e, MD5S32);
+ MD5_STEP (MD5_I , b, c, d, a, w2_t[1], MD5C3f, MD5S33);
a += MD5M_A;
b += MD5M_B;
c += MD5M_C;
d += MD5M_D;
- const u32 w0_t = uint_to_hex_lower8 ((a >> 0) & 255) << 0
- | uint_to_hex_lower8 ((a >> 8) & 255) << 16;
- const u32 w1_t = uint_to_hex_lower8 ((a >> 16) & 255) << 0
- | uint_to_hex_lower8 ((a >> 24) & 255) << 16;
- const u32 w2_t = uint_to_hex_lower8 ((b >> 0) & 255) << 0
- | uint_to_hex_lower8 ((b >> 8) & 255) << 16;
- const u32 w3_t = uint_to_hex_lower8 ((b >> 16) & 255) << 0
- | uint_to_hex_lower8 ((b >> 24) & 255) << 16;
- const u32 w4_t = uint_to_hex_lower8 ((c >> 0) & 255) << 0
- | uint_to_hex_lower8 ((c >> 8) & 255) << 16;
- const u32 w5_t = uint_to_hex_lower8 ((c >> 16) & 255) << 0
- | uint_to_hex_lower8 ((c >> 24) & 255) << 16;
- const u32 w6_t = uint_to_hex_lower8 ((d >> 0) & 255) << 0
- | uint_to_hex_lower8 ((d >> 8) & 255) << 16;
- const u32 w7_t = uint_to_hex_lower8 ((d >> 16) & 255) << 0
- | uint_to_hex_lower8 ((d >> 24) & 255) << 16;
-
- const u32 w8_t = s[0];
- const u32 w9_t = s[1];
- const u32 wa_t = s[2];
- const u32 wb_t = s[3];
- const u32 wc_t = s[4];
- const u32 wd_t = s[5];
- const u32 we_t = s[6];
- const u32 wf_t = s[7];
+ w0_t[0] = uint_to_hex_lower8 ((a >> 0) & 255) << 0
+ | uint_to_hex_lower8 ((a >> 8) & 255) << 16;
+ w0_t[1] = uint_to_hex_lower8 ((a >> 16) & 255) << 0
+ | uint_to_hex_lower8 ((a >> 24) & 255) << 16;
+ w0_t[2] = uint_to_hex_lower8 ((b >> 0) & 255) << 0
+ | uint_to_hex_lower8 ((b >> 8) & 255) << 16;
+ w0_t[3] = uint_to_hex_lower8 ((b >> 16) & 255) << 0
+ | uint_to_hex_lower8 ((b >> 24) & 255) << 16;
+ w1_t[0] = uint_to_hex_lower8 ((c >> 0) & 255) << 0
+ | uint_to_hex_lower8 ((c >> 8) & 255) << 16;
+ w1_t[1] = uint_to_hex_lower8 ((c >> 16) & 255) << 0
+ | uint_to_hex_lower8 ((c >> 24) & 255) << 16;
+ w1_t[2] = uint_to_hex_lower8 ((d >> 0) & 255) << 0
+ | uint_to_hex_lower8 ((d >> 8) & 255) << 16;
+ w1_t[3] = uint_to_hex_lower8 ((d >> 16) & 255) << 0
+ | uint_to_hex_lower8 ((d >> 24) & 255) << 16;
+
+ w2_t[0] = s[0];
+ w2_t[1] = s[1];
+ w2_t[2] = s[2];
+ w2_t[3] = s[3];
+
+ w3_t[0] = s[4];
+ w3_t[1] = s[5];
+ w3_t[2] = s[6];
+ w3_t[3] = s[7];
a = MD5M_A;
b = MD5M_B;
c = MD5M_C;
d = MD5M_D;
- MD5_STEP (MD5_Fo, a, b, c, d, w0_t, MD5C00, MD5S00);
- MD5_STEP (MD5_Fo, d, a, b, c, w1_t, MD5C01, MD5S01);
- MD5_STEP (MD5_Fo, c, d, a, b, w2_t, MD5C02, MD5S02);
- MD5_STEP (MD5_Fo, b, c, d, a, w3_t, MD5C03, MD5S03);
- MD5_STEP (MD5_Fo, a, b, c, d, w4_t, MD5C04, MD5S00);
- MD5_STEP (MD5_Fo, d, a, b, c, w5_t, MD5C05, MD5S01);
- MD5_STEP (MD5_Fo, c, d, a, b, w6_t, MD5C06, MD5S02);
- MD5_STEP (MD5_Fo, b, c, d, a, w7_t, MD5C07, MD5S03);
- MD5_STEP (MD5_Fo, a, b, c, d, w8_t, MD5C08, MD5S00);
- MD5_STEP (MD5_Fo, d, a, b, c, w9_t, MD5C09, MD5S01);
- MD5_STEP (MD5_Fo, c, d, a, b, wa_t, MD5C0a, MD5S02);
- MD5_STEP (MD5_Fo, b, c, d, a, wb_t, MD5C0b, MD5S03);
- MD5_STEP (MD5_Fo, a, b, c, d, wc_t, MD5C0c, MD5S00);
- MD5_STEP (MD5_Fo, d, a, b, c, wd_t, MD5C0d, MD5S01);
- MD5_STEP (MD5_Fo, c, d, a, b, we_t, MD5C0e, MD5S02);
- MD5_STEP (MD5_Fo, b, c, d, a, wf_t, MD5C0f, MD5S03);
-
- MD5_STEP (MD5_Go, a, b, c, d, w1_t, MD5C10, MD5S10);
- MD5_STEP (MD5_Go, d, a, b, c, w6_t, MD5C11, MD5S11);
- MD5_STEP (MD5_Go, c, d, a, b, wb_t, MD5C12, MD5S12);
- MD5_STEP (MD5_Go, b, c, d, a, w0_t, MD5C13, MD5S13);
- MD5_STEP (MD5_Go, a, b, c, d, w5_t, MD5C14, MD5S10);
- MD5_STEP (MD5_Go, d, a, b, c, wa_t, MD5C15, MD5S11);
- MD5_STEP (MD5_Go, c, d, a, b, wf_t, MD5C16, MD5S12);
- MD5_STEP (MD5_Go, b, c, d, a, w4_t, MD5C17, MD5S13);
- MD5_STEP (MD5_Go, a, b, c, d, w9_t, MD5C18, MD5S10);
- MD5_STEP (MD5_Go, d, a, b, c, we_t, MD5C19, MD5S11);
- MD5_STEP (MD5_Go, c, d, a, b, w3_t, MD5C1a, MD5S12);
- MD5_STEP (MD5_Go, b, c, d, a, w8_t, MD5C1b, MD5S13);
- MD5_STEP (MD5_Go, a, b, c, d, wd_t, MD5C1c, MD5S10);
- MD5_STEP (MD5_Go, d, a, b, c, w2_t, MD5C1d, MD5S11);
- MD5_STEP (MD5_Go, c, d, a, b, w7_t, MD5C1e, MD5S12);
- MD5_STEP (MD5_Go, b, c, d, a, wc_t, MD5C1f, MD5S13);
-
- MD5_STEP (MD5_H , a, b, c, d, w5_t, MD5C20, MD5S20);
- MD5_STEP (MD5_H , d, a, b, c, w8_t, MD5C21, MD5S21);
- MD5_STEP (MD5_H , c, d, a, b, wb_t, MD5C22, MD5S22);
- MD5_STEP (MD5_H , b, c, d, a, we_t, MD5C23, MD5S23);
- MD5_STEP (MD5_H , a, b, c, d, w1_t, MD5C24, MD5S20);
- MD5_STEP (MD5_H , d, a, b, c, w4_t, MD5C25, MD5S21);
- MD5_STEP (MD5_H , c, d, a, b, w7_t, MD5C26, MD5S22);
- MD5_STEP (MD5_H , b, c, d, a, wa_t, MD5C27, MD5S23);
- MD5_STEP (MD5_H , a, b, c, d, wd_t, MD5C28, MD5S20);
- MD5_STEP (MD5_H , d, a, b, c, w0_t, MD5C29, MD5S21);
- MD5_STEP (MD5_H , c, d, a, b, w3_t, MD5C2a, MD5S22);
- MD5_STEP (MD5_H , b, c, d, a, w6_t, MD5C2b, MD5S23);
- MD5_STEP (MD5_H , a, b, c, d, w9_t, MD5C2c, MD5S20);
- MD5_STEP (MD5_H , d, a, b, c, wc_t, MD5C2d, MD5S21);
- MD5_STEP (MD5_H , c, d, a, b, wf_t, MD5C2e, MD5S22);
- MD5_STEP (MD5_H , b, c, d, a, w2_t, MD5C2f, MD5S23);
-
- MD5_STEP (MD5_I , a, b, c, d, w0_t, MD5C30, MD5S30);
- MD5_STEP (MD5_I , d, a, b, c, w7_t, MD5C31, MD5S31);
- MD5_STEP (MD5_I , c, d, a, b, we_t, MD5C32, MD5S32);
- MD5_STEP (MD5_I , b, c, d, a, w5_t, MD5C33, MD5S33);
- MD5_STEP (MD5_I , a, b, c, d, wc_t, MD5C34, MD5S30);
- MD5_STEP (MD5_I , d, a, b, c, w3_t, MD5C35, MD5S31);
- MD5_STEP (MD5_I , c, d, a, b, wa_t, MD5C36, MD5S32);
- MD5_STEP (MD5_I , b, c, d, a, w1_t, MD5C37, MD5S33);
- MD5_STEP (MD5_I , a, b, c, d, w8_t, MD5C38, MD5S30);
- MD5_STEP (MD5_I , d, a, b, c, wf_t, MD5C39, MD5S31);
- MD5_STEP (MD5_I , c, d, a, b, w6_t, MD5C3a, MD5S32);
- MD5_STEP (MD5_I , b, c, d, a, wd_t, MD5C3b, MD5S33);
- MD5_STEP (MD5_I , a, b, c, d, w4_t, MD5C3c, MD5S30);
- MD5_STEP (MD5_I , d, a, b, c, wb_t, MD5C3d, MD5S31);
- MD5_STEP (MD5_I , c, d, a, b, w2_t, MD5C3e, MD5S32);
- MD5_STEP (MD5_I , b, c, d, a, w9_t, MD5C3f, MD5S33);
-
- const u32 r_a = a + MD5M_A;
- const u32 r_b = b + MD5M_B;
- const u32 r_c = c + MD5M_C;
- const u32 r_d = d + MD5M_D;
+ MD5_STEP (MD5_Fo, a, b, c, d, w0_t[0], MD5C00, MD5S00);
+ MD5_STEP (MD5_Fo, d, a, b, c, w0_t[1], MD5C01, MD5S01);
+ MD5_STEP (MD5_Fo, c, d, a, b, w0_t[2], MD5C02, MD5S02);
+ MD5_STEP (MD5_Fo, b, c, d, a, w0_t[3], MD5C03, MD5S03);
+ MD5_STEP (MD5_Fo, a, b, c, d, w1_t[0], MD5C04, MD5S00);
+ MD5_STEP (MD5_Fo, d, a, b, c, w1_t[1], MD5C05, MD5S01);
+ MD5_STEP (MD5_Fo, c, d, a, b, w1_t[2], MD5C06, MD5S02);
+ MD5_STEP (MD5_Fo, b, c, d, a, w1_t[3], MD5C07, MD5S03);
+ MD5_STEP (MD5_Fo, a, b, c, d, w2_t[0], MD5C08, MD5S00);
+ MD5_STEP (MD5_Fo, d, a, b, c, w2_t[1], MD5C09, MD5S01);
+ MD5_STEP (MD5_Fo, c, d, a, b, w2_t[2], MD5C0a, MD5S02);
+ MD5_STEP (MD5_Fo, b, c, d, a, w2_t[3], MD5C0b, MD5S03);
+ MD5_STEP (MD5_Fo, a, b, c, d, w3_t[0], MD5C0c, MD5S00);
+ MD5_STEP (MD5_Fo, d, a, b, c, w3_t[1], MD5C0d, MD5S01);
+ MD5_STEP (MD5_Fo, c, d, a, b, w3_t[2], MD5C0e, MD5S02);
+ MD5_STEP (MD5_Fo, b, c, d, a, w3_t[3], MD5C0f, MD5S03);
+
+ MD5_STEP (MD5_Go, a, b, c, d, w0_t[1], MD5C10, MD5S10);
+ MD5_STEP (MD5_Go, d, a, b, c, w1_t[2], MD5C11, MD5S11);
+ MD5_STEP (MD5_Go, c, d, a, b, w2_t[3], MD5C12, MD5S12);
+ MD5_STEP (MD5_Go, b, c, d, a, w0_t[0], MD5C13, MD5S13);
+ MD5_STEP (MD5_Go, a, b, c, d, w1_t[1], MD5C14, MD5S10);
+ MD5_STEP (MD5_Go, d, a, b, c, w2_t[2], MD5C15, MD5S11);
+ MD5_STEP (MD5_Go, c, d, a, b, w3_t[3], MD5C16, MD5S12);
+ MD5_STEP (MD5_Go, b, c, d, a, w1_t[0], MD5C17, MD5S13);
+ MD5_STEP (MD5_Go, a, b, c, d, w2_t[1], MD5C18, MD5S10);
+ MD5_STEP (MD5_Go, d, a, b, c, w3_t[2], MD5C19, MD5S11);
+ MD5_STEP (MD5_Go, c, d, a, b, w0_t[3], MD5C1a, MD5S12);
+ MD5_STEP (MD5_Go, b, c, d, a, w2_t[0], MD5C1b, MD5S13);
+ MD5_STEP (MD5_Go, a, b, c, d, w3_t[1], MD5C1c, MD5S10);
+ MD5_STEP (MD5_Go, d, a, b, c, w0_t[2], MD5C1d, MD5S11);
+ MD5_STEP (MD5_Go, c, d, a, b, w1_t[3], MD5C1e, MD5S12);
+ MD5_STEP (MD5_Go, b, c, d, a, w3_t[0], MD5C1f, MD5S13);
+
+ MD5_STEP (MD5_H , a, b, c, d, w1_t[1], MD5C20, MD5S20);
+ MD5_STEP (MD5_H , d, a, b, c, w2_t[0], MD5C21, MD5S21);
+ MD5_STEP (MD5_H , c, d, a, b, w2_t[3], MD5C22, MD5S22);
+ MD5_STEP (MD5_H , b, c, d, a, w3_t[2], MD5C23, MD5S23);
+ MD5_STEP (MD5_H , a, b, c, d, w0_t[1], MD5C24, MD5S20);
+ MD5_STEP (MD5_H , d, a, b, c, w1_t[0], MD5C25, MD5S21);
+ MD5_STEP (MD5_H , c, d, a, b, w1_t[3], MD5C26, MD5S22);
+ MD5_STEP (MD5_H , b, c, d, a, w2_t[2], MD5C27, MD5S23);
+ MD5_STEP (MD5_H , a, b, c, d, w3_t[1], MD5C28, MD5S20);
+ MD5_STEP (MD5_H , d, a, b, c, w0_t[0], MD5C29, MD5S21);
+ MD5_STEP (MD5_H , c, d, a, b, w0_t[3], MD5C2a, MD5S22);
+ MD5_STEP (MD5_H , b, c, d, a, w1_t[2], MD5C2b, MD5S23);
+ MD5_STEP (MD5_H , a, b, c, d, w2_t[1], MD5C2c, MD5S20);
+ MD5_STEP (MD5_H , d, a, b, c, w3_t[0], MD5C2d, MD5S21);
+ MD5_STEP (MD5_H , c, d, a, b, w3_t[3], MD5C2e, MD5S22);
+ MD5_STEP (MD5_H , b, c, d, a, w0_t[2], MD5C2f, MD5S23);
+
+ MD5_STEP (MD5_I , a, b, c, d, w0_t[0], MD5C30, MD5S30);
+ MD5_STEP (MD5_I , d, a, b, c, w1_t[3], MD5C31, MD5S31);
+ MD5_STEP (MD5_I , c, d, a, b, w3_t[2], MD5C32, MD5S32);
+ MD5_STEP (MD5_I , b, c, d, a, w1_t[1], MD5C33, MD5S33);
+ MD5_STEP (MD5_I , a, b, c, d, w3_t[0], MD5C34, MD5S30);
+ MD5_STEP (MD5_I , d, a, b, c, w0_t[3], MD5C35, MD5S31);
+ MD5_STEP (MD5_I , c, d, a, b, w2_t[2], MD5C36, MD5S32);
+ MD5_STEP (MD5_I , b, c, d, a, w0_t[1], MD5C37, MD5S33);
+ MD5_STEP (MD5_I , a, b, c, d, w2_t[0], MD5C38, MD5S30);
+ MD5_STEP (MD5_I , d, a, b, c, w3_t[3], MD5C39, MD5S31);
+ MD5_STEP (MD5_I , c, d, a, b, w1_t[2], MD5C3a, MD5S32);
+ MD5_STEP (MD5_I , b, c, d, a, w3_t[1], MD5C3b, MD5S33);
+ MD5_STEP (MD5_I , a, b, c, d, w1_t[0], MD5C3c, MD5S30);
+ MD5_STEP (MD5_I , d, a, b, c, w2_t[3], MD5C3d, MD5S31);
+ MD5_STEP (MD5_I , c, d, a, b, w0_t[2], MD5C3e, MD5S32);
+ MD5_STEP (MD5_I , b, c, d, a, w2_t[1], MD5C3f, MD5S33);
+
+ const u32x r_a = a + MD5M_A;
+ const u32x r_b = b + MD5M_B;
+ const u32x r_c = c + MD5M_C;
+ const u32x r_d = d + MD5M_D;
a = r_a;
b = r_b;
c += r_c;
d += r_d;
- const u32 r0 = a;
- const u32 r1 = d;
- const u32 r2 = c;
- const u32 r3 = b;
-
- #include COMPARE_M
+ COMPARE_M_SIMD (a, d, c, b);
}
}
u32 w0l = w0[0];
- for (u32 il_pos = 0; il_pos < bfs_cnt; il_pos++)
+ for (u32 il_pos = 0; il_pos < bfs_cnt; il_pos += VECT_SIZE)
{
- const u32 w0r = bfs_buf[il_pos].i;
-
- w0[0] = w0l | w0r;
-
- u32 a = MD5M_A;
- u32 b = MD5M_B;
- u32 c = MD5M_C;
- u32 d = MD5M_D;
-
- MD5_STEP (MD5_Fo, a, b, c, d, w0[0], MD5C00, MD5S00);
- MD5_STEP (MD5_Fo, d, a, b, c, w0[1], MD5C01, MD5S01);
- MD5_STEP (MD5_Fo, c, d, a, b, w0[2], MD5C02, MD5S02);
- MD5_STEP (MD5_Fo, b, c, d, a, w0[3], MD5C03, MD5S03);
- MD5_STEP (MD5_Fo, a, b, c, d, w1[0], MD5C04, MD5S00);
- MD5_STEP (MD5_Fo, d, a, b, c, w1[1], MD5C05, MD5S01);
- MD5_STEP (MD5_Fo, c, d, a, b, w1[2], MD5C06, MD5S02);
- MD5_STEP (MD5_Fo, b, c, d, a, w1[3], MD5C07, MD5S03);
- MD5_STEP (MD5_Fo, a, b, c, d, w2[0], MD5C08, MD5S00);
- MD5_STEP (MD5_Fo, d, a, b, c, w2[1], MD5C09, MD5S01);
- MD5_STEP (MD5_Fo, c, d, a, b, w2[2], MD5C0a, MD5S02);
- MD5_STEP (MD5_Fo, b, c, d, a, w2[3], MD5C0b, MD5S03);
- MD5_STEP (MD5_Fo, a, b, c, d, w3[0], MD5C0c, MD5S00);
- MD5_STEP (MD5_Fo, d, a, b, c, w3[1], MD5C0d, MD5S01);
- MD5_STEP (MD5_Fo, c, d, a, b, w3[2], MD5C0e, MD5S02);
- MD5_STEP (MD5_Fo, b, c, d, a, w3[3], MD5C0f, MD5S03);
-
- MD5_STEP (MD5_Go, a, b, c, d, w0[1], MD5C10, MD5S10);
- MD5_STEP (MD5_Go, d, a, b, c, w1[2], MD5C11, MD5S11);
- MD5_STEP (MD5_Go, c, d, a, b, w2[3], MD5C12, MD5S12);
- MD5_STEP (MD5_Go, b, c, d, a, w0[0], MD5C13, MD5S13);
- MD5_STEP (MD5_Go, a, b, c, d, w1[1], MD5C14, MD5S10);
- MD5_STEP (MD5_Go, d, a, b, c, w2[2], MD5C15, MD5S11);
- MD5_STEP (MD5_Go, c, d, a, b, w3[3], MD5C16, MD5S12);
- MD5_STEP (MD5_Go, b, c, d, a, w1[0], MD5C17, MD5S13);
- MD5_STEP (MD5_Go, a, b, c, d, w2[1], MD5C18, MD5S10);
- MD5_STEP (MD5_Go, d, a, b, c, w3[2], MD5C19, MD5S11);
- MD5_STEP (MD5_Go, c, d, a, b, w0[3], MD5C1a, MD5S12);
- MD5_STEP (MD5_Go, b, c, d, a, w2[0], MD5C1b, MD5S13);
- MD5_STEP (MD5_Go, a, b, c, d, w3[1], MD5C1c, MD5S10);
- MD5_STEP (MD5_Go, d, a, b, c, w0[2], MD5C1d, MD5S11);
- MD5_STEP (MD5_Go, c, d, a, b, w1[3], MD5C1e, MD5S12);
- MD5_STEP (MD5_Go, b, c, d, a, w3[0], MD5C1f, MD5S13);
-
- MD5_STEP (MD5_H , a, b, c, d, w1[1], MD5C20, MD5S20);
- MD5_STEP (MD5_H , d, a, b, c, w2[0], MD5C21, MD5S21);
- MD5_STEP (MD5_H , c, d, a, b, w2[3], MD5C22, MD5S22);
- MD5_STEP (MD5_H , b, c, d, a, w3[2], MD5C23, MD5S23);
- MD5_STEP (MD5_H , a, b, c, d, w0[1], MD5C24, MD5S20);
- MD5_STEP (MD5_H , d, a, b, c, w1[0], MD5C25, MD5S21);
- MD5_STEP (MD5_H , c, d, a, b, w1[3], MD5C26, MD5S22);
- MD5_STEP (MD5_H , b, c, d, a, w2[2], MD5C27, MD5S23);
- MD5_STEP (MD5_H , a, b, c, d, w3[1], MD5C28, MD5S20);
- MD5_STEP (MD5_H , d, a, b, c, w0[0], MD5C29, MD5S21);
- MD5_STEP (MD5_H , c, d, a, b, w0[3], MD5C2a, MD5S22);
- MD5_STEP (MD5_H , b, c, d, a, w1[2], MD5C2b, MD5S23);
- MD5_STEP (MD5_H , a, b, c, d, w2[1], MD5C2c, MD5S20);
- MD5_STEP (MD5_H , d, a, b, c, w3[0], MD5C2d, MD5S21);
- MD5_STEP (MD5_H , c, d, a, b, w3[3], MD5C2e, MD5S22);
- MD5_STEP (MD5_H , b, c, d, a, w0[2], MD5C2f, MD5S23);
-
- MD5_STEP (MD5_I , a, b, c, d, w0[0], MD5C30, MD5S30);
- MD5_STEP (MD5_I , d, a, b, c, w1[3], MD5C31, MD5S31);
- MD5_STEP (MD5_I , c, d, a, b, w3[2], MD5C32, MD5S32);
- MD5_STEP (MD5_I , b, c, d, a, w1[1], MD5C33, MD5S33);
- MD5_STEP (MD5_I , a, b, c, d, w3[0], MD5C34, MD5S30);
- MD5_STEP (MD5_I , d, a, b, c, w0[3], MD5C35, MD5S31);
- MD5_STEP (MD5_I , c, d, a, b, w2[2], MD5C36, MD5S32);
- MD5_STEP (MD5_I , b, c, d, a, w0[1], MD5C37, MD5S33);
- MD5_STEP (MD5_I , a, b, c, d, w2[0], MD5C38, MD5S30);
- MD5_STEP (MD5_I , d, a, b, c, w3[3], MD5C39, MD5S31);
- MD5_STEP (MD5_I , c, d, a, b, w1[2], MD5C3a, MD5S32);
- MD5_STEP (MD5_I , b, c, d, a, w3[1], MD5C3b, MD5S33);
- MD5_STEP (MD5_I , a, b, c, d, w1[0], MD5C3c, MD5S30);
- MD5_STEP (MD5_I , d, a, b, c, w2[3], MD5C3d, MD5S31);
- MD5_STEP (MD5_I , c, d, a, b, w0[2], MD5C3e, MD5S32);
- MD5_STEP (MD5_I , b, c, d, a, w2[1], MD5C3f, MD5S33);
+ const u32x w0r = w0r_create_bft (bfs_buf, il_pos);
+
+ const u32x w0lr = w0l | w0r;
+
+ u32x w0_t[4];
+
+ w0_t[0] = w0lr;
+ w0_t[1] = w0[1];
+ w0_t[2] = w0[2];
+ w0_t[3] = w0[3];
+
+ u32x w1_t[4];
+
+ w1_t[0] = w1[0];
+ w1_t[1] = w1[1];
+ w1_t[2] = w1[2];
+ w1_t[3] = w1[3];
+
+ u32x w2_t[4];
+
+ w2_t[0] = w2[0];
+ w2_t[1] = w2[1];
+ w2_t[2] = w2[2];
+ w2_t[3] = w2[3];
+
+ u32x w3_t[4];
+
+ w3_t[0] = w3[0];
+ w3_t[1] = w3[1];
+ w3_t[2] = w3[2];
+ w3_t[3] = w3[3];
+
+ u32x a = MD5M_A;
+ u32x b = MD5M_B;
+ u32x c = MD5M_C;
+ u32x d = MD5M_D;
+
+ MD5_STEP (MD5_Fo, a, b, c, d, w0_t[0], MD5C00, MD5S00);
+ MD5_STEP (MD5_Fo, d, a, b, c, w0_t[1], MD5C01, MD5S01);
+ MD5_STEP (MD5_Fo, c, d, a, b, w0_t[2], MD5C02, MD5S02);
+ MD5_STEP (MD5_Fo, b, c, d, a, w0_t[3], MD5C03, MD5S03);
+ MD5_STEP (MD5_Fo, a, b, c, d, w1_t[0], MD5C04, MD5S00);
+ MD5_STEP (MD5_Fo, d, a, b, c, w1_t[1], MD5C05, MD5S01);
+ MD5_STEP (MD5_Fo, c, d, a, b, w1_t[2], MD5C06, MD5S02);
+ MD5_STEP (MD5_Fo, b, c, d, a, w1_t[3], MD5C07, MD5S03);
+ MD5_STEP (MD5_Fo, a, b, c, d, w2_t[0], MD5C08, MD5S00);
+ MD5_STEP (MD5_Fo, d, a, b, c, w2_t[1], MD5C09, MD5S01);
+ MD5_STEP (MD5_Fo, c, d, a, b, w2_t[2], MD5C0a, MD5S02);
+ MD5_STEP (MD5_Fo, b, c, d, a, w2_t[3], MD5C0b, MD5S03);
+ MD5_STEP (MD5_Fo, a, b, c, d, w3_t[0], MD5C0c, MD5S00);
+ MD5_STEP (MD5_Fo, d, a, b, c, w3_t[1], MD5C0d, MD5S01);
+ MD5_STEP (MD5_Fo, c, d, a, b, w3_t[2], MD5C0e, MD5S02);
+ MD5_STEP (MD5_Fo, b, c, d, a, w3_t[3], MD5C0f, MD5S03);
+
+ MD5_STEP (MD5_Go, a, b, c, d, w0_t[1], MD5C10, MD5S10);
+ MD5_STEP (MD5_Go, d, a, b, c, w1_t[2], MD5C11, MD5S11);
+ MD5_STEP (MD5_Go, c, d, a, b, w2_t[3], MD5C12, MD5S12);
+ MD5_STEP (MD5_Go, b, c, d, a, w0_t[0], MD5C13, MD5S13);
+ MD5_STEP (MD5_Go, a, b, c, d, w1_t[1], MD5C14, MD5S10);
+ MD5_STEP (MD5_Go, d, a, b, c, w2_t[2], MD5C15, MD5S11);
+ MD5_STEP (MD5_Go, c, d, a, b, w3_t[3], MD5C16, MD5S12);
+ MD5_STEP (MD5_Go, b, c, d, a, w1_t[0], MD5C17, MD5S13);
+ MD5_STEP (MD5_Go, a, b, c, d, w2_t[1], MD5C18, MD5S10);
+ MD5_STEP (MD5_Go, d, a, b, c, w3_t[2], MD5C19, MD5S11);
+ MD5_STEP (MD5_Go, c, d, a, b, w0_t[3], MD5C1a, MD5S12);
+ MD5_STEP (MD5_Go, b, c, d, a, w2_t[0], MD5C1b, MD5S13);
+ MD5_STEP (MD5_Go, a, b, c, d, w3_t[1], MD5C1c, MD5S10);
+ MD5_STEP (MD5_Go, d, a, b, c, w0_t[2], MD5C1d, MD5S11);
+ MD5_STEP (MD5_Go, c, d, a, b, w1_t[3], MD5C1e, MD5S12);
+ MD5_STEP (MD5_Go, b, c, d, a, w3_t[0], MD5C1f, MD5S13);
+
+ MD5_STEP (MD5_H , a, b, c, d, w1_t[1], MD5C20, MD5S20);
+ MD5_STEP (MD5_H , d, a, b, c, w2_t[0], MD5C21, MD5S21);
+ MD5_STEP (MD5_H , c, d, a, b, w2_t[3], MD5C22, MD5S22);
+ MD5_STEP (MD5_H , b, c, d, a, w3_t[2], MD5C23, MD5S23);
+ MD5_STEP (MD5_H , a, b, c, d, w0_t[1], MD5C24, MD5S20);
+ MD5_STEP (MD5_H , d, a, b, c, w1_t[0], MD5C25, MD5S21);
+ MD5_STEP (MD5_H , c, d, a, b, w1_t[3], MD5C26, MD5S22);
+ MD5_STEP (MD5_H , b, c, d, a, w2_t[2], MD5C27, MD5S23);
+ MD5_STEP (MD5_H , a, b, c, d, w3_t[1], MD5C28, MD5S20);
+ MD5_STEP (MD5_H , d, a, b, c, w0_t[0], MD5C29, MD5S21);
+ MD5_STEP (MD5_H , c, d, a, b, w0_t[3], MD5C2a, MD5S22);
+ MD5_STEP (MD5_H , b, c, d, a, w1_t[2], MD5C2b, MD5S23);
+ MD5_STEP (MD5_H , a, b, c, d, w2_t[1], MD5C2c, MD5S20);
+ MD5_STEP (MD5_H , d, a, b, c, w3_t[0], MD5C2d, MD5S21);
+ MD5_STEP (MD5_H , c, d, a, b, w3_t[3], MD5C2e, MD5S22);
+ MD5_STEP (MD5_H , b, c, d, a, w0_t[2], MD5C2f, MD5S23);
+
+ MD5_STEP (MD5_I , a, b, c, d, w0_t[0], MD5C30, MD5S30);
+ MD5_STEP (MD5_I , d, a, b, c, w1_t[3], MD5C31, MD5S31);
+ MD5_STEP (MD5_I , c, d, a, b, w3_t[2], MD5C32, MD5S32);
+ MD5_STEP (MD5_I , b, c, d, a, w1_t[1], MD5C33, MD5S33);
+ MD5_STEP (MD5_I , a, b, c, d, w3_t[0], MD5C34, MD5S30);
+ MD5_STEP (MD5_I , d, a, b, c, w0_t[3], MD5C35, MD5S31);
+ MD5_STEP (MD5_I , c, d, a, b, w2_t[2], MD5C36, MD5S32);
+ MD5_STEP (MD5_I , b, c, d, a, w0_t[1], MD5C37, MD5S33);
+ MD5_STEP (MD5_I , a, b, c, d, w2_t[0], MD5C38, MD5S30);
+ MD5_STEP (MD5_I , d, a, b, c, w3_t[3], MD5C39, MD5S31);
+ MD5_STEP (MD5_I , c, d, a, b, w1_t[2], MD5C3a, MD5S32);
+ MD5_STEP (MD5_I , b, c, d, a, w3_t[1], MD5C3b, MD5S33);
+ MD5_STEP (MD5_I , a, b, c, d, w1_t[0], MD5C3c, MD5S30);
+ MD5_STEP (MD5_I , d, a, b, c, w2_t[3], MD5C3d, MD5S31);
+ MD5_STEP (MD5_I , c, d, a, b, w0_t[2], MD5C3e, MD5S32);
+ MD5_STEP (MD5_I , b, c, d, a, w2_t[1], MD5C3f, MD5S33);
a += MD5M_A;
b += MD5M_B;
c += MD5M_C;
d += MD5M_D;
- const u32 w0_t = uint_to_hex_lower8 ((a >> 0) & 255) << 0
- | uint_to_hex_lower8 ((a >> 8) & 255) << 16;
- const u32 w1_t = uint_to_hex_lower8 ((a >> 16) & 255) << 0
- | uint_to_hex_lower8 ((a >> 24) & 255) << 16;
- const u32 w2_t = uint_to_hex_lower8 ((b >> 0) & 255) << 0
- | uint_to_hex_lower8 ((b >> 8) & 255) << 16;
- const u32 w3_t = uint_to_hex_lower8 ((b >> 16) & 255) << 0
- | uint_to_hex_lower8 ((b >> 24) & 255) << 16;
- const u32 w4_t = uint_to_hex_lower8 ((c >> 0) & 255) << 0
- | uint_to_hex_lower8 ((c >> 8) & 255) << 16;
- const u32 w5_t = uint_to_hex_lower8 ((c >> 16) & 255) << 0
- | uint_to_hex_lower8 ((c >> 24) & 255) << 16;
- const u32 w6_t = uint_to_hex_lower8 ((d >> 0) & 255) << 0
- | uint_to_hex_lower8 ((d >> 8) & 255) << 16;
- const u32 w7_t = uint_to_hex_lower8 ((d >> 16) & 255) << 0
- | uint_to_hex_lower8 ((d >> 24) & 255) << 16;
-
- const u32 w8_t = s[0];
- const u32 w9_t = s[1];
- const u32 wa_t = s[2];
- const u32 wb_t = s[3];
- const u32 wc_t = s[4];
- const u32 wd_t = s[5];
- const u32 we_t = s[6];
- const u32 wf_t = s[7];
+ w0_t[0] = uint_to_hex_lower8 ((a >> 0) & 255) << 0
+ | uint_to_hex_lower8 ((a >> 8) & 255) << 16;
+ w0_t[1] = uint_to_hex_lower8 ((a >> 16) & 255) << 0
+ | uint_to_hex_lower8 ((a >> 24) & 255) << 16;
+ w0_t[2] = uint_to_hex_lower8 ((b >> 0) & 255) << 0
+ | uint_to_hex_lower8 ((b >> 8) & 255) << 16;
+ w0_t[3] = uint_to_hex_lower8 ((b >> 16) & 255) << 0
+ | uint_to_hex_lower8 ((b >> 24) & 255) << 16;
+ w1_t[0] = uint_to_hex_lower8 ((c >> 0) & 255) << 0
+ | uint_to_hex_lower8 ((c >> 8) & 255) << 16;
+ w1_t[1] = uint_to_hex_lower8 ((c >> 16) & 255) << 0
+ | uint_to_hex_lower8 ((c >> 24) & 255) << 16;
+ w1_t[2] = uint_to_hex_lower8 ((d >> 0) & 255) << 0
+ | uint_to_hex_lower8 ((d >> 8) & 255) << 16;
+ w1_t[3] = uint_to_hex_lower8 ((d >> 16) & 255) << 0
+ | uint_to_hex_lower8 ((d >> 24) & 255) << 16;
+
+ w2_t[0] = s[0];
+ w2_t[1] = s[1];
+ w2_t[2] = s[2];
+ w2_t[3] = s[3];
+
+ w3_t[0] = s[4];
+ w3_t[1] = s[5];
+ w3_t[2] = s[6];
+ w3_t[3] = s[7];
a = MD5M_A;
b = MD5M_B;
c = MD5M_C;
d = MD5M_D;
- MD5_STEP (MD5_Fo, a, b, c, d, w0_t, MD5C00, MD5S00);
- MD5_STEP (MD5_Fo, d, a, b, c, w1_t, MD5C01, MD5S01);
- MD5_STEP (MD5_Fo, c, d, a, b, w2_t, MD5C02, MD5S02);
- MD5_STEP (MD5_Fo, b, c, d, a, w3_t, MD5C03, MD5S03);
- MD5_STEP (MD5_Fo, a, b, c, d, w4_t, MD5C04, MD5S00);
- MD5_STEP (MD5_Fo, d, a, b, c, w5_t, MD5C05, MD5S01);
- MD5_STEP (MD5_Fo, c, d, a, b, w6_t, MD5C06, MD5S02);
- MD5_STEP (MD5_Fo, b, c, d, a, w7_t, MD5C07, MD5S03);
- MD5_STEP (MD5_Fo, a, b, c, d, w8_t, MD5C08, MD5S00);
- MD5_STEP (MD5_Fo, d, a, b, c, w9_t, MD5C09, MD5S01);
- MD5_STEP (MD5_Fo, c, d, a, b, wa_t, MD5C0a, MD5S02);
- MD5_STEP (MD5_Fo, b, c, d, a, wb_t, MD5C0b, MD5S03);
- MD5_STEP (MD5_Fo, a, b, c, d, wc_t, MD5C0c, MD5S00);
- MD5_STEP (MD5_Fo, d, a, b, c, wd_t, MD5C0d, MD5S01);
- MD5_STEP (MD5_Fo, c, d, a, b, we_t, MD5C0e, MD5S02);
- MD5_STEP (MD5_Fo, b, c, d, a, wf_t, MD5C0f, MD5S03);
-
- MD5_STEP (MD5_Go, a, b, c, d, w1_t, MD5C10, MD5S10);
- MD5_STEP (MD5_Go, d, a, b, c, w6_t, MD5C11, MD5S11);
- MD5_STEP (MD5_Go, c, d, a, b, wb_t, MD5C12, MD5S12);
- MD5_STEP (MD5_Go, b, c, d, a, w0_t, MD5C13, MD5S13);
- MD5_STEP (MD5_Go, a, b, c, d, w5_t, MD5C14, MD5S10);
- MD5_STEP (MD5_Go, d, a, b, c, wa_t, MD5C15, MD5S11);
- MD5_STEP (MD5_Go, c, d, a, b, wf_t, MD5C16, MD5S12);
- MD5_STEP (MD5_Go, b, c, d, a, w4_t, MD5C17, MD5S13);
- MD5_STEP (MD5_Go, a, b, c, d, w9_t, MD5C18, MD5S10);
- MD5_STEP (MD5_Go, d, a, b, c, we_t, MD5C19, MD5S11);
- MD5_STEP (MD5_Go, c, d, a, b, w3_t, MD5C1a, MD5S12);
- MD5_STEP (MD5_Go, b, c, d, a, w8_t, MD5C1b, MD5S13);
- MD5_STEP (MD5_Go, a, b, c, d, wd_t, MD5C1c, MD5S10);
- MD5_STEP (MD5_Go, d, a, b, c, w2_t, MD5C1d, MD5S11);
- MD5_STEP (MD5_Go, c, d, a, b, w7_t, MD5C1e, MD5S12);
- MD5_STEP (MD5_Go, b, c, d, a, wc_t, MD5C1f, MD5S13);
-
- MD5_STEP (MD5_H , a, b, c, d, w5_t, MD5C20, MD5S20);
- MD5_STEP (MD5_H , d, a, b, c, w8_t, MD5C21, MD5S21);
- MD5_STEP (MD5_H , c, d, a, b, wb_t, MD5C22, MD5S22);
- MD5_STEP (MD5_H , b, c, d, a, we_t, MD5C23, MD5S23);
- MD5_STEP (MD5_H , a, b, c, d, w1_t, MD5C24, MD5S20);
- MD5_STEP (MD5_H , d, a, b, c, w4_t, MD5C25, MD5S21);
- MD5_STEP (MD5_H , c, d, a, b, w7_t, MD5C26, MD5S22);
- MD5_STEP (MD5_H , b, c, d, a, wa_t, MD5C27, MD5S23);
- MD5_STEP (MD5_H , a, b, c, d, wd_t, MD5C28, MD5S20);
- MD5_STEP (MD5_H , d, a, b, c, w0_t, MD5C29, MD5S21);
- MD5_STEP (MD5_H , c, d, a, b, w3_t, MD5C2a, MD5S22);
- MD5_STEP (MD5_H , b, c, d, a, w6_t, MD5C2b, MD5S23);
- MD5_STEP (MD5_H , a, b, c, d, w9_t, MD5C2c, MD5S20);
- MD5_STEP (MD5_H , d, a, b, c, wc_t, MD5C2d, MD5S21);
- MD5_STEP (MD5_H , c, d, a, b, wf_t, MD5C2e, MD5S22);
- MD5_STEP (MD5_H , b, c, d, a, w2_t, MD5C2f, MD5S23);
-
- MD5_STEP (MD5_I , a, b, c, d, w0_t, MD5C30, MD5S30);
- MD5_STEP (MD5_I , d, a, b, c, w7_t, MD5C31, MD5S31);
- MD5_STEP (MD5_I , c, d, a, b, we_t, MD5C32, MD5S32);
- MD5_STEP (MD5_I , b, c, d, a, w5_t, MD5C33, MD5S33);
- MD5_STEP (MD5_I , a, b, c, d, wc_t, MD5C34, MD5S30);
- MD5_STEP (MD5_I , d, a, b, c, w3_t, MD5C35, MD5S31);
- MD5_STEP (MD5_I , c, d, a, b, wa_t, MD5C36, MD5S32);
- MD5_STEP (MD5_I , b, c, d, a, w1_t, MD5C37, MD5S33);
- MD5_STEP (MD5_I , a, b, c, d, w8_t, MD5C38, MD5S30);
- MD5_STEP (MD5_I , d, a, b, c, wf_t, MD5C39, MD5S31);
- MD5_STEP (MD5_I , c, d, a, b, w6_t, MD5C3a, MD5S32);
- MD5_STEP (MD5_I , b, c, d, a, wd_t, MD5C3b, MD5S33);
- MD5_STEP (MD5_I , a, b, c, d, w4_t, MD5C3c, MD5S30);
- MD5_STEP (MD5_I , d, a, b, c, wb_t, MD5C3d, MD5S31);
- MD5_STEP (MD5_I , c, d, a, b, w2_t, MD5C3e, MD5S32);
- MD5_STEP (MD5_I , b, c, d, a, w9_t, MD5C3f, MD5S33);
-
- const u32 r_a = a + MD5M_A;
- const u32 r_b = b + MD5M_B;
- const u32 r_c = c + MD5M_C;
- const u32 r_d = d + MD5M_D;
+ MD5_STEP (MD5_Fo, a, b, c, d, w0_t[0], MD5C00, MD5S00);
+ MD5_STEP (MD5_Fo, d, a, b, c, w0_t[1], MD5C01, MD5S01);
+ MD5_STEP (MD5_Fo, c, d, a, b, w0_t[2], MD5C02, MD5S02);
+ MD5_STEP (MD5_Fo, b, c, d, a, w0_t[3], MD5C03, MD5S03);
+ MD5_STEP (MD5_Fo, a, b, c, d, w1_t[0], MD5C04, MD5S00);
+ MD5_STEP (MD5_Fo, d, a, b, c, w1_t[1], MD5C05, MD5S01);
+ MD5_STEP (MD5_Fo, c, d, a, b, w1_t[2], MD5C06, MD5S02);
+ MD5_STEP (MD5_Fo, b, c, d, a, w1_t[3], MD5C07, MD5S03);
+ MD5_STEP (MD5_Fo, a, b, c, d, w2_t[0], MD5C08, MD5S00);
+ MD5_STEP (MD5_Fo, d, a, b, c, w2_t[1], MD5C09, MD5S01);
+ MD5_STEP (MD5_Fo, c, d, a, b, w2_t[2], MD5C0a, MD5S02);
+ MD5_STEP (MD5_Fo, b, c, d, a, w2_t[3], MD5C0b, MD5S03);
+ MD5_STEP (MD5_Fo, a, b, c, d, w3_t[0], MD5C0c, MD5S00);
+ MD5_STEP (MD5_Fo, d, a, b, c, w3_t[1], MD5C0d, MD5S01);
+ MD5_STEP (MD5_Fo, c, d, a, b, w3_t[2], MD5C0e, MD5S02);
+ MD5_STEP (MD5_Fo, b, c, d, a, w3_t[3], MD5C0f, MD5S03);
+
+ MD5_STEP (MD5_Go, a, b, c, d, w0_t[1], MD5C10, MD5S10);
+ MD5_STEP (MD5_Go, d, a, b, c, w1_t[2], MD5C11, MD5S11);
+ MD5_STEP (MD5_Go, c, d, a, b, w2_t[3], MD5C12, MD5S12);
+ MD5_STEP (MD5_Go, b, c, d, a, w0_t[0], MD5C13, MD5S13);
+ MD5_STEP (MD5_Go, a, b, c, d, w1_t[1], MD5C14, MD5S10);
+ MD5_STEP (MD5_Go, d, a, b, c, w2_t[2], MD5C15, MD5S11);
+ MD5_STEP (MD5_Go, c, d, a, b, w3_t[3], MD5C16, MD5S12);
+ MD5_STEP (MD5_Go, b, c, d, a, w1_t[0], MD5C17, MD5S13);
+ MD5_STEP (MD5_Go, a, b, c, d, w2_t[1], MD5C18, MD5S10);
+ MD5_STEP (MD5_Go, d, a, b, c, w3_t[2], MD5C19, MD5S11);
+ MD5_STEP (MD5_Go, c, d, a, b, w0_t[3], MD5C1a, MD5S12);
+ MD5_STEP (MD5_Go, b, c, d, a, w2_t[0], MD5C1b, MD5S13);
+ MD5_STEP (MD5_Go, a, b, c, d, w3_t[1], MD5C1c, MD5S10);
+ MD5_STEP (MD5_Go, d, a, b, c, w0_t[2], MD5C1d, MD5S11);
+ MD5_STEP (MD5_Go, c, d, a, b, w1_t[3], MD5C1e, MD5S12);
+ MD5_STEP (MD5_Go, b, c, d, a, w3_t[0], MD5C1f, MD5S13);
+
+ MD5_STEP (MD5_H , a, b, c, d, w1_t[1], MD5C20, MD5S20);
+ MD5_STEP (MD5_H , d, a, b, c, w2_t[0], MD5C21, MD5S21);
+ MD5_STEP (MD5_H , c, d, a, b, w2_t[3], MD5C22, MD5S22);
+ MD5_STEP (MD5_H , b, c, d, a, w3_t[2], MD5C23, MD5S23);
+ MD5_STEP (MD5_H , a, b, c, d, w0_t[1], MD5C24, MD5S20);
+ MD5_STEP (MD5_H , d, a, b, c, w1_t[0], MD5C25, MD5S21);
+ MD5_STEP (MD5_H , c, d, a, b, w1_t[3], MD5C26, MD5S22);
+ MD5_STEP (MD5_H , b, c, d, a, w2_t[2], MD5C27, MD5S23);
+ MD5_STEP (MD5_H , a, b, c, d, w3_t[1], MD5C28, MD5S20);
+ MD5_STEP (MD5_H , d, a, b, c, w0_t[0], MD5C29, MD5S21);
+ MD5_STEP (MD5_H , c, d, a, b, w0_t[3], MD5C2a, MD5S22);
+ MD5_STEP (MD5_H , b, c, d, a, w1_t[2], MD5C2b, MD5S23);
+ MD5_STEP (MD5_H , a, b, c, d, w2_t[1], MD5C2c, MD5S20);
+ MD5_STEP (MD5_H , d, a, b, c, w3_t[0], MD5C2d, MD5S21);
+ MD5_STEP (MD5_H , c, d, a, b, w3_t[3], MD5C2e, MD5S22);
+ MD5_STEP (MD5_H , b, c, d, a, w0_t[2], MD5C2f, MD5S23);
+
+ MD5_STEP (MD5_I , a, b, c, d, w0_t[0], MD5C30, MD5S30);
+ MD5_STEP (MD5_I , d, a, b, c, w1_t[3], MD5C31, MD5S31);
+ MD5_STEP (MD5_I , c, d, a, b, w3_t[2], MD5C32, MD5S32);
+ MD5_STEP (MD5_I , b, c, d, a, w1_t[1], MD5C33, MD5S33);
+ MD5_STEP (MD5_I , a, b, c, d, w3_t[0], MD5C34, MD5S30);
+ MD5_STEP (MD5_I , d, a, b, c, w0_t[3], MD5C35, MD5S31);
+ MD5_STEP (MD5_I , c, d, a, b, w2_t[2], MD5C36, MD5S32);
+ MD5_STEP (MD5_I , b, c, d, a, w0_t[1], MD5C37, MD5S33);
+ MD5_STEP (MD5_I , a, b, c, d, w2_t[0], MD5C38, MD5S30);
+ MD5_STEP (MD5_I , d, a, b, c, w3_t[3], MD5C39, MD5S31);
+ MD5_STEP (MD5_I , c, d, a, b, w1_t[2], MD5C3a, MD5S32);
+ MD5_STEP (MD5_I , b, c, d, a, w3_t[1], MD5C3b, MD5S33);
+ MD5_STEP (MD5_I , a, b, c, d, w1_t[0], MD5C3c, MD5S30);
+ MD5_STEP (MD5_I , d, a, b, c, w2_t[3], MD5C3d, MD5S31);
+ MD5_STEP (MD5_I , c, d, a, b, w0_t[2], MD5C3e, MD5S32);
+ MD5_STEP (MD5_I , b, c, d, a, w2_t[1], MD5C3f, MD5S33);
+
+ const u32x r_a = a + MD5M_A;
+ const u32x r_b = b + MD5M_B;
+ const u32x r_c = c + MD5M_C;
+ const u32x r_d = d + MD5M_D;
a = r_a;
b = r_b;
MD5_STEP0(MD5_I , b, c, d, a, MD5C3b, MD5S33);
MD5_STEP0(MD5_I , a, b, c, d, MD5C3c, MD5S30);
- if (allx ((a + r_a) != search[0])) continue;
+ if (MATCHES_NONE_VS ((a + r_a), search[0])) continue;
MD5_STEP0(MD5_I , d, a, b, c, MD5C3d, MD5S31);
MD5_STEP0(MD5_I , c, d, a, b, MD5C3e, MD5S32);
c += r_c;
d += r_d;
- const u32 r0 = a;
- const u32 r1 = d;
- const u32 r2 = c;
- const u32 r3 = b;
-
- #include COMPARE_S
+ COMPARE_S_SIMD (a, d, c, b);
}
}
{
append_0x80_2x4 (wordl0, wordl1, pw_l_len);
- switch_buffer_by_offset (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len);
+ switch_buffer_by_offset_le (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len);
}
/**
if (combs_mode == COMBINATOR_MODE_BASE_LEFT)
{
- switch_buffer_by_offset (wordr0, wordr1, wordr2, wordr3, pw_l_len);
+ switch_buffer_by_offset_le (wordr0, wordr1, wordr2, wordr3, pw_l_len);
}
u32 w0[4];
{
append_0x80_2x4 (wordl0, wordl1, pw_l_len);
- switch_buffer_by_offset (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len);
+ switch_buffer_by_offset_le (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len);
}
/**
if (combs_mode == COMBINATOR_MODE_BASE_LEFT)
{
- switch_buffer_by_offset (wordr0, wordr1, wordr2, wordr3, pw_l_len);
+ switch_buffer_by_offset_le (wordr0, wordr1, wordr2, wordr3, pw_l_len);
}
u32 w0[4];
#define _MD5_
+#define NEW_SIMD_CODE
+
#include "include/constants.h"
#include "include/kernel_vendor.h"
#include "include/kernel_functions.c"
#include "OpenCL/types_ocl.c"
#include "OpenCL/common.c"
-
-#define COMPARE_S "OpenCL/check_single_comp4.c"
-#define COMPARE_M "OpenCL/check_multi_comp4.c"
-
-#define uint_to_hex_lower8(i) l_bin2asc[(i)]
+#include "OpenCL/simd.c"
+
+#if VECT_SIZE == 1
+#define uint_to_hex_lower8(i) (u32x) (l_bin2asc[(i)])
+#elif VECT_SIZE == 2
+#define uint_to_hex_lower8(i) (u32x) (l_bin2asc[(i).s0], l_bin2asc[(i).s1])
+#elif VECT_SIZE == 4
+#define uint_to_hex_lower8(i) (u32x) (l_bin2asc[(i).s0], l_bin2asc[(i).s1], l_bin2asc[(i).s2], l_bin2asc[(i).s3])
+#elif VECT_SIZE == 8
+#define uint_to_hex_lower8(i) (u32x) (l_bin2asc[(i).s0], l_bin2asc[(i).s1], l_bin2asc[(i).s2], l_bin2asc[(i).s3], l_bin2asc[(i).s4], l_bin2asc[(i).s5], l_bin2asc[(i).s6], l_bin2asc[(i).s7])
+#endif
static void m02810m (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], const u32 pw_len, __global pw_t *pws, __global kernel_rule_t *rules_buf, __global comb_t *combs_buf, __global bf_t *bfs_buf, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 bfs_cnt, const u32 digests_cnt, const u32 digests_offset, __local u32 l_bin2asc[256])
{
u32 w0l = w0[0];
- for (u32 il_pos = 0; il_pos < bfs_cnt; il_pos++)
+ for (u32 il_pos = 0; il_pos < bfs_cnt; il_pos += VECT_SIZE)
{
- const u32 w0r = bfs_buf[il_pos].i;
-
- w0[0] = w0l | w0r;
-
- u32 a = MD5M_A;
- u32 b = MD5M_B;
- u32 c = MD5M_C;
- u32 d = MD5M_D;
-
- MD5_STEP (MD5_Fo, a, b, c, d, w0[0], MD5C00, MD5S00);
- MD5_STEP (MD5_Fo, d, a, b, c, w0[1], MD5C01, MD5S01);
- MD5_STEP (MD5_Fo, c, d, a, b, w0[2], MD5C02, MD5S02);
- MD5_STEP (MD5_Fo, b, c, d, a, w0[3], MD5C03, MD5S03);
- MD5_STEP (MD5_Fo, a, b, c, d, w1[0], MD5C04, MD5S00);
- MD5_STEP (MD5_Fo, d, a, b, c, w1[1], MD5C05, MD5S01);
- MD5_STEP (MD5_Fo, c, d, a, b, w1[2], MD5C06, MD5S02);
- MD5_STEP (MD5_Fo, b, c, d, a, w1[3], MD5C07, MD5S03);
- MD5_STEP (MD5_Fo, a, b, c, d, w2[0], MD5C08, MD5S00);
- MD5_STEP (MD5_Fo, d, a, b, c, w2[1], MD5C09, MD5S01);
- MD5_STEP (MD5_Fo, c, d, a, b, w2[2], MD5C0a, MD5S02);
- MD5_STEP (MD5_Fo, b, c, d, a, w2[3], MD5C0b, MD5S03);
- MD5_STEP (MD5_Fo, a, b, c, d, w3[0], MD5C0c, MD5S00);
- MD5_STEP (MD5_Fo, d, a, b, c, w3[1], MD5C0d, MD5S01);
- MD5_STEP (MD5_Fo, c, d, a, b, w3[2], MD5C0e, MD5S02);
- MD5_STEP (MD5_Fo, b, c, d, a, w3[3], MD5C0f, MD5S03);
-
- MD5_STEP (MD5_Go, a, b, c, d, w0[1], MD5C10, MD5S10);
- MD5_STEP (MD5_Go, d, a, b, c, w1[2], MD5C11, MD5S11);
- MD5_STEP (MD5_Go, c, d, a, b, w2[3], MD5C12, MD5S12);
- MD5_STEP (MD5_Go, b, c, d, a, w0[0], MD5C13, MD5S13);
- MD5_STEP (MD5_Go, a, b, c, d, w1[1], MD5C14, MD5S10);
- MD5_STEP (MD5_Go, d, a, b, c, w2[2], MD5C15, MD5S11);
- MD5_STEP (MD5_Go, c, d, a, b, w3[3], MD5C16, MD5S12);
- MD5_STEP (MD5_Go, b, c, d, a, w1[0], MD5C17, MD5S13);
- MD5_STEP (MD5_Go, a, b, c, d, w2[1], MD5C18, MD5S10);
- MD5_STEP (MD5_Go, d, a, b, c, w3[2], MD5C19, MD5S11);
- MD5_STEP (MD5_Go, c, d, a, b, w0[3], MD5C1a, MD5S12);
- MD5_STEP (MD5_Go, b, c, d, a, w2[0], MD5C1b, MD5S13);
- MD5_STEP (MD5_Go, a, b, c, d, w3[1], MD5C1c, MD5S10);
- MD5_STEP (MD5_Go, d, a, b, c, w0[2], MD5C1d, MD5S11);
- MD5_STEP (MD5_Go, c, d, a, b, w1[3], MD5C1e, MD5S12);
- MD5_STEP (MD5_Go, b, c, d, a, w3[0], MD5C1f, MD5S13);
-
- MD5_STEP (MD5_H , a, b, c, d, w1[1], MD5C20, MD5S20);
- MD5_STEP (MD5_H , d, a, b, c, w2[0], MD5C21, MD5S21);
- MD5_STEP (MD5_H , c, d, a, b, w2[3], MD5C22, MD5S22);
- MD5_STEP (MD5_H , b, c, d, a, w3[2], MD5C23, MD5S23);
- MD5_STEP (MD5_H , a, b, c, d, w0[1], MD5C24, MD5S20);
- MD5_STEP (MD5_H , d, a, b, c, w1[0], MD5C25, MD5S21);
- MD5_STEP (MD5_H , c, d, a, b, w1[3], MD5C26, MD5S22);
- MD5_STEP (MD5_H , b, c, d, a, w2[2], MD5C27, MD5S23);
- MD5_STEP (MD5_H , a, b, c, d, w3[1], MD5C28, MD5S20);
- MD5_STEP (MD5_H , d, a, b, c, w0[0], MD5C29, MD5S21);
- MD5_STEP (MD5_H , c, d, a, b, w0[3], MD5C2a, MD5S22);
- MD5_STEP (MD5_H , b, c, d, a, w1[2], MD5C2b, MD5S23);
- MD5_STEP (MD5_H , a, b, c, d, w2[1], MD5C2c, MD5S20);
- MD5_STEP (MD5_H , d, a, b, c, w3[0], MD5C2d, MD5S21);
- MD5_STEP (MD5_H , c, d, a, b, w3[3], MD5C2e, MD5S22);
- MD5_STEP (MD5_H , b, c, d, a, w0[2], MD5C2f, MD5S23);
-
- MD5_STEP (MD5_I , a, b, c, d, w0[0], MD5C30, MD5S30);
- MD5_STEP (MD5_I , d, a, b, c, w1[3], MD5C31, MD5S31);
- MD5_STEP (MD5_I , c, d, a, b, w3[2], MD5C32, MD5S32);
- MD5_STEP (MD5_I , b, c, d, a, w1[1], MD5C33, MD5S33);
- MD5_STEP (MD5_I , a, b, c, d, w3[0], MD5C34, MD5S30);
- MD5_STEP (MD5_I , d, a, b, c, w0[3], MD5C35, MD5S31);
- MD5_STEP (MD5_I , c, d, a, b, w2[2], MD5C36, MD5S32);
- MD5_STEP (MD5_I , b, c, d, a, w0[1], MD5C37, MD5S33);
- MD5_STEP (MD5_I , a, b, c, d, w2[0], MD5C38, MD5S30);
- MD5_STEP (MD5_I , d, a, b, c, w3[3], MD5C39, MD5S31);
- MD5_STEP (MD5_I , c, d, a, b, w1[2], MD5C3a, MD5S32);
- MD5_STEP (MD5_I , b, c, d, a, w3[1], MD5C3b, MD5S33);
- MD5_STEP (MD5_I , a, b, c, d, w1[0], MD5C3c, MD5S30);
- MD5_STEP (MD5_I , d, a, b, c, w2[3], MD5C3d, MD5S31);
- MD5_STEP (MD5_I , c, d, a, b, w0[2], MD5C3e, MD5S32);
- MD5_STEP (MD5_I , b, c, d, a, w2[1], MD5C3f, MD5S33);
+ const u32x w0r = w0r_create_bft (bfs_buf, il_pos);
+
+ const u32x w0lr = w0l | w0r;
+
+ u32x w0_t[4];
+
+ w0_t[0] = w0lr;
+ w0_t[1] = w0[1];
+ w0_t[2] = w0[2];
+ w0_t[3] = w0[3];
+
+ u32x w1_t[4];
+
+ w1_t[0] = w1[0];
+ w1_t[1] = w1[1];
+ w1_t[2] = w1[2];
+ w1_t[3] = w1[3];
+
+ u32x w2_t[4];
+
+ w2_t[0] = w2[0];
+ w2_t[1] = w2[1];
+ w2_t[2] = w2[2];
+ w2_t[3] = w2[3];
+
+ u32x w3_t[4];
+
+ w3_t[0] = w3[0];
+ w3_t[1] = w3[1];
+ w3_t[2] = w3[2];
+ w3_t[3] = w3[3];
+
+ u32x a = MD5M_A;
+ u32x b = MD5M_B;
+ u32x c = MD5M_C;
+ u32x d = MD5M_D;
+
+ MD5_STEP (MD5_Fo, a, b, c, d, w0_t[0], MD5C00, MD5S00);
+ MD5_STEP (MD5_Fo, d, a, b, c, w0_t[1], MD5C01, MD5S01);
+ MD5_STEP (MD5_Fo, c, d, a, b, w0_t[2], MD5C02, MD5S02);
+ MD5_STEP (MD5_Fo, b, c, d, a, w0_t[3], MD5C03, MD5S03);
+ MD5_STEP (MD5_Fo, a, b, c, d, w1_t[0], MD5C04, MD5S00);
+ MD5_STEP (MD5_Fo, d, a, b, c, w1_t[1], MD5C05, MD5S01);
+ MD5_STEP (MD5_Fo, c, d, a, b, w1_t[2], MD5C06, MD5S02);
+ MD5_STEP (MD5_Fo, b, c, d, a, w1_t[3], MD5C07, MD5S03);
+ MD5_STEP (MD5_Fo, a, b, c, d, w2_t[0], MD5C08, MD5S00);
+ MD5_STEP (MD5_Fo, d, a, b, c, w2_t[1], MD5C09, MD5S01);
+ MD5_STEP (MD5_Fo, c, d, a, b, w2_t[2], MD5C0a, MD5S02);
+ MD5_STEP (MD5_Fo, b, c, d, a, w2_t[3], MD5C0b, MD5S03);
+ MD5_STEP (MD5_Fo, a, b, c, d, w3_t[0], MD5C0c, MD5S00);
+ MD5_STEP (MD5_Fo, d, a, b, c, w3_t[1], MD5C0d, MD5S01);
+ MD5_STEP (MD5_Fo, c, d, a, b, w3_t[2], MD5C0e, MD5S02);
+ MD5_STEP (MD5_Fo, b, c, d, a, w3_t[3], MD5C0f, MD5S03);
+
+ MD5_STEP (MD5_Go, a, b, c, d, w0_t[1], MD5C10, MD5S10);
+ MD5_STEP (MD5_Go, d, a, b, c, w1_t[2], MD5C11, MD5S11);
+ MD5_STEP (MD5_Go, c, d, a, b, w2_t[3], MD5C12, MD5S12);
+ MD5_STEP (MD5_Go, b, c, d, a, w0_t[0], MD5C13, MD5S13);
+ MD5_STEP (MD5_Go, a, b, c, d, w1_t[1], MD5C14, MD5S10);
+ MD5_STEP (MD5_Go, d, a, b, c, w2_t[2], MD5C15, MD5S11);
+ MD5_STEP (MD5_Go, c, d, a, b, w3_t[3], MD5C16, MD5S12);
+ MD5_STEP (MD5_Go, b, c, d, a, w1_t[0], MD5C17, MD5S13);
+ MD5_STEP (MD5_Go, a, b, c, d, w2_t[1], MD5C18, MD5S10);
+ MD5_STEP (MD5_Go, d, a, b, c, w3_t[2], MD5C19, MD5S11);
+ MD5_STEP (MD5_Go, c, d, a, b, w0_t[3], MD5C1a, MD5S12);
+ MD5_STEP (MD5_Go, b, c, d, a, w2_t[0], MD5C1b, MD5S13);
+ MD5_STEP (MD5_Go, a, b, c, d, w3_t[1], MD5C1c, MD5S10);
+ MD5_STEP (MD5_Go, d, a, b, c, w0_t[2], MD5C1d, MD5S11);
+ MD5_STEP (MD5_Go, c, d, a, b, w1_t[3], MD5C1e, MD5S12);
+ MD5_STEP (MD5_Go, b, c, d, a, w3_t[0], MD5C1f, MD5S13);
+
+ MD5_STEP (MD5_H , a, b, c, d, w1_t[1], MD5C20, MD5S20);
+ MD5_STEP (MD5_H , d, a, b, c, w2_t[0], MD5C21, MD5S21);
+ MD5_STEP (MD5_H , c, d, a, b, w2_t[3], MD5C22, MD5S22);
+ MD5_STEP (MD5_H , b, c, d, a, w3_t[2], MD5C23, MD5S23);
+ MD5_STEP (MD5_H , a, b, c, d, w0_t[1], MD5C24, MD5S20);
+ MD5_STEP (MD5_H , d, a, b, c, w1_t[0], MD5C25, MD5S21);
+ MD5_STEP (MD5_H , c, d, a, b, w1_t[3], MD5C26, MD5S22);
+ MD5_STEP (MD5_H , b, c, d, a, w2_t[2], MD5C27, MD5S23);
+ MD5_STEP (MD5_H , a, b, c, d, w3_t[1], MD5C28, MD5S20);
+ MD5_STEP (MD5_H , d, a, b, c, w0_t[0], MD5C29, MD5S21);
+ MD5_STEP (MD5_H , c, d, a, b, w0_t[3], MD5C2a, MD5S22);
+ MD5_STEP (MD5_H , b, c, d, a, w1_t[2], MD5C2b, MD5S23);
+ MD5_STEP (MD5_H , a, b, c, d, w2_t[1], MD5C2c, MD5S20);
+ MD5_STEP (MD5_H , d, a, b, c, w3_t[0], MD5C2d, MD5S21);
+ MD5_STEP (MD5_H , c, d, a, b, w3_t[3], MD5C2e, MD5S22);
+ MD5_STEP (MD5_H , b, c, d, a, w0_t[2], MD5C2f, MD5S23);
+
+ MD5_STEP (MD5_I , a, b, c, d, w0_t[0], MD5C30, MD5S30);
+ MD5_STEP (MD5_I , d, a, b, c, w1_t[3], MD5C31, MD5S31);
+ MD5_STEP (MD5_I , c, d, a, b, w3_t[2], MD5C32, MD5S32);
+ MD5_STEP (MD5_I , b, c, d, a, w1_t[1], MD5C33, MD5S33);
+ MD5_STEP (MD5_I , a, b, c, d, w3_t[0], MD5C34, MD5S30);
+ MD5_STEP (MD5_I , d, a, b, c, w0_t[3], MD5C35, MD5S31);
+ MD5_STEP (MD5_I , c, d, a, b, w2_t[2], MD5C36, MD5S32);
+ MD5_STEP (MD5_I , b, c, d, a, w0_t[1], MD5C37, MD5S33);
+ MD5_STEP (MD5_I , a, b, c, d, w2_t[0], MD5C38, MD5S30);
+ MD5_STEP (MD5_I , d, a, b, c, w3_t[3], MD5C39, MD5S31);
+ MD5_STEP (MD5_I , c, d, a, b, w1_t[2], MD5C3a, MD5S32);
+ MD5_STEP (MD5_I , b, c, d, a, w3_t[1], MD5C3b, MD5S33);
+ MD5_STEP (MD5_I , a, b, c, d, w1_t[0], MD5C3c, MD5S30);
+ MD5_STEP (MD5_I , d, a, b, c, w2_t[3], MD5C3d, MD5S31);
+ MD5_STEP (MD5_I , c, d, a, b, w0_t[2], MD5C3e, MD5S32);
+ MD5_STEP (MD5_I , b, c, d, a, w2_t[1], MD5C3f, MD5S33);
a += MD5M_A;
b += MD5M_B;
c += MD5M_C;
d += MD5M_D;
- const u32 w0_t = s[0];
- const u32 w1_t = s[1];
- const u32 w2_t = s[2];
- const u32 w3_t = s[3];
- const u32 w4_t = s[4];
- const u32 w5_t = s[5];
- const u32 w6_t = s[6];
- const u32 w7_t = s[7];
-
- const u32 w8_t = uint_to_hex_lower8 ((a >> 0) & 255) << 0
- | uint_to_hex_lower8 ((a >> 8) & 255) << 16;
- const u32 w9_t = uint_to_hex_lower8 ((a >> 16) & 255) << 0
- | uint_to_hex_lower8 ((a >> 24) & 255) << 16;
- const u32 wa_t = uint_to_hex_lower8 ((b >> 0) & 255) << 0
- | uint_to_hex_lower8 ((b >> 8) & 255) << 16;
- const u32 wb_t = uint_to_hex_lower8 ((b >> 16) & 255) << 0
- | uint_to_hex_lower8 ((b >> 24) & 255) << 16;
- const u32 wc_t = uint_to_hex_lower8 ((c >> 0) & 255) << 0
- | uint_to_hex_lower8 ((c >> 8) & 255) << 16;
- const u32 wd_t = uint_to_hex_lower8 ((c >> 16) & 255) << 0
- | uint_to_hex_lower8 ((c >> 24) & 255) << 16;
- const u32 we_t = uint_to_hex_lower8 ((d >> 0) & 255) << 0
- | uint_to_hex_lower8 ((d >> 8) & 255) << 16;
- const u32 wf_t = uint_to_hex_lower8 ((d >> 16) & 255) << 0
- | uint_to_hex_lower8 ((d >> 24) & 255) << 16;
+ w0_t[0] = s[0];
+ w0_t[1] = s[1];
+ w0_t[2] = s[2];
+ w0_t[3] = s[3];
+
+ w1_t[0] = s[4];
+ w1_t[1] = s[5];
+ w1_t[2] = s[6];
+ w1_t[3] = s[7];
+
+ w2_t[0] = uint_to_hex_lower8 ((a >> 0) & 255) << 0
+ | uint_to_hex_lower8 ((a >> 8) & 255) << 16;
+ w2_t[1] = uint_to_hex_lower8 ((a >> 16) & 255) << 0
+ | uint_to_hex_lower8 ((a >> 24) & 255) << 16;
+ w2_t[2] = uint_to_hex_lower8 ((b >> 0) & 255) << 0
+ | uint_to_hex_lower8 ((b >> 8) & 255) << 16;
+ w2_t[3] = uint_to_hex_lower8 ((b >> 16) & 255) << 0
+ | uint_to_hex_lower8 ((b >> 24) & 255) << 16;
+ w3_t[0] = uint_to_hex_lower8 ((c >> 0) & 255) << 0
+ | uint_to_hex_lower8 ((c >> 8) & 255) << 16;
+ w3_t[1] = uint_to_hex_lower8 ((c >> 16) & 255) << 0
+ | uint_to_hex_lower8 ((c >> 24) & 255) << 16;
+ w3_t[2] = uint_to_hex_lower8 ((d >> 0) & 255) << 0
+ | uint_to_hex_lower8 ((d >> 8) & 255) << 16;
+ w3_t[3] = uint_to_hex_lower8 ((d >> 16) & 255) << 0
+ | uint_to_hex_lower8 ((d >> 24) & 255) << 16;
a = MD5M_A;
b = MD5M_B;
c = MD5M_C;
d = MD5M_D;
- MD5_STEP (MD5_Fo, a, b, c, d, w0_t, MD5C00, MD5S00);
- MD5_STEP (MD5_Fo, d, a, b, c, w1_t, MD5C01, MD5S01);
- MD5_STEP (MD5_Fo, c, d, a, b, w2_t, MD5C02, MD5S02);
- MD5_STEP (MD5_Fo, b, c, d, a, w3_t, MD5C03, MD5S03);
- MD5_STEP (MD5_Fo, a, b, c, d, w4_t, MD5C04, MD5S00);
- MD5_STEP (MD5_Fo, d, a, b, c, w5_t, MD5C05, MD5S01);
- MD5_STEP (MD5_Fo, c, d, a, b, w6_t, MD5C06, MD5S02);
- MD5_STEP (MD5_Fo, b, c, d, a, w7_t, MD5C07, MD5S03);
- MD5_STEP (MD5_Fo, a, b, c, d, w8_t, MD5C08, MD5S00);
- MD5_STEP (MD5_Fo, d, a, b, c, w9_t, MD5C09, MD5S01);
- MD5_STEP (MD5_Fo, c, d, a, b, wa_t, MD5C0a, MD5S02);
- MD5_STEP (MD5_Fo, b, c, d, a, wb_t, MD5C0b, MD5S03);
- MD5_STEP (MD5_Fo, a, b, c, d, wc_t, MD5C0c, MD5S00);
- MD5_STEP (MD5_Fo, d, a, b, c, wd_t, MD5C0d, MD5S01);
- MD5_STEP (MD5_Fo, c, d, a, b, we_t, MD5C0e, MD5S02);
- MD5_STEP (MD5_Fo, b, c, d, a, wf_t, MD5C0f, MD5S03);
-
- MD5_STEP (MD5_Go, a, b, c, d, w1_t, MD5C10, MD5S10);
- MD5_STEP (MD5_Go, d, a, b, c, w6_t, MD5C11, MD5S11);
- MD5_STEP (MD5_Go, c, d, a, b, wb_t, MD5C12, MD5S12);
- MD5_STEP (MD5_Go, b, c, d, a, w0_t, MD5C13, MD5S13);
- MD5_STEP (MD5_Go, a, b, c, d, w5_t, MD5C14, MD5S10);
- MD5_STEP (MD5_Go, d, a, b, c, wa_t, MD5C15, MD5S11);
- MD5_STEP (MD5_Go, c, d, a, b, wf_t, MD5C16, MD5S12);
- MD5_STEP (MD5_Go, b, c, d, a, w4_t, MD5C17, MD5S13);
- MD5_STEP (MD5_Go, a, b, c, d, w9_t, MD5C18, MD5S10);
- MD5_STEP (MD5_Go, d, a, b, c, we_t, MD5C19, MD5S11);
- MD5_STEP (MD5_Go, c, d, a, b, w3_t, MD5C1a, MD5S12);
- MD5_STEP (MD5_Go, b, c, d, a, w8_t, MD5C1b, MD5S13);
- MD5_STEP (MD5_Go, a, b, c, d, wd_t, MD5C1c, MD5S10);
- MD5_STEP (MD5_Go, d, a, b, c, w2_t, MD5C1d, MD5S11);
- MD5_STEP (MD5_Go, c, d, a, b, w7_t, MD5C1e, MD5S12);
- MD5_STEP (MD5_Go, b, c, d, a, wc_t, MD5C1f, MD5S13);
-
- MD5_STEP (MD5_H , a, b, c, d, w5_t, MD5C20, MD5S20);
- MD5_STEP (MD5_H , d, a, b, c, w8_t, MD5C21, MD5S21);
- MD5_STEP (MD5_H , c, d, a, b, wb_t, MD5C22, MD5S22);
- MD5_STEP (MD5_H , b, c, d, a, we_t, MD5C23, MD5S23);
- MD5_STEP (MD5_H , a, b, c, d, w1_t, MD5C24, MD5S20);
- MD5_STEP (MD5_H , d, a, b, c, w4_t, MD5C25, MD5S21);
- MD5_STEP (MD5_H , c, d, a, b, w7_t, MD5C26, MD5S22);
- MD5_STEP (MD5_H , b, c, d, a, wa_t, MD5C27, MD5S23);
- MD5_STEP (MD5_H , a, b, c, d, wd_t, MD5C28, MD5S20);
- MD5_STEP (MD5_H , d, a, b, c, w0_t, MD5C29, MD5S21);
- MD5_STEP (MD5_H , c, d, a, b, w3_t, MD5C2a, MD5S22);
- MD5_STEP (MD5_H , b, c, d, a, w6_t, MD5C2b, MD5S23);
- MD5_STEP (MD5_H , a, b, c, d, w9_t, MD5C2c, MD5S20);
- MD5_STEP (MD5_H , d, a, b, c, wc_t, MD5C2d, MD5S21);
- MD5_STEP (MD5_H , c, d, a, b, wf_t, MD5C2e, MD5S22);
- MD5_STEP (MD5_H , b, c, d, a, w2_t, MD5C2f, MD5S23);
-
- MD5_STEP (MD5_I , a, b, c, d, w0_t, MD5C30, MD5S30);
- MD5_STEP (MD5_I , d, a, b, c, w7_t, MD5C31, MD5S31);
- MD5_STEP (MD5_I , c, d, a, b, we_t, MD5C32, MD5S32);
- MD5_STEP (MD5_I , b, c, d, a, w5_t, MD5C33, MD5S33);
- MD5_STEP (MD5_I , a, b, c, d, wc_t, MD5C34, MD5S30);
- MD5_STEP (MD5_I , d, a, b, c, w3_t, MD5C35, MD5S31);
- MD5_STEP (MD5_I , c, d, a, b, wa_t, MD5C36, MD5S32);
- MD5_STEP (MD5_I , b, c, d, a, w1_t, MD5C37, MD5S33);
- MD5_STEP (MD5_I , a, b, c, d, w8_t, MD5C38, MD5S30);
- MD5_STEP (MD5_I , d, a, b, c, wf_t, MD5C39, MD5S31);
- MD5_STEP (MD5_I , c, d, a, b, w6_t, MD5C3a, MD5S32);
- MD5_STEP (MD5_I , b, c, d, a, wd_t, MD5C3b, MD5S33);
- MD5_STEP (MD5_I , a, b, c, d, w4_t, MD5C3c, MD5S30);
- MD5_STEP (MD5_I , d, a, b, c, wb_t, MD5C3d, MD5S31);
- MD5_STEP (MD5_I , c, d, a, b, w2_t, MD5C3e, MD5S32);
- MD5_STEP (MD5_I , b, c, d, a, w9_t, MD5C3f, MD5S33);
-
- const u32 r_a = a + MD5M_A;
- const u32 r_b = b + MD5M_B;
- const u32 r_c = c + MD5M_C;
- const u32 r_d = d + MD5M_D;
+ MD5_STEP (MD5_Fo, a, b, c, d, w0_t[0], MD5C00, MD5S00);
+ MD5_STEP (MD5_Fo, d, a, b, c, w0_t[1], MD5C01, MD5S01);
+ MD5_STEP (MD5_Fo, c, d, a, b, w0_t[2], MD5C02, MD5S02);
+ MD5_STEP (MD5_Fo, b, c, d, a, w0_t[3], MD5C03, MD5S03);
+ MD5_STEP (MD5_Fo, a, b, c, d, w1_t[0], MD5C04, MD5S00);
+ MD5_STEP (MD5_Fo, d, a, b, c, w1_t[1], MD5C05, MD5S01);
+ MD5_STEP (MD5_Fo, c, d, a, b, w1_t[2], MD5C06, MD5S02);
+ MD5_STEP (MD5_Fo, b, c, d, a, w1_t[3], MD5C07, MD5S03);
+ MD5_STEP (MD5_Fo, a, b, c, d, w2_t[0], MD5C08, MD5S00);
+ MD5_STEP (MD5_Fo, d, a, b, c, w2_t[1], MD5C09, MD5S01);
+ MD5_STEP (MD5_Fo, c, d, a, b, w2_t[2], MD5C0a, MD5S02);
+ MD5_STEP (MD5_Fo, b, c, d, a, w2_t[3], MD5C0b, MD5S03);
+ MD5_STEP (MD5_Fo, a, b, c, d, w3_t[0], MD5C0c, MD5S00);
+ MD5_STEP (MD5_Fo, d, a, b, c, w3_t[1], MD5C0d, MD5S01);
+ MD5_STEP (MD5_Fo, c, d, a, b, w3_t[2], MD5C0e, MD5S02);
+ MD5_STEP (MD5_Fo, b, c, d, a, w3_t[3], MD5C0f, MD5S03);
+
+ MD5_STEP (MD5_Go, a, b, c, d, w0_t[1], MD5C10, MD5S10);
+ MD5_STEP (MD5_Go, d, a, b, c, w1_t[2], MD5C11, MD5S11);
+ MD5_STEP (MD5_Go, c, d, a, b, w2_t[3], MD5C12, MD5S12);
+ MD5_STEP (MD5_Go, b, c, d, a, w0_t[0], MD5C13, MD5S13);
+ MD5_STEP (MD5_Go, a, b, c, d, w1_t[1], MD5C14, MD5S10);
+ MD5_STEP (MD5_Go, d, a, b, c, w2_t[2], MD5C15, MD5S11);
+ MD5_STEP (MD5_Go, c, d, a, b, w3_t[3], MD5C16, MD5S12);
+ MD5_STEP (MD5_Go, b, c, d, a, w1_t[0], MD5C17, MD5S13);
+ MD5_STEP (MD5_Go, a, b, c, d, w2_t[1], MD5C18, MD5S10);
+ MD5_STEP (MD5_Go, d, a, b, c, w3_t[2], MD5C19, MD5S11);
+ MD5_STEP (MD5_Go, c, d, a, b, w0_t[3], MD5C1a, MD5S12);
+ MD5_STEP (MD5_Go, b, c, d, a, w2_t[0], MD5C1b, MD5S13);
+ MD5_STEP (MD5_Go, a, b, c, d, w3_t[1], MD5C1c, MD5S10);
+ MD5_STEP (MD5_Go, d, a, b, c, w0_t[2], MD5C1d, MD5S11);
+ MD5_STEP (MD5_Go, c, d, a, b, w1_t[3], MD5C1e, MD5S12);
+ MD5_STEP (MD5_Go, b, c, d, a, w3_t[0], MD5C1f, MD5S13);
+
+ MD5_STEP (MD5_H , a, b, c, d, w1_t[1], MD5C20, MD5S20);
+ MD5_STEP (MD5_H , d, a, b, c, w2_t[0], MD5C21, MD5S21);
+ MD5_STEP (MD5_H , c, d, a, b, w2_t[3], MD5C22, MD5S22);
+ MD5_STEP (MD5_H , b, c, d, a, w3_t[2], MD5C23, MD5S23);
+ MD5_STEP (MD5_H , a, b, c, d, w0_t[1], MD5C24, MD5S20);
+ MD5_STEP (MD5_H , d, a, b, c, w1_t[0], MD5C25, MD5S21);
+ MD5_STEP (MD5_H , c, d, a, b, w1_t[3], MD5C26, MD5S22);
+ MD5_STEP (MD5_H , b, c, d, a, w2_t[2], MD5C27, MD5S23);
+ MD5_STEP (MD5_H , a, b, c, d, w3_t[1], MD5C28, MD5S20);
+ MD5_STEP (MD5_H , d, a, b, c, w0_t[0], MD5C29, MD5S21);
+ MD5_STEP (MD5_H , c, d, a, b, w0_t[3], MD5C2a, MD5S22);
+ MD5_STEP (MD5_H , b, c, d, a, w1_t[2], MD5C2b, MD5S23);
+ MD5_STEP (MD5_H , a, b, c, d, w2_t[1], MD5C2c, MD5S20);
+ MD5_STEP (MD5_H , d, a, b, c, w3_t[0], MD5C2d, MD5S21);
+ MD5_STEP (MD5_H , c, d, a, b, w3_t[3], MD5C2e, MD5S22);
+ MD5_STEP (MD5_H , b, c, d, a, w0_t[2], MD5C2f, MD5S23);
+
+ MD5_STEP (MD5_I , a, b, c, d, w0_t[0], MD5C30, MD5S30);
+ MD5_STEP (MD5_I , d, a, b, c, w1_t[3], MD5C31, MD5S31);
+ MD5_STEP (MD5_I , c, d, a, b, w3_t[2], MD5C32, MD5S32);
+ MD5_STEP (MD5_I , b, c, d, a, w1_t[1], MD5C33, MD5S33);
+ MD5_STEP (MD5_I , a, b, c, d, w3_t[0], MD5C34, MD5S30);
+ MD5_STEP (MD5_I , d, a, b, c, w0_t[3], MD5C35, MD5S31);
+ MD5_STEP (MD5_I , c, d, a, b, w2_t[2], MD5C36, MD5S32);
+ MD5_STEP (MD5_I , b, c, d, a, w0_t[1], MD5C37, MD5S33);
+ MD5_STEP (MD5_I , a, b, c, d, w2_t[0], MD5C38, MD5S30);
+ MD5_STEP (MD5_I , d, a, b, c, w3_t[3], MD5C39, MD5S31);
+ MD5_STEP (MD5_I , c, d, a, b, w1_t[2], MD5C3a, MD5S32);
+ MD5_STEP (MD5_I , b, c, d, a, w3_t[1], MD5C3b, MD5S33);
+ MD5_STEP (MD5_I , a, b, c, d, w1_t[0], MD5C3c, MD5S30);
+ MD5_STEP (MD5_I , d, a, b, c, w2_t[3], MD5C3d, MD5S31);
+ MD5_STEP (MD5_I , c, d, a, b, w0_t[2], MD5C3e, MD5S32);
+ MD5_STEP (MD5_I , b, c, d, a, w2_t[1], MD5C3f, MD5S33);
+
+ const u32x r_a = a + MD5M_A;
+ const u32x r_b = b + MD5M_B;
+ const u32x r_c = c + MD5M_C;
+ const u32x r_d = d + MD5M_D;
a = r_a;
b = r_b;
c += r_c;
d += r_d;
- const u32 r0 = a;
- const u32 r1 = d;
- const u32 r2 = c;
- const u32 r3 = b;
-
- #include COMPARE_M
+ COMPARE_M_SIMD (a, d, c, b);
}
}
u32 w0l = w0[0];
- for (u32 il_pos = 0; il_pos < bfs_cnt; il_pos++)
+ for (u32 il_pos = 0; il_pos < bfs_cnt; il_pos += VECT_SIZE)
{
- const u32 w0r = bfs_buf[il_pos].i;
-
- w0[0] = w0l | w0r;
-
- u32 a = MD5M_A;
- u32 b = MD5M_B;
- u32 c = MD5M_C;
- u32 d = MD5M_D;
-
- MD5_STEP (MD5_Fo, a, b, c, d, w0[0], MD5C00, MD5S00);
- MD5_STEP (MD5_Fo, d, a, b, c, w0[1], MD5C01, MD5S01);
- MD5_STEP (MD5_Fo, c, d, a, b, w0[2], MD5C02, MD5S02);
- MD5_STEP (MD5_Fo, b, c, d, a, w0[3], MD5C03, MD5S03);
- MD5_STEP (MD5_Fo, a, b, c, d, w1[0], MD5C04, MD5S00);
- MD5_STEP (MD5_Fo, d, a, b, c, w1[1], MD5C05, MD5S01);
- MD5_STEP (MD5_Fo, c, d, a, b, w1[2], MD5C06, MD5S02);
- MD5_STEP (MD5_Fo, b, c, d, a, w1[3], MD5C07, MD5S03);
- MD5_STEP (MD5_Fo, a, b, c, d, w2[0], MD5C08, MD5S00);
- MD5_STEP (MD5_Fo, d, a, b, c, w2[1], MD5C09, MD5S01);
- MD5_STEP (MD5_Fo, c, d, a, b, w2[2], MD5C0a, MD5S02);
- MD5_STEP (MD5_Fo, b, c, d, a, w2[3], MD5C0b, MD5S03);
- MD5_STEP (MD5_Fo, a, b, c, d, w3[0], MD5C0c, MD5S00);
- MD5_STEP (MD5_Fo, d, a, b, c, w3[1], MD5C0d, MD5S01);
- MD5_STEP (MD5_Fo, c, d, a, b, w3[2], MD5C0e, MD5S02);
- MD5_STEP (MD5_Fo, b, c, d, a, w3[3], MD5C0f, MD5S03);
-
- MD5_STEP (MD5_Go, a, b, c, d, w0[1], MD5C10, MD5S10);
- MD5_STEP (MD5_Go, d, a, b, c, w1[2], MD5C11, MD5S11);
- MD5_STEP (MD5_Go, c, d, a, b, w2[3], MD5C12, MD5S12);
- MD5_STEP (MD5_Go, b, c, d, a, w0[0], MD5C13, MD5S13);
- MD5_STEP (MD5_Go, a, b, c, d, w1[1], MD5C14, MD5S10);
- MD5_STEP (MD5_Go, d, a, b, c, w2[2], MD5C15, MD5S11);
- MD5_STEP (MD5_Go, c, d, a, b, w3[3], MD5C16, MD5S12);
- MD5_STEP (MD5_Go, b, c, d, a, w1[0], MD5C17, MD5S13);
- MD5_STEP (MD5_Go, a, b, c, d, w2[1], MD5C18, MD5S10);
- MD5_STEP (MD5_Go, d, a, b, c, w3[2], MD5C19, MD5S11);
- MD5_STEP (MD5_Go, c, d, a, b, w0[3], MD5C1a, MD5S12);
- MD5_STEP (MD5_Go, b, c, d, a, w2[0], MD5C1b, MD5S13);
- MD5_STEP (MD5_Go, a, b, c, d, w3[1], MD5C1c, MD5S10);
- MD5_STEP (MD5_Go, d, a, b, c, w0[2], MD5C1d, MD5S11);
- MD5_STEP (MD5_Go, c, d, a, b, w1[3], MD5C1e, MD5S12);
- MD5_STEP (MD5_Go, b, c, d, a, w3[0], MD5C1f, MD5S13);
-
- MD5_STEP (MD5_H , a, b, c, d, w1[1], MD5C20, MD5S20);
- MD5_STEP (MD5_H , d, a, b, c, w2[0], MD5C21, MD5S21);
- MD5_STEP (MD5_H , c, d, a, b, w2[3], MD5C22, MD5S22);
- MD5_STEP (MD5_H , b, c, d, a, w3[2], MD5C23, MD5S23);
- MD5_STEP (MD5_H , a, b, c, d, w0[1], MD5C24, MD5S20);
- MD5_STEP (MD5_H , d, a, b, c, w1[0], MD5C25, MD5S21);
- MD5_STEP (MD5_H , c, d, a, b, w1[3], MD5C26, MD5S22);
- MD5_STEP (MD5_H , b, c, d, a, w2[2], MD5C27, MD5S23);
- MD5_STEP (MD5_H , a, b, c, d, w3[1], MD5C28, MD5S20);
- MD5_STEP (MD5_H , d, a, b, c, w0[0], MD5C29, MD5S21);
- MD5_STEP (MD5_H , c, d, a, b, w0[3], MD5C2a, MD5S22);
- MD5_STEP (MD5_H , b, c, d, a, w1[2], MD5C2b, MD5S23);
- MD5_STEP (MD5_H , a, b, c, d, w2[1], MD5C2c, MD5S20);
- MD5_STEP (MD5_H , d, a, b, c, w3[0], MD5C2d, MD5S21);
- MD5_STEP (MD5_H , c, d, a, b, w3[3], MD5C2e, MD5S22);
- MD5_STEP (MD5_H , b, c, d, a, w0[2], MD5C2f, MD5S23);
-
- MD5_STEP (MD5_I , a, b, c, d, w0[0], MD5C30, MD5S30);
- MD5_STEP (MD5_I , d, a, b, c, w1[3], MD5C31, MD5S31);
- MD5_STEP (MD5_I , c, d, a, b, w3[2], MD5C32, MD5S32);
- MD5_STEP (MD5_I , b, c, d, a, w1[1], MD5C33, MD5S33);
- MD5_STEP (MD5_I , a, b, c, d, w3[0], MD5C34, MD5S30);
- MD5_STEP (MD5_I , d, a, b, c, w0[3], MD5C35, MD5S31);
- MD5_STEP (MD5_I , c, d, a, b, w2[2], MD5C36, MD5S32);
- MD5_STEP (MD5_I , b, c, d, a, w0[1], MD5C37, MD5S33);
- MD5_STEP (MD5_I , a, b, c, d, w2[0], MD5C38, MD5S30);
- MD5_STEP (MD5_I , d, a, b, c, w3[3], MD5C39, MD5S31);
- MD5_STEP (MD5_I , c, d, a, b, w1[2], MD5C3a, MD5S32);
- MD5_STEP (MD5_I , b, c, d, a, w3[1], MD5C3b, MD5S33);
- MD5_STEP (MD5_I , a, b, c, d, w1[0], MD5C3c, MD5S30);
- MD5_STEP (MD5_I , d, a, b, c, w2[3], MD5C3d, MD5S31);
- MD5_STEP (MD5_I , c, d, a, b, w0[2], MD5C3e, MD5S32);
- MD5_STEP (MD5_I , b, c, d, a, w2[1], MD5C3f, MD5S33);
+ const u32x w0r = w0r_create_bft (bfs_buf, il_pos);
+
+ const u32x w0lr = w0l | w0r;
+
+ u32x w0_t[4];
+
+ w0_t[0] = w0lr;
+ w0_t[1] = w0[1];
+ w0_t[2] = w0[2];
+ w0_t[3] = w0[3];
+
+ u32x w1_t[4];
+
+ w1_t[0] = w1[0];
+ w1_t[1] = w1[1];
+ w1_t[2] = w1[2];
+ w1_t[3] = w1[3];
+
+ u32x w2_t[4];
+
+ w2_t[0] = w2[0];
+ w2_t[1] = w2[1];
+ w2_t[2] = w2[2];
+ w2_t[3] = w2[3];
+
+ u32x w3_t[4];
+
+ w3_t[0] = w3[0];
+ w3_t[1] = w3[1];
+ w3_t[2] = w3[2];
+ w3_t[3] = w3[3];
+
+ u32x a = MD5M_A;
+ u32x b = MD5M_B;
+ u32x c = MD5M_C;
+ u32x d = MD5M_D;
+
+ MD5_STEP (MD5_Fo, a, b, c, d, w0_t[0], MD5C00, MD5S00);
+ MD5_STEP (MD5_Fo, d, a, b, c, w0_t[1], MD5C01, MD5S01);
+ MD5_STEP (MD5_Fo, c, d, a, b, w0_t[2], MD5C02, MD5S02);
+ MD5_STEP (MD5_Fo, b, c, d, a, w0_t[3], MD5C03, MD5S03);
+ MD5_STEP (MD5_Fo, a, b, c, d, w1_t[0], MD5C04, MD5S00);
+ MD5_STEP (MD5_Fo, d, a, b, c, w1_t[1], MD5C05, MD5S01);
+ MD5_STEP (MD5_Fo, c, d, a, b, w1_t[2], MD5C06, MD5S02);
+ MD5_STEP (MD5_Fo, b, c, d, a, w1_t[3], MD5C07, MD5S03);
+ MD5_STEP (MD5_Fo, a, b, c, d, w2_t[0], MD5C08, MD5S00);
+ MD5_STEP (MD5_Fo, d, a, b, c, w2_t[1], MD5C09, MD5S01);
+ MD5_STEP (MD5_Fo, c, d, a, b, w2_t[2], MD5C0a, MD5S02);
+ MD5_STEP (MD5_Fo, b, c, d, a, w2_t[3], MD5C0b, MD5S03);
+ MD5_STEP (MD5_Fo, a, b, c, d, w3_t[0], MD5C0c, MD5S00);
+ MD5_STEP (MD5_Fo, d, a, b, c, w3_t[1], MD5C0d, MD5S01);
+ MD5_STEP (MD5_Fo, c, d, a, b, w3_t[2], MD5C0e, MD5S02);
+ MD5_STEP (MD5_Fo, b, c, d, a, w3_t[3], MD5C0f, MD5S03);
+
+ MD5_STEP (MD5_Go, a, b, c, d, w0_t[1], MD5C10, MD5S10);
+ MD5_STEP (MD5_Go, d, a, b, c, w1_t[2], MD5C11, MD5S11);
+ MD5_STEP (MD5_Go, c, d, a, b, w2_t[3], MD5C12, MD5S12);
+ MD5_STEP (MD5_Go, b, c, d, a, w0_t[0], MD5C13, MD5S13);
+ MD5_STEP (MD5_Go, a, b, c, d, w1_t[1], MD5C14, MD5S10);
+ MD5_STEP (MD5_Go, d, a, b, c, w2_t[2], MD5C15, MD5S11);
+ MD5_STEP (MD5_Go, c, d, a, b, w3_t[3], MD5C16, MD5S12);
+ MD5_STEP (MD5_Go, b, c, d, a, w1_t[0], MD5C17, MD5S13);
+ MD5_STEP (MD5_Go, a, b, c, d, w2_t[1], MD5C18, MD5S10);
+ MD5_STEP (MD5_Go, d, a, b, c, w3_t[2], MD5C19, MD5S11);
+ MD5_STEP (MD5_Go, c, d, a, b, w0_t[3], MD5C1a, MD5S12);
+ MD5_STEP (MD5_Go, b, c, d, a, w2_t[0], MD5C1b, MD5S13);
+ MD5_STEP (MD5_Go, a, b, c, d, w3_t[1], MD5C1c, MD5S10);
+ MD5_STEP (MD5_Go, d, a, b, c, w0_t[2], MD5C1d, MD5S11);
+ MD5_STEP (MD5_Go, c, d, a, b, w1_t[3], MD5C1e, MD5S12);
+ MD5_STEP (MD5_Go, b, c, d, a, w3_t[0], MD5C1f, MD5S13);
+
+ MD5_STEP (MD5_H , a, b, c, d, w1_t[1], MD5C20, MD5S20);
+ MD5_STEP (MD5_H , d, a, b, c, w2_t[0], MD5C21, MD5S21);
+ MD5_STEP (MD5_H , c, d, a, b, w2_t[3], MD5C22, MD5S22);
+ MD5_STEP (MD5_H , b, c, d, a, w3_t[2], MD5C23, MD5S23);
+ MD5_STEP (MD5_H , a, b, c, d, w0_t[1], MD5C24, MD5S20);
+ MD5_STEP (MD5_H , d, a, b, c, w1_t[0], MD5C25, MD5S21);
+ MD5_STEP (MD5_H , c, d, a, b, w1_t[3], MD5C26, MD5S22);
+ MD5_STEP (MD5_H , b, c, d, a, w2_t[2], MD5C27, MD5S23);
+ MD5_STEP (MD5_H , a, b, c, d, w3_t[1], MD5C28, MD5S20);
+ MD5_STEP (MD5_H , d, a, b, c, w0_t[0], MD5C29, MD5S21);
+ MD5_STEP (MD5_H , c, d, a, b, w0_t[3], MD5C2a, MD5S22);
+ MD5_STEP (MD5_H , b, c, d, a, w1_t[2], MD5C2b, MD5S23);
+ MD5_STEP (MD5_H , a, b, c, d, w2_t[1], MD5C2c, MD5S20);
+ MD5_STEP (MD5_H , d, a, b, c, w3_t[0], MD5C2d, MD5S21);
+ MD5_STEP (MD5_H , c, d, a, b, w3_t[3], MD5C2e, MD5S22);
+ MD5_STEP (MD5_H , b, c, d, a, w0_t[2], MD5C2f, MD5S23);
+
+ MD5_STEP (MD5_I , a, b, c, d, w0_t[0], MD5C30, MD5S30);
+ MD5_STEP (MD5_I , d, a, b, c, w1_t[3], MD5C31, MD5S31);
+ MD5_STEP (MD5_I , c, d, a, b, w3_t[2], MD5C32, MD5S32);
+ MD5_STEP (MD5_I , b, c, d, a, w1_t[1], MD5C33, MD5S33);
+ MD5_STEP (MD5_I , a, b, c, d, w3_t[0], MD5C34, MD5S30);
+ MD5_STEP (MD5_I , d, a, b, c, w0_t[3], MD5C35, MD5S31);
+ MD5_STEP (MD5_I , c, d, a, b, w2_t[2], MD5C36, MD5S32);
+ MD5_STEP (MD5_I , b, c, d, a, w0_t[1], MD5C37, MD5S33);
+ MD5_STEP (MD5_I , a, b, c, d, w2_t[0], MD5C38, MD5S30);
+ MD5_STEP (MD5_I , d, a, b, c, w3_t[3], MD5C39, MD5S31);
+ MD5_STEP (MD5_I , c, d, a, b, w1_t[2], MD5C3a, MD5S32);
+ MD5_STEP (MD5_I , b, c, d, a, w3_t[1], MD5C3b, MD5S33);
+ MD5_STEP (MD5_I , a, b, c, d, w1_t[0], MD5C3c, MD5S30);
+ MD5_STEP (MD5_I , d, a, b, c, w2_t[3], MD5C3d, MD5S31);
+ MD5_STEP (MD5_I , c, d, a, b, w0_t[2], MD5C3e, MD5S32);
+ MD5_STEP (MD5_I , b, c, d, a, w2_t[1], MD5C3f, MD5S33);
a += MD5M_A;
b += MD5M_B;
c += MD5M_C;
d += MD5M_D;
- const u32 w0_t = s[0];
- const u32 w1_t = s[1];
- const u32 w2_t = s[2];
- const u32 w3_t = s[3];
- const u32 w4_t = s[4];
- const u32 w5_t = s[5];
- const u32 w6_t = s[6];
- const u32 w7_t = s[7];
-
- const u32 w8_t = uint_to_hex_lower8 ((a >> 0) & 255) << 0
- | uint_to_hex_lower8 ((a >> 8) & 255) << 16;
- const u32 w9_t = uint_to_hex_lower8 ((a >> 16) & 255) << 0
- | uint_to_hex_lower8 ((a >> 24) & 255) << 16;
- const u32 wa_t = uint_to_hex_lower8 ((b >> 0) & 255) << 0
- | uint_to_hex_lower8 ((b >> 8) & 255) << 16;
- const u32 wb_t = uint_to_hex_lower8 ((b >> 16) & 255) << 0
- | uint_to_hex_lower8 ((b >> 24) & 255) << 16;
- const u32 wc_t = uint_to_hex_lower8 ((c >> 0) & 255) << 0
- | uint_to_hex_lower8 ((c >> 8) & 255) << 16;
- const u32 wd_t = uint_to_hex_lower8 ((c >> 16) & 255) << 0
- | uint_to_hex_lower8 ((c >> 24) & 255) << 16;
- const u32 we_t = uint_to_hex_lower8 ((d >> 0) & 255) << 0
- | uint_to_hex_lower8 ((d >> 8) & 255) << 16;
- const u32 wf_t = uint_to_hex_lower8 ((d >> 16) & 255) << 0
- | uint_to_hex_lower8 ((d >> 24) & 255) << 16;
+ w0_t[0] = s[0];
+ w0_t[1] = s[1];
+ w0_t[2] = s[2];
+ w0_t[3] = s[3];
+
+ w1_t[0] = s[4];
+ w1_t[1] = s[5];
+ w1_t[2] = s[6];
+ w1_t[3] = s[7];
+
+ w2_t[0] = uint_to_hex_lower8 ((a >> 0) & 255) << 0
+ | uint_to_hex_lower8 ((a >> 8) & 255) << 16;
+ w2_t[1] = uint_to_hex_lower8 ((a >> 16) & 255) << 0
+ | uint_to_hex_lower8 ((a >> 24) & 255) << 16;
+ w2_t[2] = uint_to_hex_lower8 ((b >> 0) & 255) << 0
+ | uint_to_hex_lower8 ((b >> 8) & 255) << 16;
+ w2_t[3] = uint_to_hex_lower8 ((b >> 16) & 255) << 0
+ | uint_to_hex_lower8 ((b >> 24) & 255) << 16;
+ w3_t[0] = uint_to_hex_lower8 ((c >> 0) & 255) << 0
+ | uint_to_hex_lower8 ((c >> 8) & 255) << 16;
+ w3_t[1] = uint_to_hex_lower8 ((c >> 16) & 255) << 0
+ | uint_to_hex_lower8 ((c >> 24) & 255) << 16;
+ w3_t[2] = uint_to_hex_lower8 ((d >> 0) & 255) << 0
+ | uint_to_hex_lower8 ((d >> 8) & 255) << 16;
+ w3_t[3] = uint_to_hex_lower8 ((d >> 16) & 255) << 0
+ | uint_to_hex_lower8 ((d >> 24) & 255) << 16;
a = MD5M_A;
b = MD5M_B;
c = MD5M_C;
d = MD5M_D;
- MD5_STEP (MD5_Fo, a, b, c, d, w0_t, MD5C00, MD5S00);
- MD5_STEP (MD5_Fo, d, a, b, c, w1_t, MD5C01, MD5S01);
- MD5_STEP (MD5_Fo, c, d, a, b, w2_t, MD5C02, MD5S02);
- MD5_STEP (MD5_Fo, b, c, d, a, w3_t, MD5C03, MD5S03);
- MD5_STEP (MD5_Fo, a, b, c, d, w4_t, MD5C04, MD5S00);
- MD5_STEP (MD5_Fo, d, a, b, c, w5_t, MD5C05, MD5S01);
- MD5_STEP (MD5_Fo, c, d, a, b, w6_t, MD5C06, MD5S02);
- MD5_STEP (MD5_Fo, b, c, d, a, w7_t, MD5C07, MD5S03);
- MD5_STEP (MD5_Fo, a, b, c, d, w8_t, MD5C08, MD5S00);
- MD5_STEP (MD5_Fo, d, a, b, c, w9_t, MD5C09, MD5S01);
- MD5_STEP (MD5_Fo, c, d, a, b, wa_t, MD5C0a, MD5S02);
- MD5_STEP (MD5_Fo, b, c, d, a, wb_t, MD5C0b, MD5S03);
- MD5_STEP (MD5_Fo, a, b, c, d, wc_t, MD5C0c, MD5S00);
- MD5_STEP (MD5_Fo, d, a, b, c, wd_t, MD5C0d, MD5S01);
- MD5_STEP (MD5_Fo, c, d, a, b, we_t, MD5C0e, MD5S02);
- MD5_STEP (MD5_Fo, b, c, d, a, wf_t, MD5C0f, MD5S03);
-
- MD5_STEP (MD5_Go, a, b, c, d, w1_t, MD5C10, MD5S10);
- MD5_STEP (MD5_Go, d, a, b, c, w6_t, MD5C11, MD5S11);
- MD5_STEP (MD5_Go, c, d, a, b, wb_t, MD5C12, MD5S12);
- MD5_STEP (MD5_Go, b, c, d, a, w0_t, MD5C13, MD5S13);
- MD5_STEP (MD5_Go, a, b, c, d, w5_t, MD5C14, MD5S10);
- MD5_STEP (MD5_Go, d, a, b, c, wa_t, MD5C15, MD5S11);
- MD5_STEP (MD5_Go, c, d, a, b, wf_t, MD5C16, MD5S12);
- MD5_STEP (MD5_Go, b, c, d, a, w4_t, MD5C17, MD5S13);
- MD5_STEP (MD5_Go, a, b, c, d, w9_t, MD5C18, MD5S10);
- MD5_STEP (MD5_Go, d, a, b, c, we_t, MD5C19, MD5S11);
- MD5_STEP (MD5_Go, c, d, a, b, w3_t, MD5C1a, MD5S12);
- MD5_STEP (MD5_Go, b, c, d, a, w8_t, MD5C1b, MD5S13);
- MD5_STEP (MD5_Go, a, b, c, d, wd_t, MD5C1c, MD5S10);
- MD5_STEP (MD5_Go, d, a, b, c, w2_t, MD5C1d, MD5S11);
- MD5_STEP (MD5_Go, c, d, a, b, w7_t, MD5C1e, MD5S12);
- MD5_STEP (MD5_Go, b, c, d, a, wc_t, MD5C1f, MD5S13);
-
- MD5_STEP (MD5_H , a, b, c, d, w5_t, MD5C20, MD5S20);
- MD5_STEP (MD5_H , d, a, b, c, w8_t, MD5C21, MD5S21);
- MD5_STEP (MD5_H , c, d, a, b, wb_t, MD5C22, MD5S22);
- MD5_STEP (MD5_H , b, c, d, a, we_t, MD5C23, MD5S23);
- MD5_STEP (MD5_H , a, b, c, d, w1_t, MD5C24, MD5S20);
- MD5_STEP (MD5_H , d, a, b, c, w4_t, MD5C25, MD5S21);
- MD5_STEP (MD5_H , c, d, a, b, w7_t, MD5C26, MD5S22);
- MD5_STEP (MD5_H , b, c, d, a, wa_t, MD5C27, MD5S23);
- MD5_STEP (MD5_H , a, b, c, d, wd_t, MD5C28, MD5S20);
- MD5_STEP (MD5_H , d, a, b, c, w0_t, MD5C29, MD5S21);
- MD5_STEP (MD5_H , c, d, a, b, w3_t, MD5C2a, MD5S22);
- MD5_STEP (MD5_H , b, c, d, a, w6_t, MD5C2b, MD5S23);
- MD5_STEP (MD5_H , a, b, c, d, w9_t, MD5C2c, MD5S20);
- MD5_STEP (MD5_H , d, a, b, c, wc_t, MD5C2d, MD5S21);
- MD5_STEP (MD5_H , c, d, a, b, wf_t, MD5C2e, MD5S22);
- MD5_STEP (MD5_H , b, c, d, a, w2_t, MD5C2f, MD5S23);
-
- MD5_STEP (MD5_I , a, b, c, d, w0_t, MD5C30, MD5S30);
- MD5_STEP (MD5_I , d, a, b, c, w7_t, MD5C31, MD5S31);
- MD5_STEP (MD5_I , c, d, a, b, we_t, MD5C32, MD5S32);
- MD5_STEP (MD5_I , b, c, d, a, w5_t, MD5C33, MD5S33);
- MD5_STEP (MD5_I , a, b, c, d, wc_t, MD5C34, MD5S30);
- MD5_STEP (MD5_I , d, a, b, c, w3_t, MD5C35, MD5S31);
- MD5_STEP (MD5_I , c, d, a, b, wa_t, MD5C36, MD5S32);
- MD5_STEP (MD5_I , b, c, d, a, w1_t, MD5C37, MD5S33);
- MD5_STEP (MD5_I , a, b, c, d, w8_t, MD5C38, MD5S30);
- MD5_STEP (MD5_I , d, a, b, c, wf_t, MD5C39, MD5S31);
- MD5_STEP (MD5_I , c, d, a, b, w6_t, MD5C3a, MD5S32);
- MD5_STEP (MD5_I , b, c, d, a, wd_t, MD5C3b, MD5S33);
- MD5_STEP (MD5_I , a, b, c, d, w4_t, MD5C3c, MD5S30);
- MD5_STEP (MD5_I , d, a, b, c, wb_t, MD5C3d, MD5S31);
- MD5_STEP (MD5_I , c, d, a, b, w2_t, MD5C3e, MD5S32);
- MD5_STEP (MD5_I , b, c, d, a, w9_t, MD5C3f, MD5S33);
-
- const u32 r_a = a + MD5M_A;
- const u32 r_b = b + MD5M_B;
- const u32 r_c = c + MD5M_C;
- const u32 r_d = d + MD5M_D;
+ MD5_STEP (MD5_Fo, a, b, c, d, w0_t[0], MD5C00, MD5S00);
+ MD5_STEP (MD5_Fo, d, a, b, c, w0_t[1], MD5C01, MD5S01);
+ MD5_STEP (MD5_Fo, c, d, a, b, w0_t[2], MD5C02, MD5S02);
+ MD5_STEP (MD5_Fo, b, c, d, a, w0_t[3], MD5C03, MD5S03);
+ MD5_STEP (MD5_Fo, a, b, c, d, w1_t[0], MD5C04, MD5S00);
+ MD5_STEP (MD5_Fo, d, a, b, c, w1_t[1], MD5C05, MD5S01);
+ MD5_STEP (MD5_Fo, c, d, a, b, w1_t[2], MD5C06, MD5S02);
+ MD5_STEP (MD5_Fo, b, c, d, a, w1_t[3], MD5C07, MD5S03);
+ MD5_STEP (MD5_Fo, a, b, c, d, w2_t[0], MD5C08, MD5S00);
+ MD5_STEP (MD5_Fo, d, a, b, c, w2_t[1], MD5C09, MD5S01);
+ MD5_STEP (MD5_Fo, c, d, a, b, w2_t[2], MD5C0a, MD5S02);
+ MD5_STEP (MD5_Fo, b, c, d, a, w2_t[3], MD5C0b, MD5S03);
+ MD5_STEP (MD5_Fo, a, b, c, d, w3_t[0], MD5C0c, MD5S00);
+ MD5_STEP (MD5_Fo, d, a, b, c, w3_t[1], MD5C0d, MD5S01);
+ MD5_STEP (MD5_Fo, c, d, a, b, w3_t[2], MD5C0e, MD5S02);
+ MD5_STEP (MD5_Fo, b, c, d, a, w3_t[3], MD5C0f, MD5S03);
+
+ MD5_STEP (MD5_Go, a, b, c, d, w0_t[1], MD5C10, MD5S10);
+ MD5_STEP (MD5_Go, d, a, b, c, w1_t[2], MD5C11, MD5S11);
+ MD5_STEP (MD5_Go, c, d, a, b, w2_t[3], MD5C12, MD5S12);
+ MD5_STEP (MD5_Go, b, c, d, a, w0_t[0], MD5C13, MD5S13);
+ MD5_STEP (MD5_Go, a, b, c, d, w1_t[1], MD5C14, MD5S10);
+ MD5_STEP (MD5_Go, d, a, b, c, w2_t[2], MD5C15, MD5S11);
+ MD5_STEP (MD5_Go, c, d, a, b, w3_t[3], MD5C16, MD5S12);
+ MD5_STEP (MD5_Go, b, c, d, a, w1_t[0], MD5C17, MD5S13);
+ MD5_STEP (MD5_Go, a, b, c, d, w2_t[1], MD5C18, MD5S10);
+ MD5_STEP (MD5_Go, d, a, b, c, w3_t[2], MD5C19, MD5S11);
+ MD5_STEP (MD5_Go, c, d, a, b, w0_t[3], MD5C1a, MD5S12);
+ MD5_STEP (MD5_Go, b, c, d, a, w2_t[0], MD5C1b, MD5S13);
+ MD5_STEP (MD5_Go, a, b, c, d, w3_t[1], MD5C1c, MD5S10);
+ MD5_STEP (MD5_Go, d, a, b, c, w0_t[2], MD5C1d, MD5S11);
+ MD5_STEP (MD5_Go, c, d, a, b, w1_t[3], MD5C1e, MD5S12);
+ MD5_STEP (MD5_Go, b, c, d, a, w3_t[0], MD5C1f, MD5S13);
+
+ MD5_STEP (MD5_H , a, b, c, d, w1_t[1], MD5C20, MD5S20);
+ MD5_STEP (MD5_H , d, a, b, c, w2_t[0], MD5C21, MD5S21);
+ MD5_STEP (MD5_H , c, d, a, b, w2_t[3], MD5C22, MD5S22);
+ MD5_STEP (MD5_H , b, c, d, a, w3_t[2], MD5C23, MD5S23);
+ MD5_STEP (MD5_H , a, b, c, d, w0_t[1], MD5C24, MD5S20);
+ MD5_STEP (MD5_H , d, a, b, c, w1_t[0], MD5C25, MD5S21);
+ MD5_STEP (MD5_H , c, d, a, b, w1_t[3], MD5C26, MD5S22);
+ MD5_STEP (MD5_H , b, c, d, a, w2_t[2], MD5C27, MD5S23);
+ MD5_STEP (MD5_H , a, b, c, d, w3_t[1], MD5C28, MD5S20);
+ MD5_STEP (MD5_H , d, a, b, c, w0_t[0], MD5C29, MD5S21);
+ MD5_STEP (MD5_H , c, d, a, b, w0_t[3], MD5C2a, MD5S22);
+ MD5_STEP (MD5_H , b, c, d, a, w1_t[2], MD5C2b, MD5S23);
+ MD5_STEP (MD5_H , a, b, c, d, w2_t[1], MD5C2c, MD5S20);
+ MD5_STEP (MD5_H , d, a, b, c, w3_t[0], MD5C2d, MD5S21);
+ MD5_STEP (MD5_H , c, d, a, b, w3_t[3], MD5C2e, MD5S22);
+ MD5_STEP (MD5_H , b, c, d, a, w0_t[2], MD5C2f, MD5S23);
+
+ MD5_STEP (MD5_I , a, b, c, d, w0_t[0], MD5C30, MD5S30);
+ MD5_STEP (MD5_I , d, a, b, c, w1_t[3], MD5C31, MD5S31);
+ MD5_STEP (MD5_I , c, d, a, b, w3_t[2], MD5C32, MD5S32);
+ MD5_STEP (MD5_I , b, c, d, a, w1_t[1], MD5C33, MD5S33);
+ MD5_STEP (MD5_I , a, b, c, d, w3_t[0], MD5C34, MD5S30);
+ MD5_STEP (MD5_I , d, a, b, c, w0_t[3], MD5C35, MD5S31);
+ MD5_STEP (MD5_I , c, d, a, b, w2_t[2], MD5C36, MD5S32);
+ MD5_STEP (MD5_I , b, c, d, a, w0_t[1], MD5C37, MD5S33);
+ MD5_STEP (MD5_I , a, b, c, d, w2_t[0], MD5C38, MD5S30);
+ MD5_STEP (MD5_I , d, a, b, c, w3_t[3], MD5C39, MD5S31);
+ MD5_STEP (MD5_I , c, d, a, b, w1_t[2], MD5C3a, MD5S32);
+ MD5_STEP (MD5_I , b, c, d, a, w3_t[1], MD5C3b, MD5S33);
+ MD5_STEP (MD5_I , a, b, c, d, w1_t[0], MD5C3c, MD5S30);
+ MD5_STEP (MD5_I , d, a, b, c, w2_t[3], MD5C3d, MD5S31);
+ MD5_STEP (MD5_I , c, d, a, b, w0_t[2], MD5C3e, MD5S32);
+ MD5_STEP (MD5_I , b, c, d, a, w2_t[1], MD5C3f, MD5S33);
+
+ const u32x r_a = a + MD5M_A;
+ const u32x r_b = b + MD5M_B;
+ const u32x r_c = c + MD5M_C;
+ const u32x r_d = d + MD5M_D;
a = r_a;
b = r_b;
MD5_STEP0(MD5_I , b, c, d, a, MD5C3b, MD5S33);
MD5_STEP0(MD5_I , a, b, c, d, MD5C3c, MD5S30);
- if (allx ((a + r_a) != search[0])) continue;
+ if (MATCHES_NONE_VS ((a + r_a), search[0])) continue;
MD5_STEP0(MD5_I , d, a, b, c, MD5C3d, MD5S31);
MD5_STEP0(MD5_I , c, d, a, b, MD5C3e, MD5S32);
c += r_c;
d += r_d;
- const u32 r0 = a;
- const u32 r1 = d;
- const u32 r2 = c;
- const u32 r3 = b;
-
- #include COMPARE_S
+ COMPARE_S_SIMD (a, d, c, b);
}
}
if (combs_mode == COMBINATOR_MODE_BASE_RIGHT)
{
- switch_buffer_by_offset (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len);
+ switch_buffer_by_offset_le (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len);
}
/**
if (combs_mode == COMBINATOR_MODE_BASE_LEFT)
{
- switch_buffer_by_offset (wordr0, wordr1, wordr2, wordr3, pw_l_len);
+ switch_buffer_by_offset_le (wordr0, wordr1, wordr2, wordr3, pw_l_len);
}
u32 w0[4];
if (combs_mode == COMBINATOR_MODE_BASE_RIGHT)
{
- switch_buffer_by_offset (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len);
+ switch_buffer_by_offset_le (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len);
}
/**
if (combs_mode == COMBINATOR_MODE_BASE_LEFT)
{
- switch_buffer_by_offset (wordr0, wordr1, wordr2, wordr3, pw_l_len);
+ switch_buffer_by_offset_le (wordr0, wordr1, wordr2, wordr3, pw_l_len);
}
u32 w0[4];
w3_t[2] = w3[2];
w3_t[3] = w3[3];
- switch_buffer_by_offset (w0_t, w1_t, w2_t, w3_t, salt_len);
+ switch_buffer_by_offset_le (w0_t, w1_t, w2_t, w3_t, salt_len);
w0_t[0] |= salt_buf0[0];
w0_t[1] |= salt_buf0[1];
w3_t[2] = w3[2];
w3_t[3] = w3[3];
- switch_buffer_by_offset (w0_t, w1_t, w2_t, w3_t, salt_len);
+ switch_buffer_by_offset_le (w0_t, w1_t, w2_t, w3_t, salt_len);
w0_t[0] |= salt_buf0[0];
w0_t[1] |= salt_buf0[1];
if (combs_mode == COMBINATOR_MODE_BASE_RIGHT)
{
- switch_buffer_by_offset (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len);
+ switch_buffer_by_offset_le (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len);
}
/**
if (combs_mode == COMBINATOR_MODE_BASE_LEFT)
{
- switch_buffer_by_offset (wordr0, wordr1, wordr2, wordr3, pw_l_len);
+ switch_buffer_by_offset_le (wordr0, wordr1, wordr2, wordr3, pw_l_len);
}
u32 w0[4];
w3_t[2] = w3[2];
w3_t[3] = w3[3];
- switch_buffer_by_offset (w0_t, w1_t, w2_t, w3_t, salt_len);
+ switch_buffer_by_offset_le (w0_t, w1_t, w2_t, w3_t, salt_len);
w0_t[0] |= salt_buf0[0];
w0_t[1] |= salt_buf0[1];
if (combs_mode == COMBINATOR_MODE_BASE_RIGHT)
{
- switch_buffer_by_offset (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len);
+ switch_buffer_by_offset_le (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len);
}
/**
if (combs_mode == COMBINATOR_MODE_BASE_LEFT)
{
- switch_buffer_by_offset (wordr0, wordr1, wordr2, wordr3, pw_l_len);
+ switch_buffer_by_offset_le (wordr0, wordr1, wordr2, wordr3, pw_l_len);
}
u32 w0[4];
w3_t[2] = w3[2];
w3_t[3] = w3[3];
- switch_buffer_by_offset (w0_t, w1_t, w2_t, w3_t, salt_len);
+ switch_buffer_by_offset_le (w0_t, w1_t, w2_t, w3_t, salt_len);
w0_t[0] |= salt_buf0[0];
w0_t[1] |= salt_buf0[1];
-/**
+/** / s_skb
* Author......: Jens Steube <jens.steube@gmail.com>
* License.....: MIT
*/
#define _DES_
+#define NEW_SIMD_CODE
+
#include "include/constants.h"
#include "include/kernel_vendor.h"
#include "include/kernel_functions.c"
#include "OpenCL/types_ocl.c"
#include "OpenCL/common.c"
-
-#define COMPARE_S "OpenCL/check_single_comp4.c"
-#define COMPARE_M "OpenCL/check_multi_comp4.c"
+#include "OpenCL/simd.c"
#define PERM_OP(a,b,tt,n,m) \
{ \
}
};
+#if VECT_SIZE == 1
#define BOX(i,n,S) (S)[(n)][(i)]
-
-static void _des_crypt_encrypt (u32 iv[2], u32 data[2], u32 Kc[16], u32 Kd[16], __local u32 s_SPtrans[8][64])
+#elif VECT_SIZE == 2
+#define BOX(i,n,S) (u32x) ((S)[(n)][(i).s0], (S)[(n)][(i).s1])
+#elif VECT_SIZE == 4
+#define BOX(i,n,S) (u32x) ((S)[(n)][(i).s0], (S)[(n)][(i).s1], (S)[(n)][(i).s2], (S)[(n)][(i).s3])
+#elif VECT_SIZE == 8
+#define BOX(i,n,S) (u32x) ((S)[(n)][(i).s0], (S)[(n)][(i).s1], (S)[(n)][(i).s2], (S)[(n)][(i).s3], (S)[(n)][(i).s4], (S)[(n)][(i).s5], (S)[(n)][(i).s6], (S)[(n)][(i).s7])
+#endif
+
+static void _des_crypt_encrypt (u32x iv[2], u32x data[2], u32x Kc[16], u32x Kd[16], __local u32 s_SPtrans[8][64])
{
- u32 tt;
+ u32x tt;
- u32 r = data[0];
- u32 l = data[1];
+ u32x r = data[0];
+ u32x l = data[1];
IP (r, l, tt);
#pragma unroll 16
for (u32 i = 0; i < 16; i += 2)
{
- u32 u;
- u32 t;
+ u32x u;
+ u32x t;
u = Kc[i + 0] ^ r;
t = Kd[i + 0] ^ rotl32 (r, 28u);
iv[1] = r;
}
-static void _des_crypt_keysetup (u32 c, u32 d, u32 Kc[16], u32 Kd[16], __local u32 s_skb[8][64])
+static void _des_crypt_keysetup (u32x c, u32x d, u32x Kc[16], u32x Kd[16], __local u32 s_skb[8][64])
{
- u32 tt;
+ u32x tt;
PERM_OP (d, c, tt, 4, 0x0f0f0f0f);
HPERM_OP (c, tt, 2, 0xcccc0000);
c = c & 0x0fffffff;
d = d & 0x0fffffff;
- const u32 c00 = (c >> 0) & 0x0000003f;
- const u32 c06 = (c >> 6) & 0x00383003;
- const u32 c07 = (c >> 7) & 0x0000003c;
- const u32 c13 = (c >> 13) & 0x0000060f;
- const u32 c20 = (c >> 20) & 0x00000001;
-
- u32 s = BOX (((c00 >> 0) & 0xff), 0, s_skb)
- | BOX (((c06 >> 0) & 0xff)
- |((c07 >> 0) & 0xff), 1, s_skb)
- | BOX (((c13 >> 0) & 0xff)
- |((c06 >> 8) & 0xff), 2, s_skb)
- | BOX (((c20 >> 0) & 0xff)
- |((c13 >> 8) & 0xff)
- |((c06 >> 16) & 0xff), 3, s_skb);
-
- const u32 d00 = (d >> 0) & 0x00003c3f;
- const u32 d07 = (d >> 7) & 0x00003f03;
- const u32 d21 = (d >> 21) & 0x0000000f;
- const u32 d22 = (d >> 22) & 0x00000030;
-
- u32 t = BOX (((d00 >> 0) & 0xff), 4, s_skb)
- | BOX (((d07 >> 0) & 0xff)
- |((d00 >> 8) & 0xff), 5, s_skb)
- | BOX (((d07 >> 8) & 0xff), 6, s_skb)
- | BOX (((d21 >> 0) & 0xff)
- |((d22 >> 0) & 0xff), 7, s_skb);
+ const u32x c00 = (c >> 0) & 0x0000003f;
+ const u32x c06 = (c >> 6) & 0x00383003;
+ const u32x c07 = (c >> 7) & 0x0000003c;
+ const u32x c13 = (c >> 13) & 0x0000060f;
+ const u32x c20 = (c >> 20) & 0x00000001;
+
+ u32x s = BOX (((c00 >> 0) & 0xff), 0, s_skb)
+ | BOX (((c06 >> 0) & 0xff)
+ |((c07 >> 0) & 0xff), 1, s_skb)
+ | BOX (((c13 >> 0) & 0xff)
+ |((c06 >> 8) & 0xff), 2, s_skb)
+ | BOX (((c20 >> 0) & 0xff)
+ |((c13 >> 8) & 0xff)
+ |((c06 >> 16) & 0xff), 3, s_skb);
+
+ const u32x d00 = (d >> 0) & 0x00003c3f;
+ const u32x d07 = (d >> 7) & 0x00003f03;
+ const u32x d21 = (d >> 21) & 0x0000000f;
+ const u32x d22 = (d >> 22) & 0x00000030;
+
+ u32x t = BOX (((d00 >> 0) & 0xff), 4, s_skb)
+ | BOX (((d07 >> 0) & 0xff)
+ |((d00 >> 8) & 0xff), 5, s_skb)
+ | BOX (((d07 >> 8) & 0xff), 6, s_skb)
+ | BOX (((d21 >> 0) & 0xff)
+ |((d22 >> 0) & 0xff), 7, s_skb);
Kc[i] = ((t << 16) | (s & 0x0000ffff));
Kd[i] = ((s >> 16) | (t & 0xffff0000));
}
}
-static void overwrite_at (u32 sw[16], const u32 w0, const u32 salt_len)
-{
- #if defined cl_amd_media_ops
- switch (salt_len)
- {
- case 0: sw[0] = w0;
- break;
- case 1: sw[0] = amd_bytealign (w0, sw[0] << 24, 3);
- sw[1] = amd_bytealign (sw[1] >> 8, w0, 3);
- break;
- case 2: sw[0] = amd_bytealign (w0, sw[0] << 16, 2);
- sw[1] = amd_bytealign (sw[1] >> 16, w0, 2);
- break;
- case 3: sw[0] = amd_bytealign (w0, sw[0] << 8, 1);
- sw[1] = amd_bytealign (sw[1] >> 24, w0, 1);
- break;
- case 4: sw[1] = w0;
- break;
- case 5: sw[1] = amd_bytealign (w0, sw[1] << 24, 3);
- sw[2] = amd_bytealign (sw[2] >> 8, w0, 3);
- break;
- case 6: sw[1] = amd_bytealign (w0, sw[1] << 16, 2);
- sw[2] = amd_bytealign (sw[2] >> 16, w0, 2);
- break;
- case 7: sw[1] = amd_bytealign (w0, sw[1] << 8, 1);
- sw[2] = amd_bytealign (sw[2] >> 24, w0, 1);
- break;
- case 8: sw[2] = w0;
- break;
- case 9: sw[2] = amd_bytealign (w0, sw[2] << 24, 3);
- sw[3] = amd_bytealign (sw[3] >> 8, w0, 3);
- break;
- case 10: sw[2] = amd_bytealign (w0, sw[2] << 16, 2);
- sw[3] = amd_bytealign (sw[3] >> 16, w0, 2);
- break;
- case 11: sw[2] = amd_bytealign (w0, sw[2] << 8, 1);
- sw[3] = amd_bytealign (sw[3] >> 24, w0, 1);
- break;
- case 12: sw[3] = w0;
- break;
- case 13: sw[3] = amd_bytealign (w0, sw[3] << 24, 3);
- sw[4] = amd_bytealign (sw[4] >> 8, w0, 3);
- break;
- case 14: sw[3] = amd_bytealign (w0, sw[3] << 16, 2);
- sw[4] = amd_bytealign (sw[4] >> 16, w0, 2);
- break;
- case 15: sw[3] = amd_bytealign (w0, sw[3] << 8, 1);
- sw[4] = amd_bytealign (sw[4] >> 24, w0, 1);
- break;
- case 16: sw[4] = w0;
- break;
- case 17: sw[4] = amd_bytealign (w0, sw[4] << 24, 3);
- sw[5] = amd_bytealign (sw[5] >> 8, w0, 3);
- break;
- case 18: sw[4] = amd_bytealign (w0, sw[4] << 16, 2);
- sw[5] = amd_bytealign (sw[5] >> 16, w0, 2);
- break;
- case 19: sw[4] = amd_bytealign (w0, sw[4] << 8, 1);
- sw[5] = amd_bytealign (sw[5] >> 24, w0, 1);
- break;
- case 20: sw[5] = w0;
- break;
- case 21: sw[5] = amd_bytealign (w0, sw[5] << 24, 3);
- sw[6] = amd_bytealign (sw[6] >> 8, w0, 3);
- break;
- case 22: sw[5] = amd_bytealign (w0, sw[5] << 16, 2);
- sw[6] = amd_bytealign (sw[6] >> 16, w0, 2);
- break;
- case 23: sw[5] = amd_bytealign (w0, sw[5] << 8, 1);
- sw[6] = amd_bytealign (sw[6] >> 24, w0, 1);
- break;
- case 24: sw[6] = w0;
- break;
- case 25: sw[6] = amd_bytealign (w0, sw[6] << 24, 3);
- sw[7] = amd_bytealign (sw[7] >> 8, w0, 3);
- break;
- case 26: sw[6] = amd_bytealign (w0, sw[6] << 16, 2);
- sw[7] = amd_bytealign (sw[7] >> 16, w0, 2);
- break;
- case 27: sw[6] = amd_bytealign (w0, sw[6] << 8, 1);
- sw[7] = amd_bytealign (sw[7] >> 24, w0, 1);
- break;
- case 28: sw[7] = w0;
- break;
- case 29: sw[7] = amd_bytealign (w0, sw[7] << 24, 3);
- sw[8] = amd_bytealign (sw[8] >> 8, w0, 3);
- break;
- case 30: sw[7] = amd_bytealign (w0, sw[7] << 16, 2);
- sw[8] = amd_bytealign (sw[8] >> 16, w0, 2);
- break;
- case 31: sw[7] = amd_bytealign (w0, sw[7] << 8, 1);
- sw[8] = amd_bytealign (sw[8] >> 24, w0, 1);
- break;
- }
- #else
- switch (salt_len)
- {
- case 0: sw[0] = w0;
- break;
- case 1: sw[0] = (sw[0] & 0x000000ff) | (w0 << 8);
- sw[1] = (sw[1] & 0xffffff00) | (w0 >> 24);
- break;
- case 2: sw[0] = (sw[0] & 0x0000ffff) | (w0 << 16);
- sw[1] = (sw[1] & 0xffff0000) | (w0 >> 16);
- break;
- case 3: sw[0] = (sw[0] & 0x00ffffff) | (w0 << 24);
- sw[1] = (sw[1] & 0xff000000) | (w0 >> 8);
- break;
- case 4: sw[1] = w0;
- break;
- case 5: sw[1] = (sw[1] & 0x000000ff) | (w0 << 8);
- sw[2] = (sw[2] & 0xffffff00) | (w0 >> 24);
- break;
- case 6: sw[1] = (sw[1] & 0x0000ffff) | (w0 << 16);
- sw[2] = (sw[2] & 0xffff0000) | (w0 >> 16);
- break;
- case 7: sw[1] = (sw[1] & 0x00ffffff) | (w0 << 24);
- sw[2] = (sw[2] & 0xff000000) | (w0 >> 8);
- break;
- case 8: sw[2] = w0;
- break;
- case 9: sw[2] = (sw[2] & 0x000000ff) | (w0 << 8);
- sw[3] = (sw[3] & 0xffffff00) | (w0 >> 24);
- break;
- case 10: sw[2] = (sw[2] & 0x0000ffff) | (w0 << 16);
- sw[3] = (sw[3] & 0xffff0000) | (w0 >> 16);
- break;
- case 11: sw[2] = (sw[2] & 0x00ffffff) | (w0 << 24);
- sw[3] = (sw[3] & 0xff000000) | (w0 >> 8);
- break;
- case 12: sw[3] = w0;
- break;
- case 13: sw[3] = (sw[3] & 0x000000ff) | (w0 << 8);
- sw[4] = (sw[4] & 0xffffff00) | (w0 >> 24);
- break;
- case 14: sw[3] = (sw[3] & 0x0000ffff) | (w0 << 16);
- sw[4] = (sw[4] & 0xffff0000) | (w0 >> 16);
- break;
- case 15: sw[3] = (sw[3] & 0x00ffffff) | (w0 << 24);
- sw[4] = (sw[4] & 0xff000000) | (w0 >> 8);
- break;
- case 16: sw[4] = w0;
- break;
- case 17: sw[4] = (sw[4] & 0x000000ff) | (w0 << 8);
- sw[5] = (sw[5] & 0xffffff00) | (w0 >> 24);
- break;
- case 18: sw[4] = (sw[4] & 0x0000ffff) | (w0 << 16);
- sw[5] = (sw[5] & 0xffff0000) | (w0 >> 16);
- break;
- case 19: sw[4] = (sw[4] & 0x00ffffff) | (w0 << 24);
- sw[5] = (sw[5] & 0xff000000) | (w0 >> 8);
- break;
- case 20: sw[5] = w0;
- break;
- case 21: sw[5] = (sw[5] & 0x000000ff) | (w0 << 8);
- sw[6] = (sw[6] & 0xffffff00) | (w0 >> 24);
- break;
- case 22: sw[5] = (sw[5] & 0x0000ffff) | (w0 << 16);
- sw[6] = (sw[6] & 0xffff0000) | (w0 >> 16);
- break;
- case 23: sw[5] = (sw[5] & 0x00ffffff) | (w0 << 24);
- sw[6] = (sw[6] & 0xff000000) | (w0 >> 8);
- break;
- case 24: sw[6] = w0;
- break;
- case 25: sw[6] = (sw[6] & 0x000000ff) | (w0 << 8);
- sw[7] = (sw[7] & 0xffffff00) | (w0 >> 24);
- break;
- case 26: sw[6] = (sw[6] & 0x0000ffff) | (w0 << 16);
- sw[7] = (sw[7] & 0xffff0000) | (w0 >> 16);
- break;
- case 27: sw[6] = (sw[6] & 0x00ffffff) | (w0 << 24);
- sw[7] = (sw[7] & 0xff000000) | (w0 >> 8);
- break;
- case 28: sw[7] = w0;
- break;
- case 29: sw[7] = (sw[7] & 0x000000ff) | (w0 << 8);
- sw[8] = (sw[8] & 0xffffff00) | (w0 >> 24);
- break;
- case 30: sw[7] = (sw[7] & 0x0000ffff) | (w0 << 16);
- sw[8] = (sw[8] & 0xffff0000) | (w0 >> 16);
- break;
- case 31: sw[7] = (sw[7] & 0x00ffffff) | (w0 << 24);
- sw[8] = (sw[8] & 0xff000000) | (w0 >> 8);
- break;
- }
- #endif
-}
-
-static void m03100m (__local u32 s_SPtrans[8][64], __local u32 s_skb[8][64], u32 w[16], const u32 pw_len, __global pw_t *pws, __global kernel_rule_t *rules_buf, __global comb_t *combs_buf, __constant u32 * words_buf_r, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 bfs_cnt, const u32 digests_cnt, const u32 digests_offset)
+static void m03100m (__local u32 s_SPtrans[8][64], __local u32 s_skb[8][64], u32 w[16], const u32 pw_len, __global pw_t *pws, __global kernel_rule_t *rules_buf, __global comb_t *combs_buf, __constant u32x * words_buf_r, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 bfs_cnt, const u32 digests_cnt, const u32 digests_offset)
{
/**
* modifier
w3_t[2] = w[14];
w3_t[3] = w[15];
- switch_buffer_by_offset (w0_t, w1_t, w2_t, w3_t, salt_len);
+ switch_buffer_by_offset_le_S (w0_t, w1_t, w2_t, w3_t, salt_len);
w0_t[0] |= salt_buf0[0];
w0_t[1] |= salt_buf0[1];
w3_t[2] = 0;
w3_t[3] = 0;
- u32 dst[16];
+ u32x dst[16];
dst[ 0] = w0_t[0];
dst[ 1] = w0_t[1];
u32 w0l = w[0];
- for (u32 il_pos = 0; il_pos < bfs_cnt; il_pos++)
+ for (u32 il_pos = 0; il_pos < bfs_cnt; il_pos += VECT_SIZE)
{
- const u32 w0r = words_buf_r[il_pos];
+ const u32x w0r = words_buf_r[il_pos / VECT_SIZE];
- const u32 w0 = w0l | w0r;
+ const u32x w0 = w0l | w0r;
- overwrite_at (dst, w0, salt_len);
+ overwrite_at_le (dst, w0, salt_len);
/**
* precompute key1 since key is static: 0x0123456789abcdef
* plus LEFT_ROTATE by 2
*/
- u32 Kc[16];
+ u32x Kc[16];
Kc[ 0] = 0x64649040;
Kc[ 1] = 0x14909858;
Kc[14] = 0x584020b4;
Kc[15] = 0x00742c4c;
- u32 Kd[16];
+ u32x Kd[16];
Kd[ 0] = 0xa42ce40c;
Kd[ 1] = 0x64689858;
* key1 (generate key)
*/
- u32 iv[2];
+ u32x iv[2];
iv[0] = 0;
iv[1] = 0;
for (u32 j = 0, k = 0; j < salt_word_len; j += 8, k++)
{
- u32 data[2];
+ u32x data[2];
data[0] = ((dst[k] << 16) & 0xff000000) | ((dst[k] << 8) & 0x0000ff00);
data[1] = ((dst[k] >> 0) & 0xff000000) | ((dst[k] >> 8) & 0x0000ff00);
for (u32 j = 0, k = 0; j < salt_word_len; j += 8, k++)
{
- u32 data[2];
+ u32x data[2];
data[0] = ((dst[k] << 16) & 0xff000000) | ((dst[k] << 8) & 0x0000ff00);
data[1] = ((dst[k] >> 0) & 0xff000000) | ((dst[k] >> 8) & 0x0000ff00);
* cmp
*/
- const u32 r0 = iv[0];
- const u32 r1 = iv[1];
- const u32 r2 = 0;
- const u32 r3 = 0;
+ u32x c = 0;
+ u32x d = 0;
- #include COMPARE_M
+ COMPARE_M_SIMD (iv[0], iv[1], c, d);
}
}
-static void m03100s (__local u32 s_SPtrans[8][64], __local u32 s_skb[8][64], u32 w[16], const u32 pw_len, __global pw_t *pws, __global kernel_rule_t *rules_buf, __global comb_t *combs_buf, __constant u32 * words_buf_r, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 bfs_cnt, const u32 digests_cnt, const u32 digests_offset)
+static void m03100s (__local u32 s_SPtrans[8][64], __local u32 s_skb[8][64], u32 w[16], const u32 pw_len, __global pw_t *pws, __global kernel_rule_t *rules_buf, __global comb_t *combs_buf, __constant u32x * words_buf_r, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 bfs_cnt, const u32 digests_cnt, const u32 digests_offset)
{
/**
* modifier
w3_t[2] = w[14];
w3_t[3] = w[15];
- switch_buffer_by_offset (w0_t, w1_t, w2_t, w3_t, salt_len);
+ switch_buffer_by_offset_le_S (w0_t, w1_t, w2_t, w3_t, salt_len);
w0_t[0] |= salt_buf0[0];
w0_t[1] |= salt_buf0[1];
w3_t[2] = 0;
w3_t[3] = 0;
- u32 dst[16];
+ u32x dst[16];
dst[ 0] = w0_t[0];
dst[ 1] = w0_t[1];
u32 w0l = w[0];
- for (u32 il_pos = 0; il_pos < bfs_cnt; il_pos++)
+ for (u32 il_pos = 0; il_pos < bfs_cnt; il_pos += VECT_SIZE)
{
- const u32 w0r = words_buf_r[il_pos];
+ const u32x w0r = words_buf_r[il_pos / VECT_SIZE];
- const u32 w0 = w0l | w0r;
+ const u32x w0 = w0l | w0r;
- overwrite_at (dst, w0, salt_len);
+ overwrite_at_le (dst, w0, salt_len);
/**
* precompute key1 since key is static: 0x0123456789abcdef
* plus LEFT_ROTATE by 2
*/
- u32 Kc[16];
+ u32x Kc[16];
Kc[ 0] = 0x64649040;
Kc[ 1] = 0x14909858;
Kc[14] = 0x584020b4;
Kc[15] = 0x00742c4c;
- u32 Kd[16];
+ u32x Kd[16];
Kd[ 0] = 0xa42ce40c;
Kd[ 1] = 0x64689858;
* key1 (generate key)
*/
- u32 iv[2];
+ u32x iv[2];
iv[0] = 0;
iv[1] = 0;
for (u32 j = 0, k = 0; j < salt_word_len; j += 8, k++)
{
- u32 data[2];
+ u32x data[2];
data[0] = ((dst[k] << 16) & 0xff000000) | ((dst[k] << 8) & 0x0000ff00);
data[1] = ((dst[k] >> 0) & 0xff000000) | ((dst[k] >> 8) & 0x0000ff00);
for (u32 j = 0, k = 0; j < salt_word_len; j += 8, k++)
{
- u32 data[2];
+ u32x data[2];
data[0] = ((dst[k] << 16) & 0xff000000) | ((dst[k] << 8) & 0x0000ff00);
data[1] = ((dst[k] >> 0) & 0xff000000) | ((dst[k] >> 8) & 0x0000ff00);
* cmp
*/
- const u32 r0 = iv[0];
- const u32 r1 = iv[1];
- const u32 r2 = 0;
- const u32 r3 = 0;
+ u32x c = 0;
+ u32x d = 0;
- #include COMPARE_S
+ COMPARE_S_SIMD (iv[0], iv[1], c, d);
}
}
-__kernel void m03100_m04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, __global comb_t *combs_buf, __constant u32 * words_buf_r, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 bfs_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max)
+__kernel void m03100_m04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, __global comb_t *combs_buf, __constant u32x * words_buf_r, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 bfs_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max)
{
/**
* modifier
m03100m (s_SPtrans, s_skb, w, pw_len, pws, rules_buf, combs_buf, words_buf_r, tmps, hooks, bitmaps_buf_s1_a, bitmaps_buf_s1_b, bitmaps_buf_s1_c, bitmaps_buf_s1_d, bitmaps_buf_s2_a, bitmaps_buf_s2_b, bitmaps_buf_s2_c, bitmaps_buf_s2_d, plains_buf, digests_buf, hashes_shown, salt_bufs, esalt_bufs, d_return_buf, d_scryptV_buf, bitmap_mask, bitmap_shift1, bitmap_shift2, salt_pos, loop_pos, loop_cnt, bfs_cnt, digests_cnt, digests_offset);
}
-__kernel void m03100_m08 (__global pw_t *pws, __global kernel_rule_t *rules_buf, __global comb_t *combs_buf, __constant u32 * words_buf_r, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 bfs_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max)
+__kernel void m03100_m08 (__global pw_t *pws, __global kernel_rule_t *rules_buf, __global comb_t *combs_buf, __constant u32x * words_buf_r, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 bfs_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max)
{
/**
* modifier
m03100m (s_SPtrans, s_skb, w, pw_len, pws, rules_buf, combs_buf, words_buf_r, tmps, hooks, bitmaps_buf_s1_a, bitmaps_buf_s1_b, bitmaps_buf_s1_c, bitmaps_buf_s1_d, bitmaps_buf_s2_a, bitmaps_buf_s2_b, bitmaps_buf_s2_c, bitmaps_buf_s2_d, plains_buf, digests_buf, hashes_shown, salt_bufs, esalt_bufs, d_return_buf, d_scryptV_buf, bitmap_mask, bitmap_shift1, bitmap_shift2, salt_pos, loop_pos, loop_cnt, bfs_cnt, digests_cnt, digests_offset);
}
-__kernel void m03100_m16 (__global pw_t *pws, __global kernel_rule_t *rules_buf, __global comb_t *combs_buf, __constant u32 * words_buf_r, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 bfs_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max)
+__kernel void m03100_m16 (__global pw_t *pws, __global kernel_rule_t *rules_buf, __global comb_t *combs_buf, __constant u32x * words_buf_r, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 bfs_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max)
{
}
-__kernel void m03100_s04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, __global comb_t *combs_buf, __constant u32 * words_buf_r, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 bfs_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max)
+__kernel void m03100_s04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, __global comb_t *combs_buf, __constant u32x * words_buf_r, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 bfs_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max)
{
/**
* modifier
m03100s (s_SPtrans, s_skb, w, pw_len, pws, rules_buf, combs_buf, words_buf_r, tmps, hooks, bitmaps_buf_s1_a, bitmaps_buf_s1_b, bitmaps_buf_s1_c, bitmaps_buf_s1_d, bitmaps_buf_s2_a, bitmaps_buf_s2_b, bitmaps_buf_s2_c, bitmaps_buf_s2_d, plains_buf, digests_buf, hashes_shown, salt_bufs, esalt_bufs, d_return_buf, d_scryptV_buf, bitmap_mask, bitmap_shift1, bitmap_shift2, salt_pos, loop_pos, loop_cnt, bfs_cnt, digests_cnt, digests_offset);
}
-__kernel void m03100_s08 (__global pw_t *pws, __global kernel_rule_t *rules_buf, __global comb_t *combs_buf, __constant u32 * words_buf_r, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 bfs_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max)
+__kernel void m03100_s08 (__global pw_t *pws, __global kernel_rule_t *rules_buf, __global comb_t *combs_buf, __constant u32x * words_buf_r, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 bfs_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max)
{
/**
* modifier
m03100s (s_SPtrans, s_skb, w, pw_len, pws, rules_buf, combs_buf, words_buf_r, tmps, hooks, bitmaps_buf_s1_a, bitmaps_buf_s1_b, bitmaps_buf_s1_c, bitmaps_buf_s1_d, bitmaps_buf_s2_a, bitmaps_buf_s2_b, bitmaps_buf_s2_c, bitmaps_buf_s2_d, plains_buf, digests_buf, hashes_shown, salt_bufs, esalt_bufs, d_return_buf, d_scryptV_buf, bitmap_mask, bitmap_shift1, bitmap_shift2, salt_pos, loop_pos, loop_cnt, bfs_cnt, digests_cnt, digests_offset);
}
-__kernel void m03100_s16 (__global pw_t *pws, __global kernel_rule_t *rules_buf, __global comb_t *combs_buf, __constant u32 * words_buf_r, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 bfs_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max)
+__kernel void m03100_s16 (__global pw_t *pws, __global kernel_rule_t *rules_buf, __global comb_t *combs_buf, __constant u32x * words_buf_r, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 bfs_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max)
{
}
* prepend salt
*/
- switch_buffer_by_offset (w0_t, w1_t, w2_t, w3_t, salt_len);
+ switch_buffer_by_offset_le (w0_t, w1_t, w2_t, w3_t, salt_len);
w3_t[2] = pw_salt_len * 8;
* prepend salt
*/
- switch_buffer_by_offset (w0_t, w1_t, w2_t, w3_t, salt_len);
+ switch_buffer_by_offset_le (w0_t, w1_t, w2_t, w3_t, salt_len);
w3_t[2] = pw_salt_len * 8;
{
append_0x80_2x4 (wordl0, wordl1, pw_l_len);
- switch_buffer_by_offset (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len);
+ switch_buffer_by_offset_le (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len);
}
/**
if (combs_mode == COMBINATOR_MODE_BASE_LEFT)
{
- switch_buffer_by_offset (wordr0, wordr1, wordr2, wordr3, pw_l_len);
+ switch_buffer_by_offset_le (wordr0, wordr1, wordr2, wordr3, pw_l_len);
}
u32 w0[4];
* prepend salt
*/
- switch_buffer_by_offset (w0_t, w1_t, w2_t, w3_t, salt_len);
+ switch_buffer_by_offset_le (w0_t, w1_t, w2_t, w3_t, salt_len);
w3_t[2] = pw_salt_len * 8;
{
append_0x80_2x4 (wordl0, wordl1, pw_l_len);
- switch_buffer_by_offset (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len);
+ switch_buffer_by_offset_le (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len);
}
/**
if (combs_mode == COMBINATOR_MODE_BASE_LEFT)
{
- switch_buffer_by_offset (wordr0, wordr1, wordr2, wordr3, pw_l_len);
+ switch_buffer_by_offset_le (wordr0, wordr1, wordr2, wordr3, pw_l_len);
}
u32 w0[4];
* prepend salt
*/
- switch_buffer_by_offset (w0_t, w1_t, w2_t, w3_t, salt_len);
+ switch_buffer_by_offset_le (w0_t, w1_t, w2_t, w3_t, salt_len);
w3_t[2] = pw_salt_len * 8;
#define _MD5_
+#define NEW_SIMD_CODE
+
#include "include/constants.h"
#include "include/kernel_vendor.h"
#include "include/kernel_functions.c"
#include "OpenCL/types_ocl.c"
#include "OpenCL/common.c"
-
-#define COMPARE_S "OpenCL/check_single_comp4.c"
-#define COMPARE_M "OpenCL/check_multi_comp4.c"
-
-#define uint_to_hex_lower8(i) l_bin2asc[(i)]
+#include "OpenCL/simd.c"
+
+#if VECT_SIZE == 1
+#define uint_to_hex_lower8(i) (u32x) (l_bin2asc[(i)])
+#elif VECT_SIZE == 2
+#define uint_to_hex_lower8(i) (u32x) (l_bin2asc[(i).s0], l_bin2asc[(i).s1])
+#elif VECT_SIZE == 4
+#define uint_to_hex_lower8(i) (u32x) (l_bin2asc[(i).s0], l_bin2asc[(i).s1], l_bin2asc[(i).s2], l_bin2asc[(i).s3])
+#elif VECT_SIZE == 8
+#define uint_to_hex_lower8(i) (u32x) (l_bin2asc[(i).s0], l_bin2asc[(i).s1], l_bin2asc[(i).s2], l_bin2asc[(i).s3], l_bin2asc[(i).s4], l_bin2asc[(i).s5], l_bin2asc[(i).s6], l_bin2asc[(i).s7])
+#endif
static void m03710m (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], const u32 pw_len, __global pw_t *pws, __global kernel_rule_t *rules_buf, __global comb_t *combs_buf, __global bf_t *bfs_buf, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 bfs_cnt, const u32 digests_cnt, const u32 digests_offset, __local u32 l_bin2asc[256])
{
u32 w0l = w0[0];
- for (u32 il_pos = 0; il_pos < bfs_cnt; il_pos++)
+ for (u32 il_pos = 0; il_pos < bfs_cnt; il_pos += VECT_SIZE)
{
- const u32 w0r = bfs_buf[il_pos].i;
-
- w0[0] = w0l | w0r;
-
- u32 a = MD5M_A;
- u32 b = MD5M_B;
- u32 c = MD5M_C;
- u32 d = MD5M_D;
-
- MD5_STEP (MD5_Fo, a, b, c, d, w0[0], MD5C00, MD5S00);
- MD5_STEP (MD5_Fo, d, a, b, c, w0[1], MD5C01, MD5S01);
- MD5_STEP (MD5_Fo, c, d, a, b, w0[2], MD5C02, MD5S02);
- MD5_STEP (MD5_Fo, b, c, d, a, w0[3], MD5C03, MD5S03);
- MD5_STEP (MD5_Fo, a, b, c, d, w1[0], MD5C04, MD5S00);
- MD5_STEP (MD5_Fo, d, a, b, c, w1[1], MD5C05, MD5S01);
- MD5_STEP (MD5_Fo, c, d, a, b, w1[2], MD5C06, MD5S02);
- MD5_STEP (MD5_Fo, b, c, d, a, w1[3], MD5C07, MD5S03);
- MD5_STEP (MD5_Fo, a, b, c, d, w2[0], MD5C08, MD5S00);
- MD5_STEP (MD5_Fo, d, a, b, c, w2[1], MD5C09, MD5S01);
- MD5_STEP (MD5_Fo, c, d, a, b, w2[2], MD5C0a, MD5S02);
- MD5_STEP (MD5_Fo, b, c, d, a, w2[3], MD5C0b, MD5S03);
- MD5_STEP (MD5_Fo, a, b, c, d, w3[0], MD5C0c, MD5S00);
- MD5_STEP (MD5_Fo, d, a, b, c, w3[1], MD5C0d, MD5S01);
- MD5_STEP (MD5_Fo, c, d, a, b, w3[2], MD5C0e, MD5S02);
- MD5_STEP (MD5_Fo, b, c, d, a, w3[3], MD5C0f, MD5S03);
-
- MD5_STEP (MD5_Go, a, b, c, d, w0[1], MD5C10, MD5S10);
- MD5_STEP (MD5_Go, d, a, b, c, w1[2], MD5C11, MD5S11);
- MD5_STEP (MD5_Go, c, d, a, b, w2[3], MD5C12, MD5S12);
- MD5_STEP (MD5_Go, b, c, d, a, w0[0], MD5C13, MD5S13);
- MD5_STEP (MD5_Go, a, b, c, d, w1[1], MD5C14, MD5S10);
- MD5_STEP (MD5_Go, d, a, b, c, w2[2], MD5C15, MD5S11);
- MD5_STEP (MD5_Go, c, d, a, b, w3[3], MD5C16, MD5S12);
- MD5_STEP (MD5_Go, b, c, d, a, w1[0], MD5C17, MD5S13);
- MD5_STEP (MD5_Go, a, b, c, d, w2[1], MD5C18, MD5S10);
- MD5_STEP (MD5_Go, d, a, b, c, w3[2], MD5C19, MD5S11);
- MD5_STEP (MD5_Go, c, d, a, b, w0[3], MD5C1a, MD5S12);
- MD5_STEP (MD5_Go, b, c, d, a, w2[0], MD5C1b, MD5S13);
- MD5_STEP (MD5_Go, a, b, c, d, w3[1], MD5C1c, MD5S10);
- MD5_STEP (MD5_Go, d, a, b, c, w0[2], MD5C1d, MD5S11);
- MD5_STEP (MD5_Go, c, d, a, b, w1[3], MD5C1e, MD5S12);
- MD5_STEP (MD5_Go, b, c, d, a, w3[0], MD5C1f, MD5S13);
-
- MD5_STEP (MD5_H , a, b, c, d, w1[1], MD5C20, MD5S20);
- MD5_STEP (MD5_H , d, a, b, c, w2[0], MD5C21, MD5S21);
- MD5_STEP (MD5_H , c, d, a, b, w2[3], MD5C22, MD5S22);
- MD5_STEP (MD5_H , b, c, d, a, w3[2], MD5C23, MD5S23);
- MD5_STEP (MD5_H , a, b, c, d, w0[1], MD5C24, MD5S20);
- MD5_STEP (MD5_H , d, a, b, c, w1[0], MD5C25, MD5S21);
- MD5_STEP (MD5_H , c, d, a, b, w1[3], MD5C26, MD5S22);
- MD5_STEP (MD5_H , b, c, d, a, w2[2], MD5C27, MD5S23);
- MD5_STEP (MD5_H , a, b, c, d, w3[1], MD5C28, MD5S20);
- MD5_STEP (MD5_H , d, a, b, c, w0[0], MD5C29, MD5S21);
- MD5_STEP (MD5_H , c, d, a, b, w0[3], MD5C2a, MD5S22);
- MD5_STEP (MD5_H , b, c, d, a, w1[2], MD5C2b, MD5S23);
- MD5_STEP (MD5_H , a, b, c, d, w2[1], MD5C2c, MD5S20);
- MD5_STEP (MD5_H , d, a, b, c, w3[0], MD5C2d, MD5S21);
- MD5_STEP (MD5_H , c, d, a, b, w3[3], MD5C2e, MD5S22);
- MD5_STEP (MD5_H , b, c, d, a, w0[2], MD5C2f, MD5S23);
-
- MD5_STEP (MD5_I , a, b, c, d, w0[0], MD5C30, MD5S30);
- MD5_STEP (MD5_I , d, a, b, c, w1[3], MD5C31, MD5S31);
- MD5_STEP (MD5_I , c, d, a, b, w3[2], MD5C32, MD5S32);
- MD5_STEP (MD5_I , b, c, d, a, w1[1], MD5C33, MD5S33);
- MD5_STEP (MD5_I , a, b, c, d, w3[0], MD5C34, MD5S30);
- MD5_STEP (MD5_I , d, a, b, c, w0[3], MD5C35, MD5S31);
- MD5_STEP (MD5_I , c, d, a, b, w2[2], MD5C36, MD5S32);
- MD5_STEP (MD5_I , b, c, d, a, w0[1], MD5C37, MD5S33);
- MD5_STEP (MD5_I , a, b, c, d, w2[0], MD5C38, MD5S30);
- MD5_STEP (MD5_I , d, a, b, c, w3[3], MD5C39, MD5S31);
- MD5_STEP (MD5_I , c, d, a, b, w1[2], MD5C3a, MD5S32);
- MD5_STEP (MD5_I , b, c, d, a, w3[1], MD5C3b, MD5S33);
- MD5_STEP (MD5_I , a, b, c, d, w1[0], MD5C3c, MD5S30);
- MD5_STEP (MD5_I , d, a, b, c, w2[3], MD5C3d, MD5S31);
- MD5_STEP (MD5_I , c, d, a, b, w0[2], MD5C3e, MD5S32);
- MD5_STEP (MD5_I , b, c, d, a, w2[1], MD5C3f, MD5S33);
+ const u32x w0r = w0r_create_bft (bfs_buf, il_pos);
+
+ const u32x w0lr = w0l | w0r;
+
+ u32x w0_t[4];
+
+ w0_t[0] = w0lr;
+ w0_t[1] = w0[1];
+ w0_t[2] = w0[2];
+ w0_t[3] = w0[3];
+
+ u32x w1_t[4];
+
+ w1_t[0] = w1[0];
+ w1_t[1] = w1[1];
+ w1_t[2] = w1[2];
+ w1_t[3] = w1[3];
+
+ u32x w2_t[4];
+
+ w2_t[0] = w2[0];
+ w2_t[1] = w2[1];
+ w2_t[2] = w2[2];
+ w2_t[3] = w2[3];
+
+ u32x w3_t[4];
+
+ w3_t[0] = w3[0];
+ w3_t[1] = w3[1];
+ w3_t[2] = w3[2];
+ w3_t[3] = w3[3];
+
+ u32x a = MD5M_A;
+ u32x b = MD5M_B;
+ u32x c = MD5M_C;
+ u32x d = MD5M_D;
+
+ MD5_STEP (MD5_Fo, a, b, c, d, w0_t[0], MD5C00, MD5S00);
+ MD5_STEP (MD5_Fo, d, a, b, c, w0_t[1], MD5C01, MD5S01);
+ MD5_STEP (MD5_Fo, c, d, a, b, w0_t[2], MD5C02, MD5S02);
+ MD5_STEP (MD5_Fo, b, c, d, a, w0_t[3], MD5C03, MD5S03);
+ MD5_STEP (MD5_Fo, a, b, c, d, w1_t[0], MD5C04, MD5S00);
+ MD5_STEP (MD5_Fo, d, a, b, c, w1_t[1], MD5C05, MD5S01);
+ MD5_STEP (MD5_Fo, c, d, a, b, w1_t[2], MD5C06, MD5S02);
+ MD5_STEP (MD5_Fo, b, c, d, a, w1_t[3], MD5C07, MD5S03);
+ MD5_STEP (MD5_Fo, a, b, c, d, w2_t[0], MD5C08, MD5S00);
+ MD5_STEP (MD5_Fo, d, a, b, c, w2_t[1], MD5C09, MD5S01);
+ MD5_STEP (MD5_Fo, c, d, a, b, w2_t[2], MD5C0a, MD5S02);
+ MD5_STEP (MD5_Fo, b, c, d, a, w2_t[3], MD5C0b, MD5S03);
+ MD5_STEP (MD5_Fo, a, b, c, d, w3_t[0], MD5C0c, MD5S00);
+ MD5_STEP (MD5_Fo, d, a, b, c, w3_t[1], MD5C0d, MD5S01);
+ MD5_STEP (MD5_Fo, c, d, a, b, w3_t[2], MD5C0e, MD5S02);
+ MD5_STEP (MD5_Fo, b, c, d, a, w3_t[3], MD5C0f, MD5S03);
+
+ MD5_STEP (MD5_Go, a, b, c, d, w0_t[1], MD5C10, MD5S10);
+ MD5_STEP (MD5_Go, d, a, b, c, w1_t[2], MD5C11, MD5S11);
+ MD5_STEP (MD5_Go, c, d, a, b, w2_t[3], MD5C12, MD5S12);
+ MD5_STEP (MD5_Go, b, c, d, a, w0_t[0], MD5C13, MD5S13);
+ MD5_STEP (MD5_Go, a, b, c, d, w1_t[1], MD5C14, MD5S10);
+ MD5_STEP (MD5_Go, d, a, b, c, w2_t[2], MD5C15, MD5S11);
+ MD5_STEP (MD5_Go, c, d, a, b, w3_t[3], MD5C16, MD5S12);
+ MD5_STEP (MD5_Go, b, c, d, a, w1_t[0], MD5C17, MD5S13);
+ MD5_STEP (MD5_Go, a, b, c, d, w2_t[1], MD5C18, MD5S10);
+ MD5_STEP (MD5_Go, d, a, b, c, w3_t[2], MD5C19, MD5S11);
+ MD5_STEP (MD5_Go, c, d, a, b, w0_t[3], MD5C1a, MD5S12);
+ MD5_STEP (MD5_Go, b, c, d, a, w2_t[0], MD5C1b, MD5S13);
+ MD5_STEP (MD5_Go, a, b, c, d, w3_t[1], MD5C1c, MD5S10);
+ MD5_STEP (MD5_Go, d, a, b, c, w0_t[2], MD5C1d, MD5S11);
+ MD5_STEP (MD5_Go, c, d, a, b, w1_t[3], MD5C1e, MD5S12);
+ MD5_STEP (MD5_Go, b, c, d, a, w3_t[0], MD5C1f, MD5S13);
+
+ MD5_STEP (MD5_H , a, b, c, d, w1_t[1], MD5C20, MD5S20);
+ MD5_STEP (MD5_H , d, a, b, c, w2_t[0], MD5C21, MD5S21);
+ MD5_STEP (MD5_H , c, d, a, b, w2_t[3], MD5C22, MD5S22);
+ MD5_STEP (MD5_H , b, c, d, a, w3_t[2], MD5C23, MD5S23);
+ MD5_STEP (MD5_H , a, b, c, d, w0_t[1], MD5C24, MD5S20);
+ MD5_STEP (MD5_H , d, a, b, c, w1_t[0], MD5C25, MD5S21);
+ MD5_STEP (MD5_H , c, d, a, b, w1_t[3], MD5C26, MD5S22);
+ MD5_STEP (MD5_H , b, c, d, a, w2_t[2], MD5C27, MD5S23);
+ MD5_STEP (MD5_H , a, b, c, d, w3_t[1], MD5C28, MD5S20);
+ MD5_STEP (MD5_H , d, a, b, c, w0_t[0], MD5C29, MD5S21);
+ MD5_STEP (MD5_H , c, d, a, b, w0_t[3], MD5C2a, MD5S22);
+ MD5_STEP (MD5_H , b, c, d, a, w1_t[2], MD5C2b, MD5S23);
+ MD5_STEP (MD5_H , a, b, c, d, w2_t[1], MD5C2c, MD5S20);
+ MD5_STEP (MD5_H , d, a, b, c, w3_t[0], MD5C2d, MD5S21);
+ MD5_STEP (MD5_H , c, d, a, b, w3_t[3], MD5C2e, MD5S22);
+ MD5_STEP (MD5_H , b, c, d, a, w0_t[2], MD5C2f, MD5S23);
+
+ MD5_STEP (MD5_I , a, b, c, d, w0_t[0], MD5C30, MD5S30);
+ MD5_STEP (MD5_I , d, a, b, c, w1_t[3], MD5C31, MD5S31);
+ MD5_STEP (MD5_I , c, d, a, b, w3_t[2], MD5C32, MD5S32);
+ MD5_STEP (MD5_I , b, c, d, a, w1_t[1], MD5C33, MD5S33);
+ MD5_STEP (MD5_I , a, b, c, d, w3_t[0], MD5C34, MD5S30);
+ MD5_STEP (MD5_I , d, a, b, c, w0_t[3], MD5C35, MD5S31);
+ MD5_STEP (MD5_I , c, d, a, b, w2_t[2], MD5C36, MD5S32);
+ MD5_STEP (MD5_I , b, c, d, a, w0_t[1], MD5C37, MD5S33);
+ MD5_STEP (MD5_I , a, b, c, d, w2_t[0], MD5C38, MD5S30);
+ MD5_STEP (MD5_I , d, a, b, c, w3_t[3], MD5C39, MD5S31);
+ MD5_STEP (MD5_I , c, d, a, b, w1_t[2], MD5C3a, MD5S32);
+ MD5_STEP (MD5_I , b, c, d, a, w3_t[1], MD5C3b, MD5S33);
+ MD5_STEP (MD5_I , a, b, c, d, w1_t[0], MD5C3c, MD5S30);
+ MD5_STEP (MD5_I , d, a, b, c, w2_t[3], MD5C3d, MD5S31);
+ MD5_STEP (MD5_I , c, d, a, b, w0_t[2], MD5C3e, MD5S32);
+ MD5_STEP (MD5_I , b, c, d, a, w2_t[1], MD5C3f, MD5S33);
a += MD5M_A;
b += MD5M_B;
c += MD5M_C;
d += MD5M_D;
- u32 w0_t[4];
- u32 w1_t[4];
- u32 w2_t[4];
- u32 w3_t[4];
-
w0_t[0] = uint_to_hex_lower8 ((a >> 0) & 255) << 0
| uint_to_hex_lower8 ((a >> 8) & 255) << 16;
w0_t[1] = uint_to_hex_lower8 ((a >> 16) & 255) << 0
* prepend salt
*/
- switch_buffer_by_offset (w0_t, w1_t, w2_t, w3_t, salt_len);
+ switch_buffer_by_offset_le (w0_t, w1_t, w2_t, w3_t, salt_len);
w3_t[2] = pw_salt_len * 8;
MD5_STEP (MD5_I , c, d, a, b, w0_t[2], MD5C3e, MD5S32);
MD5_STEP (MD5_I , b, c, d, a, w2_t[1], MD5C3f, MD5S33);
- const u32 r0 = a;
- const u32 r1 = d;
- const u32 r2 = c;
- const u32 r3 = b;
-
- #include COMPARE_M
+ COMPARE_M_SIMD (a, d, c, b);
}
}
u32 w0l = w0[0];
- for (u32 il_pos = 0; il_pos < bfs_cnt; il_pos++)
+ for (u32 il_pos = 0; il_pos < bfs_cnt; il_pos += VECT_SIZE)
{
- const u32 w0r = bfs_buf[il_pos].i;
-
- w0[0] = w0l | w0r;
-
- u32 a = MD5M_A;
- u32 b = MD5M_B;
- u32 c = MD5M_C;
- u32 d = MD5M_D;
-
- MD5_STEP (MD5_Fo, a, b, c, d, w0[0], MD5C00, MD5S00);
- MD5_STEP (MD5_Fo, d, a, b, c, w0[1], MD5C01, MD5S01);
- MD5_STEP (MD5_Fo, c, d, a, b, w0[2], MD5C02, MD5S02);
- MD5_STEP (MD5_Fo, b, c, d, a, w0[3], MD5C03, MD5S03);
- MD5_STEP (MD5_Fo, a, b, c, d, w1[0], MD5C04, MD5S00);
- MD5_STEP (MD5_Fo, d, a, b, c, w1[1], MD5C05, MD5S01);
- MD5_STEP (MD5_Fo, c, d, a, b, w1[2], MD5C06, MD5S02);
- MD5_STEP (MD5_Fo, b, c, d, a, w1[3], MD5C07, MD5S03);
- MD5_STEP (MD5_Fo, a, b, c, d, w2[0], MD5C08, MD5S00);
- MD5_STEP (MD5_Fo, d, a, b, c, w2[1], MD5C09, MD5S01);
- MD5_STEP (MD5_Fo, c, d, a, b, w2[2], MD5C0a, MD5S02);
- MD5_STEP (MD5_Fo, b, c, d, a, w2[3], MD5C0b, MD5S03);
- MD5_STEP (MD5_Fo, a, b, c, d, w3[0], MD5C0c, MD5S00);
- MD5_STEP (MD5_Fo, d, a, b, c, w3[1], MD5C0d, MD5S01);
- MD5_STEP (MD5_Fo, c, d, a, b, w3[2], MD5C0e, MD5S02);
- MD5_STEP (MD5_Fo, b, c, d, a, w3[3], MD5C0f, MD5S03);
-
- MD5_STEP (MD5_Go, a, b, c, d, w0[1], MD5C10, MD5S10);
- MD5_STEP (MD5_Go, d, a, b, c, w1[2], MD5C11, MD5S11);
- MD5_STEP (MD5_Go, c, d, a, b, w2[3], MD5C12, MD5S12);
- MD5_STEP (MD5_Go, b, c, d, a, w0[0], MD5C13, MD5S13);
- MD5_STEP (MD5_Go, a, b, c, d, w1[1], MD5C14, MD5S10);
- MD5_STEP (MD5_Go, d, a, b, c, w2[2], MD5C15, MD5S11);
- MD5_STEP (MD5_Go, c, d, a, b, w3[3], MD5C16, MD5S12);
- MD5_STEP (MD5_Go, b, c, d, a, w1[0], MD5C17, MD5S13);
- MD5_STEP (MD5_Go, a, b, c, d, w2[1], MD5C18, MD5S10);
- MD5_STEP (MD5_Go, d, a, b, c, w3[2], MD5C19, MD5S11);
- MD5_STEP (MD5_Go, c, d, a, b, w0[3], MD5C1a, MD5S12);
- MD5_STEP (MD5_Go, b, c, d, a, w2[0], MD5C1b, MD5S13);
- MD5_STEP (MD5_Go, a, b, c, d, w3[1], MD5C1c, MD5S10);
- MD5_STEP (MD5_Go, d, a, b, c, w0[2], MD5C1d, MD5S11);
- MD5_STEP (MD5_Go, c, d, a, b, w1[3], MD5C1e, MD5S12);
- MD5_STEP (MD5_Go, b, c, d, a, w3[0], MD5C1f, MD5S13);
-
- MD5_STEP (MD5_H , a, b, c, d, w1[1], MD5C20, MD5S20);
- MD5_STEP (MD5_H , d, a, b, c, w2[0], MD5C21, MD5S21);
- MD5_STEP (MD5_H , c, d, a, b, w2[3], MD5C22, MD5S22);
- MD5_STEP (MD5_H , b, c, d, a, w3[2], MD5C23, MD5S23);
- MD5_STEP (MD5_H , a, b, c, d, w0[1], MD5C24, MD5S20);
- MD5_STEP (MD5_H , d, a, b, c, w1[0], MD5C25, MD5S21);
- MD5_STEP (MD5_H , c, d, a, b, w1[3], MD5C26, MD5S22);
- MD5_STEP (MD5_H , b, c, d, a, w2[2], MD5C27, MD5S23);
- MD5_STEP (MD5_H , a, b, c, d, w3[1], MD5C28, MD5S20);
- MD5_STEP (MD5_H , d, a, b, c, w0[0], MD5C29, MD5S21);
- MD5_STEP (MD5_H , c, d, a, b, w0[3], MD5C2a, MD5S22);
- MD5_STEP (MD5_H , b, c, d, a, w1[2], MD5C2b, MD5S23);
- MD5_STEP (MD5_H , a, b, c, d, w2[1], MD5C2c, MD5S20);
- MD5_STEP (MD5_H , d, a, b, c, w3[0], MD5C2d, MD5S21);
- MD5_STEP (MD5_H , c, d, a, b, w3[3], MD5C2e, MD5S22);
- MD5_STEP (MD5_H , b, c, d, a, w0[2], MD5C2f, MD5S23);
-
- MD5_STEP (MD5_I , a, b, c, d, w0[0], MD5C30, MD5S30);
- MD5_STEP (MD5_I , d, a, b, c, w1[3], MD5C31, MD5S31);
- MD5_STEP (MD5_I , c, d, a, b, w3[2], MD5C32, MD5S32);
- MD5_STEP (MD5_I , b, c, d, a, w1[1], MD5C33, MD5S33);
- MD5_STEP (MD5_I , a, b, c, d, w3[0], MD5C34, MD5S30);
- MD5_STEP (MD5_I , d, a, b, c, w0[3], MD5C35, MD5S31);
- MD5_STEP (MD5_I , c, d, a, b, w2[2], MD5C36, MD5S32);
- MD5_STEP (MD5_I , b, c, d, a, w0[1], MD5C37, MD5S33);
- MD5_STEP (MD5_I , a, b, c, d, w2[0], MD5C38, MD5S30);
- MD5_STEP (MD5_I , d, a, b, c, w3[3], MD5C39, MD5S31);
- MD5_STEP (MD5_I , c, d, a, b, w1[2], MD5C3a, MD5S32);
- MD5_STEP (MD5_I , b, c, d, a, w3[1], MD5C3b, MD5S33);
- MD5_STEP (MD5_I , a, b, c, d, w1[0], MD5C3c, MD5S30);
- MD5_STEP (MD5_I , d, a, b, c, w2[3], MD5C3d, MD5S31);
- MD5_STEP (MD5_I , c, d, a, b, w0[2], MD5C3e, MD5S32);
- MD5_STEP (MD5_I , b, c, d, a, w2[1], MD5C3f, MD5S33);
+ const u32x w0r = w0r_create_bft (bfs_buf, il_pos);
+
+ const u32x w0lr = w0l | w0r;
+
+ u32x w0_t[4];
+
+ w0_t[0] = w0lr;
+ w0_t[1] = w0[1];
+ w0_t[2] = w0[2];
+ w0_t[3] = w0[3];
+
+ u32x w1_t[4];
+
+ w1_t[0] = w1[0];
+ w1_t[1] = w1[1];
+ w1_t[2] = w1[2];
+ w1_t[3] = w1[3];
+
+ u32x w2_t[4];
+
+ w2_t[0] = w2[0];
+ w2_t[1] = w2[1];
+ w2_t[2] = w2[2];
+ w2_t[3] = w2[3];
+
+ u32x w3_t[4];
+
+ w3_t[0] = w3[0];
+ w3_t[1] = w3[1];
+ w3_t[2] = w3[2];
+ w3_t[3] = w3[3];
+
+ u32x a = MD5M_A;
+ u32x b = MD5M_B;
+ u32x c = MD5M_C;
+ u32x d = MD5M_D;
+
+ MD5_STEP (MD5_Fo, a, b, c, d, w0_t[0], MD5C00, MD5S00);
+ MD5_STEP (MD5_Fo, d, a, b, c, w0_t[1], MD5C01, MD5S01);
+ MD5_STEP (MD5_Fo, c, d, a, b, w0_t[2], MD5C02, MD5S02);
+ MD5_STEP (MD5_Fo, b, c, d, a, w0_t[3], MD5C03, MD5S03);
+ MD5_STEP (MD5_Fo, a, b, c, d, w1_t[0], MD5C04, MD5S00);
+ MD5_STEP (MD5_Fo, d, a, b, c, w1_t[1], MD5C05, MD5S01);
+ MD5_STEP (MD5_Fo, c, d, a, b, w1_t[2], MD5C06, MD5S02);
+ MD5_STEP (MD5_Fo, b, c, d, a, w1_t[3], MD5C07, MD5S03);
+ MD5_STEP (MD5_Fo, a, b, c, d, w2_t[0], MD5C08, MD5S00);
+ MD5_STEP (MD5_Fo, d, a, b, c, w2_t[1], MD5C09, MD5S01);
+ MD5_STEP (MD5_Fo, c, d, a, b, w2_t[2], MD5C0a, MD5S02);
+ MD5_STEP (MD5_Fo, b, c, d, a, w2_t[3], MD5C0b, MD5S03);
+ MD5_STEP (MD5_Fo, a, b, c, d, w3_t[0], MD5C0c, MD5S00);
+ MD5_STEP (MD5_Fo, d, a, b, c, w3_t[1], MD5C0d, MD5S01);
+ MD5_STEP (MD5_Fo, c, d, a, b, w3_t[2], MD5C0e, MD5S02);
+ MD5_STEP (MD5_Fo, b, c, d, a, w3_t[3], MD5C0f, MD5S03);
+
+ MD5_STEP (MD5_Go, a, b, c, d, w0_t[1], MD5C10, MD5S10);
+ MD5_STEP (MD5_Go, d, a, b, c, w1_t[2], MD5C11, MD5S11);
+ MD5_STEP (MD5_Go, c, d, a, b, w2_t[3], MD5C12, MD5S12);
+ MD5_STEP (MD5_Go, b, c, d, a, w0_t[0], MD5C13, MD5S13);
+ MD5_STEP (MD5_Go, a, b, c, d, w1_t[1], MD5C14, MD5S10);
+ MD5_STEP (MD5_Go, d, a, b, c, w2_t[2], MD5C15, MD5S11);
+ MD5_STEP (MD5_Go, c, d, a, b, w3_t[3], MD5C16, MD5S12);
+ MD5_STEP (MD5_Go, b, c, d, a, w1_t[0], MD5C17, MD5S13);
+ MD5_STEP (MD5_Go, a, b, c, d, w2_t[1], MD5C18, MD5S10);
+ MD5_STEP (MD5_Go, d, a, b, c, w3_t[2], MD5C19, MD5S11);
+ MD5_STEP (MD5_Go, c, d, a, b, w0_t[3], MD5C1a, MD5S12);
+ MD5_STEP (MD5_Go, b, c, d, a, w2_t[0], MD5C1b, MD5S13);
+ MD5_STEP (MD5_Go, a, b, c, d, w3_t[1], MD5C1c, MD5S10);
+ MD5_STEP (MD5_Go, d, a, b, c, w0_t[2], MD5C1d, MD5S11);
+ MD5_STEP (MD5_Go, c, d, a, b, w1_t[3], MD5C1e, MD5S12);
+ MD5_STEP (MD5_Go, b, c, d, a, w3_t[0], MD5C1f, MD5S13);
+
+ MD5_STEP (MD5_H , a, b, c, d, w1_t[1], MD5C20, MD5S20);
+ MD5_STEP (MD5_H , d, a, b, c, w2_t[0], MD5C21, MD5S21);
+ MD5_STEP (MD5_H , c, d, a, b, w2_t[3], MD5C22, MD5S22);
+ MD5_STEP (MD5_H , b, c, d, a, w3_t[2], MD5C23, MD5S23);
+ MD5_STEP (MD5_H , a, b, c, d, w0_t[1], MD5C24, MD5S20);
+ MD5_STEP (MD5_H , d, a, b, c, w1_t[0], MD5C25, MD5S21);
+ MD5_STEP (MD5_H , c, d, a, b, w1_t[3], MD5C26, MD5S22);
+ MD5_STEP (MD5_H , b, c, d, a, w2_t[2], MD5C27, MD5S23);
+ MD5_STEP (MD5_H , a, b, c, d, w3_t[1], MD5C28, MD5S20);
+ MD5_STEP (MD5_H , d, a, b, c, w0_t[0], MD5C29, MD5S21);
+ MD5_STEP (MD5_H , c, d, a, b, w0_t[3], MD5C2a, MD5S22);
+ MD5_STEP (MD5_H , b, c, d, a, w1_t[2], MD5C2b, MD5S23);
+ MD5_STEP (MD5_H , a, b, c, d, w2_t[1], MD5C2c, MD5S20);
+ MD5_STEP (MD5_H , d, a, b, c, w3_t[0], MD5C2d, MD5S21);
+ MD5_STEP (MD5_H , c, d, a, b, w3_t[3], MD5C2e, MD5S22);
+ MD5_STEP (MD5_H , b, c, d, a, w0_t[2], MD5C2f, MD5S23);
+
+ MD5_STEP (MD5_I , a, b, c, d, w0_t[0], MD5C30, MD5S30);
+ MD5_STEP (MD5_I , d, a, b, c, w1_t[3], MD5C31, MD5S31);
+ MD5_STEP (MD5_I , c, d, a, b, w3_t[2], MD5C32, MD5S32);
+ MD5_STEP (MD5_I , b, c, d, a, w1_t[1], MD5C33, MD5S33);
+ MD5_STEP (MD5_I , a, b, c, d, w3_t[0], MD5C34, MD5S30);
+ MD5_STEP (MD5_I , d, a, b, c, w0_t[3], MD5C35, MD5S31);
+ MD5_STEP (MD5_I , c, d, a, b, w2_t[2], MD5C36, MD5S32);
+ MD5_STEP (MD5_I , b, c, d, a, w0_t[1], MD5C37, MD5S33);
+ MD5_STEP (MD5_I , a, b, c, d, w2_t[0], MD5C38, MD5S30);
+ MD5_STEP (MD5_I , d, a, b, c, w3_t[3], MD5C39, MD5S31);
+ MD5_STEP (MD5_I , c, d, a, b, w1_t[2], MD5C3a, MD5S32);
+ MD5_STEP (MD5_I , b, c, d, a, w3_t[1], MD5C3b, MD5S33);
+ MD5_STEP (MD5_I , a, b, c, d, w1_t[0], MD5C3c, MD5S30);
+ MD5_STEP (MD5_I , d, a, b, c, w2_t[3], MD5C3d, MD5S31);
+ MD5_STEP (MD5_I , c, d, a, b, w0_t[2], MD5C3e, MD5S32);
+ MD5_STEP (MD5_I , b, c, d, a, w2_t[1], MD5C3f, MD5S33);
a += MD5M_A;
b += MD5M_B;
c += MD5M_C;
d += MD5M_D;
- u32 w0_t[4];
- u32 w1_t[4];
- u32 w2_t[4];
- u32 w3_t[4];
-
w0_t[0] = uint_to_hex_lower8 ((a >> 0) & 255) << 0
| uint_to_hex_lower8 ((a >> 8) & 255) << 16;
w0_t[1] = uint_to_hex_lower8 ((a >> 16) & 255) << 0
* prepend salt
*/
- switch_buffer_by_offset (w0_t, w1_t, w2_t, w3_t, salt_len);
+ switch_buffer_by_offset_le (w0_t, w1_t, w2_t, w3_t, salt_len);
w3_t[2] = pw_salt_len * 8;
MD5_STEP (MD5_I , c, d, a, b, w0_t[2], MD5C3e, MD5S32);
MD5_STEP (MD5_I , b, c, d, a, w2_t[1], MD5C3f, MD5S33);
- const u32 r0 = a;
- const u32 r1 = d;
- const u32 r2 = c;
- const u32 r3 = b;
-
- #include COMPARE_S
+ COMPARE_S_SIMD (a, d, c, b);
}
}
* prepend salt
*/
- switch_buffer_by_offset (w0_t, w1_t, w2_t, w3_t, salt_len);
+ switch_buffer_by_offset_le (w0_t, w1_t, w2_t, w3_t, salt_len);
w0_t[0] |= salt_buf0[0];
w0_t[1] |= salt_buf0[1];
s3[2] = 0;
s3[3] = 0;
- switch_buffer_by_offset (s0, s1, s2, s3, salt_len + out_len);
+ switch_buffer_by_offset_le (s0, s1, s2, s3, salt_len + out_len);
w0_t[0] |= s0[0];
w0_t[1] |= s0[1];
* prepend salt
*/
- switch_buffer_by_offset (w0_t, w1_t, w2_t, w3_t, salt_len);
+ switch_buffer_by_offset_le (w0_t, w1_t, w2_t, w3_t, salt_len);
w0_t[0] |= salt_buf0[0];
w0_t[1] |= salt_buf0[1];
s3[2] = 0;
s3[3] = 0;
- switch_buffer_by_offset (s0, s1, s2, s3, salt_len + out_len);
+ switch_buffer_by_offset_le (s0, s1, s2, s3, salt_len + out_len);
w0_t[0] |= s0[0];
w0_t[1] |= s0[1];
if (combs_mode == COMBINATOR_MODE_BASE_RIGHT)
{
- switch_buffer_by_offset (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len);
+ switch_buffer_by_offset_le (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len);
}
/**
if (combs_mode == COMBINATOR_MODE_BASE_LEFT)
{
- switch_buffer_by_offset (wordr0, wordr1, wordr2, wordr3, pw_l_len);
+ switch_buffer_by_offset_le (wordr0, wordr1, wordr2, wordr3, pw_l_len);
}
u32 w0[4];
w3_t[2] = w3[2];
w3_t[3] = w3[3];
- switch_buffer_by_offset (w0_t, w1_t, w2_t, w3_t, salt_len);
+ switch_buffer_by_offset_le (w0_t, w1_t, w2_t, w3_t, salt_len);
w0_t[0] |= salt_buf0[0];
w0_t[1] |= salt_buf0[1];
s3[2] = 0;
s3[3] = 0;
- switch_buffer_by_offset (s0, s1, s2, s3, salt_len + pw_len);
+ switch_buffer_by_offset_le (s0, s1, s2, s3, salt_len + pw_len);
w0_t[0] |= s0[0];
w0_t[1] |= s0[1];
if (combs_mode == COMBINATOR_MODE_BASE_RIGHT)
{
- switch_buffer_by_offset (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len);
+ switch_buffer_by_offset_le (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len);
}
/**
if (combs_mode == COMBINATOR_MODE_BASE_LEFT)
{
- switch_buffer_by_offset (wordr0, wordr1, wordr2, wordr3, pw_l_len);
+ switch_buffer_by_offset_le (wordr0, wordr1, wordr2, wordr3, pw_l_len);
}
u32 w0[4];
w3_t[2] = w3[2];
w3_t[3] = w3[3];
- switch_buffer_by_offset (w0_t, w1_t, w2_t, w3_t, salt_len);
+ switch_buffer_by_offset_le (w0_t, w1_t, w2_t, w3_t, salt_len);
w0_t[0] |= salt_buf0[0];
w0_t[1] |= salt_buf0[1];
s3[2] = 0;
s3[3] = 0;
- switch_buffer_by_offset (s0, s1, s2, s3, salt_len + pw_len);
+ switch_buffer_by_offset_le (s0, s1, s2, s3, salt_len + pw_len);
w0_t[0] |= s0[0];
w0_t[1] |= s0[1];
#define _MD5_
+#define NEW_SIMD_CODE
+
#include "include/constants.h"
#include "include/kernel_vendor.h"
#include "include/kernel_functions.c"
#include "OpenCL/types_ocl.c"
#include "OpenCL/common.c"
-
-#define COMPARE_S "OpenCL/check_single_comp4.c"
-#define COMPARE_M "OpenCL/check_multi_comp4.c"
+#include "OpenCL/simd.c"
static void m03800m (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], const u32 pw_len, __global pw_t *pws, __global kernel_rule_t *rules_buf, __global comb_t *combs_buf, __global bf_t *bfs_buf, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 bfs_cnt, const u32 digests_cnt, const u32 digests_offset)
{
u32 w0l = w0[0];
- for (u32 il_pos = 0; il_pos < bfs_cnt; il_pos++)
+ for (u32 il_pos = 0; il_pos < bfs_cnt; il_pos += VECT_SIZE)
{
- const u32 w0r = bfs_buf[il_pos].i;
+ const u32x w0r = w0r_create_bft (bfs_buf, il_pos);
- w0[0] = w0l | w0r;
+ const u32x w0lr = w0l | w0r;
- u32 w0_t[4];
+ u32x w0_t[4];
- w0_t[0] = w0[0];
+ w0_t[0] = w0lr;
w0_t[1] = w0[1];
w0_t[2] = w0[2];
w0_t[3] = w0[3];
- u32 w1_t[4];
+ u32x w1_t[4];
w1_t[0] = w1[0];
w1_t[1] = w1[1];
w1_t[2] = w1[2];
w1_t[3] = w1[3];
- u32 w2_t[4];
+ u32x w2_t[4];
w2_t[0] = w2[0];
w2_t[1] = w2[1];
w2_t[2] = w2[2];
w2_t[3] = w2[3];
- u32 w3_t[4];
+ u32x w3_t[4];
w3_t[0] = w3[0];
w3_t[1] = w3[1];
* prepend salt
*/
- switch_buffer_by_offset (w0_t, w1_t, w2_t, w3_t, salt_len);
+ switch_buffer_by_offset_le (w0_t, w1_t, w2_t, w3_t, salt_len);
w0_t[0] |= salt_buf0[0];
w0_t[1] |= salt_buf0[1];
* append salt
*/
- u32 s0[4];
+ u32x s0[4];
s0[0] = salt_buf0[0];
s0[1] = salt_buf0[1];
s0[2] = salt_buf0[2];
s0[3] = salt_buf0[3];
- u32 s1[4];
+ u32x s1[4];
s1[0] = salt_buf1[0];
s1[1] = salt_buf1[1];
s1[2] = salt_buf1[2];
s1[3] = salt_buf1[3];
- u32 s2[4];
+ u32x s2[4];
s2[0] = 0;
s2[1] = 0;
s2[2] = 0;
s2[3] = 0;
- u32 s3[4];
+ u32x s3[4];
s3[0] = 0;
s3[1] = 0;
s3[2] = 0;
s3[3] = 0;
- switch_buffer_by_offset (s0, s1, s2, s3, salt_len + pw_len);
+ switch_buffer_by_offset_le (s0, s1, s2, s3, salt_len + pw_len);
w0_t[0] |= s0[0];
w0_t[1] |= s0[1];
* md5
*/
- u32 a = MD5M_A;
- u32 b = MD5M_B;
- u32 c = MD5M_C;
- u32 d = MD5M_D;
+ u32x a = MD5M_A;
+ u32x b = MD5M_B;
+ u32x c = MD5M_C;
+ u32x d = MD5M_D;
MD5_STEP (MD5_Fo, a, b, c, d, w0_t[0], MD5C00, MD5S00);
MD5_STEP (MD5_Fo, d, a, b, c, w0_t[1], MD5C01, MD5S01);
MD5_STEP (MD5_I , c, d, a, b, w0_t[2], MD5C3e, MD5S32);
MD5_STEP (MD5_I , b, c, d, a, w2_t[1], MD5C3f, MD5S33);
- const u32 r0 = a;
- const u32 r1 = d;
- const u32 r2 = c;
- const u32 r3 = b;
-
- #include COMPARE_M
+ COMPARE_M_SIMD (a, d, c, b);
}
}
u32 w0l = w0[0];
- for (u32 il_pos = 0; il_pos < bfs_cnt; il_pos++)
+ for (u32 il_pos = 0; il_pos < bfs_cnt; il_pos += VECT_SIZE)
{
- const u32 w0r = bfs_buf[il_pos].i;
+ const u32x w0r = w0r_create_bft (bfs_buf, il_pos);
- w0[0] = w0l | w0r;
+ const u32x w0lr = w0l | w0r;
- u32 w0_t[4];
+ u32x w0_t[4];
- w0_t[0] = w0[0];
+ w0_t[0] = w0lr;
w0_t[1] = w0[1];
w0_t[2] = w0[2];
w0_t[3] = w0[3];
- u32 w1_t[4];
+ u32x w1_t[4];
w1_t[0] = w1[0];
w1_t[1] = w1[1];
w1_t[2] = w1[2];
w1_t[3] = w1[3];
- u32 w2_t[4];
+ u32x w2_t[4];
w2_t[0] = w2[0];
w2_t[1] = w2[1];
w2_t[2] = w2[2];
w2_t[3] = w2[3];
- u32 w3_t[4];
+ u32x w3_t[4];
w3_t[0] = w3[0];
w3_t[1] = w3[1];
* prepend salt
*/
- switch_buffer_by_offset (w0_t, w1_t, w2_t, w3_t, salt_len);
+ switch_buffer_by_offset_le (w0_t, w1_t, w2_t, w3_t, salt_len);
w0_t[0] |= salt_buf0[0];
w0_t[1] |= salt_buf0[1];
* append salt
*/
- u32 s0[4];
+ u32x s0[4];
s0[0] = salt_buf0[0];
s0[1] = salt_buf0[1];
s0[2] = salt_buf0[2];
s0[3] = salt_buf0[3];
- u32 s1[4];
+ u32x s1[4];
s1[0] = salt_buf1[0];
s1[1] = salt_buf1[1];
s1[2] = salt_buf1[2];
s1[3] = salt_buf1[3];
- u32 s2[4];
+ u32x s2[4];
s2[0] = 0;
s2[1] = 0;
s2[2] = 0;
s2[3] = 0;
- u32 s3[4];
+ u32x s3[4];
s3[0] = 0;
s3[1] = 0;
s3[2] = 0;
s3[3] = 0;
- switch_buffer_by_offset (s0, s1, s2, s3, salt_len + pw_len);
+ switch_buffer_by_offset_le (s0, s1, s2, s3, salt_len + pw_len);
w0_t[0] |= s0[0];
w0_t[1] |= s0[1];
* md5
*/
- u32 a = MD5M_A;
- u32 b = MD5M_B;
- u32 c = MD5M_C;
- u32 d = MD5M_D;
+ u32x a = MD5M_A;
+ u32x b = MD5M_B;
+ u32x c = MD5M_C;
+ u32x d = MD5M_D;
MD5_STEP (MD5_Fo, a, b, c, d, w0_t[0], MD5C00, MD5S00);
MD5_STEP (MD5_Fo, d, a, b, c, w0_t[1], MD5C01, MD5S01);
MD5_STEP (MD5_I , c, d, a, b, w0_t[2], MD5C3e, MD5S32);
MD5_STEP (MD5_I , b, c, d, a, w2_t[1], MD5C3f, MD5S33);
- const u32 r0 = a;
- const u32 r1 = d;
- const u32 r2 = c;
- const u32 r3 = b;
-
- #include COMPARE_S
+ COMPARE_S_SIMD (a, d, c, b);
}
}
{
append_0x80_2x4 (wordl0, wordl1, pw_l_len);
- switch_buffer_by_offset (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len);
+ switch_buffer_by_offset_le (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len);
}
/**
if (combs_mode == COMBINATOR_MODE_BASE_LEFT)
{
- switch_buffer_by_offset (wordr0, wordr1, wordr2, wordr3, pw_l_len);
+ switch_buffer_by_offset_le (wordr0, wordr1, wordr2, wordr3, pw_l_len);
}
u32 w0[4];
{
append_0x80_2x4 (wordl0, wordl1, pw_l_len);
- switch_buffer_by_offset (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len);
+ switch_buffer_by_offset_le (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len);
}
/**
if (combs_mode == COMBINATOR_MODE_BASE_LEFT)
{
- switch_buffer_by_offset (wordr0, wordr1, wordr2, wordr3, pw_l_len);
+ switch_buffer_by_offset_le (wordr0, wordr1, wordr2, wordr3, pw_l_len);
}
u32 w0[4];
#define _MD5_
+#define NEW_SIMD_CODE
+
#include "include/constants.h"
#include "include/kernel_vendor.h"
#include "include/kernel_functions.c"
#include "OpenCL/types_ocl.c"
#include "OpenCL/common.c"
-
-#define COMPARE_S "OpenCL/check_single_comp4.c"
-#define COMPARE_M "OpenCL/check_multi_comp4.c"
-
-#define uint_to_hex_lower8(i) l_bin2asc[(i)]
+#include "OpenCL/simd.c"
+
+#if VECT_SIZE == 1
+#define uint_to_hex_upper8(i) (u32x) (l_bin2asc[(i)])
+#elif VECT_SIZE == 2
+#define uint_to_hex_upper8(i) (u32x) (l_bin2asc[(i).s0], l_bin2asc[(i).s1])
+#elif VECT_SIZE == 4
+#define uint_to_hex_upper8(i) (u32x) (l_bin2asc[(i).s0], l_bin2asc[(i).s1], l_bin2asc[(i).s2], l_bin2asc[(i).s3])
+#elif VECT_SIZE == 8
+#define uint_to_hex_upper8(i) (u32x) (l_bin2asc[(i).s0], l_bin2asc[(i).s1], l_bin2asc[(i).s2], l_bin2asc[(i).s3], l_bin2asc[(i).s4], l_bin2asc[(i).s5], l_bin2asc[(i).s6], l_bin2asc[(i).s7])
+#endif
static void m04310m (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], const u32 pw_len, __global pw_t *pws, __global kernel_rule_t *rules_buf, __global comb_t *combs_buf, __global bf_t *bfs_buf, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 bfs_cnt, const u32 digests_cnt, const u32 digests_offset, __local u32 l_bin2asc[256])
{
u32 w0l = w0[0];
- for (u32 il_pos = 0; il_pos < bfs_cnt; il_pos++)
+ for (u32 il_pos = 0; il_pos < bfs_cnt; il_pos += VECT_SIZE)
{
- const u32 w0r = bfs_buf[il_pos].i;
-
- w0[0] = w0l | w0r;
-
- u32 a = MD5M_A;
- u32 b = MD5M_B;
- u32 c = MD5M_C;
- u32 d = MD5M_D;
-
- MD5_STEP (MD5_Fo, a, b, c, d, w0[0], MD5C00, MD5S00);
- MD5_STEP (MD5_Fo, d, a, b, c, w0[1], MD5C01, MD5S01);
- MD5_STEP (MD5_Fo, c, d, a, b, w0[2], MD5C02, MD5S02);
- MD5_STEP (MD5_Fo, b, c, d, a, w0[3], MD5C03, MD5S03);
- MD5_STEP (MD5_Fo, a, b, c, d, w1[0], MD5C04, MD5S00);
- MD5_STEP (MD5_Fo, d, a, b, c, w1[1], MD5C05, MD5S01);
- MD5_STEP (MD5_Fo, c, d, a, b, w1[2], MD5C06, MD5S02);
- MD5_STEP (MD5_Fo, b, c, d, a, w1[3], MD5C07, MD5S03);
- MD5_STEP (MD5_Fo, a, b, c, d, w2[0], MD5C08, MD5S00);
- MD5_STEP (MD5_Fo, d, a, b, c, w2[1], MD5C09, MD5S01);
- MD5_STEP (MD5_Fo, c, d, a, b, w2[2], MD5C0a, MD5S02);
- MD5_STEP (MD5_Fo, b, c, d, a, w2[3], MD5C0b, MD5S03);
- MD5_STEP (MD5_Fo, a, b, c, d, w3[0], MD5C0c, MD5S00);
- MD5_STEP (MD5_Fo, d, a, b, c, w3[1], MD5C0d, MD5S01);
- MD5_STEP (MD5_Fo, c, d, a, b, w3[2], MD5C0e, MD5S02);
- MD5_STEP (MD5_Fo, b, c, d, a, w3[3], MD5C0f, MD5S03);
-
- MD5_STEP (MD5_Go, a, b, c, d, w0[1], MD5C10, MD5S10);
- MD5_STEP (MD5_Go, d, a, b, c, w1[2], MD5C11, MD5S11);
- MD5_STEP (MD5_Go, c, d, a, b, w2[3], MD5C12, MD5S12);
- MD5_STEP (MD5_Go, b, c, d, a, w0[0], MD5C13, MD5S13);
- MD5_STEP (MD5_Go, a, b, c, d, w1[1], MD5C14, MD5S10);
- MD5_STEP (MD5_Go, d, a, b, c, w2[2], MD5C15, MD5S11);
- MD5_STEP (MD5_Go, c, d, a, b, w3[3], MD5C16, MD5S12);
- MD5_STEP (MD5_Go, b, c, d, a, w1[0], MD5C17, MD5S13);
- MD5_STEP (MD5_Go, a, b, c, d, w2[1], MD5C18, MD5S10);
- MD5_STEP (MD5_Go, d, a, b, c, w3[2], MD5C19, MD5S11);
- MD5_STEP (MD5_Go, c, d, a, b, w0[3], MD5C1a, MD5S12);
- MD5_STEP (MD5_Go, b, c, d, a, w2[0], MD5C1b, MD5S13);
- MD5_STEP (MD5_Go, a, b, c, d, w3[1], MD5C1c, MD5S10);
- MD5_STEP (MD5_Go, d, a, b, c, w0[2], MD5C1d, MD5S11);
- MD5_STEP (MD5_Go, c, d, a, b, w1[3], MD5C1e, MD5S12);
- MD5_STEP (MD5_Go, b, c, d, a, w3[0], MD5C1f, MD5S13);
-
- MD5_STEP (MD5_H , a, b, c, d, w1[1], MD5C20, MD5S20);
- MD5_STEP (MD5_H , d, a, b, c, w2[0], MD5C21, MD5S21);
- MD5_STEP (MD5_H , c, d, a, b, w2[3], MD5C22, MD5S22);
- MD5_STEP (MD5_H , b, c, d, a, w3[2], MD5C23, MD5S23);
- MD5_STEP (MD5_H , a, b, c, d, w0[1], MD5C24, MD5S20);
- MD5_STEP (MD5_H , d, a, b, c, w1[0], MD5C25, MD5S21);
- MD5_STEP (MD5_H , c, d, a, b, w1[3], MD5C26, MD5S22);
- MD5_STEP (MD5_H , b, c, d, a, w2[2], MD5C27, MD5S23);
- MD5_STEP (MD5_H , a, b, c, d, w3[1], MD5C28, MD5S20);
- MD5_STEP (MD5_H , d, a, b, c, w0[0], MD5C29, MD5S21);
- MD5_STEP (MD5_H , c, d, a, b, w0[3], MD5C2a, MD5S22);
- MD5_STEP (MD5_H , b, c, d, a, w1[2], MD5C2b, MD5S23);
- MD5_STEP (MD5_H , a, b, c, d, w2[1], MD5C2c, MD5S20);
- MD5_STEP (MD5_H , d, a, b, c, w3[0], MD5C2d, MD5S21);
- MD5_STEP (MD5_H , c, d, a, b, w3[3], MD5C2e, MD5S22);
- MD5_STEP (MD5_H , b, c, d, a, w0[2], MD5C2f, MD5S23);
-
- MD5_STEP (MD5_I , a, b, c, d, w0[0], MD5C30, MD5S30);
- MD5_STEP (MD5_I , d, a, b, c, w1[3], MD5C31, MD5S31);
- MD5_STEP (MD5_I , c, d, a, b, w3[2], MD5C32, MD5S32);
- MD5_STEP (MD5_I , b, c, d, a, w1[1], MD5C33, MD5S33);
- MD5_STEP (MD5_I , a, b, c, d, w3[0], MD5C34, MD5S30);
- MD5_STEP (MD5_I , d, a, b, c, w0[3], MD5C35, MD5S31);
- MD5_STEP (MD5_I , c, d, a, b, w2[2], MD5C36, MD5S32);
- MD5_STEP (MD5_I , b, c, d, a, w0[1], MD5C37, MD5S33);
- MD5_STEP (MD5_I , a, b, c, d, w2[0], MD5C38, MD5S30);
- MD5_STEP (MD5_I , d, a, b, c, w3[3], MD5C39, MD5S31);
- MD5_STEP (MD5_I , c, d, a, b, w1[2], MD5C3a, MD5S32);
- MD5_STEP (MD5_I , b, c, d, a, w3[1], MD5C3b, MD5S33);
- MD5_STEP (MD5_I , a, b, c, d, w1[0], MD5C3c, MD5S30);
- MD5_STEP (MD5_I , d, a, b, c, w2[3], MD5C3d, MD5S31);
- MD5_STEP (MD5_I , c, d, a, b, w0[2], MD5C3e, MD5S32);
- MD5_STEP (MD5_I , b, c, d, a, w2[1], MD5C3f, MD5S33);
+ const u32x w0r = w0r_create_bft (bfs_buf, il_pos);
+
+ const u32x w0lr = w0l | w0r;
+
+ u32x w0_t[4];
+
+ w0_t[0] = w0lr;
+ w0_t[1] = w0[1];
+ w0_t[2] = w0[2];
+ w0_t[3] = w0[3];
+
+ u32x w1_t[4];
+
+ w1_t[0] = w1[0];
+ w1_t[1] = w1[1];
+ w1_t[2] = w1[2];
+ w1_t[3] = w1[3];
+
+ u32x w2_t[4];
+
+ w2_t[0] = w2[0];
+ w2_t[1] = w2[1];
+ w2_t[2] = w2[2];
+ w2_t[3] = w2[3];
+
+ u32x w3_t[4];
+
+ w3_t[0] = w3[0];
+ w3_t[1] = w3[1];
+ w3_t[2] = w3[2];
+ w3_t[3] = w3[3];
+
+ u32x a = MD5M_A;
+ u32x b = MD5M_B;
+ u32x c = MD5M_C;
+ u32x d = MD5M_D;
+
+ MD5_STEP (MD5_Fo, a, b, c, d, w0_t[0], MD5C00, MD5S00);
+ MD5_STEP (MD5_Fo, d, a, b, c, w0_t[1], MD5C01, MD5S01);
+ MD5_STEP (MD5_Fo, c, d, a, b, w0_t[2], MD5C02, MD5S02);
+ MD5_STEP (MD5_Fo, b, c, d, a, w0_t[3], MD5C03, MD5S03);
+ MD5_STEP (MD5_Fo, a, b, c, d, w1_t[0], MD5C04, MD5S00);
+ MD5_STEP (MD5_Fo, d, a, b, c, w1_t[1], MD5C05, MD5S01);
+ MD5_STEP (MD5_Fo, c, d, a, b, w1_t[2], MD5C06, MD5S02);
+ MD5_STEP (MD5_Fo, b, c, d, a, w1_t[3], MD5C07, MD5S03);
+ MD5_STEP (MD5_Fo, a, b, c, d, w2_t[0], MD5C08, MD5S00);
+ MD5_STEP (MD5_Fo, d, a, b, c, w2_t[1], MD5C09, MD5S01);
+ MD5_STEP (MD5_Fo, c, d, a, b, w2_t[2], MD5C0a, MD5S02);
+ MD5_STEP (MD5_Fo, b, c, d, a, w2_t[3], MD5C0b, MD5S03);
+ MD5_STEP (MD5_Fo, a, b, c, d, w3_t[0], MD5C0c, MD5S00);
+ MD5_STEP (MD5_Fo, d, a, b, c, w3_t[1], MD5C0d, MD5S01);
+ MD5_STEP (MD5_Fo, c, d, a, b, w3_t[2], MD5C0e, MD5S02);
+ MD5_STEP (MD5_Fo, b, c, d, a, w3_t[3], MD5C0f, MD5S03);
+
+ MD5_STEP (MD5_Go, a, b, c, d, w0_t[1], MD5C10, MD5S10);
+ MD5_STEP (MD5_Go, d, a, b, c, w1_t[2], MD5C11, MD5S11);
+ MD5_STEP (MD5_Go, c, d, a, b, w2_t[3], MD5C12, MD5S12);
+ MD5_STEP (MD5_Go, b, c, d, a, w0_t[0], MD5C13, MD5S13);
+ MD5_STEP (MD5_Go, a, b, c, d, w1_t[1], MD5C14, MD5S10);
+ MD5_STEP (MD5_Go, d, a, b, c, w2_t[2], MD5C15, MD5S11);
+ MD5_STEP (MD5_Go, c, d, a, b, w3_t[3], MD5C16, MD5S12);
+ MD5_STEP (MD5_Go, b, c, d, a, w1_t[0], MD5C17, MD5S13);
+ MD5_STEP (MD5_Go, a, b, c, d, w2_t[1], MD5C18, MD5S10);
+ MD5_STEP (MD5_Go, d, a, b, c, w3_t[2], MD5C19, MD5S11);
+ MD5_STEP (MD5_Go, c, d, a, b, w0_t[3], MD5C1a, MD5S12);
+ MD5_STEP (MD5_Go, b, c, d, a, w2_t[0], MD5C1b, MD5S13);
+ MD5_STEP (MD5_Go, a, b, c, d, w3_t[1], MD5C1c, MD5S10);
+ MD5_STEP (MD5_Go, d, a, b, c, w0_t[2], MD5C1d, MD5S11);
+ MD5_STEP (MD5_Go, c, d, a, b, w1_t[3], MD5C1e, MD5S12);
+ MD5_STEP (MD5_Go, b, c, d, a, w3_t[0], MD5C1f, MD5S13);
+
+ MD5_STEP (MD5_H , a, b, c, d, w1_t[1], MD5C20, MD5S20);
+ MD5_STEP (MD5_H , d, a, b, c, w2_t[0], MD5C21, MD5S21);
+ MD5_STEP (MD5_H , c, d, a, b, w2_t[3], MD5C22, MD5S22);
+ MD5_STEP (MD5_H , b, c, d, a, w3_t[2], MD5C23, MD5S23);
+ MD5_STEP (MD5_H , a, b, c, d, w0_t[1], MD5C24, MD5S20);
+ MD5_STEP (MD5_H , d, a, b, c, w1_t[0], MD5C25, MD5S21);
+ MD5_STEP (MD5_H , c, d, a, b, w1_t[3], MD5C26, MD5S22);
+ MD5_STEP (MD5_H , b, c, d, a, w2_t[2], MD5C27, MD5S23);
+ MD5_STEP (MD5_H , a, b, c, d, w3_t[1], MD5C28, MD5S20);
+ MD5_STEP (MD5_H , d, a, b, c, w0_t[0], MD5C29, MD5S21);
+ MD5_STEP (MD5_H , c, d, a, b, w0_t[3], MD5C2a, MD5S22);
+ MD5_STEP (MD5_H , b, c, d, a, w1_t[2], MD5C2b, MD5S23);
+ MD5_STEP (MD5_H , a, b, c, d, w2_t[1], MD5C2c, MD5S20);
+ MD5_STEP (MD5_H , d, a, b, c, w3_t[0], MD5C2d, MD5S21);
+ MD5_STEP (MD5_H , c, d, a, b, w3_t[3], MD5C2e, MD5S22);
+ MD5_STEP (MD5_H , b, c, d, a, w0_t[2], MD5C2f, MD5S23);
+
+ MD5_STEP (MD5_I , a, b, c, d, w0_t[0], MD5C30, MD5S30);
+ MD5_STEP (MD5_I , d, a, b, c, w1_t[3], MD5C31, MD5S31);
+ MD5_STEP (MD5_I , c, d, a, b, w3_t[2], MD5C32, MD5S32);
+ MD5_STEP (MD5_I , b, c, d, a, w1_t[1], MD5C33, MD5S33);
+ MD5_STEP (MD5_I , a, b, c, d, w3_t[0], MD5C34, MD5S30);
+ MD5_STEP (MD5_I , d, a, b, c, w0_t[3], MD5C35, MD5S31);
+ MD5_STEP (MD5_I , c, d, a, b, w2_t[2], MD5C36, MD5S32);
+ MD5_STEP (MD5_I , b, c, d, a, w0_t[1], MD5C37, MD5S33);
+ MD5_STEP (MD5_I , a, b, c, d, w2_t[0], MD5C38, MD5S30);
+ MD5_STEP (MD5_I , d, a, b, c, w3_t[3], MD5C39, MD5S31);
+ MD5_STEP (MD5_I , c, d, a, b, w1_t[2], MD5C3a, MD5S32);
+ MD5_STEP (MD5_I , b, c, d, a, w3_t[1], MD5C3b, MD5S33);
+ MD5_STEP (MD5_I , a, b, c, d, w1_t[0], MD5C3c, MD5S30);
+ MD5_STEP (MD5_I , d, a, b, c, w2_t[3], MD5C3d, MD5S31);
+ MD5_STEP (MD5_I , c, d, a, b, w0_t[2], MD5C3e, MD5S32);
+ MD5_STEP (MD5_I , b, c, d, a, w2_t[1], MD5C3f, MD5S33);
a += MD5M_A;
b += MD5M_B;
c += MD5M_C;
d += MD5M_D;
- const u32 w0_t = uint_to_hex_lower8 ((a >> 0) & 255) << 0
- | uint_to_hex_lower8 ((a >> 8) & 255) << 16;
- const u32 w1_t = uint_to_hex_lower8 ((a >> 16) & 255) << 0
- | uint_to_hex_lower8 ((a >> 24) & 255) << 16;
- const u32 w2_t = uint_to_hex_lower8 ((b >> 0) & 255) << 0
- | uint_to_hex_lower8 ((b >> 8) & 255) << 16;
- const u32 w3_t = uint_to_hex_lower8 ((b >> 16) & 255) << 0
- | uint_to_hex_lower8 ((b >> 24) & 255) << 16;
- const u32 w4_t = uint_to_hex_lower8 ((c >> 0) & 255) << 0
- | uint_to_hex_lower8 ((c >> 8) & 255) << 16;
- const u32 w5_t = uint_to_hex_lower8 ((c >> 16) & 255) << 0
- | uint_to_hex_lower8 ((c >> 24) & 255) << 16;
- const u32 w6_t = uint_to_hex_lower8 ((d >> 0) & 255) << 0
- | uint_to_hex_lower8 ((d >> 8) & 255) << 16;
- const u32 w7_t = uint_to_hex_lower8 ((d >> 16) & 255) << 0
- | uint_to_hex_lower8 ((d >> 24) & 255) << 16;
-
- const u32 w8_t = s[0];
- const u32 w9_t = s[1];
- const u32 wa_t = s[2];
- const u32 wb_t = s[3];
- const u32 wc_t = s[4];
- const u32 wd_t = s[5];
- const u32 we_t = s[6];
- const u32 wf_t = s[7];
+ w0_t[0] = uint_to_hex_upper8 ((a >> 0) & 255) << 0
+ | uint_to_hex_upper8 ((a >> 8) & 255) << 16;
+ w0_t[1] = uint_to_hex_upper8 ((a >> 16) & 255) << 0
+ | uint_to_hex_upper8 ((a >> 24) & 255) << 16;
+ w0_t[2] = uint_to_hex_upper8 ((b >> 0) & 255) << 0
+ | uint_to_hex_upper8 ((b >> 8) & 255) << 16;
+ w0_t[3] = uint_to_hex_upper8 ((b >> 16) & 255) << 0
+ | uint_to_hex_upper8 ((b >> 24) & 255) << 16;
+ w1_t[0] = uint_to_hex_upper8 ((c >> 0) & 255) << 0
+ | uint_to_hex_upper8 ((c >> 8) & 255) << 16;
+ w1_t[1] = uint_to_hex_upper8 ((c >> 16) & 255) << 0
+ | uint_to_hex_upper8 ((c >> 24) & 255) << 16;
+ w1_t[2] = uint_to_hex_upper8 ((d >> 0) & 255) << 0
+ | uint_to_hex_upper8 ((d >> 8) & 255) << 16;
+ w1_t[3] = uint_to_hex_upper8 ((d >> 16) & 255) << 0
+ | uint_to_hex_upper8 ((d >> 24) & 255) << 16;
+
+ w2_t[0] = s[0];
+ w2_t[1] = s[1];
+ w2_t[2] = s[2];
+ w2_t[3] = s[3];
+
+ w3_t[0] = s[4];
+ w3_t[1] = s[5];
+ w3_t[2] = s[6];
+ w3_t[3] = s[7];
a = MD5M_A;
b = MD5M_B;
c = MD5M_C;
d = MD5M_D;
- MD5_STEP (MD5_Fo, a, b, c, d, w0_t, MD5C00, MD5S00);
- MD5_STEP (MD5_Fo, d, a, b, c, w1_t, MD5C01, MD5S01);
- MD5_STEP (MD5_Fo, c, d, a, b, w2_t, MD5C02, MD5S02);
- MD5_STEP (MD5_Fo, b, c, d, a, w3_t, MD5C03, MD5S03);
- MD5_STEP (MD5_Fo, a, b, c, d, w4_t, MD5C04, MD5S00);
- MD5_STEP (MD5_Fo, d, a, b, c, w5_t, MD5C05, MD5S01);
- MD5_STEP (MD5_Fo, c, d, a, b, w6_t, MD5C06, MD5S02);
- MD5_STEP (MD5_Fo, b, c, d, a, w7_t, MD5C07, MD5S03);
- MD5_STEP (MD5_Fo, a, b, c, d, w8_t, MD5C08, MD5S00);
- MD5_STEP (MD5_Fo, d, a, b, c, w9_t, MD5C09, MD5S01);
- MD5_STEP (MD5_Fo, c, d, a, b, wa_t, MD5C0a, MD5S02);
- MD5_STEP (MD5_Fo, b, c, d, a, wb_t, MD5C0b, MD5S03);
- MD5_STEP (MD5_Fo, a, b, c, d, wc_t, MD5C0c, MD5S00);
- MD5_STEP (MD5_Fo, d, a, b, c, wd_t, MD5C0d, MD5S01);
- MD5_STEP (MD5_Fo, c, d, a, b, we_t, MD5C0e, MD5S02);
- MD5_STEP (MD5_Fo, b, c, d, a, wf_t, MD5C0f, MD5S03);
-
- MD5_STEP (MD5_Go, a, b, c, d, w1_t, MD5C10, MD5S10);
- MD5_STEP (MD5_Go, d, a, b, c, w6_t, MD5C11, MD5S11);
- MD5_STEP (MD5_Go, c, d, a, b, wb_t, MD5C12, MD5S12);
- MD5_STEP (MD5_Go, b, c, d, a, w0_t, MD5C13, MD5S13);
- MD5_STEP (MD5_Go, a, b, c, d, w5_t, MD5C14, MD5S10);
- MD5_STEP (MD5_Go, d, a, b, c, wa_t, MD5C15, MD5S11);
- MD5_STEP (MD5_Go, c, d, a, b, wf_t, MD5C16, MD5S12);
- MD5_STEP (MD5_Go, b, c, d, a, w4_t, MD5C17, MD5S13);
- MD5_STEP (MD5_Go, a, b, c, d, w9_t, MD5C18, MD5S10);
- MD5_STEP (MD5_Go, d, a, b, c, we_t, MD5C19, MD5S11);
- MD5_STEP (MD5_Go, c, d, a, b, w3_t, MD5C1a, MD5S12);
- MD5_STEP (MD5_Go, b, c, d, a, w8_t, MD5C1b, MD5S13);
- MD5_STEP (MD5_Go, a, b, c, d, wd_t, MD5C1c, MD5S10);
- MD5_STEP (MD5_Go, d, a, b, c, w2_t, MD5C1d, MD5S11);
- MD5_STEP (MD5_Go, c, d, a, b, w7_t, MD5C1e, MD5S12);
- MD5_STEP (MD5_Go, b, c, d, a, wc_t, MD5C1f, MD5S13);
-
- MD5_STEP (MD5_H , a, b, c, d, w5_t, MD5C20, MD5S20);
- MD5_STEP (MD5_H , d, a, b, c, w8_t, MD5C21, MD5S21);
- MD5_STEP (MD5_H , c, d, a, b, wb_t, MD5C22, MD5S22);
- MD5_STEP (MD5_H , b, c, d, a, we_t, MD5C23, MD5S23);
- MD5_STEP (MD5_H , a, b, c, d, w1_t, MD5C24, MD5S20);
- MD5_STEP (MD5_H , d, a, b, c, w4_t, MD5C25, MD5S21);
- MD5_STEP (MD5_H , c, d, a, b, w7_t, MD5C26, MD5S22);
- MD5_STEP (MD5_H , b, c, d, a, wa_t, MD5C27, MD5S23);
- MD5_STEP (MD5_H , a, b, c, d, wd_t, MD5C28, MD5S20);
- MD5_STEP (MD5_H , d, a, b, c, w0_t, MD5C29, MD5S21);
- MD5_STEP (MD5_H , c, d, a, b, w3_t, MD5C2a, MD5S22);
- MD5_STEP (MD5_H , b, c, d, a, w6_t, MD5C2b, MD5S23);
- MD5_STEP (MD5_H , a, b, c, d, w9_t, MD5C2c, MD5S20);
- MD5_STEP (MD5_H , d, a, b, c, wc_t, MD5C2d, MD5S21);
- MD5_STEP (MD5_H , c, d, a, b, wf_t, MD5C2e, MD5S22);
- MD5_STEP (MD5_H , b, c, d, a, w2_t, MD5C2f, MD5S23);
-
- MD5_STEP (MD5_I , a, b, c, d, w0_t, MD5C30, MD5S30);
- MD5_STEP (MD5_I , d, a, b, c, w7_t, MD5C31, MD5S31);
- MD5_STEP (MD5_I , c, d, a, b, we_t, MD5C32, MD5S32);
- MD5_STEP (MD5_I , b, c, d, a, w5_t, MD5C33, MD5S33);
- MD5_STEP (MD5_I , a, b, c, d, wc_t, MD5C34, MD5S30);
- MD5_STEP (MD5_I , d, a, b, c, w3_t, MD5C35, MD5S31);
- MD5_STEP (MD5_I , c, d, a, b, wa_t, MD5C36, MD5S32);
- MD5_STEP (MD5_I , b, c, d, a, w1_t, MD5C37, MD5S33);
- MD5_STEP (MD5_I , a, b, c, d, w8_t, MD5C38, MD5S30);
- MD5_STEP (MD5_I , d, a, b, c, wf_t, MD5C39, MD5S31);
- MD5_STEP (MD5_I , c, d, a, b, w6_t, MD5C3a, MD5S32);
- MD5_STEP (MD5_I , b, c, d, a, wd_t, MD5C3b, MD5S33);
- MD5_STEP (MD5_I , a, b, c, d, w4_t, MD5C3c, MD5S30);
- MD5_STEP (MD5_I , d, a, b, c, wb_t, MD5C3d, MD5S31);
- MD5_STEP (MD5_I , c, d, a, b, w2_t, MD5C3e, MD5S32);
- MD5_STEP (MD5_I , b, c, d, a, w9_t, MD5C3f, MD5S33);
-
- const u32 r0 = a;
- const u32 r1 = d;
- const u32 r2 = c;
- const u32 r3 = b;
-
- #include COMPARE_M
+ MD5_STEP (MD5_Fo, a, b, c, d, w0_t[0], MD5C00, MD5S00);
+ MD5_STEP (MD5_Fo, d, a, b, c, w0_t[1], MD5C01, MD5S01);
+ MD5_STEP (MD5_Fo, c, d, a, b, w0_t[2], MD5C02, MD5S02);
+ MD5_STEP (MD5_Fo, b, c, d, a, w0_t[3], MD5C03, MD5S03);
+ MD5_STEP (MD5_Fo, a, b, c, d, w1_t[0], MD5C04, MD5S00);
+ MD5_STEP (MD5_Fo, d, a, b, c, w1_t[1], MD5C05, MD5S01);
+ MD5_STEP (MD5_Fo, c, d, a, b, w1_t[2], MD5C06, MD5S02);
+ MD5_STEP (MD5_Fo, b, c, d, a, w1_t[3], MD5C07, MD5S03);
+ MD5_STEP (MD5_Fo, a, b, c, d, w2_t[0], MD5C08, MD5S00);
+ MD5_STEP (MD5_Fo, d, a, b, c, w2_t[1], MD5C09, MD5S01);
+ MD5_STEP (MD5_Fo, c, d, a, b, w2_t[2], MD5C0a, MD5S02);
+ MD5_STEP (MD5_Fo, b, c, d, a, w2_t[3], MD5C0b, MD5S03);
+ MD5_STEP (MD5_Fo, a, b, c, d, w3_t[0], MD5C0c, MD5S00);
+ MD5_STEP (MD5_Fo, d, a, b, c, w3_t[1], MD5C0d, MD5S01);
+ MD5_STEP (MD5_Fo, c, d, a, b, w3_t[2], MD5C0e, MD5S02);
+ MD5_STEP (MD5_Fo, b, c, d, a, w3_t[3], MD5C0f, MD5S03);
+
+ MD5_STEP (MD5_Go, a, b, c, d, w0_t[1], MD5C10, MD5S10);
+ MD5_STEP (MD5_Go, d, a, b, c, w1_t[2], MD5C11, MD5S11);
+ MD5_STEP (MD5_Go, c, d, a, b, w2_t[3], MD5C12, MD5S12);
+ MD5_STEP (MD5_Go, b, c, d, a, w0_t[0], MD5C13, MD5S13);
+ MD5_STEP (MD5_Go, a, b, c, d, w1_t[1], MD5C14, MD5S10);
+ MD5_STEP (MD5_Go, d, a, b, c, w2_t[2], MD5C15, MD5S11);
+ MD5_STEP (MD5_Go, c, d, a, b, w3_t[3], MD5C16, MD5S12);
+ MD5_STEP (MD5_Go, b, c, d, a, w1_t[0], MD5C17, MD5S13);
+ MD5_STEP (MD5_Go, a, b, c, d, w2_t[1], MD5C18, MD5S10);
+ MD5_STEP (MD5_Go, d, a, b, c, w3_t[2], MD5C19, MD5S11);
+ MD5_STEP (MD5_Go, c, d, a, b, w0_t[3], MD5C1a, MD5S12);
+ MD5_STEP (MD5_Go, b, c, d, a, w2_t[0], MD5C1b, MD5S13);
+ MD5_STEP (MD5_Go, a, b, c, d, w3_t[1], MD5C1c, MD5S10);
+ MD5_STEP (MD5_Go, d, a, b, c, w0_t[2], MD5C1d, MD5S11);
+ MD5_STEP (MD5_Go, c, d, a, b, w1_t[3], MD5C1e, MD5S12);
+ MD5_STEP (MD5_Go, b, c, d, a, w3_t[0], MD5C1f, MD5S13);
+
+ MD5_STEP (MD5_H , a, b, c, d, w1_t[1], MD5C20, MD5S20);
+ MD5_STEP (MD5_H , d, a, b, c, w2_t[0], MD5C21, MD5S21);
+ MD5_STEP (MD5_H , c, d, a, b, w2_t[3], MD5C22, MD5S22);
+ MD5_STEP (MD5_H , b, c, d, a, w3_t[2], MD5C23, MD5S23);
+ MD5_STEP (MD5_H , a, b, c, d, w0_t[1], MD5C24, MD5S20);
+ MD5_STEP (MD5_H , d, a, b, c, w1_t[0], MD5C25, MD5S21);
+ MD5_STEP (MD5_H , c, d, a, b, w1_t[3], MD5C26, MD5S22);
+ MD5_STEP (MD5_H , b, c, d, a, w2_t[2], MD5C27, MD5S23);
+ MD5_STEP (MD5_H , a, b, c, d, w3_t[1], MD5C28, MD5S20);
+ MD5_STEP (MD5_H , d, a, b, c, w0_t[0], MD5C29, MD5S21);
+ MD5_STEP (MD5_H , c, d, a, b, w0_t[3], MD5C2a, MD5S22);
+ MD5_STEP (MD5_H , b, c, d, a, w1_t[2], MD5C2b, MD5S23);
+ MD5_STEP (MD5_H , a, b, c, d, w2_t[1], MD5C2c, MD5S20);
+ MD5_STEP (MD5_H , d, a, b, c, w3_t[0], MD5C2d, MD5S21);
+ MD5_STEP (MD5_H , c, d, a, b, w3_t[3], MD5C2e, MD5S22);
+ MD5_STEP (MD5_H , b, c, d, a, w0_t[2], MD5C2f, MD5S23);
+
+ MD5_STEP (MD5_I , a, b, c, d, w0_t[0], MD5C30, MD5S30);
+ MD5_STEP (MD5_I , d, a, b, c, w1_t[3], MD5C31, MD5S31);
+ MD5_STEP (MD5_I , c, d, a, b, w3_t[2], MD5C32, MD5S32);
+ MD5_STEP (MD5_I , b, c, d, a, w1_t[1], MD5C33, MD5S33);
+ MD5_STEP (MD5_I , a, b, c, d, w3_t[0], MD5C34, MD5S30);
+ MD5_STEP (MD5_I , d, a, b, c, w0_t[3], MD5C35, MD5S31);
+ MD5_STEP (MD5_I , c, d, a, b, w2_t[2], MD5C36, MD5S32);
+ MD5_STEP (MD5_I , b, c, d, a, w0_t[1], MD5C37, MD5S33);
+ MD5_STEP (MD5_I , a, b, c, d, w2_t[0], MD5C38, MD5S30);
+ MD5_STEP (MD5_I , d, a, b, c, w3_t[3], MD5C39, MD5S31);
+ MD5_STEP (MD5_I , c, d, a, b, w1_t[2], MD5C3a, MD5S32);
+ MD5_STEP (MD5_I , b, c, d, a, w3_t[1], MD5C3b, MD5S33);
+ MD5_STEP (MD5_I , a, b, c, d, w1_t[0], MD5C3c, MD5S30);
+ MD5_STEP (MD5_I , d, a, b, c, w2_t[3], MD5C3d, MD5S31);
+ MD5_STEP (MD5_I , c, d, a, b, w0_t[2], MD5C3e, MD5S32);
+ MD5_STEP (MD5_I , b, c, d, a, w2_t[1], MD5C3f, MD5S33);
+
+ COMPARE_M_SIMD (a, d, c, b);
}
}
u32 w0l = w0[0];
- for (u32 il_pos = 0; il_pos < bfs_cnt; il_pos++)
+ for (u32 il_pos = 0; il_pos < bfs_cnt; il_pos += VECT_SIZE)
{
- const u32 w0r = bfs_buf[il_pos].i;
-
- w0[0] = w0l | w0r;
-
- u32 a = MD5M_A;
- u32 b = MD5M_B;
- u32 c = MD5M_C;
- u32 d = MD5M_D;
-
- MD5_STEP (MD5_Fo, a, b, c, d, w0[0], MD5C00, MD5S00);
- MD5_STEP (MD5_Fo, d, a, b, c, w0[1], MD5C01, MD5S01);
- MD5_STEP (MD5_Fo, c, d, a, b, w0[2], MD5C02, MD5S02);
- MD5_STEP (MD5_Fo, b, c, d, a, w0[3], MD5C03, MD5S03);
- MD5_STEP (MD5_Fo, a, b, c, d, w1[0], MD5C04, MD5S00);
- MD5_STEP (MD5_Fo, d, a, b, c, w1[1], MD5C05, MD5S01);
- MD5_STEP (MD5_Fo, c, d, a, b, w1[2], MD5C06, MD5S02);
- MD5_STEP (MD5_Fo, b, c, d, a, w1[3], MD5C07, MD5S03);
- MD5_STEP (MD5_Fo, a, b, c, d, w2[0], MD5C08, MD5S00);
- MD5_STEP (MD5_Fo, d, a, b, c, w2[1], MD5C09, MD5S01);
- MD5_STEP (MD5_Fo, c, d, a, b, w2[2], MD5C0a, MD5S02);
- MD5_STEP (MD5_Fo, b, c, d, a, w2[3], MD5C0b, MD5S03);
- MD5_STEP (MD5_Fo, a, b, c, d, w3[0], MD5C0c, MD5S00);
- MD5_STEP (MD5_Fo, d, a, b, c, w3[1], MD5C0d, MD5S01);
- MD5_STEP (MD5_Fo, c, d, a, b, w3[2], MD5C0e, MD5S02);
- MD5_STEP (MD5_Fo, b, c, d, a, w3[3], MD5C0f, MD5S03);
-
- MD5_STEP (MD5_Go, a, b, c, d, w0[1], MD5C10, MD5S10);
- MD5_STEP (MD5_Go, d, a, b, c, w1[2], MD5C11, MD5S11);
- MD5_STEP (MD5_Go, c, d, a, b, w2[3], MD5C12, MD5S12);
- MD5_STEP (MD5_Go, b, c, d, a, w0[0], MD5C13, MD5S13);
- MD5_STEP (MD5_Go, a, b, c, d, w1[1], MD5C14, MD5S10);
- MD5_STEP (MD5_Go, d, a, b, c, w2[2], MD5C15, MD5S11);
- MD5_STEP (MD5_Go, c, d, a, b, w3[3], MD5C16, MD5S12);
- MD5_STEP (MD5_Go, b, c, d, a, w1[0], MD5C17, MD5S13);
- MD5_STEP (MD5_Go, a, b, c, d, w2[1], MD5C18, MD5S10);
- MD5_STEP (MD5_Go, d, a, b, c, w3[2], MD5C19, MD5S11);
- MD5_STEP (MD5_Go, c, d, a, b, w0[3], MD5C1a, MD5S12);
- MD5_STEP (MD5_Go, b, c, d, a, w2[0], MD5C1b, MD5S13);
- MD5_STEP (MD5_Go, a, b, c, d, w3[1], MD5C1c, MD5S10);
- MD5_STEP (MD5_Go, d, a, b, c, w0[2], MD5C1d, MD5S11);
- MD5_STEP (MD5_Go, c, d, a, b, w1[3], MD5C1e, MD5S12);
- MD5_STEP (MD5_Go, b, c, d, a, w3[0], MD5C1f, MD5S13);
-
- MD5_STEP (MD5_H , a, b, c, d, w1[1], MD5C20, MD5S20);
- MD5_STEP (MD5_H , d, a, b, c, w2[0], MD5C21, MD5S21);
- MD5_STEP (MD5_H , c, d, a, b, w2[3], MD5C22, MD5S22);
- MD5_STEP (MD5_H , b, c, d, a, w3[2], MD5C23, MD5S23);
- MD5_STEP (MD5_H , a, b, c, d, w0[1], MD5C24, MD5S20);
- MD5_STEP (MD5_H , d, a, b, c, w1[0], MD5C25, MD5S21);
- MD5_STEP (MD5_H , c, d, a, b, w1[3], MD5C26, MD5S22);
- MD5_STEP (MD5_H , b, c, d, a, w2[2], MD5C27, MD5S23);
- MD5_STEP (MD5_H , a, b, c, d, w3[1], MD5C28, MD5S20);
- MD5_STEP (MD5_H , d, a, b, c, w0[0], MD5C29, MD5S21);
- MD5_STEP (MD5_H , c, d, a, b, w0[3], MD5C2a, MD5S22);
- MD5_STEP (MD5_H , b, c, d, a, w1[2], MD5C2b, MD5S23);
- MD5_STEP (MD5_H , a, b, c, d, w2[1], MD5C2c, MD5S20);
- MD5_STEP (MD5_H , d, a, b, c, w3[0], MD5C2d, MD5S21);
- MD5_STEP (MD5_H , c, d, a, b, w3[3], MD5C2e, MD5S22);
- MD5_STEP (MD5_H , b, c, d, a, w0[2], MD5C2f, MD5S23);
-
- MD5_STEP (MD5_I , a, b, c, d, w0[0], MD5C30, MD5S30);
- MD5_STEP (MD5_I , d, a, b, c, w1[3], MD5C31, MD5S31);
- MD5_STEP (MD5_I , c, d, a, b, w3[2], MD5C32, MD5S32);
- MD5_STEP (MD5_I , b, c, d, a, w1[1], MD5C33, MD5S33);
- MD5_STEP (MD5_I , a, b, c, d, w3[0], MD5C34, MD5S30);
- MD5_STEP (MD5_I , d, a, b, c, w0[3], MD5C35, MD5S31);
- MD5_STEP (MD5_I , c, d, a, b, w2[2], MD5C36, MD5S32);
- MD5_STEP (MD5_I , b, c, d, a, w0[1], MD5C37, MD5S33);
- MD5_STEP (MD5_I , a, b, c, d, w2[0], MD5C38, MD5S30);
- MD5_STEP (MD5_I , d, a, b, c, w3[3], MD5C39, MD5S31);
- MD5_STEP (MD5_I , c, d, a, b, w1[2], MD5C3a, MD5S32);
- MD5_STEP (MD5_I , b, c, d, a, w3[1], MD5C3b, MD5S33);
- MD5_STEP (MD5_I , a, b, c, d, w1[0], MD5C3c, MD5S30);
- MD5_STEP (MD5_I , d, a, b, c, w2[3], MD5C3d, MD5S31);
- MD5_STEP (MD5_I , c, d, a, b, w0[2], MD5C3e, MD5S32);
- MD5_STEP (MD5_I , b, c, d, a, w2[1], MD5C3f, MD5S33);
+ const u32x w0r = w0r_create_bft (bfs_buf, il_pos);
+
+ const u32x w0lr = w0l | w0r;
+
+ u32x w0_t[4];
+
+ w0_t[0] = w0lr;
+ w0_t[1] = w0[1];
+ w0_t[2] = w0[2];
+ w0_t[3] = w0[3];
+
+ u32x w1_t[4];
+
+ w1_t[0] = w1[0];
+ w1_t[1] = w1[1];
+ w1_t[2] = w1[2];
+ w1_t[3] = w1[3];
+
+ u32x w2_t[4];
+
+ w2_t[0] = w2[0];
+ w2_t[1] = w2[1];
+ w2_t[2] = w2[2];
+ w2_t[3] = w2[3];
+
+ u32x w3_t[4];
+
+ w3_t[0] = w3[0];
+ w3_t[1] = w3[1];
+ w3_t[2] = w3[2];
+ w3_t[3] = w3[3];
+
+ u32x a = MD5M_A;
+ u32x b = MD5M_B;
+ u32x c = MD5M_C;
+ u32x d = MD5M_D;
+
+ MD5_STEP (MD5_Fo, a, b, c, d, w0_t[0], MD5C00, MD5S00);
+ MD5_STEP (MD5_Fo, d, a, b, c, w0_t[1], MD5C01, MD5S01);
+ MD5_STEP (MD5_Fo, c, d, a, b, w0_t[2], MD5C02, MD5S02);
+ MD5_STEP (MD5_Fo, b, c, d, a, w0_t[3], MD5C03, MD5S03);
+ MD5_STEP (MD5_Fo, a, b, c, d, w1_t[0], MD5C04, MD5S00);
+ MD5_STEP (MD5_Fo, d, a, b, c, w1_t[1], MD5C05, MD5S01);
+ MD5_STEP (MD5_Fo, c, d, a, b, w1_t[2], MD5C06, MD5S02);
+ MD5_STEP (MD5_Fo, b, c, d, a, w1_t[3], MD5C07, MD5S03);
+ MD5_STEP (MD5_Fo, a, b, c, d, w2_t[0], MD5C08, MD5S00);
+ MD5_STEP (MD5_Fo, d, a, b, c, w2_t[1], MD5C09, MD5S01);
+ MD5_STEP (MD5_Fo, c, d, a, b, w2_t[2], MD5C0a, MD5S02);
+ MD5_STEP (MD5_Fo, b, c, d, a, w2_t[3], MD5C0b, MD5S03);
+ MD5_STEP (MD5_Fo, a, b, c, d, w3_t[0], MD5C0c, MD5S00);
+ MD5_STEP (MD5_Fo, d, a, b, c, w3_t[1], MD5C0d, MD5S01);
+ MD5_STEP (MD5_Fo, c, d, a, b, w3_t[2], MD5C0e, MD5S02);
+ MD5_STEP (MD5_Fo, b, c, d, a, w3_t[3], MD5C0f, MD5S03);
+
+ MD5_STEP (MD5_Go, a, b, c, d, w0_t[1], MD5C10, MD5S10);
+ MD5_STEP (MD5_Go, d, a, b, c, w1_t[2], MD5C11, MD5S11);
+ MD5_STEP (MD5_Go, c, d, a, b, w2_t[3], MD5C12, MD5S12);
+ MD5_STEP (MD5_Go, b, c, d, a, w0_t[0], MD5C13, MD5S13);
+ MD5_STEP (MD5_Go, a, b, c, d, w1_t[1], MD5C14, MD5S10);
+ MD5_STEP (MD5_Go, d, a, b, c, w2_t[2], MD5C15, MD5S11);
+ MD5_STEP (MD5_Go, c, d, a, b, w3_t[3], MD5C16, MD5S12);
+ MD5_STEP (MD5_Go, b, c, d, a, w1_t[0], MD5C17, MD5S13);
+ MD5_STEP (MD5_Go, a, b, c, d, w2_t[1], MD5C18, MD5S10);
+ MD5_STEP (MD5_Go, d, a, b, c, w3_t[2], MD5C19, MD5S11);
+ MD5_STEP (MD5_Go, c, d, a, b, w0_t[3], MD5C1a, MD5S12);
+ MD5_STEP (MD5_Go, b, c, d, a, w2_t[0], MD5C1b, MD5S13);
+ MD5_STEP (MD5_Go, a, b, c, d, w3_t[1], MD5C1c, MD5S10);
+ MD5_STEP (MD5_Go, d, a, b, c, w0_t[2], MD5C1d, MD5S11);
+ MD5_STEP (MD5_Go, c, d, a, b, w1_t[3], MD5C1e, MD5S12);
+ MD5_STEP (MD5_Go, b, c, d, a, w3_t[0], MD5C1f, MD5S13);
+
+ MD5_STEP (MD5_H , a, b, c, d, w1_t[1], MD5C20, MD5S20);
+ MD5_STEP (MD5_H , d, a, b, c, w2_t[0], MD5C21, MD5S21);
+ MD5_STEP (MD5_H , c, d, a, b, w2_t[3], MD5C22, MD5S22);
+ MD5_STEP (MD5_H , b, c, d, a, w3_t[2], MD5C23, MD5S23);
+ MD5_STEP (MD5_H , a, b, c, d, w0_t[1], MD5C24, MD5S20);
+ MD5_STEP (MD5_H , d, a, b, c, w1_t[0], MD5C25, MD5S21);
+ MD5_STEP (MD5_H , c, d, a, b, w1_t[3], MD5C26, MD5S22);
+ MD5_STEP (MD5_H , b, c, d, a, w2_t[2], MD5C27, MD5S23);
+ MD5_STEP (MD5_H , a, b, c, d, w3_t[1], MD5C28, MD5S20);
+ MD5_STEP (MD5_H , d, a, b, c, w0_t[0], MD5C29, MD5S21);
+ MD5_STEP (MD5_H , c, d, a, b, w0_t[3], MD5C2a, MD5S22);
+ MD5_STEP (MD5_H , b, c, d, a, w1_t[2], MD5C2b, MD5S23);
+ MD5_STEP (MD5_H , a, b, c, d, w2_t[1], MD5C2c, MD5S20);
+ MD5_STEP (MD5_H , d, a, b, c, w3_t[0], MD5C2d, MD5S21);
+ MD5_STEP (MD5_H , c, d, a, b, w3_t[3], MD5C2e, MD5S22);
+ MD5_STEP (MD5_H , b, c, d, a, w0_t[2], MD5C2f, MD5S23);
+
+ MD5_STEP (MD5_I , a, b, c, d, w0_t[0], MD5C30, MD5S30);
+ MD5_STEP (MD5_I , d, a, b, c, w1_t[3], MD5C31, MD5S31);
+ MD5_STEP (MD5_I , c, d, a, b, w3_t[2], MD5C32, MD5S32);
+ MD5_STEP (MD5_I , b, c, d, a, w1_t[1], MD5C33, MD5S33);
+ MD5_STEP (MD5_I , a, b, c, d, w3_t[0], MD5C34, MD5S30);
+ MD5_STEP (MD5_I , d, a, b, c, w0_t[3], MD5C35, MD5S31);
+ MD5_STEP (MD5_I , c, d, a, b, w2_t[2], MD5C36, MD5S32);
+ MD5_STEP (MD5_I , b, c, d, a, w0_t[1], MD5C37, MD5S33);
+ MD5_STEP (MD5_I , a, b, c, d, w2_t[0], MD5C38, MD5S30);
+ MD5_STEP (MD5_I , d, a, b, c, w3_t[3], MD5C39, MD5S31);
+ MD5_STEP (MD5_I , c, d, a, b, w1_t[2], MD5C3a, MD5S32);
+ MD5_STEP (MD5_I , b, c, d, a, w3_t[1], MD5C3b, MD5S33);
+ MD5_STEP (MD5_I , a, b, c, d, w1_t[0], MD5C3c, MD5S30);
+ MD5_STEP (MD5_I , d, a, b, c, w2_t[3], MD5C3d, MD5S31);
+ MD5_STEP (MD5_I , c, d, a, b, w0_t[2], MD5C3e, MD5S32);
+ MD5_STEP (MD5_I , b, c, d, a, w2_t[1], MD5C3f, MD5S33);
a += MD5M_A;
b += MD5M_B;
c += MD5M_C;
d += MD5M_D;
- const u32 w0_t = uint_to_hex_lower8 ((a >> 0) & 255) << 0
- | uint_to_hex_lower8 ((a >> 8) & 255) << 16;
- const u32 w1_t = uint_to_hex_lower8 ((a >> 16) & 255) << 0
- | uint_to_hex_lower8 ((a >> 24) & 255) << 16;
- const u32 w2_t = uint_to_hex_lower8 ((b >> 0) & 255) << 0
- | uint_to_hex_lower8 ((b >> 8) & 255) << 16;
- const u32 w3_t = uint_to_hex_lower8 ((b >> 16) & 255) << 0
- | uint_to_hex_lower8 ((b >> 24) & 255) << 16;
- const u32 w4_t = uint_to_hex_lower8 ((c >> 0) & 255) << 0
- | uint_to_hex_lower8 ((c >> 8) & 255) << 16;
- const u32 w5_t = uint_to_hex_lower8 ((c >> 16) & 255) << 0
- | uint_to_hex_lower8 ((c >> 24) & 255) << 16;
- const u32 w6_t = uint_to_hex_lower8 ((d >> 0) & 255) << 0
- | uint_to_hex_lower8 ((d >> 8) & 255) << 16;
- const u32 w7_t = uint_to_hex_lower8 ((d >> 16) & 255) << 0
- | uint_to_hex_lower8 ((d >> 24) & 255) << 16;
-
- const u32 w8_t = s[0];
- const u32 w9_t = s[1];
- const u32 wa_t = s[2];
- const u32 wb_t = s[3];
- const u32 wc_t = s[4];
- const u32 wd_t = s[5];
- const u32 we_t = s[6];
- const u32 wf_t = s[7];
+ w0_t[0] = uint_to_hex_upper8 ((a >> 0) & 255) << 0
+ | uint_to_hex_upper8 ((a >> 8) & 255) << 16;
+ w0_t[1] = uint_to_hex_upper8 ((a >> 16) & 255) << 0
+ | uint_to_hex_upper8 ((a >> 24) & 255) << 16;
+ w0_t[2] = uint_to_hex_upper8 ((b >> 0) & 255) << 0
+ | uint_to_hex_upper8 ((b >> 8) & 255) << 16;
+ w0_t[3] = uint_to_hex_upper8 ((b >> 16) & 255) << 0
+ | uint_to_hex_upper8 ((b >> 24) & 255) << 16;
+ w1_t[0] = uint_to_hex_upper8 ((c >> 0) & 255) << 0
+ | uint_to_hex_upper8 ((c >> 8) & 255) << 16;
+ w1_t[1] = uint_to_hex_upper8 ((c >> 16) & 255) << 0
+ | uint_to_hex_upper8 ((c >> 24) & 255) << 16;
+ w1_t[2] = uint_to_hex_upper8 ((d >> 0) & 255) << 0
+ | uint_to_hex_upper8 ((d >> 8) & 255) << 16;
+ w1_t[3] = uint_to_hex_upper8 ((d >> 16) & 255) << 0
+ | uint_to_hex_upper8 ((d >> 24) & 255) << 16;
+
+ w2_t[0] = s[0];
+ w2_t[1] = s[1];
+ w2_t[2] = s[2];
+ w2_t[3] = s[3];
+
+ w3_t[0] = s[4];
+ w3_t[1] = s[5];
+ w3_t[2] = s[6];
+ w3_t[3] = s[7];
a = MD5M_A;
b = MD5M_B;
c = MD5M_C;
d = MD5M_D;
- MD5_STEP (MD5_Fo, a, b, c, d, w0_t, MD5C00, MD5S00);
- MD5_STEP (MD5_Fo, d, a, b, c, w1_t, MD5C01, MD5S01);
- MD5_STEP (MD5_Fo, c, d, a, b, w2_t, MD5C02, MD5S02);
- MD5_STEP (MD5_Fo, b, c, d, a, w3_t, MD5C03, MD5S03);
- MD5_STEP (MD5_Fo, a, b, c, d, w4_t, MD5C04, MD5S00);
- MD5_STEP (MD5_Fo, d, a, b, c, w5_t, MD5C05, MD5S01);
- MD5_STEP (MD5_Fo, c, d, a, b, w6_t, MD5C06, MD5S02);
- MD5_STEP (MD5_Fo, b, c, d, a, w7_t, MD5C07, MD5S03);
- MD5_STEP (MD5_Fo, a, b, c, d, w8_t, MD5C08, MD5S00);
- MD5_STEP (MD5_Fo, d, a, b, c, w9_t, MD5C09, MD5S01);
- MD5_STEP (MD5_Fo, c, d, a, b, wa_t, MD5C0a, MD5S02);
- MD5_STEP (MD5_Fo, b, c, d, a, wb_t, MD5C0b, MD5S03);
- MD5_STEP (MD5_Fo, a, b, c, d, wc_t, MD5C0c, MD5S00);
- MD5_STEP (MD5_Fo, d, a, b, c, wd_t, MD5C0d, MD5S01);
- MD5_STEP (MD5_Fo, c, d, a, b, we_t, MD5C0e, MD5S02);
- MD5_STEP (MD5_Fo, b, c, d, a, wf_t, MD5C0f, MD5S03);
-
- MD5_STEP (MD5_Go, a, b, c, d, w1_t, MD5C10, MD5S10);
- MD5_STEP (MD5_Go, d, a, b, c, w6_t, MD5C11, MD5S11);
- MD5_STEP (MD5_Go, c, d, a, b, wb_t, MD5C12, MD5S12);
- MD5_STEP (MD5_Go, b, c, d, a, w0_t, MD5C13, MD5S13);
- MD5_STEP (MD5_Go, a, b, c, d, w5_t, MD5C14, MD5S10);
- MD5_STEP (MD5_Go, d, a, b, c, wa_t, MD5C15, MD5S11);
- MD5_STEP (MD5_Go, c, d, a, b, wf_t, MD5C16, MD5S12);
- MD5_STEP (MD5_Go, b, c, d, a, w4_t, MD5C17, MD5S13);
- MD5_STEP (MD5_Go, a, b, c, d, w9_t, MD5C18, MD5S10);
- MD5_STEP (MD5_Go, d, a, b, c, we_t, MD5C19, MD5S11);
- MD5_STEP (MD5_Go, c, d, a, b, w3_t, MD5C1a, MD5S12);
- MD5_STEP (MD5_Go, b, c, d, a, w8_t, MD5C1b, MD5S13);
- MD5_STEP (MD5_Go, a, b, c, d, wd_t, MD5C1c, MD5S10);
- MD5_STEP (MD5_Go, d, a, b, c, w2_t, MD5C1d, MD5S11);
- MD5_STEP (MD5_Go, c, d, a, b, w7_t, MD5C1e, MD5S12);
- MD5_STEP (MD5_Go, b, c, d, a, wc_t, MD5C1f, MD5S13);
-
- MD5_STEP (MD5_H , a, b, c, d, w5_t, MD5C20, MD5S20);
- MD5_STEP (MD5_H , d, a, b, c, w8_t, MD5C21, MD5S21);
- MD5_STEP (MD5_H , c, d, a, b, wb_t, MD5C22, MD5S22);
- MD5_STEP (MD5_H , b, c, d, a, we_t, MD5C23, MD5S23);
- MD5_STEP (MD5_H , a, b, c, d, w1_t, MD5C24, MD5S20);
- MD5_STEP (MD5_H , d, a, b, c, w4_t, MD5C25, MD5S21);
- MD5_STEP (MD5_H , c, d, a, b, w7_t, MD5C26, MD5S22);
- MD5_STEP (MD5_H , b, c, d, a, wa_t, MD5C27, MD5S23);
- MD5_STEP (MD5_H , a, b, c, d, wd_t, MD5C28, MD5S20);
- MD5_STEP (MD5_H , d, a, b, c, w0_t, MD5C29, MD5S21);
- MD5_STEP (MD5_H , c, d, a, b, w3_t, MD5C2a, MD5S22);
- MD5_STEP (MD5_H , b, c, d, a, w6_t, MD5C2b, MD5S23);
- MD5_STEP (MD5_H , a, b, c, d, w9_t, MD5C2c, MD5S20);
- MD5_STEP (MD5_H , d, a, b, c, wc_t, MD5C2d, MD5S21);
- MD5_STEP (MD5_H , c, d, a, b, wf_t, MD5C2e, MD5S22);
- MD5_STEP (MD5_H , b, c, d, a, w2_t, MD5C2f, MD5S23);
-
- MD5_STEP (MD5_I , a, b, c, d, w0_t, MD5C30, MD5S30);
- MD5_STEP (MD5_I , d, a, b, c, w7_t, MD5C31, MD5S31);
- MD5_STEP (MD5_I , c, d, a, b, we_t, MD5C32, MD5S32);
- MD5_STEP (MD5_I , b, c, d, a, w5_t, MD5C33, MD5S33);
- MD5_STEP (MD5_I , a, b, c, d, wc_t, MD5C34, MD5S30);
- MD5_STEP (MD5_I , d, a, b, c, w3_t, MD5C35, MD5S31);
- MD5_STEP (MD5_I , c, d, a, b, wa_t, MD5C36, MD5S32);
- MD5_STEP (MD5_I , b, c, d, a, w1_t, MD5C37, MD5S33);
- MD5_STEP (MD5_I , a, b, c, d, w8_t, MD5C38, MD5S30);
- MD5_STEP (MD5_I , d, a, b, c, wf_t, MD5C39, MD5S31);
- MD5_STEP (MD5_I , c, d, a, b, w6_t, MD5C3a, MD5S32);
- MD5_STEP (MD5_I , b, c, d, a, wd_t, MD5C3b, MD5S33);
- MD5_STEP (MD5_I , a, b, c, d, w4_t, MD5C3c, MD5S30);
- MD5_STEP (MD5_I , d, a, b, c, wb_t, MD5C3d, MD5S31);
- MD5_STEP (MD5_I , c, d, a, b, w2_t, MD5C3e, MD5S32);
- MD5_STEP (MD5_I , b, c, d, a, w9_t, MD5C3f, MD5S33);
-
- const u32 r0 = a;
- const u32 r1 = d;
- const u32 r2 = c;
- const u32 r3 = b;
-
- #include COMPARE_S
+ MD5_STEP (MD5_Fo, a, b, c, d, w0_t[0], MD5C00, MD5S00);
+ MD5_STEP (MD5_Fo, d, a, b, c, w0_t[1], MD5C01, MD5S01);
+ MD5_STEP (MD5_Fo, c, d, a, b, w0_t[2], MD5C02, MD5S02);
+ MD5_STEP (MD5_Fo, b, c, d, a, w0_t[3], MD5C03, MD5S03);
+ MD5_STEP (MD5_Fo, a, b, c, d, w1_t[0], MD5C04, MD5S00);
+ MD5_STEP (MD5_Fo, d, a, b, c, w1_t[1], MD5C05, MD5S01);
+ MD5_STEP (MD5_Fo, c, d, a, b, w1_t[2], MD5C06, MD5S02);
+ MD5_STEP (MD5_Fo, b, c, d, a, w1_t[3], MD5C07, MD5S03);
+ MD5_STEP (MD5_Fo, a, b, c, d, w2_t[0], MD5C08, MD5S00);
+ MD5_STEP (MD5_Fo, d, a, b, c, w2_t[1], MD5C09, MD5S01);
+ MD5_STEP (MD5_Fo, c, d, a, b, w2_t[2], MD5C0a, MD5S02);
+ MD5_STEP (MD5_Fo, b, c, d, a, w2_t[3], MD5C0b, MD5S03);
+ MD5_STEP (MD5_Fo, a, b, c, d, w3_t[0], MD5C0c, MD5S00);
+ MD5_STEP (MD5_Fo, d, a, b, c, w3_t[1], MD5C0d, MD5S01);
+ MD5_STEP (MD5_Fo, c, d, a, b, w3_t[2], MD5C0e, MD5S02);
+ MD5_STEP (MD5_Fo, b, c, d, a, w3_t[3], MD5C0f, MD5S03);
+
+ MD5_STEP (MD5_Go, a, b, c, d, w0_t[1], MD5C10, MD5S10);
+ MD5_STEP (MD5_Go, d, a, b, c, w1_t[2], MD5C11, MD5S11);
+ MD5_STEP (MD5_Go, c, d, a, b, w2_t[3], MD5C12, MD5S12);
+ MD5_STEP (MD5_Go, b, c, d, a, w0_t[0], MD5C13, MD5S13);
+ MD5_STEP (MD5_Go, a, b, c, d, w1_t[1], MD5C14, MD5S10);
+ MD5_STEP (MD5_Go, d, a, b, c, w2_t[2], MD5C15, MD5S11);
+ MD5_STEP (MD5_Go, c, d, a, b, w3_t[3], MD5C16, MD5S12);
+ MD5_STEP (MD5_Go, b, c, d, a, w1_t[0], MD5C17, MD5S13);
+ MD5_STEP (MD5_Go, a, b, c, d, w2_t[1], MD5C18, MD5S10);
+ MD5_STEP (MD5_Go, d, a, b, c, w3_t[2], MD5C19, MD5S11);
+ MD5_STEP (MD5_Go, c, d, a, b, w0_t[3], MD5C1a, MD5S12);
+ MD5_STEP (MD5_Go, b, c, d, a, w2_t[0], MD5C1b, MD5S13);
+ MD5_STEP (MD5_Go, a, b, c, d, w3_t[1], MD5C1c, MD5S10);
+ MD5_STEP (MD5_Go, d, a, b, c, w0_t[2], MD5C1d, MD5S11);
+ MD5_STEP (MD5_Go, c, d, a, b, w1_t[3], MD5C1e, MD5S12);
+ MD5_STEP (MD5_Go, b, c, d, a, w3_t[0], MD5C1f, MD5S13);
+
+ MD5_STEP (MD5_H , a, b, c, d, w1_t[1], MD5C20, MD5S20);
+ MD5_STEP (MD5_H , d, a, b, c, w2_t[0], MD5C21, MD5S21);
+ MD5_STEP (MD5_H , c, d, a, b, w2_t[3], MD5C22, MD5S22);
+ MD5_STEP (MD5_H , b, c, d, a, w3_t[2], MD5C23, MD5S23);
+ MD5_STEP (MD5_H , a, b, c, d, w0_t[1], MD5C24, MD5S20);
+ MD5_STEP (MD5_H , d, a, b, c, w1_t[0], MD5C25, MD5S21);
+ MD5_STEP (MD5_H , c, d, a, b, w1_t[3], MD5C26, MD5S22);
+ MD5_STEP (MD5_H , b, c, d, a, w2_t[2], MD5C27, MD5S23);
+ MD5_STEP (MD5_H , a, b, c, d, w3_t[1], MD5C28, MD5S20);
+ MD5_STEP (MD5_H , d, a, b, c, w0_t[0], MD5C29, MD5S21);
+ MD5_STEP (MD5_H , c, d, a, b, w0_t[3], MD5C2a, MD5S22);
+ MD5_STEP (MD5_H , b, c, d, a, w1_t[2], MD5C2b, MD5S23);
+ MD5_STEP (MD5_H , a, b, c, d, w2_t[1], MD5C2c, MD5S20);
+ MD5_STEP (MD5_H , d, a, b, c, w3_t[0], MD5C2d, MD5S21);
+ MD5_STEP (MD5_H , c, d, a, b, w3_t[3], MD5C2e, MD5S22);
+ MD5_STEP (MD5_H , b, c, d, a, w0_t[2], MD5C2f, MD5S23);
+
+ MD5_STEP (MD5_I , a, b, c, d, w0_t[0], MD5C30, MD5S30);
+ MD5_STEP (MD5_I , d, a, b, c, w1_t[3], MD5C31, MD5S31);
+ MD5_STEP (MD5_I , c, d, a, b, w3_t[2], MD5C32, MD5S32);
+ MD5_STEP (MD5_I , b, c, d, a, w1_t[1], MD5C33, MD5S33);
+ MD5_STEP (MD5_I , a, b, c, d, w3_t[0], MD5C34, MD5S30);
+ MD5_STEP (MD5_I , d, a, b, c, w0_t[3], MD5C35, MD5S31);
+ MD5_STEP (MD5_I , c, d, a, b, w2_t[2], MD5C36, MD5S32);
+ MD5_STEP (MD5_I , b, c, d, a, w0_t[1], MD5C37, MD5S33);
+ MD5_STEP (MD5_I , a, b, c, d, w2_t[0], MD5C38, MD5S30);
+ MD5_STEP (MD5_I , d, a, b, c, w3_t[3], MD5C39, MD5S31);
+ MD5_STEP (MD5_I , c, d, a, b, w1_t[2], MD5C3a, MD5S32);
+ MD5_STEP (MD5_I , b, c, d, a, w3_t[1], MD5C3b, MD5S33);
+ MD5_STEP (MD5_I , a, b, c, d, w1_t[0], MD5C3c, MD5S30);
+ MD5_STEP (MD5_I , d, a, b, c, w2_t[3], MD5C3d, MD5S31);
+ MD5_STEP (MD5_I , c, d, a, b, w0_t[2], MD5C3e, MD5S32);
+ MD5_STEP (MD5_I , b, c, d, a, w2_t[1], MD5C3f, MD5S33);
+
+ COMPARE_S_SIMD (a, d, c, b);
}
}
{
append_0x80_2x4 (wordl0, wordl1, pw_l_len);
- switch_buffer_by_offset (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len);
+ switch_buffer_by_offset_le (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len);
}
/**
{
append_0x80_2x4 (wordr0, wordr1, pw_r_len);
- switch_buffer_by_offset (wordr0, wordr1, wordr2, wordr3, pw_l_len);
+ switch_buffer_by_offset_le (wordr0, wordr1, wordr2, wordr3, pw_l_len);
}
u32 w0[4];
{
append_0x80_2x4 (wordl0, wordl1, pw_l_len);
- switch_buffer_by_offset (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len);
+ switch_buffer_by_offset_le (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len);
}
/**
{
append_0x80_2x4 (wordr0, wordr1, pw_r_len);
- switch_buffer_by_offset (wordr0, wordr1, wordr2, wordr3, pw_l_len);
+ switch_buffer_by_offset_le (wordr0, wordr1, wordr2, wordr3, pw_l_len);
}
u32 w0[4];
#define _MD5_SHA1_
+#define NEW_SIMD_CODE
+
#include "include/constants.h"
#include "include/kernel_vendor.h"
#include "include/kernel_functions.c"
#include "OpenCL/types_ocl.c"
#include "OpenCL/common.c"
-
-#define COMPARE_S "OpenCL/check_single_comp4.c"
-#define COMPARE_M "OpenCL/check_multi_comp4.c"
-
-#define uint_to_hex_lower8(i) l_bin2asc[(i)]
+#include "OpenCL/simd.c"
+
+#if VECT_SIZE == 1
+#define uint_to_hex_lower8(i) (u32x) (l_bin2asc[(i)])
+#elif VECT_SIZE == 2
+#define uint_to_hex_lower8(i) (u32x) (l_bin2asc[(i).s0], l_bin2asc[(i).s1])
+#elif VECT_SIZE == 4
+#define uint_to_hex_lower8(i) (u32x) (l_bin2asc[(i).s0], l_bin2asc[(i).s1], l_bin2asc[(i).s2], l_bin2asc[(i).s3])
+#elif VECT_SIZE == 8
+#define uint_to_hex_lower8(i) (u32x) (l_bin2asc[(i).s0], l_bin2asc[(i).s1], l_bin2asc[(i).s2], l_bin2asc[(i).s3], l_bin2asc[(i).s4], l_bin2asc[(i).s5], l_bin2asc[(i).s6], l_bin2asc[(i).s7])
+#endif
static void m04400m (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], const u32 pw_len, __global pw_t *pws, __global kernel_rule_t *rules_buf, __global comb_t *combs_buf, __global bf_t *bfs_buf, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 bfs_cnt, const u32 digests_cnt, const u32 digests_offset, __local u32 l_bin2asc[256])
{
u32 w0l = w0[0];
- for (u32 il_pos = 0; il_pos < bfs_cnt; il_pos++)
+ for (u32 il_pos = 0; il_pos < bfs_cnt; il_pos += VECT_SIZE)
{
- const u32 w0r = bfs_buf[il_pos].i;
+ const u32x w0r = w0r_create_bft (bfs_buf, il_pos);
- w0[0] = w0l | w0r;
+ const u32x w0lr = w0l | w0r;
/**
* sha1
*/
- u32 w0_t = w0[0];
- u32 w1_t = w0[1];
- u32 w2_t = w0[2];
- u32 w3_t = w0[3];
- u32 w4_t = w1[0];
- u32 w5_t = w1[1];
- u32 w6_t = w1[2];
- u32 w7_t = w1[3];
- u32 w8_t = w2[0];
- u32 w9_t = w2[1];
- u32 wa_t = w2[2];
- u32 wb_t = w2[3];
- u32 wc_t = w3[0];
- u32 wd_t = w3[1];
- u32 we_t = 0;
- u32 wf_t = pw_len * 8;
-
- u32 a = SHA1M_A;
- u32 b = SHA1M_B;
- u32 c = SHA1M_C;
- u32 d = SHA1M_D;
- u32 e = SHA1M_E;
+ u32x w0_t = w0lr;
+ u32x w1_t = w0[1];
+ u32x w2_t = w0[2];
+ u32x w3_t = w0[3];
+ u32x w4_t = w1[0];
+ u32x w5_t = w1[1];
+ u32x w6_t = w1[2];
+ u32x w7_t = w1[3];
+ u32x w8_t = w2[0];
+ u32x w9_t = w2[1];
+ u32x wa_t = w2[2];
+ u32x wb_t = w2[3];
+ u32x wc_t = w3[0];
+ u32x wd_t = w3[1];
+ u32x we_t = 0;
+ u32x wf_t = pw_len * 8;
+
+ u32x a = SHA1M_A;
+ u32x b = SHA1M_B;
+ u32x c = SHA1M_C;
+ u32x d = SHA1M_D;
+ u32x e = SHA1M_E;
#undef K
#define K SHA1C00
MD5_STEP (MD5_I , c, d, a, b, w2_t, MD5C3e, MD5S32);
MD5_STEP (MD5_I , b, c, d, a, w9_t, MD5C3f, MD5S33);
- const u32 r0 = a;
- const u32 r1 = d;
- const u32 r2 = c;
- const u32 r3 = b;
-
- #include COMPARE_M
+ COMPARE_M_SIMD (a, d, c, b);
}
}
u32 w0l = w0[0];
- for (u32 il_pos = 0; il_pos < bfs_cnt; il_pos++)
+ for (u32 il_pos = 0; il_pos < bfs_cnt; il_pos += VECT_SIZE)
{
- const u32 w0r = bfs_buf[il_pos].i;
+ const u32x w0r = w0r_create_bft (bfs_buf, il_pos);
- w0[0] = w0l | w0r;
+ const u32x w0lr = w0l | w0r;
/**
* sha1
*/
- u32 w0_t = w0[0];
- u32 w1_t = w0[1];
- u32 w2_t = w0[2];
- u32 w3_t = w0[3];
- u32 w4_t = w1[0];
- u32 w5_t = w1[1];
- u32 w6_t = w1[2];
- u32 w7_t = w1[3];
- u32 w8_t = w2[0];
- u32 w9_t = w2[1];
- u32 wa_t = w2[2];
- u32 wb_t = w2[3];
- u32 wc_t = w3[0];
- u32 wd_t = w3[1];
- u32 we_t = 0;
- u32 wf_t = pw_len * 8;
-
- u32 a = SHA1M_A;
- u32 b = SHA1M_B;
- u32 c = SHA1M_C;
- u32 d = SHA1M_D;
- u32 e = SHA1M_E;
+ u32x w0_t = w0lr;
+ u32x w1_t = w0[1];
+ u32x w2_t = w0[2];
+ u32x w3_t = w0[3];
+ u32x w4_t = w1[0];
+ u32x w5_t = w1[1];
+ u32x w6_t = w1[2];
+ u32x w7_t = w1[3];
+ u32x w8_t = w2[0];
+ u32x w9_t = w2[1];
+ u32x wa_t = w2[2];
+ u32x wb_t = w2[3];
+ u32x wc_t = w3[0];
+ u32x wd_t = w3[1];
+ u32x we_t = 0;
+ u32x wf_t = pw_len * 8;
+
+ u32x a = SHA1M_A;
+ u32x b = SHA1M_B;
+ u32x c = SHA1M_C;
+ u32x d = SHA1M_D;
+ u32x e = SHA1M_E;
#undef K
#define K SHA1C00
MD5_STEP (MD5_I , c, d, a, b, w2_t, MD5C3e, MD5S32);
MD5_STEP (MD5_I , b, c, d, a, w9_t, MD5C3f, MD5S33);
- const u32 r0 = a;
- const u32 r1 = d;
- const u32 r2 = c;
- const u32 r3 = b;
-
- #include COMPARE_S
+ COMPARE_S_SIMD (a, d, c, b);
}
}
{
append_0x80_2x4 (wordl0, wordl1, pw_l_len);
- switch_buffer_by_offset (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len);
+ switch_buffer_by_offset_le (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len);
}
/**
{
append_0x80_2x4 (wordr0, wordr1, pw_r_len);
- switch_buffer_by_offset (wordr0, wordr1, wordr2, wordr3, pw_l_len);
+ switch_buffer_by_offset_le (wordr0, wordr1, wordr2, wordr3, pw_l_len);
}
u32 w0[4];
{
append_0x80_2x4 (wordl0, wordl1, pw_l_len);
- switch_buffer_by_offset (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len);
+ switch_buffer_by_offset_le (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len);
}
/**
{
append_0x80_2x4 (wordr0, wordr1, pw_r_len);
- switch_buffer_by_offset (wordr0, wordr1, wordr2, wordr3, pw_l_len);
+ switch_buffer_by_offset_le (wordr0, wordr1, wordr2, wordr3, pw_l_len);
}
u32 w0[4];
#define _SHA1_
+#define NEW_SIMD_CODE
+
#include "include/constants.h"
#include "include/kernel_vendor.h"
#include "include/kernel_functions.c"
#include "OpenCL/types_ocl.c"
#include "OpenCL/common.c"
-
-#define COMPARE_S "OpenCL/check_single_comp4.c"
-#define COMPARE_M "OpenCL/check_multi_comp4.c"
-
-#define uint_to_hex_lower8_le(i) l_bin2asc[(i)]
+#include "OpenCL/simd.c"
+
+#if VECT_SIZE == 1
+#define uint_to_hex_lower8_le(i) (u32x) (l_bin2asc[(i)])
+#elif VECT_SIZE == 2
+#define uint_to_hex_lower8_le(i) (u32x) (l_bin2asc[(i).s0], l_bin2asc[(i).s1])
+#elif VECT_SIZE == 4
+#define uint_to_hex_lower8_le(i) (u32x) (l_bin2asc[(i).s0], l_bin2asc[(i).s1], l_bin2asc[(i).s2], l_bin2asc[(i).s3])
+#elif VECT_SIZE == 8
+#define uint_to_hex_lower8_le(i) (u32x) (l_bin2asc[(i).s0], l_bin2asc[(i).s1], l_bin2asc[(i).s2], l_bin2asc[(i).s3], l_bin2asc[(i).s4], l_bin2asc[(i).s5], l_bin2asc[(i).s6], l_bin2asc[(i).s7])
+#endif
static void m04500m (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], const u32 pw_len, __global pw_t *pws, __global kernel_rule_t *rules_buf, __global comb_t *combs_buf, __global bf_t *bfs_buf, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 bfs_cnt, const u32 digests_cnt, const u32 digests_offset, __local u32 l_bin2asc[256])
{
u32 w0l = w0[0];
- for (u32 il_pos = 0; il_pos < bfs_cnt; il_pos++)
+ for (u32 il_pos = 0; il_pos < bfs_cnt; il_pos += VECT_SIZE)
{
- const u32 w0r = bfs_buf[il_pos].i;
+ const u32x w0r = w0r_create_bft (bfs_buf, il_pos);
- w0[0] = w0l | w0r;
+ const u32x w0lr = w0l | w0r;
/**
* sha1
*/
- u32 w0_t = w0[0];
- u32 w1_t = w0[1];
- u32 w2_t = w0[2];
- u32 w3_t = w0[3];
- u32 w4_t = w1[0];
- u32 w5_t = w1[1];
- u32 w6_t = w1[2];
- u32 w7_t = w1[3];
- u32 w8_t = w2[0];
- u32 w9_t = w2[1];
- u32 wa_t = w2[2];
- u32 wb_t = w2[3];
- u32 wc_t = w3[0];
- u32 wd_t = w3[1];
- u32 we_t = 0;
- u32 wf_t = pw_len * 8;
-
- u32 a = SHA1M_A;
- u32 b = SHA1M_B;
- u32 c = SHA1M_C;
- u32 d = SHA1M_D;
- u32 e = SHA1M_E;
+ u32x w0_t = w0lr;
+ u32x w1_t = w0[1];
+ u32x w2_t = w0[2];
+ u32x w3_t = w0[3];
+ u32x w4_t = w1[0];
+ u32x w5_t = w1[1];
+ u32x w6_t = w1[2];
+ u32x w7_t = w1[3];
+ u32x w8_t = w2[0];
+ u32x w9_t = w2[1];
+ u32x wa_t = w2[2];
+ u32x wb_t = w2[3];
+ u32x wc_t = w3[0];
+ u32x wd_t = w3[1];
+ u32x we_t = 0;
+ u32x wf_t = pw_len * 8;
+
+ u32x a = SHA1M_A;
+ u32x b = SHA1M_B;
+ u32x c = SHA1M_C;
+ u32x d = SHA1M_D;
+ u32x e = SHA1M_E;
#undef K
#define K SHA1C00
we_t = rotl32 ((wb_t ^ w6_t ^ w0_t ^ we_t), 1u); SHA1_STEP (SHA1_F1, c, d, e, a, b, we_t);
wf_t = rotl32 ((wc_t ^ w7_t ^ w1_t ^ wf_t), 1u); SHA1_STEP (SHA1_F1, b, c, d, e, a, wf_t);
- const u32 r0 = d;
- const u32 r1 = e;
- const u32 r2 = c;
- const u32 r3 = b;
-
- #include COMPARE_M
+ COMPARE_M_SIMD (d, e, c, b);
}
}
* reverse
*/
- const u32 e_rev = rotl32 (search[1], 2u);
+ const u32 e_rev = rotl32_S (search[1], 2u);
/**
* loop
u32 w0l = w0[0];
- for (u32 il_pos = 0; il_pos < bfs_cnt; il_pos++)
+ for (u32 il_pos = 0; il_pos < bfs_cnt; il_pos += VECT_SIZE)
{
- const u32 w0r = bfs_buf[il_pos].i;
+ const u32x w0r = w0r_create_bft (bfs_buf, il_pos);
- w0[0] = w0l | w0r;
+ const u32x w0lr = w0l | w0r;
/**
* sha1
*/
- u32 w0_t = w0[0];
- u32 w1_t = w0[1];
- u32 w2_t = w0[2];
- u32 w3_t = w0[3];
- u32 w4_t = w1[0];
- u32 w5_t = w1[1];
- u32 w6_t = w1[2];
- u32 w7_t = w1[3];
- u32 w8_t = w2[0];
- u32 w9_t = w2[1];
- u32 wa_t = w2[2];
- u32 wb_t = w2[3];
- u32 wc_t = w3[0];
- u32 wd_t = w3[1];
- u32 we_t = 0;
- u32 wf_t = pw_len * 8;
-
- u32 a = SHA1M_A;
- u32 b = SHA1M_B;
- u32 c = SHA1M_C;
- u32 d = SHA1M_D;
- u32 e = SHA1M_E;
+ u32x w0_t = w0lr;
+ u32x w1_t = w0[1];
+ u32x w2_t = w0[2];
+ u32x w3_t = w0[3];
+ u32x w4_t = w1[0];
+ u32x w5_t = w1[1];
+ u32x w6_t = w1[2];
+ u32x w7_t = w1[3];
+ u32x w8_t = w2[0];
+ u32x w9_t = w2[1];
+ u32x wa_t = w2[2];
+ u32x wb_t = w2[3];
+ u32x wc_t = w3[0];
+ u32x wd_t = w3[1];
+ u32x we_t = 0;
+ u32x wf_t = pw_len * 8;
+
+ u32x a = SHA1M_A;
+ u32x b = SHA1M_B;
+ u32x c = SHA1M_C;
+ u32x d = SHA1M_D;
+ u32x e = SHA1M_E;
#undef K
#define K SHA1C00
wa_t = rotl32 ((w7_t ^ w2_t ^ wc_t ^ wa_t), 1u); SHA1_STEP (SHA1_F1, b, c, d, e, a, wa_t);
wb_t = rotl32 ((w8_t ^ w3_t ^ wd_t ^ wb_t), 1u); SHA1_STEP (SHA1_F1, a, b, c, d, e, wb_t);
- if (allx (e != e_rev)) continue;
+ if (MATCHES_NONE_VS (e, e_rev)) continue;
wc_t = rotl32 ((w9_t ^ w4_t ^ we_t ^ wc_t), 1u); SHA1_STEP (SHA1_F1, e, a, b, c, d, wc_t);
wd_t = rotl32 ((wa_t ^ w5_t ^ wf_t ^ wd_t), 1u); SHA1_STEP (SHA1_F1, d, e, a, b, c, wd_t);
we_t = rotl32 ((wb_t ^ w6_t ^ w0_t ^ we_t), 1u); SHA1_STEP (SHA1_F1, c, d, e, a, b, we_t);
wf_t = rotl32 ((wc_t ^ w7_t ^ w1_t ^ wf_t), 1u); SHA1_STEP (SHA1_F1, b, c, d, e, a, wf_t);
- const u32 r0 = d;
- const u32 r1 = e;
- const u32 r2 = c;
- const u32 r3 = b;
-
- #include COMPARE_S
+ COMPARE_S_SIMD (d, e, c, b);
}
}
{
append_0x80_2x4 (wordl0, wordl1, pw_l_len);
- switch_buffer_by_offset (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len);
+ switch_buffer_by_offset_le (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len);
}
/**
{
append_0x80_2x4 (wordr0, wordr1, pw_r_len);
- switch_buffer_by_offset (wordr0, wordr1, wordr2, wordr3, pw_l_len);
+ switch_buffer_by_offset_le (wordr0, wordr1, wordr2, wordr3, pw_l_len);
}
u32 w0[4];
{
append_0x80_2x4 (wordl0, wordl1, pw_l_len);
- switch_buffer_by_offset (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len);
+ switch_buffer_by_offset_le (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len);
}
/**
{
append_0x80_2x4 (wordr0, wordr1, pw_r_len);
- switch_buffer_by_offset (wordr0, wordr1, wordr2, wordr3, pw_l_len);
+ switch_buffer_by_offset_le (wordr0, wordr1, wordr2, wordr3, pw_l_len);
}
u32 w0[4];
#define _SHA1_MD5_
+#define NEW_SIMD_CODE
+
#include "include/constants.h"
#include "include/kernel_vendor.h"
#undef _MD5_
#include "OpenCL/types_ocl.c"
#include "OpenCL/common.c"
-
-#define COMPARE_S "OpenCL/check_single_comp4.c"
-#define COMPARE_M "OpenCL/check_multi_comp4.c"
-
-#define uint_to_hex_lower8_le(i) l_bin2asc[(i)]
+#include "OpenCL/simd.c"
+
+#if VECT_SIZE == 1
+#define uint_to_hex_lower8_le(i) (u32x) (l_bin2asc[(i)])
+#elif VECT_SIZE == 2
+#define uint_to_hex_lower8_le(i) (u32x) (l_bin2asc[(i).s0], l_bin2asc[(i).s1])
+#elif VECT_SIZE == 4
+#define uint_to_hex_lower8_le(i) (u32x) (l_bin2asc[(i).s0], l_bin2asc[(i).s1], l_bin2asc[(i).s2], l_bin2asc[(i).s3])
+#elif VECT_SIZE == 8
+#define uint_to_hex_lower8_le(i) (u32x) (l_bin2asc[(i).s0], l_bin2asc[(i).s1], l_bin2asc[(i).s2], l_bin2asc[(i).s3], l_bin2asc[(i).s4], l_bin2asc[(i).s5], l_bin2asc[(i).s6], l_bin2asc[(i).s7])
+#endif
static void m04700m (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], const u32 pw_len, __global pw_t *pws, __global kernel_rule_t *rules_buf, __global comb_t *combs_buf, __global bf_t *bfs_buf, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 bfs_cnt, const u32 digests_cnt, const u32 digests_offset, __local u32 l_bin2asc[256])
{
u32 w0l = w0[0];
- for (u32 il_pos = 0; il_pos < bfs_cnt; il_pos++)
+ for (u32 il_pos = 0; il_pos < bfs_cnt; il_pos += VECT_SIZE)
{
- const u32 w0r = bfs_buf[il_pos].i;
+ const u32x w0r = w0r_create_bft (bfs_buf, il_pos);
+
+ const u32x w0lr = w0l | w0r;
+
+ u32x w0_t[4];
+
+ w0_t[0] = w0lr;
+ w0_t[1] = w0[1];
+ w0_t[2] = w0[2];
+ w0_t[3] = w0[3];
+
+ u32x w1_t[4];
+
+ w1_t[0] = w1[0];
+ w1_t[1] = w1[1];
+ w1_t[2] = w1[2];
+ w1_t[3] = w1[3];
+
+ u32x w2_t[4];
+
+ w2_t[0] = w2[0];
+ w2_t[1] = w2[1];
+ w2_t[2] = w2[2];
+ w2_t[3] = w2[3];
- w0[0] = w0l | w0r;
+ u32x w3_t[4];
+
+ w3_t[0] = w3[0];
+ w3_t[1] = w3[1];
+ w3_t[2] = w3[2];
+ w3_t[3] = w3[3];
/**
* md5
*/
- u32 a = MD5M_A;
- u32 b = MD5M_B;
- u32 c = MD5M_C;
- u32 d = MD5M_D;
-
- MD5_STEP (MD5_Fo, a, b, c, d, w0[0], MD5C00, MD5S00);
- MD5_STEP (MD5_Fo, d, a, b, c, w0[1], MD5C01, MD5S01);
- MD5_STEP (MD5_Fo, c, d, a, b, w0[2], MD5C02, MD5S02);
- MD5_STEP (MD5_Fo, b, c, d, a, w0[3], MD5C03, MD5S03);
- MD5_STEP (MD5_Fo, a, b, c, d, w1[0], MD5C04, MD5S00);
- MD5_STEP (MD5_Fo, d, a, b, c, w1[1], MD5C05, MD5S01);
- MD5_STEP (MD5_Fo, c, d, a, b, w1[2], MD5C06, MD5S02);
- MD5_STEP (MD5_Fo, b, c, d, a, w1[3], MD5C07, MD5S03);
- MD5_STEP (MD5_Fo, a, b, c, d, w2[0], MD5C08, MD5S00);
- MD5_STEP (MD5_Fo, d, a, b, c, w2[1], MD5C09, MD5S01);
- MD5_STEP (MD5_Fo, c, d, a, b, w2[2], MD5C0a, MD5S02);
- MD5_STEP (MD5_Fo, b, c, d, a, w2[3], MD5C0b, MD5S03);
- MD5_STEP (MD5_Fo, a, b, c, d, w3[0], MD5C0c, MD5S00);
- MD5_STEP (MD5_Fo, d, a, b, c, w3[1], MD5C0d, MD5S01);
- MD5_STEP (MD5_Fo, c, d, a, b, w3[2], MD5C0e, MD5S02);
- MD5_STEP (MD5_Fo, b, c, d, a, w3[3], MD5C0f, MD5S03);
-
- MD5_STEP (MD5_Go, a, b, c, d, w0[1], MD5C10, MD5S10);
- MD5_STEP (MD5_Go, d, a, b, c, w1[2], MD5C11, MD5S11);
- MD5_STEP (MD5_Go, c, d, a, b, w2[3], MD5C12, MD5S12);
- MD5_STEP (MD5_Go, b, c, d, a, w0[0], MD5C13, MD5S13);
- MD5_STEP (MD5_Go, a, b, c, d, w1[1], MD5C14, MD5S10);
- MD5_STEP (MD5_Go, d, a, b, c, w2[2], MD5C15, MD5S11);
- MD5_STEP (MD5_Go, c, d, a, b, w3[3], MD5C16, MD5S12);
- MD5_STEP (MD5_Go, b, c, d, a, w1[0], MD5C17, MD5S13);
- MD5_STEP (MD5_Go, a, b, c, d, w2[1], MD5C18, MD5S10);
- MD5_STEP (MD5_Go, d, a, b, c, w3[2], MD5C19, MD5S11);
- MD5_STEP (MD5_Go, c, d, a, b, w0[3], MD5C1a, MD5S12);
- MD5_STEP (MD5_Go, b, c, d, a, w2[0], MD5C1b, MD5S13);
- MD5_STEP (MD5_Go, a, b, c, d, w3[1], MD5C1c, MD5S10);
- MD5_STEP (MD5_Go, d, a, b, c, w0[2], MD5C1d, MD5S11);
- MD5_STEP (MD5_Go, c, d, a, b, w1[3], MD5C1e, MD5S12);
- MD5_STEP (MD5_Go, b, c, d, a, w3[0], MD5C1f, MD5S13);
-
- MD5_STEP (MD5_H , a, b, c, d, w1[1], MD5C20, MD5S20);
- MD5_STEP (MD5_H , d, a, b, c, w2[0], MD5C21, MD5S21);
- MD5_STEP (MD5_H , c, d, a, b, w2[3], MD5C22, MD5S22);
- MD5_STEP (MD5_H , b, c, d, a, w3[2], MD5C23, MD5S23);
- MD5_STEP (MD5_H , a, b, c, d, w0[1], MD5C24, MD5S20);
- MD5_STEP (MD5_H , d, a, b, c, w1[0], MD5C25, MD5S21);
- MD5_STEP (MD5_H , c, d, a, b, w1[3], MD5C26, MD5S22);
- MD5_STEP (MD5_H , b, c, d, a, w2[2], MD5C27, MD5S23);
- MD5_STEP (MD5_H , a, b, c, d, w3[1], MD5C28, MD5S20);
- MD5_STEP (MD5_H , d, a, b, c, w0[0], MD5C29, MD5S21);
- MD5_STEP (MD5_H , c, d, a, b, w0[3], MD5C2a, MD5S22);
- MD5_STEP (MD5_H , b, c, d, a, w1[2], MD5C2b, MD5S23);
- MD5_STEP (MD5_H , a, b, c, d, w2[1], MD5C2c, MD5S20);
- MD5_STEP (MD5_H , d, a, b, c, w3[0], MD5C2d, MD5S21);
- MD5_STEP (MD5_H , c, d, a, b, w3[3], MD5C2e, MD5S22);
- MD5_STEP (MD5_H , b, c, d, a, w0[2], MD5C2f, MD5S23);
-
- MD5_STEP (MD5_I , a, b, c, d, w0[0], MD5C30, MD5S30);
- MD5_STEP (MD5_I , d, a, b, c, w1[3], MD5C31, MD5S31);
- MD5_STEP (MD5_I , c, d, a, b, w3[2], MD5C32, MD5S32);
- MD5_STEP (MD5_I , b, c, d, a, w1[1], MD5C33, MD5S33);
- MD5_STEP (MD5_I , a, b, c, d, w3[0], MD5C34, MD5S30);
- MD5_STEP (MD5_I , d, a, b, c, w0[3], MD5C35, MD5S31);
- MD5_STEP (MD5_I , c, d, a, b, w2[2], MD5C36, MD5S32);
- MD5_STEP (MD5_I , b, c, d, a, w0[1], MD5C37, MD5S33);
- MD5_STEP (MD5_I , a, b, c, d, w2[0], MD5C38, MD5S30);
- MD5_STEP (MD5_I , d, a, b, c, w3[3], MD5C39, MD5S31);
- MD5_STEP (MD5_I , c, d, a, b, w1[2], MD5C3a, MD5S32);
- MD5_STEP (MD5_I , b, c, d, a, w3[1], MD5C3b, MD5S33);
- MD5_STEP (MD5_I , a, b, c, d, w1[0], MD5C3c, MD5S30);
- MD5_STEP (MD5_I , d, a, b, c, w2[3], MD5C3d, MD5S31);
- MD5_STEP (MD5_I , c, d, a, b, w0[2], MD5C3e, MD5S32);
- MD5_STEP (MD5_I , b, c, d, a, w2[1], MD5C3f, MD5S33);
+ u32x a = MD5M_A;
+ u32x b = MD5M_B;
+ u32x c = MD5M_C;
+ u32x d = MD5M_D;
+
+ MD5_STEP (MD5_Fo, a, b, c, d, w0_t[0], MD5C00, MD5S00);
+ MD5_STEP (MD5_Fo, d, a, b, c, w0_t[1], MD5C01, MD5S01);
+ MD5_STEP (MD5_Fo, c, d, a, b, w0_t[2], MD5C02, MD5S02);
+ MD5_STEP (MD5_Fo, b, c, d, a, w0_t[3], MD5C03, MD5S03);
+ MD5_STEP (MD5_Fo, a, b, c, d, w1_t[0], MD5C04, MD5S00);
+ MD5_STEP (MD5_Fo, d, a, b, c, w1_t[1], MD5C05, MD5S01);
+ MD5_STEP (MD5_Fo, c, d, a, b, w1_t[2], MD5C06, MD5S02);
+ MD5_STEP (MD5_Fo, b, c, d, a, w1_t[3], MD5C07, MD5S03);
+ MD5_STEP (MD5_Fo, a, b, c, d, w2_t[0], MD5C08, MD5S00);
+ MD5_STEP (MD5_Fo, d, a, b, c, w2_t[1], MD5C09, MD5S01);
+ MD5_STEP (MD5_Fo, c, d, a, b, w2_t[2], MD5C0a, MD5S02);
+ MD5_STEP (MD5_Fo, b, c, d, a, w2_t[3], MD5C0b, MD5S03);
+ MD5_STEP (MD5_Fo, a, b, c, d, w3_t[0], MD5C0c, MD5S00);
+ MD5_STEP (MD5_Fo, d, a, b, c, w3_t[1], MD5C0d, MD5S01);
+ MD5_STEP (MD5_Fo, c, d, a, b, w3_t[2], MD5C0e, MD5S02);
+ MD5_STEP (MD5_Fo, b, c, d, a, w3_t[3], MD5C0f, MD5S03);
+
+ MD5_STEP (MD5_Go, a, b, c, d, w0_t[1], MD5C10, MD5S10);
+ MD5_STEP (MD5_Go, d, a, b, c, w1_t[2], MD5C11, MD5S11);
+ MD5_STEP (MD5_Go, c, d, a, b, w2_t[3], MD5C12, MD5S12);
+ MD5_STEP (MD5_Go, b, c, d, a, w0_t[0], MD5C13, MD5S13);
+ MD5_STEP (MD5_Go, a, b, c, d, w1_t[1], MD5C14, MD5S10);
+ MD5_STEP (MD5_Go, d, a, b, c, w2_t[2], MD5C15, MD5S11);
+ MD5_STEP (MD5_Go, c, d, a, b, w3_t[3], MD5C16, MD5S12);
+ MD5_STEP (MD5_Go, b, c, d, a, w1_t[0], MD5C17, MD5S13);
+ MD5_STEP (MD5_Go, a, b, c, d, w2_t[1], MD5C18, MD5S10);
+ MD5_STEP (MD5_Go, d, a, b, c, w3_t[2], MD5C19, MD5S11);
+ MD5_STEP (MD5_Go, c, d, a, b, w0_t[3], MD5C1a, MD5S12);
+ MD5_STEP (MD5_Go, b, c, d, a, w2_t[0], MD5C1b, MD5S13);
+ MD5_STEP (MD5_Go, a, b, c, d, w3_t[1], MD5C1c, MD5S10);
+ MD5_STEP (MD5_Go, d, a, b, c, w0_t[2], MD5C1d, MD5S11);
+ MD5_STEP (MD5_Go, c, d, a, b, w1_t[3], MD5C1e, MD5S12);
+ MD5_STEP (MD5_Go, b, c, d, a, w3_t[0], MD5C1f, MD5S13);
+
+ MD5_STEP (MD5_H , a, b, c, d, w1_t[1], MD5C20, MD5S20);
+ MD5_STEP (MD5_H , d, a, b, c, w2_t[0], MD5C21, MD5S21);
+ MD5_STEP (MD5_H , c, d, a, b, w2_t[3], MD5C22, MD5S22);
+ MD5_STEP (MD5_H , b, c, d, a, w3_t[2], MD5C23, MD5S23);
+ MD5_STEP (MD5_H , a, b, c, d, w0_t[1], MD5C24, MD5S20);
+ MD5_STEP (MD5_H , d, a, b, c, w1_t[0], MD5C25, MD5S21);
+ MD5_STEP (MD5_H , c, d, a, b, w1_t[3], MD5C26, MD5S22);
+ MD5_STEP (MD5_H , b, c, d, a, w2_t[2], MD5C27, MD5S23);
+ MD5_STEP (MD5_H , a, b, c, d, w3_t[1], MD5C28, MD5S20);
+ MD5_STEP (MD5_H , d, a, b, c, w0_t[0], MD5C29, MD5S21);
+ MD5_STEP (MD5_H , c, d, a, b, w0_t[3], MD5C2a, MD5S22);
+ MD5_STEP (MD5_H , b, c, d, a, w1_t[2], MD5C2b, MD5S23);
+ MD5_STEP (MD5_H , a, b, c, d, w2_t[1], MD5C2c, MD5S20);
+ MD5_STEP (MD5_H , d, a, b, c, w3_t[0], MD5C2d, MD5S21);
+ MD5_STEP (MD5_H , c, d, a, b, w3_t[3], MD5C2e, MD5S22);
+ MD5_STEP (MD5_H , b, c, d, a, w0_t[2], MD5C2f, MD5S23);
+
+ MD5_STEP (MD5_I , a, b, c, d, w0_t[0], MD5C30, MD5S30);
+ MD5_STEP (MD5_I , d, a, b, c, w1_t[3], MD5C31, MD5S31);
+ MD5_STEP (MD5_I , c, d, a, b, w3_t[2], MD5C32, MD5S32);
+ MD5_STEP (MD5_I , b, c, d, a, w1_t[1], MD5C33, MD5S33);
+ MD5_STEP (MD5_I , a, b, c, d, w3_t[0], MD5C34, MD5S30);
+ MD5_STEP (MD5_I , d, a, b, c, w0_t[3], MD5C35, MD5S31);
+ MD5_STEP (MD5_I , c, d, a, b, w2_t[2], MD5C36, MD5S32);
+ MD5_STEP (MD5_I , b, c, d, a, w0_t[1], MD5C37, MD5S33);
+ MD5_STEP (MD5_I , a, b, c, d, w2_t[0], MD5C38, MD5S30);
+ MD5_STEP (MD5_I , d, a, b, c, w3_t[3], MD5C39, MD5S31);
+ MD5_STEP (MD5_I , c, d, a, b, w1_t[2], MD5C3a, MD5S32);
+ MD5_STEP (MD5_I , b, c, d, a, w3_t[1], MD5C3b, MD5S33);
+ MD5_STEP (MD5_I , a, b, c, d, w1_t[0], MD5C3c, MD5S30);
+ MD5_STEP (MD5_I , d, a, b, c, w2_t[3], MD5C3d, MD5S31);
+ MD5_STEP (MD5_I , c, d, a, b, w0_t[2], MD5C3e, MD5S32);
+ MD5_STEP (MD5_I , b, c, d, a, w2_t[1], MD5C3f, MD5S33);
a += MD5M_A;
b += MD5M_B;
* sha1
*/
- u32 w0_t = uint_to_hex_lower8_le ((a >> 8) & 255) << 0
- | uint_to_hex_lower8_le ((a >> 0) & 255) << 16;
- u32 w1_t = uint_to_hex_lower8_le ((a >> 24) & 255) << 0
- | uint_to_hex_lower8_le ((a >> 16) & 255) << 16;
- u32 w2_t = uint_to_hex_lower8_le ((b >> 8) & 255) << 0
- | uint_to_hex_lower8_le ((b >> 0) & 255) << 16;
- u32 w3_t = uint_to_hex_lower8_le ((b >> 24) & 255) << 0
- | uint_to_hex_lower8_le ((b >> 16) & 255) << 16;
- u32 w4_t = uint_to_hex_lower8_le ((c >> 8) & 255) << 0
- | uint_to_hex_lower8_le ((c >> 0) & 255) << 16;
- u32 w5_t = uint_to_hex_lower8_le ((c >> 24) & 255) << 0
- | uint_to_hex_lower8_le ((c >> 16) & 255) << 16;
- u32 w6_t = uint_to_hex_lower8_le ((d >> 8) & 255) << 0
- | uint_to_hex_lower8_le ((d >> 0) & 255) << 16;
- u32 w7_t = uint_to_hex_lower8_le ((d >> 24) & 255) << 0
- | uint_to_hex_lower8_le ((d >> 16) & 255) << 16;
-
- u32 w8_t = 0x80000000;
- u32 w9_t = 0;
- u32 wa_t = 0;
- u32 wb_t = 0;
- u32 wc_t = 0;
- u32 wd_t = 0;
- u32 we_t = 0;
- u32 wf_t = 32 * 8;
-
- u32 e;
+ w0_t[0] = uint_to_hex_lower8_le ((a >> 8) & 255) << 0
+ | uint_to_hex_lower8_le ((a >> 0) & 255) << 16;
+ w0_t[1] = uint_to_hex_lower8_le ((a >> 24) & 255) << 0
+ | uint_to_hex_lower8_le ((a >> 16) & 255) << 16;
+ w0_t[2] = uint_to_hex_lower8_le ((b >> 8) & 255) << 0
+ | uint_to_hex_lower8_le ((b >> 0) & 255) << 16;
+ w0_t[3] = uint_to_hex_lower8_le ((b >> 24) & 255) << 0
+ | uint_to_hex_lower8_le ((b >> 16) & 255) << 16;
+ w1_t[0] = uint_to_hex_lower8_le ((c >> 8) & 255) << 0
+ | uint_to_hex_lower8_le ((c >> 0) & 255) << 16;
+ w1_t[1] = uint_to_hex_lower8_le ((c >> 24) & 255) << 0
+ | uint_to_hex_lower8_le ((c >> 16) & 255) << 16;
+ w1_t[2] = uint_to_hex_lower8_le ((d >> 8) & 255) << 0
+ | uint_to_hex_lower8_le ((d >> 0) & 255) << 16;
+ w1_t[3] = uint_to_hex_lower8_le ((d >> 24) & 255) << 0
+ | uint_to_hex_lower8_le ((d >> 16) & 255) << 16;
+
+ w2_t[0] = 0x80000000;
+ w2_t[1] = 0;
+ w2_t[2] = 0;
+ w2_t[3] = 0;
+
+ w3_t[0] = 0;
+ w3_t[1] = 0;
+ w3_t[2] = 0;
+ w3_t[3] = 32 * 8;
+
+ u32x e;
a = SHA1M_A;
b = SHA1M_B;
#undef K
#define K SHA1C00
- SHA1_STEP (SHA1_F0o, a, b, c, d, e, w0_t);
- SHA1_STEP (SHA1_F0o, e, a, b, c, d, w1_t);
- SHA1_STEP (SHA1_F0o, d, e, a, b, c, w2_t);
- SHA1_STEP (SHA1_F0o, c, d, e, a, b, w3_t);
- SHA1_STEP (SHA1_F0o, b, c, d, e, a, w4_t);
- SHA1_STEP (SHA1_F0o, a, b, c, d, e, w5_t);
- SHA1_STEP (SHA1_F0o, e, a, b, c, d, w6_t);
- SHA1_STEP (SHA1_F0o, d, e, a, b, c, w7_t);
- SHA1_STEP (SHA1_F0o, c, d, e, a, b, w8_t);
- SHA1_STEP (SHA1_F0o, b, c, d, e, a, w9_t);
- SHA1_STEP (SHA1_F0o, a, b, c, d, e, wa_t);
- SHA1_STEP (SHA1_F0o, e, a, b, c, d, wb_t);
- SHA1_STEP (SHA1_F0o, d, e, a, b, c, wc_t);
- SHA1_STEP (SHA1_F0o, c, d, e, a, b, wd_t);
- SHA1_STEP (SHA1_F0o, b, c, d, e, a, we_t);
- SHA1_STEP (SHA1_F0o, a, b, c, d, e, wf_t);
- w0_t = rotl32 ((wd_t ^ w8_t ^ w2_t ^ w0_t), 1u); SHA1_STEP (SHA1_F0o, e, a, b, c, d, w0_t);
- w1_t = rotl32 ((we_t ^ w9_t ^ w3_t ^ w1_t), 1u); SHA1_STEP (SHA1_F0o, d, e, a, b, c, w1_t);
- w2_t = rotl32 ((wf_t ^ wa_t ^ w4_t ^ w2_t), 1u); SHA1_STEP (SHA1_F0o, c, d, e, a, b, w2_t);
- w3_t = rotl32 ((w0_t ^ wb_t ^ w5_t ^ w3_t), 1u); SHA1_STEP (SHA1_F0o, b, c, d, e, a, w3_t);
+ SHA1_STEP (SHA1_F0o, a, b, c, d, e, w0_t[0]);
+ SHA1_STEP (SHA1_F0o, e, a, b, c, d, w0_t[1]);
+ SHA1_STEP (SHA1_F0o, d, e, a, b, c, w0_t[2]);
+ SHA1_STEP (SHA1_F0o, c, d, e, a, b, w0_t[3]);
+ SHA1_STEP (SHA1_F0o, b, c, d, e, a, w1_t[0]);
+ SHA1_STEP (SHA1_F0o, a, b, c, d, e, w1_t[1]);
+ SHA1_STEP (SHA1_F0o, e, a, b, c, d, w1_t[2]);
+ SHA1_STEP (SHA1_F0o, d, e, a, b, c, w1_t[3]);
+ SHA1_STEP (SHA1_F0o, c, d, e, a, b, w2_t[0]);
+ SHA1_STEP (SHA1_F0o, b, c, d, e, a, w2_t[1]);
+ SHA1_STEP (SHA1_F0o, a, b, c, d, e, w2_t[2]);
+ SHA1_STEP (SHA1_F0o, e, a, b, c, d, w2_t[3]);
+ SHA1_STEP (SHA1_F0o, d, e, a, b, c, w3_t[0]);
+ SHA1_STEP (SHA1_F0o, c, d, e, a, b, w3_t[1]);
+ SHA1_STEP (SHA1_F0o, b, c, d, e, a, w3_t[2]);
+ SHA1_STEP (SHA1_F0o, a, b, c, d, e, w3_t[3]);
+ w0_t[0] = rotl32 ((w3_t[1] ^ w2_t[0] ^ w0_t[2] ^ w0_t[0]), 1u); SHA1_STEP (SHA1_F0o, e, a, b, c, d, w0_t[0]);
+ w0_t[1] = rotl32 ((w3_t[2] ^ w2_t[1] ^ w0_t[3] ^ w0_t[1]), 1u); SHA1_STEP (SHA1_F0o, d, e, a, b, c, w0_t[1]);
+ w0_t[2] = rotl32 ((w3_t[3] ^ w2_t[2] ^ w1_t[0] ^ w0_t[2]), 1u); SHA1_STEP (SHA1_F0o, c, d, e, a, b, w0_t[2]);
+ w0_t[3] = rotl32 ((w0_t[0] ^ w2_t[3] ^ w1_t[1] ^ w0_t[3]), 1u); SHA1_STEP (SHA1_F0o, b, c, d, e, a, w0_t[3]);
#undef K
#define K SHA1C01
- w4_t = rotl32 ((w1_t ^ wc_t ^ w6_t ^ w4_t), 1u); SHA1_STEP (SHA1_F1, a, b, c, d, e, w4_t);
- w5_t = rotl32 ((w2_t ^ wd_t ^ w7_t ^ w5_t), 1u); SHA1_STEP (SHA1_F1, e, a, b, c, d, w5_t);
- w6_t = rotl32 ((w3_t ^ we_t ^ w8_t ^ w6_t), 1u); SHA1_STEP (SHA1_F1, d, e, a, b, c, w6_t);
- w7_t = rotl32 ((w4_t ^ wf_t ^ w9_t ^ w7_t), 1u); SHA1_STEP (SHA1_F1, c, d, e, a, b, w7_t);
- w8_t = rotl32 ((w5_t ^ w0_t ^ wa_t ^ w8_t), 1u); SHA1_STEP (SHA1_F1, b, c, d, e, a, w8_t);
- w9_t = rotl32 ((w6_t ^ w1_t ^ wb_t ^ w9_t), 1u); SHA1_STEP (SHA1_F1, a, b, c, d, e, w9_t);
- wa_t = rotl32 ((w7_t ^ w2_t ^ wc_t ^ wa_t), 1u); SHA1_STEP (SHA1_F1, e, a, b, c, d, wa_t);
- wb_t = rotl32 ((w8_t ^ w3_t ^ wd_t ^ wb_t), 1u); SHA1_STEP (SHA1_F1, d, e, a, b, c, wb_t);
- wc_t = rotl32 ((w9_t ^ w4_t ^ we_t ^ wc_t), 1u); SHA1_STEP (SHA1_F1, c, d, e, a, b, wc_t);
- wd_t = rotl32 ((wa_t ^ w5_t ^ wf_t ^ wd_t), 1u); SHA1_STEP (SHA1_F1, b, c, d, e, a, wd_t);
- we_t = rotl32 ((wb_t ^ w6_t ^ w0_t ^ we_t), 1u); SHA1_STEP (SHA1_F1, a, b, c, d, e, we_t);
- wf_t = rotl32 ((wc_t ^ w7_t ^ w1_t ^ wf_t), 1u); SHA1_STEP (SHA1_F1, e, a, b, c, d, wf_t);
- w0_t = rotl32 ((wd_t ^ w8_t ^ w2_t ^ w0_t), 1u); SHA1_STEP (SHA1_F1, d, e, a, b, c, w0_t);
- w1_t = rotl32 ((we_t ^ w9_t ^ w3_t ^ w1_t), 1u); SHA1_STEP (SHA1_F1, c, d, e, a, b, w1_t);
- w2_t = rotl32 ((wf_t ^ wa_t ^ w4_t ^ w2_t), 1u); SHA1_STEP (SHA1_F1, b, c, d, e, a, w2_t);
- w3_t = rotl32 ((w0_t ^ wb_t ^ w5_t ^ w3_t), 1u); SHA1_STEP (SHA1_F1, a, b, c, d, e, w3_t);
- w4_t = rotl32 ((w1_t ^ wc_t ^ w6_t ^ w4_t), 1u); SHA1_STEP (SHA1_F1, e, a, b, c, d, w4_t);
- w5_t = rotl32 ((w2_t ^ wd_t ^ w7_t ^ w5_t), 1u); SHA1_STEP (SHA1_F1, d, e, a, b, c, w5_t);
- w6_t = rotl32 ((w3_t ^ we_t ^ w8_t ^ w6_t), 1u); SHA1_STEP (SHA1_F1, c, d, e, a, b, w6_t);
- w7_t = rotl32 ((w4_t ^ wf_t ^ w9_t ^ w7_t), 1u); SHA1_STEP (SHA1_F1, b, c, d, e, a, w7_t);
+ w1_t[0] = rotl32 ((w0_t[1] ^ w3_t[0] ^ w1_t[2] ^ w1_t[0]), 1u); SHA1_STEP (SHA1_F1, a, b, c, d, e, w1_t[0]);
+ w1_t[1] = rotl32 ((w0_t[2] ^ w3_t[1] ^ w1_t[3] ^ w1_t[1]), 1u); SHA1_STEP (SHA1_F1, e, a, b, c, d, w1_t[1]);
+ w1_t[2] = rotl32 ((w0_t[3] ^ w3_t[2] ^ w2_t[0] ^ w1_t[2]), 1u); SHA1_STEP (SHA1_F1, d, e, a, b, c, w1_t[2]);
+ w1_t[3] = rotl32 ((w1_t[0] ^ w3_t[3] ^ w2_t[1] ^ w1_t[3]), 1u); SHA1_STEP (SHA1_F1, c, d, e, a, b, w1_t[3]);
+ w2_t[0] = rotl32 ((w1_t[1] ^ w0_t[0] ^ w2_t[2] ^ w2_t[0]), 1u); SHA1_STEP (SHA1_F1, b, c, d, e, a, w2_t[0]);
+ w2_t[1] = rotl32 ((w1_t[2] ^ w0_t[1] ^ w2_t[3] ^ w2_t[1]), 1u); SHA1_STEP (SHA1_F1, a, b, c, d, e, w2_t[1]);
+ w2_t[2] = rotl32 ((w1_t[3] ^ w0_t[2] ^ w3_t[0] ^ w2_t[2]), 1u); SHA1_STEP (SHA1_F1, e, a, b, c, d, w2_t[2]);
+ w2_t[3] = rotl32 ((w2_t[0] ^ w0_t[3] ^ w3_t[1] ^ w2_t[3]), 1u); SHA1_STEP (SHA1_F1, d, e, a, b, c, w2_t[3]);
+ w3_t[0] = rotl32 ((w2_t[1] ^ w1_t[0] ^ w3_t[2] ^ w3_t[0]), 1u); SHA1_STEP (SHA1_F1, c, d, e, a, b, w3_t[0]);
+ w3_t[1] = rotl32 ((w2_t[2] ^ w1_t[1] ^ w3_t[3] ^ w3_t[1]), 1u); SHA1_STEP (SHA1_F1, b, c, d, e, a, w3_t[1]);
+ w3_t[2] = rotl32 ((w2_t[3] ^ w1_t[2] ^ w0_t[0] ^ w3_t[2]), 1u); SHA1_STEP (SHA1_F1, a, b, c, d, e, w3_t[2]);
+ w3_t[3] = rotl32 ((w3_t[0] ^ w1_t[3] ^ w0_t[1] ^ w3_t[3]), 1u); SHA1_STEP (SHA1_F1, e, a, b, c, d, w3_t[3]);
+ w0_t[0] = rotl32 ((w3_t[1] ^ w2_t[0] ^ w0_t[2] ^ w0_t[0]), 1u); SHA1_STEP (SHA1_F1, d, e, a, b, c, w0_t[0]);
+ w0_t[1] = rotl32 ((w3_t[2] ^ w2_t[1] ^ w0_t[3] ^ w0_t[1]), 1u); SHA1_STEP (SHA1_F1, c, d, e, a, b, w0_t[1]);
+ w0_t[2] = rotl32 ((w3_t[3] ^ w2_t[2] ^ w1_t[0] ^ w0_t[2]), 1u); SHA1_STEP (SHA1_F1, b, c, d, e, a, w0_t[2]);
+ w0_t[3] = rotl32 ((w0_t[0] ^ w2_t[3] ^ w1_t[1] ^ w0_t[3]), 1u); SHA1_STEP (SHA1_F1, a, b, c, d, e, w0_t[3]);
+ w1_t[0] = rotl32 ((w0_t[1] ^ w3_t[0] ^ w1_t[2] ^ w1_t[0]), 1u); SHA1_STEP (SHA1_F1, e, a, b, c, d, w1_t[0]);
+ w1_t[1] = rotl32 ((w0_t[2] ^ w3_t[1] ^ w1_t[3] ^ w1_t[1]), 1u); SHA1_STEP (SHA1_F1, d, e, a, b, c, w1_t[1]);
+ w1_t[2] = rotl32 ((w0_t[3] ^ w3_t[2] ^ w2_t[0] ^ w1_t[2]), 1u); SHA1_STEP (SHA1_F1, c, d, e, a, b, w1_t[2]);
+ w1_t[3] = rotl32 ((w1_t[0] ^ w3_t[3] ^ w2_t[1] ^ w1_t[3]), 1u); SHA1_STEP (SHA1_F1, b, c, d, e, a, w1_t[3]);
#undef K
#define K SHA1C02
- w8_t = rotl32 ((w5_t ^ w0_t ^ wa_t ^ w8_t), 1u); SHA1_STEP (SHA1_F2o, a, b, c, d, e, w8_t);
- w9_t = rotl32 ((w6_t ^ w1_t ^ wb_t ^ w9_t), 1u); SHA1_STEP (SHA1_F2o, e, a, b, c, d, w9_t);
- wa_t = rotl32 ((w7_t ^ w2_t ^ wc_t ^ wa_t), 1u); SHA1_STEP (SHA1_F2o, d, e, a, b, c, wa_t);
- wb_t = rotl32 ((w8_t ^ w3_t ^ wd_t ^ wb_t), 1u); SHA1_STEP (SHA1_F2o, c, d, e, a, b, wb_t);
- wc_t = rotl32 ((w9_t ^ w4_t ^ we_t ^ wc_t), 1u); SHA1_STEP (SHA1_F2o, b, c, d, e, a, wc_t);
- wd_t = rotl32 ((wa_t ^ w5_t ^ wf_t ^ wd_t), 1u); SHA1_STEP (SHA1_F2o, a, b, c, d, e, wd_t);
- we_t = rotl32 ((wb_t ^ w6_t ^ w0_t ^ we_t), 1u); SHA1_STEP (SHA1_F2o, e, a, b, c, d, we_t);
- wf_t = rotl32 ((wc_t ^ w7_t ^ w1_t ^ wf_t), 1u); SHA1_STEP (SHA1_F2o, d, e, a, b, c, wf_t);
- w0_t = rotl32 ((wd_t ^ w8_t ^ w2_t ^ w0_t), 1u); SHA1_STEP (SHA1_F2o, c, d, e, a, b, w0_t);
- w1_t = rotl32 ((we_t ^ w9_t ^ w3_t ^ w1_t), 1u); SHA1_STEP (SHA1_F2o, b, c, d, e, a, w1_t);
- w2_t = rotl32 ((wf_t ^ wa_t ^ w4_t ^ w2_t), 1u); SHA1_STEP (SHA1_F2o, a, b, c, d, e, w2_t);
- w3_t = rotl32 ((w0_t ^ wb_t ^ w5_t ^ w3_t), 1u); SHA1_STEP (SHA1_F2o, e, a, b, c, d, w3_t);
- w4_t = rotl32 ((w1_t ^ wc_t ^ w6_t ^ w4_t), 1u); SHA1_STEP (SHA1_F2o, d, e, a, b, c, w4_t);
- w5_t = rotl32 ((w2_t ^ wd_t ^ w7_t ^ w5_t), 1u); SHA1_STEP (SHA1_F2o, c, d, e, a, b, w5_t);
- w6_t = rotl32 ((w3_t ^ we_t ^ w8_t ^ w6_t), 1u); SHA1_STEP (SHA1_F2o, b, c, d, e, a, w6_t);
- w7_t = rotl32 ((w4_t ^ wf_t ^ w9_t ^ w7_t), 1u); SHA1_STEP (SHA1_F2o, a, b, c, d, e, w7_t);
- w8_t = rotl32 ((w5_t ^ w0_t ^ wa_t ^ w8_t), 1u); SHA1_STEP (SHA1_F2o, e, a, b, c, d, w8_t);
- w9_t = rotl32 ((w6_t ^ w1_t ^ wb_t ^ w9_t), 1u); SHA1_STEP (SHA1_F2o, d, e, a, b, c, w9_t);
- wa_t = rotl32 ((w7_t ^ w2_t ^ wc_t ^ wa_t), 1u); SHA1_STEP (SHA1_F2o, c, d, e, a, b, wa_t);
- wb_t = rotl32 ((w8_t ^ w3_t ^ wd_t ^ wb_t), 1u); SHA1_STEP (SHA1_F2o, b, c, d, e, a, wb_t);
+ w2_t[0] = rotl32 ((w1_t[1] ^ w0_t[0] ^ w2_t[2] ^ w2_t[0]), 1u); SHA1_STEP (SHA1_F2o, a, b, c, d, e, w2_t[0]);
+ w2_t[1] = rotl32 ((w1_t[2] ^ w0_t[1] ^ w2_t[3] ^ w2_t[1]), 1u); SHA1_STEP (SHA1_F2o, e, a, b, c, d, w2_t[1]);
+ w2_t[2] = rotl32 ((w1_t[3] ^ w0_t[2] ^ w3_t[0] ^ w2_t[2]), 1u); SHA1_STEP (SHA1_F2o, d, e, a, b, c, w2_t[2]);
+ w2_t[3] = rotl32 ((w2_t[0] ^ w0_t[3] ^ w3_t[1] ^ w2_t[3]), 1u); SHA1_STEP (SHA1_F2o, c, d, e, a, b, w2_t[3]);
+ w3_t[0] = rotl32 ((w2_t[1] ^ w1_t[0] ^ w3_t[2] ^ w3_t[0]), 1u); SHA1_STEP (SHA1_F2o, b, c, d, e, a, w3_t[0]);
+ w3_t[1] = rotl32 ((w2_t[2] ^ w1_t[1] ^ w3_t[3] ^ w3_t[1]), 1u); SHA1_STEP (SHA1_F2o, a, b, c, d, e, w3_t[1]);
+ w3_t[2] = rotl32 ((w2_t[3] ^ w1_t[2] ^ w0_t[0] ^ w3_t[2]), 1u); SHA1_STEP (SHA1_F2o, e, a, b, c, d, w3_t[2]);
+ w3_t[3] = rotl32 ((w3_t[0] ^ w1_t[3] ^ w0_t[1] ^ w3_t[3]), 1u); SHA1_STEP (SHA1_F2o, d, e, a, b, c, w3_t[3]);
+ w0_t[0] = rotl32 ((w3_t[1] ^ w2_t[0] ^ w0_t[2] ^ w0_t[0]), 1u); SHA1_STEP (SHA1_F2o, c, d, e, a, b, w0_t[0]);
+ w0_t[1] = rotl32 ((w3_t[2] ^ w2_t[1] ^ w0_t[3] ^ w0_t[1]), 1u); SHA1_STEP (SHA1_F2o, b, c, d, e, a, w0_t[1]);
+ w0_t[2] = rotl32 ((w3_t[3] ^ w2_t[2] ^ w1_t[0] ^ w0_t[2]), 1u); SHA1_STEP (SHA1_F2o, a, b, c, d, e, w0_t[2]);
+ w0_t[3] = rotl32 ((w0_t[0] ^ w2_t[3] ^ w1_t[1] ^ w0_t[3]), 1u); SHA1_STEP (SHA1_F2o, e, a, b, c, d, w0_t[3]);
+ w1_t[0] = rotl32 ((w0_t[1] ^ w3_t[0] ^ w1_t[2] ^ w1_t[0]), 1u); SHA1_STEP (SHA1_F2o, d, e, a, b, c, w1_t[0]);
+ w1_t[1] = rotl32 ((w0_t[2] ^ w3_t[1] ^ w1_t[3] ^ w1_t[1]), 1u); SHA1_STEP (SHA1_F2o, c, d, e, a, b, w1_t[1]);
+ w1_t[2] = rotl32 ((w0_t[3] ^ w3_t[2] ^ w2_t[0] ^ w1_t[2]), 1u); SHA1_STEP (SHA1_F2o, b, c, d, e, a, w1_t[2]);
+ w1_t[3] = rotl32 ((w1_t[0] ^ w3_t[3] ^ w2_t[1] ^ w1_t[3]), 1u); SHA1_STEP (SHA1_F2o, a, b, c, d, e, w1_t[3]);
+ w2_t[0] = rotl32 ((w1_t[1] ^ w0_t[0] ^ w2_t[2] ^ w2_t[0]), 1u); SHA1_STEP (SHA1_F2o, e, a, b, c, d, w2_t[0]);
+ w2_t[1] = rotl32 ((w1_t[2] ^ w0_t[1] ^ w2_t[3] ^ w2_t[1]), 1u); SHA1_STEP (SHA1_F2o, d, e, a, b, c, w2_t[1]);
+ w2_t[2] = rotl32 ((w1_t[3] ^ w0_t[2] ^ w3_t[0] ^ w2_t[2]), 1u); SHA1_STEP (SHA1_F2o, c, d, e, a, b, w2_t[2]);
+ w2_t[3] = rotl32 ((w2_t[0] ^ w0_t[3] ^ w3_t[1] ^ w2_t[3]), 1u); SHA1_STEP (SHA1_F2o, b, c, d, e, a, w2_t[3]);
#undef K
#define K SHA1C03
- wc_t = rotl32 ((w9_t ^ w4_t ^ we_t ^ wc_t), 1u); SHA1_STEP (SHA1_F1, a, b, c, d, e, wc_t);
- wd_t = rotl32 ((wa_t ^ w5_t ^ wf_t ^ wd_t), 1u); SHA1_STEP (SHA1_F1, e, a, b, c, d, wd_t);
- we_t = rotl32 ((wb_t ^ w6_t ^ w0_t ^ we_t), 1u); SHA1_STEP (SHA1_F1, d, e, a, b, c, we_t);
- wf_t = rotl32 ((wc_t ^ w7_t ^ w1_t ^ wf_t), 1u); SHA1_STEP (SHA1_F1, c, d, e, a, b, wf_t);
- w0_t = rotl32 ((wd_t ^ w8_t ^ w2_t ^ w0_t), 1u); SHA1_STEP (SHA1_F1, b, c, d, e, a, w0_t);
- w1_t = rotl32 ((we_t ^ w9_t ^ w3_t ^ w1_t), 1u); SHA1_STEP (SHA1_F1, a, b, c, d, e, w1_t);
- w2_t = rotl32 ((wf_t ^ wa_t ^ w4_t ^ w2_t), 1u); SHA1_STEP (SHA1_F1, e, a, b, c, d, w2_t);
- w3_t = rotl32 ((w0_t ^ wb_t ^ w5_t ^ w3_t), 1u); SHA1_STEP (SHA1_F1, d, e, a, b, c, w3_t);
- w4_t = rotl32 ((w1_t ^ wc_t ^ w6_t ^ w4_t), 1u); SHA1_STEP (SHA1_F1, c, d, e, a, b, w4_t);
- w5_t = rotl32 ((w2_t ^ wd_t ^ w7_t ^ w5_t), 1u); SHA1_STEP (SHA1_F1, b, c, d, e, a, w5_t);
- w6_t = rotl32 ((w3_t ^ we_t ^ w8_t ^ w6_t), 1u); SHA1_STEP (SHA1_F1, a, b, c, d, e, w6_t);
- w7_t = rotl32 ((w4_t ^ wf_t ^ w9_t ^ w7_t), 1u); SHA1_STEP (SHA1_F1, e, a, b, c, d, w7_t);
- w8_t = rotl32 ((w5_t ^ w0_t ^ wa_t ^ w8_t), 1u); SHA1_STEP (SHA1_F1, d, e, a, b, c, w8_t);
- w9_t = rotl32 ((w6_t ^ w1_t ^ wb_t ^ w9_t), 1u); SHA1_STEP (SHA1_F1, c, d, e, a, b, w9_t);
- wa_t = rotl32 ((w7_t ^ w2_t ^ wc_t ^ wa_t), 1u); SHA1_STEP (SHA1_F1, b, c, d, e, a, wa_t);
- wb_t = rotl32 ((w8_t ^ w3_t ^ wd_t ^ wb_t), 1u); SHA1_STEP (SHA1_F1, a, b, c, d, e, wb_t);
- wc_t = rotl32 ((w9_t ^ w4_t ^ we_t ^ wc_t), 1u); SHA1_STEP (SHA1_F1, e, a, b, c, d, wc_t);
- wd_t = rotl32 ((wa_t ^ w5_t ^ wf_t ^ wd_t), 1u); SHA1_STEP (SHA1_F1, d, e, a, b, c, wd_t);
- we_t = rotl32 ((wb_t ^ w6_t ^ w0_t ^ we_t), 1u); SHA1_STEP (SHA1_F1, c, d, e, a, b, we_t);
- wf_t = rotl32 ((wc_t ^ w7_t ^ w1_t ^ wf_t), 1u); SHA1_STEP (SHA1_F1, b, c, d, e, a, wf_t);
-
- const u32 r0 = d;
- const u32 r1 = e;
- const u32 r2 = c;
- const u32 r3 = b;
-
- #include COMPARE_M
+ w3_t[0] = rotl32 ((w2_t[1] ^ w1_t[0] ^ w3_t[2] ^ w3_t[0]), 1u); SHA1_STEP (SHA1_F1, a, b, c, d, e, w3_t[0]);
+ w3_t[1] = rotl32 ((w2_t[2] ^ w1_t[1] ^ w3_t[3] ^ w3_t[1]), 1u); SHA1_STEP (SHA1_F1, e, a, b, c, d, w3_t[1]);
+ w3_t[2] = rotl32 ((w2_t[3] ^ w1_t[2] ^ w0_t[0] ^ w3_t[2]), 1u); SHA1_STEP (SHA1_F1, d, e, a, b, c, w3_t[2]);
+ w3_t[3] = rotl32 ((w3_t[0] ^ w1_t[3] ^ w0_t[1] ^ w3_t[3]), 1u); SHA1_STEP (SHA1_F1, c, d, e, a, b, w3_t[3]);
+ w0_t[0] = rotl32 ((w3_t[1] ^ w2_t[0] ^ w0_t[2] ^ w0_t[0]), 1u); SHA1_STEP (SHA1_F1, b, c, d, e, a, w0_t[0]);
+ w0_t[1] = rotl32 ((w3_t[2] ^ w2_t[1] ^ w0_t[3] ^ w0_t[1]), 1u); SHA1_STEP (SHA1_F1, a, b, c, d, e, w0_t[1]);
+ w0_t[2] = rotl32 ((w3_t[3] ^ w2_t[2] ^ w1_t[0] ^ w0_t[2]), 1u); SHA1_STEP (SHA1_F1, e, a, b, c, d, w0_t[2]);
+ w0_t[3] = rotl32 ((w0_t[0] ^ w2_t[3] ^ w1_t[1] ^ w0_t[3]), 1u); SHA1_STEP (SHA1_F1, d, e, a, b, c, w0_t[3]);
+ w1_t[0] = rotl32 ((w0_t[1] ^ w3_t[0] ^ w1_t[2] ^ w1_t[0]), 1u); SHA1_STEP (SHA1_F1, c, d, e, a, b, w1_t[0]);
+ w1_t[1] = rotl32 ((w0_t[2] ^ w3_t[1] ^ w1_t[3] ^ w1_t[1]), 1u); SHA1_STEP (SHA1_F1, b, c, d, e, a, w1_t[1]);
+ w1_t[2] = rotl32 ((w0_t[3] ^ w3_t[2] ^ w2_t[0] ^ w1_t[2]), 1u); SHA1_STEP (SHA1_F1, a, b, c, d, e, w1_t[2]);
+ w1_t[3] = rotl32 ((w1_t[0] ^ w3_t[3] ^ w2_t[1] ^ w1_t[3]), 1u); SHA1_STEP (SHA1_F1, e, a, b, c, d, w1_t[3]);
+ w2_t[0] = rotl32 ((w1_t[1] ^ w0_t[0] ^ w2_t[2] ^ w2_t[0]), 1u); SHA1_STEP (SHA1_F1, d, e, a, b, c, w2_t[0]);
+ w2_t[1] = rotl32 ((w1_t[2] ^ w0_t[1] ^ w2_t[3] ^ w2_t[1]), 1u); SHA1_STEP (SHA1_F1, c, d, e, a, b, w2_t[1]);
+ w2_t[2] = rotl32 ((w1_t[3] ^ w0_t[2] ^ w3_t[0] ^ w2_t[2]), 1u); SHA1_STEP (SHA1_F1, b, c, d, e, a, w2_t[2]);
+ w2_t[3] = rotl32 ((w2_t[0] ^ w0_t[3] ^ w3_t[1] ^ w2_t[3]), 1u); SHA1_STEP (SHA1_F1, a, b, c, d, e, w2_t[3]);
+ w3_t[0] = rotl32 ((w2_t[1] ^ w1_t[0] ^ w3_t[2] ^ w3_t[0]), 1u); SHA1_STEP (SHA1_F1, e, a, b, c, d, w3_t[0]);
+ w3_t[1] = rotl32 ((w2_t[2] ^ w1_t[1] ^ w3_t[3] ^ w3_t[1]), 1u); SHA1_STEP (SHA1_F1, d, e, a, b, c, w3_t[1]);
+ w3_t[2] = rotl32 ((w2_t[3] ^ w1_t[2] ^ w0_t[0] ^ w3_t[2]), 1u); SHA1_STEP (SHA1_F1, c, d, e, a, b, w3_t[2]);
+ w3_t[3] = rotl32 ((w3_t[0] ^ w1_t[3] ^ w0_t[1] ^ w3_t[3]), 1u); SHA1_STEP (SHA1_F1, b, c, d, e, a, w3_t[3]);
+
+ COMPARE_M_SIMD (d, e, c, b);
}
}
* reverse
*/
- const u32 e_rev = rotl32 (search[1], 2u);
+ const u32 e_rev = rotl32_S (search[1], 2u);
/**
* loop
u32 w0l = w0[0];
- for (u32 il_pos = 0; il_pos < bfs_cnt; il_pos++)
+ for (u32 il_pos = 0; il_pos < bfs_cnt; il_pos += VECT_SIZE)
{
- const u32 w0r = bfs_buf[il_pos].i;
+ const u32x w0r = w0r_create_bft (bfs_buf, il_pos);
+
+ const u32x w0lr = w0l | w0r;
+
+ u32x w0_t[4];
+
+ w0_t[0] = w0lr;
+ w0_t[1] = w0[1];
+ w0_t[2] = w0[2];
+ w0_t[3] = w0[3];
+
+ u32x w1_t[4];
- w0[0] = w0l | w0r;
+ w1_t[0] = w1[0];
+ w1_t[1] = w1[1];
+ w1_t[2] = w1[2];
+ w1_t[3] = w1[3];
+
+ u32x w2_t[4];
+
+ w2_t[0] = w2[0];
+ w2_t[1] = w2[1];
+ w2_t[2] = w2[2];
+ w2_t[3] = w2[3];
+
+ u32x w3_t[4];
+
+ w3_t[0] = w3[0];
+ w3_t[1] = w3[1];
+ w3_t[2] = w3[2];
+ w3_t[3] = w3[3];
/**
* md5
*/
- u32 a = MD5M_A;
- u32 b = MD5M_B;
- u32 c = MD5M_C;
- u32 d = MD5M_D;
-
- MD5_STEP (MD5_Fo, a, b, c, d, w0[0], MD5C00, MD5S00);
- MD5_STEP (MD5_Fo, d, a, b, c, w0[1], MD5C01, MD5S01);
- MD5_STEP (MD5_Fo, c, d, a, b, w0[2], MD5C02, MD5S02);
- MD5_STEP (MD5_Fo, b, c, d, a, w0[3], MD5C03, MD5S03);
- MD5_STEP (MD5_Fo, a, b, c, d, w1[0], MD5C04, MD5S00);
- MD5_STEP (MD5_Fo, d, a, b, c, w1[1], MD5C05, MD5S01);
- MD5_STEP (MD5_Fo, c, d, a, b, w1[2], MD5C06, MD5S02);
- MD5_STEP (MD5_Fo, b, c, d, a, w1[3], MD5C07, MD5S03);
- MD5_STEP (MD5_Fo, a, b, c, d, w2[0], MD5C08, MD5S00);
- MD5_STEP (MD5_Fo, d, a, b, c, w2[1], MD5C09, MD5S01);
- MD5_STEP (MD5_Fo, c, d, a, b, w2[2], MD5C0a, MD5S02);
- MD5_STEP (MD5_Fo, b, c, d, a, w2[3], MD5C0b, MD5S03);
- MD5_STEP (MD5_Fo, a, b, c, d, w3[0], MD5C0c, MD5S00);
- MD5_STEP (MD5_Fo, d, a, b, c, w3[1], MD5C0d, MD5S01);
- MD5_STEP (MD5_Fo, c, d, a, b, w3[2], MD5C0e, MD5S02);
- MD5_STEP (MD5_Fo, b, c, d, a, w3[3], MD5C0f, MD5S03);
-
- MD5_STEP (MD5_Go, a, b, c, d, w0[1], MD5C10, MD5S10);
- MD5_STEP (MD5_Go, d, a, b, c, w1[2], MD5C11, MD5S11);
- MD5_STEP (MD5_Go, c, d, a, b, w2[3], MD5C12, MD5S12);
- MD5_STEP (MD5_Go, b, c, d, a, w0[0], MD5C13, MD5S13);
- MD5_STEP (MD5_Go, a, b, c, d, w1[1], MD5C14, MD5S10);
- MD5_STEP (MD5_Go, d, a, b, c, w2[2], MD5C15, MD5S11);
- MD5_STEP (MD5_Go, c, d, a, b, w3[3], MD5C16, MD5S12);
- MD5_STEP (MD5_Go, b, c, d, a, w1[0], MD5C17, MD5S13);
- MD5_STEP (MD5_Go, a, b, c, d, w2[1], MD5C18, MD5S10);
- MD5_STEP (MD5_Go, d, a, b, c, w3[2], MD5C19, MD5S11);
- MD5_STEP (MD5_Go, c, d, a, b, w0[3], MD5C1a, MD5S12);
- MD5_STEP (MD5_Go, b, c, d, a, w2[0], MD5C1b, MD5S13);
- MD5_STEP (MD5_Go, a, b, c, d, w3[1], MD5C1c, MD5S10);
- MD5_STEP (MD5_Go, d, a, b, c, w0[2], MD5C1d, MD5S11);
- MD5_STEP (MD5_Go, c, d, a, b, w1[3], MD5C1e, MD5S12);
- MD5_STEP (MD5_Go, b, c, d, a, w3[0], MD5C1f, MD5S13);
-
- MD5_STEP (MD5_H , a, b, c, d, w1[1], MD5C20, MD5S20);
- MD5_STEP (MD5_H , d, a, b, c, w2[0], MD5C21, MD5S21);
- MD5_STEP (MD5_H , c, d, a, b, w2[3], MD5C22, MD5S22);
- MD5_STEP (MD5_H , b, c, d, a, w3[2], MD5C23, MD5S23);
- MD5_STEP (MD5_H , a, b, c, d, w0[1], MD5C24, MD5S20);
- MD5_STEP (MD5_H , d, a, b, c, w1[0], MD5C25, MD5S21);
- MD5_STEP (MD5_H , c, d, a, b, w1[3], MD5C26, MD5S22);
- MD5_STEP (MD5_H , b, c, d, a, w2[2], MD5C27, MD5S23);
- MD5_STEP (MD5_H , a, b, c, d, w3[1], MD5C28, MD5S20);
- MD5_STEP (MD5_H , d, a, b, c, w0[0], MD5C29, MD5S21);
- MD5_STEP (MD5_H , c, d, a, b, w0[3], MD5C2a, MD5S22);
- MD5_STEP (MD5_H , b, c, d, a, w1[2], MD5C2b, MD5S23);
- MD5_STEP (MD5_H , a, b, c, d, w2[1], MD5C2c, MD5S20);
- MD5_STEP (MD5_H , d, a, b, c, w3[0], MD5C2d, MD5S21);
- MD5_STEP (MD5_H , c, d, a, b, w3[3], MD5C2e, MD5S22);
- MD5_STEP (MD5_H , b, c, d, a, w0[2], MD5C2f, MD5S23);
-
- MD5_STEP (MD5_I , a, b, c, d, w0[0], MD5C30, MD5S30);
- MD5_STEP (MD5_I , d, a, b, c, w1[3], MD5C31, MD5S31);
- MD5_STEP (MD5_I , c, d, a, b, w3[2], MD5C32, MD5S32);
- MD5_STEP (MD5_I , b, c, d, a, w1[1], MD5C33, MD5S33);
- MD5_STEP (MD5_I , a, b, c, d, w3[0], MD5C34, MD5S30);
- MD5_STEP (MD5_I , d, a, b, c, w0[3], MD5C35, MD5S31);
- MD5_STEP (MD5_I , c, d, a, b, w2[2], MD5C36, MD5S32);
- MD5_STEP (MD5_I , b, c, d, a, w0[1], MD5C37, MD5S33);
- MD5_STEP (MD5_I , a, b, c, d, w2[0], MD5C38, MD5S30);
- MD5_STEP (MD5_I , d, a, b, c, w3[3], MD5C39, MD5S31);
- MD5_STEP (MD5_I , c, d, a, b, w1[2], MD5C3a, MD5S32);
- MD5_STEP (MD5_I , b, c, d, a, w3[1], MD5C3b, MD5S33);
- MD5_STEP (MD5_I , a, b, c, d, w1[0], MD5C3c, MD5S30);
- MD5_STEP (MD5_I , d, a, b, c, w2[3], MD5C3d, MD5S31);
- MD5_STEP (MD5_I , c, d, a, b, w0[2], MD5C3e, MD5S32);
- MD5_STEP (MD5_I , b, c, d, a, w2[1], MD5C3f, MD5S33);
+ u32x a = MD5M_A;
+ u32x b = MD5M_B;
+ u32x c = MD5M_C;
+ u32x d = MD5M_D;
+
+ MD5_STEP (MD5_Fo, a, b, c, d, w0_t[0], MD5C00, MD5S00);
+ MD5_STEP (MD5_Fo, d, a, b, c, w0_t[1], MD5C01, MD5S01);
+ MD5_STEP (MD5_Fo, c, d, a, b, w0_t[2], MD5C02, MD5S02);
+ MD5_STEP (MD5_Fo, b, c, d, a, w0_t[3], MD5C03, MD5S03);
+ MD5_STEP (MD5_Fo, a, b, c, d, w1_t[0], MD5C04, MD5S00);
+ MD5_STEP (MD5_Fo, d, a, b, c, w1_t[1], MD5C05, MD5S01);
+ MD5_STEP (MD5_Fo, c, d, a, b, w1_t[2], MD5C06, MD5S02);
+ MD5_STEP (MD5_Fo, b, c, d, a, w1_t[3], MD5C07, MD5S03);
+ MD5_STEP (MD5_Fo, a, b, c, d, w2_t[0], MD5C08, MD5S00);
+ MD5_STEP (MD5_Fo, d, a, b, c, w2_t[1], MD5C09, MD5S01);
+ MD5_STEP (MD5_Fo, c, d, a, b, w2_t[2], MD5C0a, MD5S02);
+ MD5_STEP (MD5_Fo, b, c, d, a, w2_t[3], MD5C0b, MD5S03);
+ MD5_STEP (MD5_Fo, a, b, c, d, w3_t[0], MD5C0c, MD5S00);
+ MD5_STEP (MD5_Fo, d, a, b, c, w3_t[1], MD5C0d, MD5S01);
+ MD5_STEP (MD5_Fo, c, d, a, b, w3_t[2], MD5C0e, MD5S02);
+ MD5_STEP (MD5_Fo, b, c, d, a, w3_t[3], MD5C0f, MD5S03);
+
+ MD5_STEP (MD5_Go, a, b, c, d, w0_t[1], MD5C10, MD5S10);
+ MD5_STEP (MD5_Go, d, a, b, c, w1_t[2], MD5C11, MD5S11);
+ MD5_STEP (MD5_Go, c, d, a, b, w2_t[3], MD5C12, MD5S12);
+ MD5_STEP (MD5_Go, b, c, d, a, w0_t[0], MD5C13, MD5S13);
+ MD5_STEP (MD5_Go, a, b, c, d, w1_t[1], MD5C14, MD5S10);
+ MD5_STEP (MD5_Go, d, a, b, c, w2_t[2], MD5C15, MD5S11);
+ MD5_STEP (MD5_Go, c, d, a, b, w3_t[3], MD5C16, MD5S12);
+ MD5_STEP (MD5_Go, b, c, d, a, w1_t[0], MD5C17, MD5S13);
+ MD5_STEP (MD5_Go, a, b, c, d, w2_t[1], MD5C18, MD5S10);
+ MD5_STEP (MD5_Go, d, a, b, c, w3_t[2], MD5C19, MD5S11);
+ MD5_STEP (MD5_Go, c, d, a, b, w0_t[3], MD5C1a, MD5S12);
+ MD5_STEP (MD5_Go, b, c, d, a, w2_t[0], MD5C1b, MD5S13);
+ MD5_STEP (MD5_Go, a, b, c, d, w3_t[1], MD5C1c, MD5S10);
+ MD5_STEP (MD5_Go, d, a, b, c, w0_t[2], MD5C1d, MD5S11);
+ MD5_STEP (MD5_Go, c, d, a, b, w1_t[3], MD5C1e, MD5S12);
+ MD5_STEP (MD5_Go, b, c, d, a, w3_t[0], MD5C1f, MD5S13);
+
+ MD5_STEP (MD5_H , a, b, c, d, w1_t[1], MD5C20, MD5S20);
+ MD5_STEP (MD5_H , d, a, b, c, w2_t[0], MD5C21, MD5S21);
+ MD5_STEP (MD5_H , c, d, a, b, w2_t[3], MD5C22, MD5S22);
+ MD5_STEP (MD5_H , b, c, d, a, w3_t[2], MD5C23, MD5S23);
+ MD5_STEP (MD5_H , a, b, c, d, w0_t[1], MD5C24, MD5S20);
+ MD5_STEP (MD5_H , d, a, b, c, w1_t[0], MD5C25, MD5S21);
+ MD5_STEP (MD5_H , c, d, a, b, w1_t[3], MD5C26, MD5S22);
+ MD5_STEP (MD5_H , b, c, d, a, w2_t[2], MD5C27, MD5S23);
+ MD5_STEP (MD5_H , a, b, c, d, w3_t[1], MD5C28, MD5S20);
+ MD5_STEP (MD5_H , d, a, b, c, w0_t[0], MD5C29, MD5S21);
+ MD5_STEP (MD5_H , c, d, a, b, w0_t[3], MD5C2a, MD5S22);
+ MD5_STEP (MD5_H , b, c, d, a, w1_t[2], MD5C2b, MD5S23);
+ MD5_STEP (MD5_H , a, b, c, d, w2_t[1], MD5C2c, MD5S20);
+ MD5_STEP (MD5_H , d, a, b, c, w3_t[0], MD5C2d, MD5S21);
+ MD5_STEP (MD5_H , c, d, a, b, w3_t[3], MD5C2e, MD5S22);
+ MD5_STEP (MD5_H , b, c, d, a, w0_t[2], MD5C2f, MD5S23);
+
+ MD5_STEP (MD5_I , a, b, c, d, w0_t[0], MD5C30, MD5S30);
+ MD5_STEP (MD5_I , d, a, b, c, w1_t[3], MD5C31, MD5S31);
+ MD5_STEP (MD5_I , c, d, a, b, w3_t[2], MD5C32, MD5S32);
+ MD5_STEP (MD5_I , b, c, d, a, w1_t[1], MD5C33, MD5S33);
+ MD5_STEP (MD5_I , a, b, c, d, w3_t[0], MD5C34, MD5S30);
+ MD5_STEP (MD5_I , d, a, b, c, w0_t[3], MD5C35, MD5S31);
+ MD5_STEP (MD5_I , c, d, a, b, w2_t[2], MD5C36, MD5S32);
+ MD5_STEP (MD5_I , b, c, d, a, w0_t[1], MD5C37, MD5S33);
+ MD5_STEP (MD5_I , a, b, c, d, w2_t[0], MD5C38, MD5S30);
+ MD5_STEP (MD5_I , d, a, b, c, w3_t[3], MD5C39, MD5S31);
+ MD5_STEP (MD5_I , c, d, a, b, w1_t[2], MD5C3a, MD5S32);
+ MD5_STEP (MD5_I , b, c, d, a, w3_t[1], MD5C3b, MD5S33);
+ MD5_STEP (MD5_I , a, b, c, d, w1_t[0], MD5C3c, MD5S30);
+ MD5_STEP (MD5_I , d, a, b, c, w2_t[3], MD5C3d, MD5S31);
+ MD5_STEP (MD5_I , c, d, a, b, w0_t[2], MD5C3e, MD5S32);
+ MD5_STEP (MD5_I , b, c, d, a, w2_t[1], MD5C3f, MD5S33);
a += MD5M_A;
b += MD5M_B;
* sha1
*/
- u32 w0_t = uint_to_hex_lower8_le ((a >> 8) & 255) << 0
- | uint_to_hex_lower8_le ((a >> 0) & 255) << 16;
- u32 w1_t = uint_to_hex_lower8_le ((a >> 24) & 255) << 0
- | uint_to_hex_lower8_le ((a >> 16) & 255) << 16;
- u32 w2_t = uint_to_hex_lower8_le ((b >> 8) & 255) << 0
- | uint_to_hex_lower8_le ((b >> 0) & 255) << 16;
- u32 w3_t = uint_to_hex_lower8_le ((b >> 24) & 255) << 0
- | uint_to_hex_lower8_le ((b >> 16) & 255) << 16;
- u32 w4_t = uint_to_hex_lower8_le ((c >> 8) & 255) << 0
- | uint_to_hex_lower8_le ((c >> 0) & 255) << 16;
- u32 w5_t = uint_to_hex_lower8_le ((c >> 24) & 255) << 0
- | uint_to_hex_lower8_le ((c >> 16) & 255) << 16;
- u32 w6_t = uint_to_hex_lower8_le ((d >> 8) & 255) << 0
- | uint_to_hex_lower8_le ((d >> 0) & 255) << 16;
- u32 w7_t = uint_to_hex_lower8_le ((d >> 24) & 255) << 0
- | uint_to_hex_lower8_le ((d >> 16) & 255) << 16;
-
- u32 w8_t = 0x80000000;
- u32 w9_t = 0;
- u32 wa_t = 0;
- u32 wb_t = 0;
- u32 wc_t = 0;
- u32 wd_t = 0;
- u32 we_t = 0;
- u32 wf_t = 32 * 8;
-
- u32 e;
+ w0_t[0] = uint_to_hex_lower8_le ((a >> 8) & 255) << 0
+ | uint_to_hex_lower8_le ((a >> 0) & 255) << 16;
+ w0_t[1] = uint_to_hex_lower8_le ((a >> 24) & 255) << 0
+ | uint_to_hex_lower8_le ((a >> 16) & 255) << 16;
+ w0_t[2] = uint_to_hex_lower8_le ((b >> 8) & 255) << 0
+ | uint_to_hex_lower8_le ((b >> 0) & 255) << 16;
+ w0_t[3] = uint_to_hex_lower8_le ((b >> 24) & 255) << 0
+ | uint_to_hex_lower8_le ((b >> 16) & 255) << 16;
+ w1_t[0] = uint_to_hex_lower8_le ((c >> 8) & 255) << 0
+ | uint_to_hex_lower8_le ((c >> 0) & 255) << 16;
+ w1_t[1] = uint_to_hex_lower8_le ((c >> 24) & 255) << 0
+ | uint_to_hex_lower8_le ((c >> 16) & 255) << 16;
+ w1_t[2] = uint_to_hex_lower8_le ((d >> 8) & 255) << 0
+ | uint_to_hex_lower8_le ((d >> 0) & 255) << 16;
+ w1_t[3] = uint_to_hex_lower8_le ((d >> 24) & 255) << 0
+ | uint_to_hex_lower8_le ((d >> 16) & 255) << 16;
+
+ w2_t[0] = 0x80000000;
+ w2_t[1] = 0;
+ w2_t[2] = 0;
+ w2_t[3] = 0;
+
+ w3_t[0] = 0;
+ w3_t[1] = 0;
+ w3_t[2] = 0;
+ w3_t[3] = 32 * 8;
+
+ u32x e;
a = SHA1M_A;
b = SHA1M_B;
#undef K
#define K SHA1C00
- SHA1_STEP (SHA1_F0o, a, b, c, d, e, w0_t);
- SHA1_STEP (SHA1_F0o, e, a, b, c, d, w1_t);
- SHA1_STEP (SHA1_F0o, d, e, a, b, c, w2_t);
- SHA1_STEP (SHA1_F0o, c, d, e, a, b, w3_t);
- SHA1_STEP (SHA1_F0o, b, c, d, e, a, w4_t);
- SHA1_STEP (SHA1_F0o, a, b, c, d, e, w5_t);
- SHA1_STEP (SHA1_F0o, e, a, b, c, d, w6_t);
- SHA1_STEP (SHA1_F0o, d, e, a, b, c, w7_t);
- SHA1_STEP (SHA1_F0o, c, d, e, a, b, w8_t);
- SHA1_STEP (SHA1_F0o, b, c, d, e, a, w9_t);
- SHA1_STEP (SHA1_F0o, a, b, c, d, e, wa_t);
- SHA1_STEP (SHA1_F0o, e, a, b, c, d, wb_t);
- SHA1_STEP (SHA1_F0o, d, e, a, b, c, wc_t);
- SHA1_STEP (SHA1_F0o, c, d, e, a, b, wd_t);
- SHA1_STEP (SHA1_F0o, b, c, d, e, a, we_t);
- SHA1_STEP (SHA1_F0o, a, b, c, d, e, wf_t);
- w0_t = rotl32 ((wd_t ^ w8_t ^ w2_t ^ w0_t), 1u); SHA1_STEP (SHA1_F0o, e, a, b, c, d, w0_t);
- w1_t = rotl32 ((we_t ^ w9_t ^ w3_t ^ w1_t), 1u); SHA1_STEP (SHA1_F0o, d, e, a, b, c, w1_t);
- w2_t = rotl32 ((wf_t ^ wa_t ^ w4_t ^ w2_t), 1u); SHA1_STEP (SHA1_F0o, c, d, e, a, b, w2_t);
- w3_t = rotl32 ((w0_t ^ wb_t ^ w5_t ^ w3_t), 1u); SHA1_STEP (SHA1_F0o, b, c, d, e, a, w3_t);
+ SHA1_STEP (SHA1_F0o, a, b, c, d, e, w0_t[0]);
+ SHA1_STEP (SHA1_F0o, e, a, b, c, d, w0_t[1]);
+ SHA1_STEP (SHA1_F0o, d, e, a, b, c, w0_t[2]);
+ SHA1_STEP (SHA1_F0o, c, d, e, a, b, w0_t[3]);
+ SHA1_STEP (SHA1_F0o, b, c, d, e, a, w1_t[0]);
+ SHA1_STEP (SHA1_F0o, a, b, c, d, e, w1_t[1]);
+ SHA1_STEP (SHA1_F0o, e, a, b, c, d, w1_t[2]);
+ SHA1_STEP (SHA1_F0o, d, e, a, b, c, w1_t[3]);
+ SHA1_STEP (SHA1_F0o, c, d, e, a, b, w2_t[0]);
+ SHA1_STEP (SHA1_F0o, b, c, d, e, a, w2_t[1]);
+ SHA1_STEP (SHA1_F0o, a, b, c, d, e, w2_t[2]);
+ SHA1_STEP (SHA1_F0o, e, a, b, c, d, w2_t[3]);
+ SHA1_STEP (SHA1_F0o, d, e, a, b, c, w3_t[0]);
+ SHA1_STEP (SHA1_F0o, c, d, e, a, b, w3_t[1]);
+ SHA1_STEP (SHA1_F0o, b, c, d, e, a, w3_t[2]);
+ SHA1_STEP (SHA1_F0o, a, b, c, d, e, w3_t[3]);
+ w0_t[0] = rotl32 ((w3_t[1] ^ w2_t[0] ^ w0_t[2] ^ w0_t[0]), 1u); SHA1_STEP (SHA1_F0o, e, a, b, c, d, w0_t[0]);
+ w0_t[1] = rotl32 ((w3_t[2] ^ w2_t[1] ^ w0_t[3] ^ w0_t[1]), 1u); SHA1_STEP (SHA1_F0o, d, e, a, b, c, w0_t[1]);
+ w0_t[2] = rotl32 ((w3_t[3] ^ w2_t[2] ^ w1_t[0] ^ w0_t[2]), 1u); SHA1_STEP (SHA1_F0o, c, d, e, a, b, w0_t[2]);
+ w0_t[3] = rotl32 ((w0_t[0] ^ w2_t[3] ^ w1_t[1] ^ w0_t[3]), 1u); SHA1_STEP (SHA1_F0o, b, c, d, e, a, w0_t[3]);
#undef K
#define K SHA1C01
- w4_t = rotl32 ((w1_t ^ wc_t ^ w6_t ^ w4_t), 1u); SHA1_STEP (SHA1_F1, a, b, c, d, e, w4_t);
- w5_t = rotl32 ((w2_t ^ wd_t ^ w7_t ^ w5_t), 1u); SHA1_STEP (SHA1_F1, e, a, b, c, d, w5_t);
- w6_t = rotl32 ((w3_t ^ we_t ^ w8_t ^ w6_t), 1u); SHA1_STEP (SHA1_F1, d, e, a, b, c, w6_t);
- w7_t = rotl32 ((w4_t ^ wf_t ^ w9_t ^ w7_t), 1u); SHA1_STEP (SHA1_F1, c, d, e, a, b, w7_t);
- w8_t = rotl32 ((w5_t ^ w0_t ^ wa_t ^ w8_t), 1u); SHA1_STEP (SHA1_F1, b, c, d, e, a, w8_t);
- w9_t = rotl32 ((w6_t ^ w1_t ^ wb_t ^ w9_t), 1u); SHA1_STEP (SHA1_F1, a, b, c, d, e, w9_t);
- wa_t = rotl32 ((w7_t ^ w2_t ^ wc_t ^ wa_t), 1u); SHA1_STEP (SHA1_F1, e, a, b, c, d, wa_t);
- wb_t = rotl32 ((w8_t ^ w3_t ^ wd_t ^ wb_t), 1u); SHA1_STEP (SHA1_F1, d, e, a, b, c, wb_t);
- wc_t = rotl32 ((w9_t ^ w4_t ^ we_t ^ wc_t), 1u); SHA1_STEP (SHA1_F1, c, d, e, a, b, wc_t);
- wd_t = rotl32 ((wa_t ^ w5_t ^ wf_t ^ wd_t), 1u); SHA1_STEP (SHA1_F1, b, c, d, e, a, wd_t);
- we_t = rotl32 ((wb_t ^ w6_t ^ w0_t ^ we_t), 1u); SHA1_STEP (SHA1_F1, a, b, c, d, e, we_t);
- wf_t = rotl32 ((wc_t ^ w7_t ^ w1_t ^ wf_t), 1u); SHA1_STEP (SHA1_F1, e, a, b, c, d, wf_t);
- w0_t = rotl32 ((wd_t ^ w8_t ^ w2_t ^ w0_t), 1u); SHA1_STEP (SHA1_F1, d, e, a, b, c, w0_t);
- w1_t = rotl32 ((we_t ^ w9_t ^ w3_t ^ w1_t), 1u); SHA1_STEP (SHA1_F1, c, d, e, a, b, w1_t);
- w2_t = rotl32 ((wf_t ^ wa_t ^ w4_t ^ w2_t), 1u); SHA1_STEP (SHA1_F1, b, c, d, e, a, w2_t);
- w3_t = rotl32 ((w0_t ^ wb_t ^ w5_t ^ w3_t), 1u); SHA1_STEP (SHA1_F1, a, b, c, d, e, w3_t);
- w4_t = rotl32 ((w1_t ^ wc_t ^ w6_t ^ w4_t), 1u); SHA1_STEP (SHA1_F1, e, a, b, c, d, w4_t);
- w5_t = rotl32 ((w2_t ^ wd_t ^ w7_t ^ w5_t), 1u); SHA1_STEP (SHA1_F1, d, e, a, b, c, w5_t);
- w6_t = rotl32 ((w3_t ^ we_t ^ w8_t ^ w6_t), 1u); SHA1_STEP (SHA1_F1, c, d, e, a, b, w6_t);
- w7_t = rotl32 ((w4_t ^ wf_t ^ w9_t ^ w7_t), 1u); SHA1_STEP (SHA1_F1, b, c, d, e, a, w7_t);
+ w1_t[0] = rotl32 ((w0_t[1] ^ w3_t[0] ^ w1_t[2] ^ w1_t[0]), 1u); SHA1_STEP (SHA1_F1, a, b, c, d, e, w1_t[0]);
+ w1_t[1] = rotl32 ((w0_t[2] ^ w3_t[1] ^ w1_t[3] ^ w1_t[1]), 1u); SHA1_STEP (SHA1_F1, e, a, b, c, d, w1_t[1]);
+ w1_t[2] = rotl32 ((w0_t[3] ^ w3_t[2] ^ w2_t[0] ^ w1_t[2]), 1u); SHA1_STEP (SHA1_F1, d, e, a, b, c, w1_t[2]);
+ w1_t[3] = rotl32 ((w1_t[0] ^ w3_t[3] ^ w2_t[1] ^ w1_t[3]), 1u); SHA1_STEP (SHA1_F1, c, d, e, a, b, w1_t[3]);
+ w2_t[0] = rotl32 ((w1_t[1] ^ w0_t[0] ^ w2_t[2] ^ w2_t[0]), 1u); SHA1_STEP (SHA1_F1, b, c, d, e, a, w2_t[0]);
+ w2_t[1] = rotl32 ((w1_t[2] ^ w0_t[1] ^ w2_t[3] ^ w2_t[1]), 1u); SHA1_STEP (SHA1_F1, a, b, c, d, e, w2_t[1]);
+ w2_t[2] = rotl32 ((w1_t[3] ^ w0_t[2] ^ w3_t[0] ^ w2_t[2]), 1u); SHA1_STEP (SHA1_F1, e, a, b, c, d, w2_t[2]);
+ w2_t[3] = rotl32 ((w2_t[0] ^ w0_t[3] ^ w3_t[1] ^ w2_t[3]), 1u); SHA1_STEP (SHA1_F1, d, e, a, b, c, w2_t[3]);
+ w3_t[0] = rotl32 ((w2_t[1] ^ w1_t[0] ^ w3_t[2] ^ w3_t[0]), 1u); SHA1_STEP (SHA1_F1, c, d, e, a, b, w3_t[0]);
+ w3_t[1] = rotl32 ((w2_t[2] ^ w1_t[1] ^ w3_t[3] ^ w3_t[1]), 1u); SHA1_STEP (SHA1_F1, b, c, d, e, a, w3_t[1]);
+ w3_t[2] = rotl32 ((w2_t[3] ^ w1_t[2] ^ w0_t[0] ^ w3_t[2]), 1u); SHA1_STEP (SHA1_F1, a, b, c, d, e, w3_t[2]);
+ w3_t[3] = rotl32 ((w3_t[0] ^ w1_t[3] ^ w0_t[1] ^ w3_t[3]), 1u); SHA1_STEP (SHA1_F1, e, a, b, c, d, w3_t[3]);
+ w0_t[0] = rotl32 ((w3_t[1] ^ w2_t[0] ^ w0_t[2] ^ w0_t[0]), 1u); SHA1_STEP (SHA1_F1, d, e, a, b, c, w0_t[0]);
+ w0_t[1] = rotl32 ((w3_t[2] ^ w2_t[1] ^ w0_t[3] ^ w0_t[1]), 1u); SHA1_STEP (SHA1_F1, c, d, e, a, b, w0_t[1]);
+ w0_t[2] = rotl32 ((w3_t[3] ^ w2_t[2] ^ w1_t[0] ^ w0_t[2]), 1u); SHA1_STEP (SHA1_F1, b, c, d, e, a, w0_t[2]);
+ w0_t[3] = rotl32 ((w0_t[0] ^ w2_t[3] ^ w1_t[1] ^ w0_t[3]), 1u); SHA1_STEP (SHA1_F1, a, b, c, d, e, w0_t[3]);
+ w1_t[0] = rotl32 ((w0_t[1] ^ w3_t[0] ^ w1_t[2] ^ w1_t[0]), 1u); SHA1_STEP (SHA1_F1, e, a, b, c, d, w1_t[0]);
+ w1_t[1] = rotl32 ((w0_t[2] ^ w3_t[1] ^ w1_t[3] ^ w1_t[1]), 1u); SHA1_STEP (SHA1_F1, d, e, a, b, c, w1_t[1]);
+ w1_t[2] = rotl32 ((w0_t[3] ^ w3_t[2] ^ w2_t[0] ^ w1_t[2]), 1u); SHA1_STEP (SHA1_F1, c, d, e, a, b, w1_t[2]);
+ w1_t[3] = rotl32 ((w1_t[0] ^ w3_t[3] ^ w2_t[1] ^ w1_t[3]), 1u); SHA1_STEP (SHA1_F1, b, c, d, e, a, w1_t[3]);
#undef K
#define K SHA1C02
- w8_t = rotl32 ((w5_t ^ w0_t ^ wa_t ^ w8_t), 1u); SHA1_STEP (SHA1_F2o, a, b, c, d, e, w8_t);
- w9_t = rotl32 ((w6_t ^ w1_t ^ wb_t ^ w9_t), 1u); SHA1_STEP (SHA1_F2o, e, a, b, c, d, w9_t);
- wa_t = rotl32 ((w7_t ^ w2_t ^ wc_t ^ wa_t), 1u); SHA1_STEP (SHA1_F2o, d, e, a, b, c, wa_t);
- wb_t = rotl32 ((w8_t ^ w3_t ^ wd_t ^ wb_t), 1u); SHA1_STEP (SHA1_F2o, c, d, e, a, b, wb_t);
- wc_t = rotl32 ((w9_t ^ w4_t ^ we_t ^ wc_t), 1u); SHA1_STEP (SHA1_F2o, b, c, d, e, a, wc_t);
- wd_t = rotl32 ((wa_t ^ w5_t ^ wf_t ^ wd_t), 1u); SHA1_STEP (SHA1_F2o, a, b, c, d, e, wd_t);
- we_t = rotl32 ((wb_t ^ w6_t ^ w0_t ^ we_t), 1u); SHA1_STEP (SHA1_F2o, e, a, b, c, d, we_t);
- wf_t = rotl32 ((wc_t ^ w7_t ^ w1_t ^ wf_t), 1u); SHA1_STEP (SHA1_F2o, d, e, a, b, c, wf_t);
- w0_t = rotl32 ((wd_t ^ w8_t ^ w2_t ^ w0_t), 1u); SHA1_STEP (SHA1_F2o, c, d, e, a, b, w0_t);
- w1_t = rotl32 ((we_t ^ w9_t ^ w3_t ^ w1_t), 1u); SHA1_STEP (SHA1_F2o, b, c, d, e, a, w1_t);
- w2_t = rotl32 ((wf_t ^ wa_t ^ w4_t ^ w2_t), 1u); SHA1_STEP (SHA1_F2o, a, b, c, d, e, w2_t);
- w3_t = rotl32 ((w0_t ^ wb_t ^ w5_t ^ w3_t), 1u); SHA1_STEP (SHA1_F2o, e, a, b, c, d, w3_t);
- w4_t = rotl32 ((w1_t ^ wc_t ^ w6_t ^ w4_t), 1u); SHA1_STEP (SHA1_F2o, d, e, a, b, c, w4_t);
- w5_t = rotl32 ((w2_t ^ wd_t ^ w7_t ^ w5_t), 1u); SHA1_STEP (SHA1_F2o, c, d, e, a, b, w5_t);
- w6_t = rotl32 ((w3_t ^ we_t ^ w8_t ^ w6_t), 1u); SHA1_STEP (SHA1_F2o, b, c, d, e, a, w6_t);
- w7_t = rotl32 ((w4_t ^ wf_t ^ w9_t ^ w7_t), 1u); SHA1_STEP (SHA1_F2o, a, b, c, d, e, w7_t);
- w8_t = rotl32 ((w5_t ^ w0_t ^ wa_t ^ w8_t), 1u); SHA1_STEP (SHA1_F2o, e, a, b, c, d, w8_t);
- w9_t = rotl32 ((w6_t ^ w1_t ^ wb_t ^ w9_t), 1u); SHA1_STEP (SHA1_F2o, d, e, a, b, c, w9_t);
- wa_t = rotl32 ((w7_t ^ w2_t ^ wc_t ^ wa_t), 1u); SHA1_STEP (SHA1_F2o, c, d, e, a, b, wa_t);
- wb_t = rotl32 ((w8_t ^ w3_t ^ wd_t ^ wb_t), 1u); SHA1_STEP (SHA1_F2o, b, c, d, e, a, wb_t);
+ w2_t[0] = rotl32 ((w1_t[1] ^ w0_t[0] ^ w2_t[2] ^ w2_t[0]), 1u); SHA1_STEP (SHA1_F2o, a, b, c, d, e, w2_t[0]);
+ w2_t[1] = rotl32 ((w1_t[2] ^ w0_t[1] ^ w2_t[3] ^ w2_t[1]), 1u); SHA1_STEP (SHA1_F2o, e, a, b, c, d, w2_t[1]);
+ w2_t[2] = rotl32 ((w1_t[3] ^ w0_t[2] ^ w3_t[0] ^ w2_t[2]), 1u); SHA1_STEP (SHA1_F2o, d, e, a, b, c, w2_t[2]);
+ w2_t[3] = rotl32 ((w2_t[0] ^ w0_t[3] ^ w3_t[1] ^ w2_t[3]), 1u); SHA1_STEP (SHA1_F2o, c, d, e, a, b, w2_t[3]);
+ w3_t[0] = rotl32 ((w2_t[1] ^ w1_t[0] ^ w3_t[2] ^ w3_t[0]), 1u); SHA1_STEP (SHA1_F2o, b, c, d, e, a, w3_t[0]);
+ w3_t[1] = rotl32 ((w2_t[2] ^ w1_t[1] ^ w3_t[3] ^ w3_t[1]), 1u); SHA1_STEP (SHA1_F2o, a, b, c, d, e, w3_t[1]);
+ w3_t[2] = rotl32 ((w2_t[3] ^ w1_t[2] ^ w0_t[0] ^ w3_t[2]), 1u); SHA1_STEP (SHA1_F2o, e, a, b, c, d, w3_t[2]);
+ w3_t[3] = rotl32 ((w3_t[0] ^ w1_t[3] ^ w0_t[1] ^ w3_t[3]), 1u); SHA1_STEP (SHA1_F2o, d, e, a, b, c, w3_t[3]);
+ w0_t[0] = rotl32 ((w3_t[1] ^ w2_t[0] ^ w0_t[2] ^ w0_t[0]), 1u); SHA1_STEP (SHA1_F2o, c, d, e, a, b, w0_t[0]);
+ w0_t[1] = rotl32 ((w3_t[2] ^ w2_t[1] ^ w0_t[3] ^ w0_t[1]), 1u); SHA1_STEP (SHA1_F2o, b, c, d, e, a, w0_t[1]);
+ w0_t[2] = rotl32 ((w3_t[3] ^ w2_t[2] ^ w1_t[0] ^ w0_t[2]), 1u); SHA1_STEP (SHA1_F2o, a, b, c, d, e, w0_t[2]);
+ w0_t[3] = rotl32 ((w0_t[0] ^ w2_t[3] ^ w1_t[1] ^ w0_t[3]), 1u); SHA1_STEP (SHA1_F2o, e, a, b, c, d, w0_t[3]);
+ w1_t[0] = rotl32 ((w0_t[1] ^ w3_t[0] ^ w1_t[2] ^ w1_t[0]), 1u); SHA1_STEP (SHA1_F2o, d, e, a, b, c, w1_t[0]);
+ w1_t[1] = rotl32 ((w0_t[2] ^ w3_t[1] ^ w1_t[3] ^ w1_t[1]), 1u); SHA1_STEP (SHA1_F2o, c, d, e, a, b, w1_t[1]);
+ w1_t[2] = rotl32 ((w0_t[3] ^ w3_t[2] ^ w2_t[0] ^ w1_t[2]), 1u); SHA1_STEP (SHA1_F2o, b, c, d, e, a, w1_t[2]);
+ w1_t[3] = rotl32 ((w1_t[0] ^ w3_t[3] ^ w2_t[1] ^ w1_t[3]), 1u); SHA1_STEP (SHA1_F2o, a, b, c, d, e, w1_t[3]);
+ w2_t[0] = rotl32 ((w1_t[1] ^ w0_t[0] ^ w2_t[2] ^ w2_t[0]), 1u); SHA1_STEP (SHA1_F2o, e, a, b, c, d, w2_t[0]);
+ w2_t[1] = rotl32 ((w1_t[2] ^ w0_t[1] ^ w2_t[3] ^ w2_t[1]), 1u); SHA1_STEP (SHA1_F2o, d, e, a, b, c, w2_t[1]);
+ w2_t[2] = rotl32 ((w1_t[3] ^ w0_t[2] ^ w3_t[0] ^ w2_t[2]), 1u); SHA1_STEP (SHA1_F2o, c, d, e, a, b, w2_t[2]);
+ w2_t[3] = rotl32 ((w2_t[0] ^ w0_t[3] ^ w3_t[1] ^ w2_t[3]), 1u); SHA1_STEP (SHA1_F2o, b, c, d, e, a, w2_t[3]);
#undef K
#define K SHA1C03
- wc_t = rotl32 ((w9_t ^ w4_t ^ we_t ^ wc_t), 1u); SHA1_STEP (SHA1_F1, a, b, c, d, e, wc_t);
- wd_t = rotl32 ((wa_t ^ w5_t ^ wf_t ^ wd_t), 1u); SHA1_STEP (SHA1_F1, e, a, b, c, d, wd_t);
- we_t = rotl32 ((wb_t ^ w6_t ^ w0_t ^ we_t), 1u); SHA1_STEP (SHA1_F1, d, e, a, b, c, we_t);
- wf_t = rotl32 ((wc_t ^ w7_t ^ w1_t ^ wf_t), 1u); SHA1_STEP (SHA1_F1, c, d, e, a, b, wf_t);
- w0_t = rotl32 ((wd_t ^ w8_t ^ w2_t ^ w0_t), 1u); SHA1_STEP (SHA1_F1, b, c, d, e, a, w0_t);
- w1_t = rotl32 ((we_t ^ w9_t ^ w3_t ^ w1_t), 1u); SHA1_STEP (SHA1_F1, a, b, c, d, e, w1_t);
- w2_t = rotl32 ((wf_t ^ wa_t ^ w4_t ^ w2_t), 1u); SHA1_STEP (SHA1_F1, e, a, b, c, d, w2_t);
- w3_t = rotl32 ((w0_t ^ wb_t ^ w5_t ^ w3_t), 1u); SHA1_STEP (SHA1_F1, d, e, a, b, c, w3_t);
- w4_t = rotl32 ((w1_t ^ wc_t ^ w6_t ^ w4_t), 1u); SHA1_STEP (SHA1_F1, c, d, e, a, b, w4_t);
- w5_t = rotl32 ((w2_t ^ wd_t ^ w7_t ^ w5_t), 1u); SHA1_STEP (SHA1_F1, b, c, d, e, a, w5_t);
- w6_t = rotl32 ((w3_t ^ we_t ^ w8_t ^ w6_t), 1u); SHA1_STEP (SHA1_F1, a, b, c, d, e, w6_t);
- w7_t = rotl32 ((w4_t ^ wf_t ^ w9_t ^ w7_t), 1u); SHA1_STEP (SHA1_F1, e, a, b, c, d, w7_t);
- w8_t = rotl32 ((w5_t ^ w0_t ^ wa_t ^ w8_t), 1u); SHA1_STEP (SHA1_F1, d, e, a, b, c, w8_t);
- w9_t = rotl32 ((w6_t ^ w1_t ^ wb_t ^ w9_t), 1u); SHA1_STEP (SHA1_F1, c, d, e, a, b, w9_t);
- wa_t = rotl32 ((w7_t ^ w2_t ^ wc_t ^ wa_t), 1u); SHA1_STEP (SHA1_F1, b, c, d, e, a, wa_t);
- wb_t = rotl32 ((w8_t ^ w3_t ^ wd_t ^ wb_t), 1u); SHA1_STEP (SHA1_F1, a, b, c, d, e, wb_t);
-
- if (allx (e != e_rev)) continue;
-
- wc_t = rotl32 ((w9_t ^ w4_t ^ we_t ^ wc_t), 1u); SHA1_STEP (SHA1_F1, e, a, b, c, d, wc_t);
- wd_t = rotl32 ((wa_t ^ w5_t ^ wf_t ^ wd_t), 1u); SHA1_STEP (SHA1_F1, d, e, a, b, c, wd_t);
- we_t = rotl32 ((wb_t ^ w6_t ^ w0_t ^ we_t), 1u); SHA1_STEP (SHA1_F1, c, d, e, a, b, we_t);
- wf_t = rotl32 ((wc_t ^ w7_t ^ w1_t ^ wf_t), 1u); SHA1_STEP (SHA1_F1, b, c, d, e, a, wf_t);
-
- const u32 r0 = d;
- const u32 r1 = e;
- const u32 r2 = c;
- const u32 r3 = b;
-
- #include COMPARE_S
+ w3_t[0] = rotl32 ((w2_t[1] ^ w1_t[0] ^ w3_t[2] ^ w3_t[0]), 1u); SHA1_STEP (SHA1_F1, a, b, c, d, e, w3_t[0]);
+ w3_t[1] = rotl32 ((w2_t[2] ^ w1_t[1] ^ w3_t[3] ^ w3_t[1]), 1u); SHA1_STEP (SHA1_F1, e, a, b, c, d, w3_t[1]);
+ w3_t[2] = rotl32 ((w2_t[3] ^ w1_t[2] ^ w0_t[0] ^ w3_t[2]), 1u); SHA1_STEP (SHA1_F1, d, e, a, b, c, w3_t[2]);
+ w3_t[3] = rotl32 ((w3_t[0] ^ w1_t[3] ^ w0_t[1] ^ w3_t[3]), 1u); SHA1_STEP (SHA1_F1, c, d, e, a, b, w3_t[3]);
+ w0_t[0] = rotl32 ((w3_t[1] ^ w2_t[0] ^ w0_t[2] ^ w0_t[0]), 1u); SHA1_STEP (SHA1_F1, b, c, d, e, a, w0_t[0]);
+ w0_t[1] = rotl32 ((w3_t[2] ^ w2_t[1] ^ w0_t[3] ^ w0_t[1]), 1u); SHA1_STEP (SHA1_F1, a, b, c, d, e, w0_t[1]);
+ w0_t[2] = rotl32 ((w3_t[3] ^ w2_t[2] ^ w1_t[0] ^ w0_t[2]), 1u); SHA1_STEP (SHA1_F1, e, a, b, c, d, w0_t[2]);
+ w0_t[3] = rotl32 ((w0_t[0] ^ w2_t[3] ^ w1_t[1] ^ w0_t[3]), 1u); SHA1_STEP (SHA1_F1, d, e, a, b, c, w0_t[3]);
+ w1_t[0] = rotl32 ((w0_t[1] ^ w3_t[0] ^ w1_t[2] ^ w1_t[0]), 1u); SHA1_STEP (SHA1_F1, c, d, e, a, b, w1_t[0]);
+ w1_t[1] = rotl32 ((w0_t[2] ^ w3_t[1] ^ w1_t[3] ^ w1_t[1]), 1u); SHA1_STEP (SHA1_F1, b, c, d, e, a, w1_t[1]);
+ w1_t[2] = rotl32 ((w0_t[3] ^ w3_t[2] ^ w2_t[0] ^ w1_t[2]), 1u); SHA1_STEP (SHA1_F1, a, b, c, d, e, w1_t[2]);
+ w1_t[3] = rotl32 ((w1_t[0] ^ w3_t[3] ^ w2_t[1] ^ w1_t[3]), 1u); SHA1_STEP (SHA1_F1, e, a, b, c, d, w1_t[3]);
+ w2_t[0] = rotl32 ((w1_t[1] ^ w0_t[0] ^ w2_t[2] ^ w2_t[0]), 1u); SHA1_STEP (SHA1_F1, d, e, a, b, c, w2_t[0]);
+ w2_t[1] = rotl32 ((w1_t[2] ^ w0_t[1] ^ w2_t[3] ^ w2_t[1]), 1u); SHA1_STEP (SHA1_F1, c, d, e, a, b, w2_t[1]);
+ w2_t[2] = rotl32 ((w1_t[3] ^ w0_t[2] ^ w3_t[0] ^ w2_t[2]), 1u); SHA1_STEP (SHA1_F1, b, c, d, e, a, w2_t[2]);
+ w2_t[3] = rotl32 ((w2_t[0] ^ w0_t[3] ^ w3_t[1] ^ w2_t[3]), 1u); SHA1_STEP (SHA1_F1, a, b, c, d, e, w2_t[3]);
+
+ if (MATCHES_NONE_VS (e, e_rev)) continue;
+
+ w3_t[0] = rotl32 ((w2_t[1] ^ w1_t[0] ^ w3_t[2] ^ w3_t[0]), 1u); SHA1_STEP (SHA1_F1, e, a, b, c, d, w3_t[0]);
+ w3_t[1] = rotl32 ((w2_t[2] ^ w1_t[1] ^ w3_t[3] ^ w3_t[1]), 1u); SHA1_STEP (SHA1_F1, d, e, a, b, c, w3_t[1]);
+ w3_t[2] = rotl32 ((w2_t[3] ^ w1_t[2] ^ w0_t[0] ^ w3_t[2]), 1u); SHA1_STEP (SHA1_F1, c, d, e, a, b, w3_t[2]);
+ w3_t[3] = rotl32 ((w3_t[0] ^ w1_t[3] ^ w0_t[1] ^ w3_t[3]), 1u); SHA1_STEP (SHA1_F1, b, c, d, e, a, w3_t[3]);
+
+ COMPARE_S_SIMD (d, e, c, b);
}
}
+
__kernel void m04700_m04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, __global comb_t *combs_buf, __global bf_t *bfs_buf, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 bfs_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max)
{
/**
s3[2] = 0;
s3[3] = 0;
- switch_buffer_by_offset (s0, s1, s2, s3, out_len);
+ switch_buffer_by_offset_le (s0, s1, s2, s3, out_len);
w0[0] |= s0[0];
w0[1] |= s0[1];
* add id byte
*/
- switch_buffer_by_offset (w0, w1, w2, w3, 1);
+ switch_buffer_by_offset_le (w0, w1, w2, w3, 1);
w0[0] |= salt_buf[4];
s3[2] = 0;
s3[3] = 0;
- switch_buffer_by_offset (s0, s1, s2, s3, out_len);
+ switch_buffer_by_offset_le (s0, s1, s2, s3, out_len);
w0[0] |= s0[0];
w0[1] |= s0[1];
* add id byte
*/
- switch_buffer_by_offset (w0, w1, w2, w3, 1);
+ switch_buffer_by_offset_le (w0, w1, w2, w3, 1);
w0[0] |= salt_buf[4];
if (combs_mode == COMBINATOR_MODE_BASE_RIGHT)
{
- switch_buffer_by_offset (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len);
+ switch_buffer_by_offset_le (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len);
}
/**
if (combs_mode == COMBINATOR_MODE_BASE_LEFT)
{
- switch_buffer_by_offset (wordr0, wordr1, wordr2, wordr3, pw_l_len);
+ switch_buffer_by_offset_le (wordr0, wordr1, wordr2, wordr3, pw_l_len);
}
u32 w0[4];
* add id byte
*/
- switch_buffer_by_offset (w0, w1, w2, w3, 1);
+ switch_buffer_by_offset_le (w0, w1, w2, w3, 1);
w0[0] |= salt_buf[4];
s3[2] = 0;
s3[3] = 0;
- switch_buffer_by_offset (s0, s1, s2, s3, 1 + pw_len);
+ switch_buffer_by_offset_le (s0, s1, s2, s3, 1 + pw_len);
const u32 pw_salt_len = pw_len + salt_len;
if (combs_mode == COMBINATOR_MODE_BASE_RIGHT)
{
- switch_buffer_by_offset (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len);
+ switch_buffer_by_offset_le (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len);
}
/**
if (combs_mode == COMBINATOR_MODE_BASE_LEFT)
{
- switch_buffer_by_offset (wordr0, wordr1, wordr2, wordr3, pw_l_len);
+ switch_buffer_by_offset_le (wordr0, wordr1, wordr2, wordr3, pw_l_len);
}
u32 w0[4];
* add id byte
*/
- switch_buffer_by_offset (w0, w1, w2, w3, 1);
+ switch_buffer_by_offset_le (w0, w1, w2, w3, 1);
w0[0] |= salt_buf[4];
s3[2] = 0;
s3[3] = 0;
- switch_buffer_by_offset (s0, s1, s2, s3, 1 + pw_len);
+ switch_buffer_by_offset_le (s0, s1, s2, s3, 1 + pw_len);
const u32 pw_salt_len = pw_len + salt_len;
#define _MD5_
+#define NEW_SIMD_CODE
+
#include "include/constants.h"
#include "include/kernel_vendor.h"
#include "include/kernel_functions.c"
#include "OpenCL/types_ocl.c"
#include "OpenCL/common.c"
-
-#define COMPARE_S "OpenCL/check_single_comp4.c"
-#define COMPARE_M "OpenCL/check_multi_comp4.c"
+#include "OpenCL/simd.c"
static void m04800m (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], const u32 pw_len, __global pw_t *pws, __global kernel_rule_t *rules_buf, __global comb_t *combs_buf, __global bf_t *bfs_buf, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 bfs_cnt, const u32 digests_cnt, const u32 digests_offset)
{
// move w by 1
- u32 w0_t[4];
+ u32x w0_t[4];
w0_t[0] = w0[0];
w0_t[1] = w0[1];
w0_t[2] = w0[2];
w0_t[3] = w0[3];
- u32 w1_t[4];
+ u32x w1_t[4];
w1_t[0] = w1[0];
w1_t[1] = w1[1];
w1_t[2] = w1[2];
w1_t[3] = w1[3];
- u32 w2_t[4];
+ u32x w2_t[4];
w2_t[0] = 0;
w2_t[1] = 0;
w2_t[2] = 0;
w2_t[3] = 0;
- u32 w3_t[4];
+ u32x w3_t[4];
w3_t[0] = 0;
w3_t[1] = 0;
w3_t[2] = 0;
w3_t[3] = 0;
- switch_buffer_by_offset (w0_t, w1_t, w2_t, w3_t, 1);
+ switch_buffer_by_offset_le (w0_t, w1_t, w2_t, w3_t, 1);
- switch_buffer_by_offset (s0, s1, s2, s3, 1 + pw_len);
+ switch_buffer_by_offset_le_S (s0, s1, s2, s3, 1 + pw_len);
w0_t[0] |= s0[0];
w0_t[1] |= s0[1];
* loop
*/
- u32 w0l = w0_t[0];
- u32 w1l = w0_t[1];
+ u32x w0l = w0_t[0];
+ u32x w1l = w0_t[1];
- for (u32 il_pos = 0; il_pos < bfs_cnt; il_pos++)
+ for (u32 il_pos = 0; il_pos < bfs_cnt; il_pos += VECT_SIZE)
{
- const u32 w0r = bfs_buf[il_pos].i;
+ const u32x w0r = w0r_create_bft (bfs_buf, il_pos);
w0_t[0] = w0l | (w0r << 8);
w0_t[1] = w1l | (w0r >> 24);
* md5
*/
- u32 a = MD5M_A;
- u32 b = MD5M_B;
- u32 c = MD5M_C;
- u32 d = MD5M_D;
+ u32x a = MD5M_A;
+ u32x b = MD5M_B;
+ u32x c = MD5M_C;
+ u32x d = MD5M_D;
MD5_STEP (MD5_Fo, a, b, c, d, w0_t[0], MD5C00, MD5S00);
MD5_STEP (MD5_Fo, d, a, b, c, w0_t[1], MD5C01, MD5S01);
MD5_STEP (MD5_I , c, d, a, b, w0_t[2], MD5C3e, MD5S32);
MD5_STEP (MD5_I , b, c, d, a, w2_t[1], MD5C3f, MD5S33);
- const u32 r0 = a;
- const u32 r1 = d;
- const u32 r2 = c;
- const u32 r3 = b;
-
- #include COMPARE_M
+ COMPARE_M_SIMD (a, d, c, b);
}
}
// move w by 1
- u32 w0_t[4];
+ u32x w0_t[4];
w0_t[0] = w0[0];
w0_t[1] = w0[1];
w0_t[2] = w0[2];
w0_t[3] = w0[3];
- u32 w1_t[4];
+ u32x w1_t[4];
w1_t[0] = w1[0];
w1_t[1] = w1[1];
w1_t[2] = w1[2];
w1_t[3] = w1[3];
- u32 w2_t[4];
+ u32x w2_t[4];
w2_t[0] = 0;
w2_t[1] = 0;
w2_t[2] = 0;
w2_t[3] = 0;
- u32 w3_t[4];
+ u32x w3_t[4];
w3_t[0] = 0;
w3_t[1] = 0;
w3_t[2] = 0;
w3_t[3] = 0;
- switch_buffer_by_offset (w0_t, w1_t, w2_t, w3_t, 1);
+ switch_buffer_by_offset_le (w0_t, w1_t, w2_t, w3_t, 1);
- switch_buffer_by_offset (s0, s1, s2, s3, 1 + pw_len);
+ switch_buffer_by_offset_le_S (s0, s1, s2, s3, 1 + pw_len);
w0_t[0] |= s0[0];
w0_t[1] |= s0[1];
* loop
*/
- u32 w0l = w0_t[0];
- u32 w1l = w0_t[1];
+ u32x w0l = w0_t[0];
+ u32x w1l = w0_t[1];
- for (u32 il_pos = 0; il_pos < bfs_cnt; il_pos++)
+ for (u32 il_pos = 0; il_pos < bfs_cnt; il_pos += VECT_SIZE)
{
- const u32 w0r = bfs_buf[il_pos].i;
+ const u32x w0r = w0r_create_bft (bfs_buf, il_pos);
w0_t[0] = w0l | (w0r << 8);
w0_t[1] = w1l | (w0r >> 24);
* md5
*/
- u32 a = MD5M_A;
- u32 b = MD5M_B;
- u32 c = MD5M_C;
- u32 d = MD5M_D;
+ u32x a = MD5M_A;
+ u32x b = MD5M_B;
+ u32x c = MD5M_C;
+ u32x d = MD5M_D;
MD5_STEP (MD5_Fo, a, b, c, d, w0_t[0], MD5C00, MD5S00);
MD5_STEP (MD5_Fo, d, a, b, c, w0_t[1], MD5C01, MD5S01);
MD5_STEP (MD5_I , b, c, d, a, w3_t[1], MD5C3b, MD5S33);
MD5_STEP (MD5_I , a, b, c, d, w1_t[0], MD5C3c, MD5S30);
- bool q_cond = allx (search[0] != a);
-
- if (q_cond) continue;
+ if (MATCHES_NONE_VS (a, search[0])) continue;
MD5_STEP (MD5_I , d, a, b, c, w2_t[3], MD5C3d, MD5S31);
MD5_STEP (MD5_I , c, d, a, b, w0_t[2], MD5C3e, MD5S32);
MD5_STEP (MD5_I , b, c, d, a, w2_t[1], MD5C3f, MD5S33);
- const u32 r0 = a;
- const u32 r1 = d;
- const u32 r2 = c;
- const u32 r3 = b;
-
- #include COMPARE_S
+ COMPARE_S_SIMD (a, d, c, b);
}
}
* prepend salt
*/
- switch_buffer_by_offset (w0_t, w1_t, w2_t, w3_t, salt_len);
+ switch_buffer_by_offset_le (w0_t, w1_t, w2_t, w3_t, salt_len);
w0_t[0] |= salt_buf0[0];
w0_t[1] |= salt_buf0[1];
s3[2] = 0;
s3[3] = 0;
- switch_buffer_by_offset (s0, s1, s2, s3, salt_len + out_len);
+ switch_buffer_by_offset_le (s0, s1, s2, s3, salt_len + out_len);
w0_t[0] |= s0[0];
w0_t[1] |= s0[1];
* prepend salt
*/
- switch_buffer_by_offset (w0_t, w1_t, w2_t, w3_t, salt_len);
+ switch_buffer_by_offset_le (w0_t, w1_t, w2_t, w3_t, salt_len);
w0_t[0] |= salt_buf0[0];
w0_t[1] |= salt_buf0[1];
s3[2] = 0;
s3[3] = 0;
- switch_buffer_by_offset (s0, s1, s2, s3, salt_len + out_len);
+ switch_buffer_by_offset_le (s0, s1, s2, s3, salt_len + out_len);
w0_t[0] |= s0[0];
w0_t[1] |= s0[1];
if (combs_mode == COMBINATOR_MODE_BASE_RIGHT)
{
- switch_buffer_by_offset (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len);
+ switch_buffer_by_offset_le (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len);
}
/**
if (combs_mode == COMBINATOR_MODE_BASE_LEFT)
{
- switch_buffer_by_offset (wordr0, wordr1, wordr2, wordr3, pw_l_len);
+ switch_buffer_by_offset_le (wordr0, wordr1, wordr2, wordr3, pw_l_len);
}
u32 w0_t[4];
* prepend salt
*/
- switch_buffer_by_offset (w0_t, w1_t, w2_t, w3_t, salt_len);
+ switch_buffer_by_offset_le (w0_t, w1_t, w2_t, w3_t, salt_len);
w0_t[0] |= salt_buf0[0];
w0_t[1] |= salt_buf0[1];
s3[2] = 0;
s3[3] = 0;
- switch_buffer_by_offset (s0, s1, s2, s3, salt_len + pw_len);
+ switch_buffer_by_offset_le (s0, s1, s2, s3, salt_len + pw_len);
w0_t[0] |= s0[0];
w0_t[1] |= s0[1];
if (combs_mode == COMBINATOR_MODE_BASE_RIGHT)
{
- switch_buffer_by_offset (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len);
+ switch_buffer_by_offset_le (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len);
}
/**
if (combs_mode == COMBINATOR_MODE_BASE_LEFT)
{
- switch_buffer_by_offset (wordr0, wordr1, wordr2, wordr3, pw_l_len);
+ switch_buffer_by_offset_le (wordr0, wordr1, wordr2, wordr3, pw_l_len);
}
u32 w0_t[4];
* prepend salt
*/
- switch_buffer_by_offset (w0_t, w1_t, w2_t, w3_t, salt_len);
+ switch_buffer_by_offset_le (w0_t, w1_t, w2_t, w3_t, salt_len);
w0_t[0] |= salt_buf0[0];
w0_t[1] |= salt_buf0[1];
s3[2] = 0;
s3[3] = 0;
- switch_buffer_by_offset (s0, s1, s2, s3, salt_len + pw_len);
+ switch_buffer_by_offset_le (s0, s1, s2, s3, salt_len + pw_len);
w0_t[0] |= s0[0];
w0_t[1] |= s0[1];
#define _SHA1_
+#define NEW_SIMD_CODE
+
#include "include/constants.h"
#include "include/kernel_vendor.h"
#include "include/kernel_functions.c"
#include "OpenCL/types_ocl.c"
#include "OpenCL/common.c"
-
-#define COMPARE_S "OpenCL/check_single_comp4.c"
-#define COMPARE_M "OpenCL/check_multi_comp4.c"
+#include "OpenCL/simd.c"
static void m04900m (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], const u32 pw_len, __global pw_t *pws, __global kernel_rule_t *rules_buf, __global comb_t *combs_buf, __global bf_t *bfs_buf, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 bfs_cnt, const u32 digests_cnt, const u32 digests_offset)
{
// first we need to switch the right-hand salt to the correct position (2nd salt)
- switch_buffer_by_offset (salt_buf0_t, salt_buf1_t, salt_buf2_t, salt_buf3_t, salt_len + pw_len);
+ switch_buffer_by_offset_le_S (salt_buf0_t, salt_buf1_t, salt_buf2_t, salt_buf3_t, salt_len + pw_len);
u32 salt_buf0[4];
salt_buf3[2] |= salt_buf3_t[2];
salt_buf3[3] |= salt_buf3_t[3];
- append_0x80_4x4 (salt_buf0, salt_buf1, salt_buf2, salt_buf3, pw_salt_len);
+ append_0x80_4x4_S (salt_buf0, salt_buf1, salt_buf2, salt_buf3, pw_salt_len);
/**
* loop
u32 w0l = w0[0];
- for (u32 il_pos = 0; il_pos < bfs_cnt; il_pos++)
+ for (u32 il_pos = 0; il_pos < bfs_cnt; il_pos += VECT_SIZE)
{
- const u32 w0r = bfs_buf[il_pos].i;
+ const u32x w0r = w0r_create_bft (bfs_buf, il_pos);
- w0[0] = w0l | w0r;
+ const u32x w0lr = w0l | w0r;
- u32 w0_t[4];
+ u32x w0_t[4];
- w0_t[0] = w0[0];
+ w0_t[0] = w0lr;
w0_t[1] = w0[1];
w0_t[2] = w0[2];
w0_t[3] = w0[3];
- u32 w1_t[4];
+ u32x w1_t[4];
w1_t[0] = w1[0];
w1_t[1] = w1[1];
w1_t[2] = w1[2];
w1_t[3] = w1[3];
- u32 w2_t[4];
+ u32x w2_t[4];
w2_t[0] = w2[0];
w2_t[1] = w2[1];
w2_t[2] = w2[2];
w2_t[3] = w2[3];
- u32 w3_t[4];
+ u32x w3_t[4];
w3_t[0] = w3[0];
w3_t[1] = w3[1];
* put the password after the first salt but before the second salt
*/
- switch_buffer_by_offset (w0_t, w1_t, w2_t, w3_t, salt_len);
+ switch_buffer_by_offset_le (w0_t, w1_t, w2_t, w3_t, salt_len);
w0_t[0] |= salt_buf0[0];
w0_t[1] |= salt_buf0[1];
w3_t[1] |= salt_buf3[1];
w3_t[2] |= salt_buf3[2];
- u32 w0 = swap32 (w0_t[0]);
- u32 w1 = swap32 (w0_t[1]);
- u32 w2 = swap32 (w0_t[2]);
- u32 w3 = swap32 (w0_t[3]);
- u32 w4 = swap32 (w1_t[0]);
- u32 w5 = swap32 (w1_t[1]);
- u32 w6 = swap32 (w1_t[2]);
- u32 w7 = swap32 (w1_t[3]);
- u32 w8 = swap32 (w2_t[0]);
- u32 w9 = swap32 (w2_t[1]);
- u32 wa = swap32 (w2_t[2]);
- u32 wb = swap32 (w2_t[3]);
- u32 wc = swap32 (w3_t[0]);
- u32 wd = swap32 (w3_t[1]);
- u32 we = swap32 (w3_t[2]);
- u32 wf = pw_salt_len * 8;
+ u32x w0 = swap32 (w0_t[0]);
+ u32x w1 = swap32 (w0_t[1]);
+ u32x w2 = swap32 (w0_t[2]);
+ u32x w3 = swap32 (w0_t[3]);
+ u32x w4 = swap32 (w1_t[0]);
+ u32x w5 = swap32 (w1_t[1]);
+ u32x w6 = swap32 (w1_t[2]);
+ u32x w7 = swap32 (w1_t[3]);
+ u32x w8 = swap32 (w2_t[0]);
+ u32x w9 = swap32 (w2_t[1]);
+ u32x wa = swap32 (w2_t[2]);
+ u32x wb = swap32 (w2_t[3]);
+ u32x wc = swap32 (w3_t[0]);
+ u32x wd = swap32 (w3_t[1]);
+ u32x we = swap32 (w3_t[2]);
+ u32x wf = pw_salt_len * 8;
/**
* sha1
*/
- u32 a = SHA1M_A;
- u32 b = SHA1M_B;
- u32 c = SHA1M_C;
- u32 d = SHA1M_D;
- u32 e = SHA1M_E;
+ u32x a = SHA1M_A;
+ u32x b = SHA1M_B;
+ u32x c = SHA1M_C;
+ u32x d = SHA1M_D;
+ u32x e = SHA1M_E;
#undef K
#define K SHA1C00
we = rotl32 ((wb ^ w6 ^ w0 ^ we), 1u); SHA1_STEP (SHA1_F1, c, d, e, a, b, we);
wf = rotl32 ((wc ^ w7 ^ w1 ^ wf), 1u); SHA1_STEP (SHA1_F1, b, c, d, e, a, wf);
- const u32 r0 = d;
- const u32 r1 = e;
- const u32 r2 = c;
- const u32 r3 = b;
-
- #include COMPARE_M
+ COMPARE_M_SIMD (d, e, c, b);
}
}
* reverse
*/
- const u32 e_rev = rotl32 (search[1], 2u);
+ const u32 e_rev = rotl32_S (search[1], 2u);
/**
* salt
// first we need to switch the right-hand salt to the correct position (2nd salt)
- switch_buffer_by_offset (salt_buf0_t, salt_buf1_t, salt_buf2_t, salt_buf3_t, salt_len + pw_len);
+ switch_buffer_by_offset_le_S (salt_buf0_t, salt_buf1_t, salt_buf2_t, salt_buf3_t, salt_len + pw_len);
u32 salt_buf0[4];
salt_buf3[2] |= salt_buf3_t[2];
salt_buf3[3] |= salt_buf3_t[3];
- append_0x80_4x4 (salt_buf0, salt_buf1, salt_buf2, salt_buf3, pw_salt_len);
+ append_0x80_4x4_S (salt_buf0, salt_buf1, salt_buf2, salt_buf3, pw_salt_len);
/**
* loop
u32 w0l = w0[0];
- for (u32 il_pos = 0; il_pos < bfs_cnt; il_pos++)
+ for (u32 il_pos = 0; il_pos < bfs_cnt; il_pos += VECT_SIZE)
{
- const u32 w0r = bfs_buf[il_pos].i;
+ const u32x w0r = w0r_create_bft (bfs_buf, il_pos);
- w0[0] = w0l | w0r;
+ const u32x w0lr = w0l | w0r;
- u32 w0_t[4];
+ u32x w0_t[4];
- w0_t[0] = w0[0];
+ w0_t[0] = w0lr;
w0_t[1] = w0[1];
w0_t[2] = w0[2];
w0_t[3] = w0[3];
- u32 w1_t[4];
+ u32x w1_t[4];
w1_t[0] = w1[0];
w1_t[1] = w1[1];
w1_t[2] = w1[2];
w1_t[3] = w1[3];
- u32 w2_t[4];
+ u32x w2_t[4];
w2_t[0] = w2[0];
w2_t[1] = w2[1];
w2_t[2] = w2[2];
w2_t[3] = w2[3];
- u32 w3_t[4];
+ u32x w3_t[4];
w3_t[0] = w3[0];
w3_t[1] = w3[1];
* put the password after the first salt but before the second salt
*/
- switch_buffer_by_offset (w0_t, w1_t, w2_t, w3_t, salt_len);
+ switch_buffer_by_offset_le (w0_t, w1_t, w2_t, w3_t, salt_len);
w0_t[0] |= salt_buf0[0];
w0_t[1] |= salt_buf0[1];
w3_t[1] |= salt_buf3[1];
w3_t[2] |= salt_buf3[2];
- u32 w0 = swap32 (w0_t[0]);
- u32 w1 = swap32 (w0_t[1]);
- u32 w2 = swap32 (w0_t[2]);
- u32 w3 = swap32 (w0_t[3]);
- u32 w4 = swap32 (w1_t[0]);
- u32 w5 = swap32 (w1_t[1]);
- u32 w6 = swap32 (w1_t[2]);
- u32 w7 = swap32 (w1_t[3]);
- u32 w8 = swap32 (w2_t[0]);
- u32 w9 = swap32 (w2_t[1]);
- u32 wa = swap32 (w2_t[2]);
- u32 wb = swap32 (w2_t[3]);
- u32 wc = swap32 (w3_t[0]);
- u32 wd = swap32 (w3_t[1]);
- u32 we = swap32 (w3_t[2]);
- u32 wf = pw_salt_len * 8;
+ u32x w0 = swap32 (w0_t[0]);
+ u32x w1 = swap32 (w0_t[1]);
+ u32x w2 = swap32 (w0_t[2]);
+ u32x w3 = swap32 (w0_t[3]);
+ u32x w4 = swap32 (w1_t[0]);
+ u32x w5 = swap32 (w1_t[1]);
+ u32x w6 = swap32 (w1_t[2]);
+ u32x w7 = swap32 (w1_t[3]);
+ u32x w8 = swap32 (w2_t[0]);
+ u32x w9 = swap32 (w2_t[1]);
+ u32x wa = swap32 (w2_t[2]);
+ u32x wb = swap32 (w2_t[3]);
+ u32x wc = swap32 (w3_t[0]);
+ u32x wd = swap32 (w3_t[1]);
+ u32x we = swap32 (w3_t[2]);
+ u32x wf = pw_salt_len * 8;
/**
* sha1
*/
- u32 a = SHA1M_A;
- u32 b = SHA1M_B;
- u32 c = SHA1M_C;
- u32 d = SHA1M_D;
- u32 e = SHA1M_E;
+ u32x a = SHA1M_A;
+ u32x b = SHA1M_B;
+ u32x c = SHA1M_C;
+ u32x d = SHA1M_D;
+ u32x e = SHA1M_E;
#undef K
#define K SHA1C00
wa = rotl32 ((w7 ^ w2 ^ wc ^ wa), 1u); SHA1_STEP (SHA1_F1, b, c, d, e, a, wa);
wb = rotl32 ((w8 ^ w3 ^ wd ^ wb), 1u); SHA1_STEP (SHA1_F1, a, b, c, d, e, wb);
- if (allx (e != e_rev)) continue;
+ if (MATCHES_NONE_VS (e, e_rev)) continue;
wc = rotl32 ((w9 ^ w4 ^ we ^ wc), 1u); SHA1_STEP (SHA1_F1, e, a, b, c, d, wc);
wd = rotl32 ((wa ^ w5 ^ wf ^ wd), 1u); SHA1_STEP (SHA1_F1, d, e, a, b, c, wd);
we = rotl32 ((wb ^ w6 ^ w0 ^ we), 1u); SHA1_STEP (SHA1_F1, c, d, e, a, b, we);
wf = rotl32 ((wc ^ w7 ^ w1 ^ wf), 1u); SHA1_STEP (SHA1_F1, b, c, d, e, a, wf);
- const u32 r0 = d;
- const u32 r1 = e;
- const u32 r2 = c;
- const u32 r3 = b;
-
- #include COMPARE_S
+ COMPARE_S_SIMD (d, e, c, b);
}
}
{
append_0x01_2x4 (wordl0, wordl1, pw_l_len);
- switch_buffer_by_offset (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len);
+ switch_buffer_by_offset_le (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len);
}
/**
{
append_0x01_2x4 (wordr0, wordr1, pw_r_len);
- switch_buffer_by_offset (wordr0, wordr1, wordr2, wordr3, pw_l_len);
+ switch_buffer_by_offset_le (wordr0, wordr1, wordr2, wordr3, pw_l_len);
}
u32 w0[4];
{
append_0x01_2x4 (wordl0, wordl1, pw_l_len);
- switch_buffer_by_offset (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len);
+ switch_buffer_by_offset_le (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len);
}
/**
{
append_0x01_2x4 (wordr0, wordr1, pw_r_len);
- switch_buffer_by_offset (wordr0, wordr1, wordr2, wordr3, pw_l_len);
+ switch_buffer_by_offset_le (wordr0, wordr1, wordr2, wordr3, pw_l_len);
}
u32 w0[4];
#define _KECCAK_
+#define NEW_SIMD_CODE
+
#include "include/constants.h"
#include "include/kernel_vendor.h"
#include "include/kernel_functions.c"
#include "OpenCL/types_ocl.c"
#include "OpenCL/common.c"
-
-#define COMPARE_S "OpenCL/check_single_comp4.c"
-#define COMPARE_M "OpenCL/check_multi_comp4.c"
+#include "OpenCL/simd.c"
__constant u64 keccakf_rndc[24] =
{
#define Rho_Pi(s) \
{ \
- u32 j = keccakf_piln[s]; \
- u32 k = keccakf_rotc[s]; \
+ u32 j = keccakf_piln[s]; \
+ u32 k = keccakf_rotc[s]; \
bc0 = st[j]; \
st[j] = rotl64 (t, k); \
t = bc0; \
* const
*/
- const u8 keccakf_rotc[24] =
+ const u32 keccakf_rotc[24] =
{
1, 3, 6, 10, 15, 21, 28, 36, 45, 55, 2, 14,
27, 41, 56, 8, 25, 43, 62, 18, 39, 61, 20, 44
};
- const u8 keccakf_piln[24] =
+ const u32 keccakf_piln[24] =
{
10, 7, 11, 17, 18, 3, 5, 16, 8, 21, 24, 4,
15, 23, 19, 13, 12, 2, 20, 14, 22, 9, 6, 1
u32 w0l = w0[0];
- for (u32 il_pos = 0; il_pos < bfs_cnt; il_pos++)
+ for (u32 il_pos = 0; il_pos < bfs_cnt; il_pos += VECT_SIZE)
{
- const u32 w0r = bfs_buf[il_pos].i;
+ const u32x w0r = w0r_create_bft (bfs_buf, il_pos);
- w0[0] = w0l | w0r;
+ const u32x w0lr = w0l | w0r;
- u64 st[25];
+ u64x st[25];
- st[ 0] = (u64) (w0[0]) | (u64) (w0[1]) << 32;
- st[ 1] = (u64) (w0[2]) | (u64) (w0[3]) << 32;
- st[ 2] = (u64) (w1[0]) | (u64) (w1[1]) << 32;
- st[ 3] = (u64) (w1[2]) | (u64) (w1[3]) << 32;
- st[ 4] = (u64) (w2[0]) | (u64) (w2[1]) << 32;
- st[ 5] = (u64) (w2[2]) | (u64) (w2[3]) << 32;
- st[ 6] = (u64) (w3[0]) | (u64) (w3[1]) << 32;
- st[ 7] = (u64) (w3[2]) | (u64) (w3[3]) << 32;
+ st[ 0] = hl32_to_64 (w0[1], w0lr);
+ st[ 1] = hl32_to_64 (w0[3], w0[2]);
+ st[ 2] = hl32_to_64 (w1[1], w1[0]);
+ st[ 3] = hl32_to_64 (w1[3], w1[2]);
+ st[ 4] = hl32_to_64 (w2[1], w2[0]);
+ st[ 5] = hl32_to_64 (w2[3], w2[2]);
+ st[ 6] = hl32_to_64 (w3[1], w3[0]);
+ st[ 7] = hl32_to_64 (w3[3], w3[2]);
st[ 8] = 0;
st[ 9] = 0;
st[10] = 0;
{
// Theta
- u64 bc0 = Theta1 (0);
- u64 bc1 = Theta1 (1);
- u64 bc2 = Theta1 (2);
- u64 bc3 = Theta1 (3);
- u64 bc4 = Theta1 (4);
+ u64x bc0 = Theta1 (0);
+ u64x bc1 = Theta1 (1);
+ u64x bc2 = Theta1 (2);
+ u64x bc3 = Theta1 (3);
+ u64x bc4 = Theta1 (4);
- u64 t;
+ u64x t;
t = bc4 ^ rotl64 (bc1, 1); Theta2 (0);
t = bc0 ^ rotl64 (bc2, 1); Theta2 (1);
st[0] ^= keccakf_rndc[round];
}
- const u32 r0 = l32_from_64 (st[1]);
- const u32 r1 = h32_from_64 (st[1]);
- const u32 r2 = l32_from_64 (st[2]);
- const u32 r3 = h32_from_64 (st[2]);
+ const u32x r0 = l32_from_64 (st[1]);
+ const u32x r1 = h32_from_64 (st[1]);
+ const u32x r2 = l32_from_64 (st[2]);
+ const u32x r3 = h32_from_64 (st[2]);
- #include COMPARE_M
+ COMPARE_M_SIMD (r0, r1, r2, r3);
}
}
* const
*/
- const u8 keccakf_rotc[24] =
+ const u32 keccakf_rotc[24] =
{
1, 3, 6, 10, 15, 21, 28, 36, 45, 55, 2, 14,
27, 41, 56, 8, 25, 43, 62, 18, 39, 61, 20, 44
};
- const u8 keccakf_piln[24] =
+ const u32 keccakf_piln[24] =
{
10, 7, 11, 17, 18, 3, 5, 16, 8, 21, 24, 4,
15, 23, 19, 13, 12, 2, 20, 14, 22, 9, 6, 1
u32 w0l = w0[0];
- for (u32 il_pos = 0; il_pos < bfs_cnt; il_pos++)
+ for (u32 il_pos = 0; il_pos < bfs_cnt; il_pos += VECT_SIZE)
{
- const u32 w0r = bfs_buf[il_pos].i;
+ const u32x w0r = w0r_create_bft (bfs_buf, il_pos);
- w0[0] = w0l | w0r;
+ const u32x w0lr = w0l | w0r;
- u64 st[25];
+ u64x st[25];
- st[ 0] = (u64) (w0[0]) | (u64) (w0[1]) << 32;
- st[ 1] = (u64) (w0[2]) | (u64) (w0[3]) << 32;
- st[ 2] = (u64) (w1[0]) | (u64) (w1[1]) << 32;
- st[ 3] = (u64) (w1[2]) | (u64) (w1[3]) << 32;
- st[ 4] = (u64) (w2[0]) | (u64) (w2[1]) << 32;
- st[ 5] = (u64) (w2[2]) | (u64) (w2[3]) << 32;
- st[ 6] = (u64) (w3[0]) | (u64) (w3[1]) << 32;
- st[ 7] = (u64) (w3[2]) | (u64) (w3[3]) << 32;
+ st[ 0] = hl32_to_64 (w0[1], w0lr);
+ st[ 1] = hl32_to_64 (w0[3], w0[2]);
+ st[ 2] = hl32_to_64 (w1[1], w1[0]);
+ st[ 3] = hl32_to_64 (w1[3], w1[2]);
+ st[ 4] = hl32_to_64 (w2[1], w2[0]);
+ st[ 5] = hl32_to_64 (w2[3], w2[2]);
+ st[ 6] = hl32_to_64 (w3[1], w3[0]);
+ st[ 7] = hl32_to_64 (w3[3], w3[2]);
st[ 8] = 0;
st[ 9] = 0;
st[10] = 0;
{
// Theta
- u64 bc0 = Theta1 (0);
- u64 bc1 = Theta1 (1);
- u64 bc2 = Theta1 (2);
- u64 bc3 = Theta1 (3);
- u64 bc4 = Theta1 (4);
+ u64x bc0 = Theta1 (0);
+ u64x bc1 = Theta1 (1);
+ u64x bc2 = Theta1 (2);
+ u64x bc3 = Theta1 (3);
+ u64x bc4 = Theta1 (4);
- u64 t;
+ u64x t;
t = bc4 ^ rotl64 (bc1, 1); Theta2 (0);
t = bc0 ^ rotl64 (bc2, 1); Theta2 (1);
st[0] ^= keccakf_rndc[round];
}
- const u32 r0 = l32_from_64 (st[1]);
- const u32 r1 = h32_from_64 (st[1]);
- const u32 r2 = l32_from_64 (st[2]);
- const u32 r3 = h32_from_64 (st[2]);
+ const u32x r0 = l32_from_64 (st[1]);
+ const u32x r1 = h32_from_64 (st[1]);
+ const u32x r2 = l32_from_64 (st[2]);
+ const u32x r3 = h32_from_64 (st[2]);
- #include COMPARE_S
+ COMPARE_S_SIMD (r0, r1, r2, r3);
}
}
{
append_0x80_2x4 (wordl0, wordl1, pw_l_len);
- switch_buffer_by_offset (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len);
+ switch_buffer_by_offset_le (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len);
}
/**
if (combs_mode == COMBINATOR_MODE_BASE_LEFT)
{
- switch_buffer_by_offset (wordr0, wordr1, wordr2, wordr3, pw_l_len);
+ switch_buffer_by_offset_le (wordr0, wordr1, wordr2, wordr3, pw_l_len);
}
u32 w0[4];
{
append_0x80_2x4 (wordl0, wordl1, pw_l_len);
- switch_buffer_by_offset (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len);
+ switch_buffer_by_offset_le (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len);
}
/**
if (combs_mode == COMBINATOR_MODE_BASE_LEFT)
{
- switch_buffer_by_offset (wordr0, wordr1, wordr2, wordr3, pw_l_len);
+ switch_buffer_by_offset_le (wordr0, wordr1, wordr2, wordr3, pw_l_len);
}
u32 w0[4];
#define _MD5H_
+#define NEW_SIMD_CODE
+
#include "include/constants.h"
#include "include/kernel_vendor.h"
#include "include/kernel_functions.c"
#include "OpenCL/types_ocl.c"
#include "OpenCL/common.c"
-
-#define COMPARE_S "OpenCL/check_single_comp4.c"
-#define COMPARE_M "OpenCL/check_multi_comp4.c"
+#include "OpenCL/simd.c"
static void m05100m (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], const u32 pw_len, __global pw_t *pws, __global kernel_rule_t *rules_buf, __global comb_t *combs_buf, __global bf_t *bfs_buf, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 bfs_cnt, const u32 digests_cnt, const u32 digests_offset)
{
u32 w0l = w0[0];
- for (u32 il_pos = 0; il_pos < bfs_cnt; il_pos++)
+ for (u32 il_pos = 0; il_pos < bfs_cnt; il_pos += VECT_SIZE)
{
- const u32 w0r = bfs_buf[il_pos].i;
+ const u32x w0r = w0r_create_bft (bfs_buf, il_pos);
- w0[0] = w0l | w0r;
+ const u32x w0lr = w0l | w0r;
- u32 a = MD5M_A;
- u32 b = MD5M_B;
- u32 c = MD5M_C;
- u32 d = MD5M_D;
+ u32x a = MD5M_A;
+ u32x b = MD5M_B;
+ u32x c = MD5M_C;
+ u32x d = MD5M_D;
- MD5_STEP (MD5_Fo, a, b, c, d, w0[0], MD5C00, MD5S00);
+ MD5_STEP (MD5_Fo, a, b, c, d, w0lr, MD5C00, MD5S00);
MD5_STEP (MD5_Fo, d, a, b, c, w0[1], MD5C01, MD5S01);
MD5_STEP (MD5_Fo, c, d, a, b, w0[2], MD5C02, MD5S02);
MD5_STEP (MD5_Fo, b, c, d, a, w0[3], MD5C03, MD5S03);
MD5_STEP (MD5_Go, a, b, c, d, w0[1], MD5C10, MD5S10);
MD5_STEP (MD5_Go, d, a, b, c, w1[2], MD5C11, MD5S11);
MD5_STEP (MD5_Go, c, d, a, b, w2[3], MD5C12, MD5S12);
- MD5_STEP (MD5_Go, b, c, d, a, w0[0], MD5C13, MD5S13);
+ MD5_STEP (MD5_Go, b, c, d, a, w0lr, MD5C13, MD5S13);
MD5_STEP (MD5_Go, a, b, c, d, w1[1], MD5C14, MD5S10);
MD5_STEP (MD5_Go, d, a, b, c, w2[2], MD5C15, MD5S11);
MD5_STEP (MD5_Go, c, d, a, b, w3[3], MD5C16, MD5S12);
MD5_STEP (MD5_H , c, d, a, b, w1[3], MD5C26, MD5S22);
MD5_STEP (MD5_H , b, c, d, a, w2[2], MD5C27, MD5S23);
MD5_STEP (MD5_H , a, b, c, d, w3[1], MD5C28, MD5S20);
- MD5_STEP (MD5_H , d, a, b, c, w0[0], MD5C29, MD5S21);
+ MD5_STEP (MD5_H , d, a, b, c, w0lr, MD5C29, MD5S21);
MD5_STEP (MD5_H , c, d, a, b, w0[3], MD5C2a, MD5S22);
MD5_STEP (MD5_H , b, c, d, a, w1[2], MD5C2b, MD5S23);
MD5_STEP (MD5_H , a, b, c, d, w2[1], MD5C2c, MD5S20);
MD5_STEP (MD5_H , c, d, a, b, w3[3], MD5C2e, MD5S22);
MD5_STEP (MD5_H , b, c, d, a, w0[2], MD5C2f, MD5S23);
- MD5_STEP (MD5_I , a, b, c, d, w0[0], MD5C30, MD5S30);
+ MD5_STEP (MD5_I , a, b, c, d, w0lr, MD5C30, MD5S30);
MD5_STEP (MD5_I , d, a, b, c, w1[3], MD5C31, MD5S31);
MD5_STEP (MD5_I , c, d, a, b, w3[2], MD5C32, MD5S32);
MD5_STEP (MD5_I , b, c, d, a, w1[1], MD5C33, MD5S33);
c += MD5M_C;
d += MD5M_D;
- {
- const u32 r0 = a;
- const u32 r1 = b;
- const u32 r2 = 0;
- const u32 r3 = 0;
-
- #include COMPARE_M
- }
+ u32x e = 0;
+ u32x f = 0;
- {
- const u32 r0 = b;
- const u32 r1 = c;
- const u32 r2 = 0;
- const u32 r3 = 0;
+ COMPARE_M_SIMD (a, b, e, f);
- #include COMPARE_M
- }
+ COMPARE_M_SIMD (b, c, e, f);
- {
- const u32 r0 = c;
- const u32 r1 = d;
- const u32 r2 = 0;
- const u32 r3 = 0;
-
- #include COMPARE_M
- }
+ COMPARE_M_SIMD (c, d, e, f);
}
}
u32 w0l = w0[0];
- for (u32 il_pos = 0; il_pos < bfs_cnt; il_pos++)
+ for (u32 il_pos = 0; il_pos < bfs_cnt; il_pos += VECT_SIZE)
{
- const u32 w0r = bfs_buf[il_pos].i;
+ const u32x w0r = w0r_create_bft (bfs_buf, il_pos);
- w0[0] = w0l | w0r;
+ const u32x w0lr = w0l | w0r;
- u32 a = MD5M_A;
- u32 b = MD5M_B;
- u32 c = MD5M_C;
- u32 d = MD5M_D;
+ u32x a = MD5M_A;
+ u32x b = MD5M_B;
+ u32x c = MD5M_C;
+ u32x d = MD5M_D;
- MD5_STEP (MD5_Fo, a, b, c, d, w0[0], MD5C00, MD5S00);
+ MD5_STEP (MD5_Fo, a, b, c, d, w0lr, MD5C00, MD5S00);
MD5_STEP (MD5_Fo, d, a, b, c, w0[1], MD5C01, MD5S01);
MD5_STEP (MD5_Fo, c, d, a, b, w0[2], MD5C02, MD5S02);
MD5_STEP (MD5_Fo, b, c, d, a, w0[3], MD5C03, MD5S03);
MD5_STEP (MD5_Go, a, b, c, d, w0[1], MD5C10, MD5S10);
MD5_STEP (MD5_Go, d, a, b, c, w1[2], MD5C11, MD5S11);
MD5_STEP (MD5_Go, c, d, a, b, w2[3], MD5C12, MD5S12);
- MD5_STEP (MD5_Go, b, c, d, a, w0[0], MD5C13, MD5S13);
+ MD5_STEP (MD5_Go, b, c, d, a, w0lr, MD5C13, MD5S13);
MD5_STEP (MD5_Go, a, b, c, d, w1[1], MD5C14, MD5S10);
MD5_STEP (MD5_Go, d, a, b, c, w2[2], MD5C15, MD5S11);
MD5_STEP (MD5_Go, c, d, a, b, w3[3], MD5C16, MD5S12);
MD5_STEP (MD5_H , c, d, a, b, w1[3], MD5C26, MD5S22);
MD5_STEP (MD5_H , b, c, d, a, w2[2], MD5C27, MD5S23);
MD5_STEP (MD5_H , a, b, c, d, w3[1], MD5C28, MD5S20);
- MD5_STEP (MD5_H , d, a, b, c, w0[0], MD5C29, MD5S21);
+ MD5_STEP (MD5_H , d, a, b, c, w0lr, MD5C29, MD5S21);
MD5_STEP (MD5_H , c, d, a, b, w0[3], MD5C2a, MD5S22);
MD5_STEP (MD5_H , b, c, d, a, w1[2], MD5C2b, MD5S23);
MD5_STEP (MD5_H , a, b, c, d, w2[1], MD5C2c, MD5S20);
MD5_STEP (MD5_H , c, d, a, b, w3[3], MD5C2e, MD5S22);
MD5_STEP (MD5_H , b, c, d, a, w0[2], MD5C2f, MD5S23);
- MD5_STEP (MD5_I , a, b, c, d, w0[0], MD5C30, MD5S30);
+ MD5_STEP (MD5_I , a, b, c, d, w0lr, MD5C30, MD5S30);
MD5_STEP (MD5_I , d, a, b, c, w1[3], MD5C31, MD5S31);
MD5_STEP (MD5_I , c, d, a, b, w3[2], MD5C32, MD5S32);
MD5_STEP (MD5_I , b, c, d, a, w1[1], MD5C33, MD5S33);
c += MD5M_C;
d += MD5M_D;
- {
- const u32 r0 = a;
- const u32 r1 = b;
- const u32 r2 = 0;
- const u32 r3 = 0;
-
- #include COMPARE_S
- }
-
- {
- const u32 r0 = b;
- const u32 r1 = c;
- const u32 r2 = 0;
- const u32 r3 = 0;
+ u32x e = 0;
+ u32x f = 0;
- #include COMPARE_S
- }
+ COMPARE_S_SIMD (a, b, e, f);
- {
- const u32 r0 = c;
- const u32 r1 = d;
- const u32 r2 = 0;
- const u32 r3 = 0;
+ COMPARE_S_SIMD (b, c, e, f);
- #include COMPARE_S
- }
+ COMPARE_S_SIMD (c, d, e, f);
}
}
u32 salt_len = salt_bufs[salt_pos].salt_len;
- switch_buffer_by_offset (salt_buf0, salt_buf1, salt_buf2, salt_buf3, pw_len);
+ switch_buffer_by_offset_le (salt_buf0, salt_buf1, salt_buf2, salt_buf3, pw_len);
w0[0] |= salt_buf0[0];
w0[1] |= salt_buf0[1];
if (combs_mode == COMBINATOR_MODE_BASE_RIGHT)
{
- switch_buffer_by_offset (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len);
+ switch_buffer_by_offset_le (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len);
}
/**
if (combs_mode == COMBINATOR_MODE_BASE_LEFT)
{
- switch_buffer_by_offset (wordr0, wordr1, wordr2, wordr3, pw_l_len);
+ switch_buffer_by_offset_le (wordr0, wordr1, wordr2, wordr3, pw_l_len);
}
u32 w0[4];
if (combs_mode == COMBINATOR_MODE_BASE_RIGHT)
{
- switch_buffer_by_offset (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len);
+ switch_buffer_by_offset_le (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len);
}
/**
if (combs_mode == COMBINATOR_MODE_BASE_LEFT)
{
- switch_buffer_by_offset (wordr0, wordr1, wordr2, wordr3, pw_l_len);
+ switch_buffer_by_offset_le (wordr0, wordr1, wordr2, wordr3, pw_l_len);
}
u32 w0[4];
#define _MD5_
+#define NEW_SIMD_CODE
+
#include "include/constants.h"
#include "include/kernel_vendor.h"
#include "include/kernel_functions.c"
#include "OpenCL/types_ocl.c"
#include "OpenCL/common.c"
+#include "OpenCL/simd.c"
-#define COMPARE_S "OpenCL/check_single_comp4.c"
-#define COMPARE_M "OpenCL/check_multi_comp4.c"
-
-static void md5_transform (const u32 w0[4], const u32 w1[4], const u32 w2[4], const u32 w3[4], u32 digest[4])
+static void md5_transform (const u32x w0[4], const u32x w1[4], const u32x w2[4], const u32x w3[4], u32x digest[4])
{
- u32 a = digest[0];
- u32 b = digest[1];
- u32 c = digest[2];
- u32 d = digest[3];
-
- u32 w0_t = w0[0];
- u32 w1_t = w0[1];
- u32 w2_t = w0[2];
- u32 w3_t = w0[3];
- u32 w4_t = w1[0];
- u32 w5_t = w1[1];
- u32 w6_t = w1[2];
- u32 w7_t = w1[3];
- u32 w8_t = w2[0];
- u32 w9_t = w2[1];
- u32 wa_t = w2[2];
- u32 wb_t = w2[3];
- u32 wc_t = w3[0];
- u32 wd_t = w3[1];
- u32 we_t = w3[2];
- u32 wf_t = w3[3];
+ u32x a = digest[0];
+ u32x b = digest[1];
+ u32x c = digest[2];
+ u32x d = digest[3];
+
+ u32x w0_t = w0[0];
+ u32x w1_t = w0[1];
+ u32x w2_t = w0[2];
+ u32x w3_t = w0[3];
+ u32x w4_t = w1[0];
+ u32x w5_t = w1[1];
+ u32x w6_t = w1[2];
+ u32x w7_t = w1[3];
+ u32x w8_t = w2[0];
+ u32x w9_t = w2[1];
+ u32x wa_t = w2[2];
+ u32x wb_t = w2[3];
+ u32x wc_t = w3[0];
+ u32x wd_t = w3[1];
+ u32x we_t = w3[2];
+ u32x wf_t = w3[3];
MD5_STEP (MD5_Fo, a, b, c, d, w0_t, MD5C00, MD5S00);
MD5_STEP (MD5_Fo, d, a, b, c, w1_t, MD5C01, MD5S01);
digest[3] += d;
}
-static void hmac_md5_pad (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], u32 ipad[4], u32 opad[4])
+static void hmac_md5_pad (u32x w0[4], u32x w1[4], u32x w2[4], u32x w3[4], u32x ipad[4], u32x opad[4])
{
w0[0] = w0[0] ^ 0x36363636;
w0[1] = w0[1] ^ 0x36363636;
md5_transform (w0, w1, w2, w3, opad);
}
-static void hmac_md5_run (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], u32 ipad[4], u32 opad[4], u32 digest[4])
+static void hmac_md5_run (u32x w0[4], u32x w1[4], u32x w2[4], u32x w3[4], u32x ipad[4], u32x opad[4], u32x digest[4])
{
digest[0] = ipad[0];
digest[1] = ipad[1];
u32 w0l = w0[0];
- for (u32 il_pos = 0; il_pos < bfs_cnt; il_pos++)
+ for (u32 il_pos = 0; il_pos < bfs_cnt; il_pos += VECT_SIZE)
{
- const u32 w0r = bfs_buf[il_pos].i;
+ const u32x w0r = w0r_create_bft (bfs_buf, il_pos);
- w0[0] = w0l | w0r;
+ const u32x w0lr = w0l | w0r;
/**
* pads
*/
- u32 w0_t[4];
+ u32x w0_t[4];
- w0_t[0] = w0[0];
+ w0_t[0] = w0lr;
w0_t[1] = w0[1];
w0_t[2] = w0[2];
w0_t[3] = w0[3];
- u32 w1_t[4];
+ u32x w1_t[4];
w1_t[0] = w1[0];
w1_t[1] = w1[1];
w1_t[2] = w1[2];
w1_t[3] = w1[3];
- u32 w2_t[4];
+ u32x w2_t[4];
w2_t[0] = w2[0];
w2_t[1] = w2[1];
w2_t[2] = w2[2];
w2_t[3] = w2[3];
- u32 w3_t[4];
+ u32x w3_t[4];
w3_t[0] = w3[0];
w3_t[1] = w3[1];
w3_t[2] = w3[2];
w3_t[3] = w3[3];
- u32 ipad[4];
- u32 opad[4];
+ u32x ipad[4];
+ u32x opad[4];
hmac_md5_pad (w0_t, w1_t, w2_t, w3_t, ipad, opad);
w3_t[2] = (64 + nr_len) * 8;
w3_t[3] = 0;
- u32 digest[4];
+ u32x digest[4];
hmac_md5_run (w0_t, w1_t, w2_t, w3_t, ipad, opad, digest);
hmac_md5_run (w0_t, w1_t, w2_t, w3_t, ipad, opad, digest);
- const u32 r0 = digest[0];
- const u32 r1 = digest[3];
- const u32 r2 = digest[2];
- const u32 r3 = digest[1];
-
- #include COMPARE_M
+ COMPARE_M_SIMD (digest[0], digest[3], digest[2], digest[1]);
}
}
u32 w0l = w0[0];
- for (u32 il_pos = 0; il_pos < bfs_cnt; il_pos++)
+ for (u32 il_pos = 0; il_pos < bfs_cnt; il_pos += VECT_SIZE)
{
- const u32 w0r = bfs_buf[il_pos].i;
+ const u32x w0r = w0r_create_bft (bfs_buf, il_pos);
- w0[0] = w0l | w0r;
+ const u32x w0lr = w0l | w0r;
/**
* pads
*/
- u32 w0_t[4];
+ u32x w0_t[4];
- w0_t[0] = w0[0];
+ w0_t[0] = w0lr;
w0_t[1] = w0[1];
w0_t[2] = w0[2];
w0_t[3] = w0[3];
- u32 w1_t[4];
+ u32x w1_t[4];
w1_t[0] = w1[0];
w1_t[1] = w1[1];
w1_t[2] = w1[2];
w1_t[3] = w1[3];
- u32 w2_t[4];
+ u32x w2_t[4];
w2_t[0] = w2[0];
w2_t[1] = w2[1];
w2_t[2] = w2[2];
w2_t[3] = w2[3];
- u32 w3_t[4];
+ u32x w3_t[4];
w3_t[0] = w3[0];
w3_t[1] = w3[1];
w3_t[2] = w3[2];
w3_t[3] = w3[3];
- u32 ipad[4];
- u32 opad[4];
+ u32x ipad[4];
+ u32x opad[4];
hmac_md5_pad (w0_t, w1_t, w2_t, w3_t, ipad, opad);
w3_t[2] = (64 + nr_len) * 8;
w3_t[3] = 0;
- u32 digest[4];
+ u32x digest[4];
hmac_md5_run (w0_t, w1_t, w2_t, w3_t, ipad, opad, digest);
hmac_md5_run (w0_t, w1_t, w2_t, w3_t, ipad, opad, digest);
- const u32 r0 = digest[0];
- const u32 r1 = digest[3];
- const u32 r2 = digest[2];
- const u32 r3 = digest[1];
-
- #include COMPARE_S
+ COMPARE_S_SIMD (digest[0], digest[3], digest[2], digest[1]);
}
}
if (combs_mode == COMBINATOR_MODE_BASE_RIGHT)
{
- switch_buffer_by_offset (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len);
+ switch_buffer_by_offset_le (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len);
}
/**
if (combs_mode == COMBINATOR_MODE_BASE_LEFT)
{
- switch_buffer_by_offset (wordr0, wordr1, wordr2, wordr3, pw_l_len);
+ switch_buffer_by_offset_le (wordr0, wordr1, wordr2, wordr3, pw_l_len);
}
u32 w0[4];
if (combs_mode == COMBINATOR_MODE_BASE_RIGHT)
{
- switch_buffer_by_offset (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len);
+ switch_buffer_by_offset_le (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len);
}
/**
if (combs_mode == COMBINATOR_MODE_BASE_LEFT)
{
- switch_buffer_by_offset (wordr0, wordr1, wordr2, wordr3, pw_l_len);
+ switch_buffer_by_offset_le (wordr0, wordr1, wordr2, wordr3, pw_l_len);
}
u32 w0[4];
#define _SHA1_
+#define NEW_SIMD_CODE
+
#include "include/constants.h"
#include "include/kernel_vendor.h"
#include "include/kernel_functions.c"
#include "OpenCL/types_ocl.c"
#include "OpenCL/common.c"
+#include "OpenCL/simd.c"
-#define COMPARE_S "OpenCL/check_single_comp4.c"
-#define COMPARE_M "OpenCL/check_multi_comp4.c"
-
-static void sha1_transform (const u32 w0[4], const u32 w1[4], const u32 w2[4], const u32 w3[4], u32 digest[5])
+static void sha1_transform (const u32x w0[4], const u32x w1[4], const u32x w2[4], const u32x w3[4], u32x digest[5])
{
- u32 A = digest[0];
- u32 B = digest[1];
- u32 C = digest[2];
- u32 D = digest[3];
- u32 E = digest[4];
-
- u32 w0_t = w0[0];
- u32 w1_t = w0[1];
- u32 w2_t = w0[2];
- u32 w3_t = w0[3];
- u32 w4_t = w1[0];
- u32 w5_t = w1[1];
- u32 w6_t = w1[2];
- u32 w7_t = w1[3];
- u32 w8_t = w2[0];
- u32 w9_t = w2[1];
- u32 wa_t = w2[2];
- u32 wb_t = w2[3];
- u32 wc_t = w3[0];
- u32 wd_t = w3[1];
- u32 we_t = w3[2];
- u32 wf_t = w3[3];
+ u32x A = digest[0];
+ u32x B = digest[1];
+ u32x C = digest[2];
+ u32x D = digest[3];
+ u32x E = digest[4];
+
+ u32x w0_t = w0[0];
+ u32x w1_t = w0[1];
+ u32x w2_t = w0[2];
+ u32x w3_t = w0[3];
+ u32x w4_t = w1[0];
+ u32x w5_t = w1[1];
+ u32x w6_t = w1[2];
+ u32x w7_t = w1[3];
+ u32x w8_t = w2[0];
+ u32x w9_t = w2[1];
+ u32x wa_t = w2[2];
+ u32x wb_t = w2[3];
+ u32x wc_t = w3[0];
+ u32x wd_t = w3[1];
+ u32x we_t = w3[2];
+ u32x wf_t = w3[3];
#undef K
#define K SHA1C00
digest[4] += E;
}
-static void hmac_sha1_pad (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], u32 ipad[5], u32 opad[5])
+static void hmac_sha1_pad (u32x w0[4], u32x w1[4], u32x w2[4], u32x w3[4], u32x ipad[5], u32x opad[5])
{
w0[0] = w0[0] ^ 0x36363636;
w0[1] = w0[1] ^ 0x36363636;
sha1_transform (w0, w1, w2, w3, opad);
}
-static void hmac_sha1_run (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], u32 ipad[5], u32 opad[5], u32 digest[5])
+static void hmac_sha1_run (u32x w0[4], u32x w1[4], u32x w2[4], u32x w3[4], u32x ipad[5], u32x opad[5], u32x digest[5])
{
digest[0] = ipad[0];
digest[1] = ipad[1];
u32 w0l = w0[0];
- for (u32 il_pos = 0; il_pos < bfs_cnt; il_pos++)
+ for (u32 il_pos = 0; il_pos < bfs_cnt; il_pos += VECT_SIZE)
{
- const u32 w0r = bfs_buf[il_pos].i;
+ const u32x w0r = w0r_create_bft (bfs_buf, il_pos);
- w0[0] = w0l | w0r;
+ const u32x w0lr = w0l | w0r;
/**
* pads
*/
- u32 w0_t[4];
+ u32x w0_t[4];
- w0_t[0] = w0[0];
+ w0_t[0] = w0lr;
w0_t[1] = w0[1];
w0_t[2] = w0[2];
w0_t[3] = w0[3];
- u32 w1_t[4];
+ u32x w1_t[4];
w1_t[0] = w1[0];
w1_t[1] = w1[1];
w1_t[2] = w1[2];
w1_t[3] = w1[3];
- u32 w2_t[4];
+ u32x w2_t[4];
w2_t[0] = w2[0];
w2_t[1] = w2[1];
w2_t[2] = w2[2];
w2_t[3] = w2[3];
- u32 w3_t[4];
+ u32x w3_t[4];
w3_t[0] = w3[0];
w3_t[1] = w3[1];
w3_t[2] = 0;
w3_t[3] = 0;
- u32 ipad[5];
- u32 opad[5];
+ u32x ipad[5];
+ u32x opad[5];
hmac_sha1_pad (w0_t, w1_t, w2_t, w3_t, ipad, opad);
w3_t[2] = 0;
w3_t[3] = (64 + nr_len) * 8;
- u32 digest[5];
+ u32x digest[5];
hmac_sha1_run (w0_t, w1_t, w2_t, w3_t, ipad, opad, digest);
hmac_sha1_run (w0_t, w1_t, w2_t, w3_t, ipad, opad, digest);
- const u32 r0 = digest[3];
- const u32 r1 = digest[4];
- const u32 r2 = digest[2];
- const u32 r3 = digest[1];
-
- #include COMPARE_M
+ COMPARE_M_SIMD (digest[3], digest[4], digest[2], digest[1]);
}
}
u32 w0l = w0[0];
- for (u32 il_pos = 0; il_pos < bfs_cnt; il_pos++)
+ for (u32 il_pos = 0; il_pos < bfs_cnt; il_pos += VECT_SIZE)
{
- const u32 w0r = bfs_buf[il_pos].i;
+ const u32x w0r = w0r_create_bft (bfs_buf, il_pos);
- w0[0] = w0l | w0r;
+ const u32x w0lr = w0l | w0r;
/**
* pads
*/
- u32 w0_t[4];
+ u32x w0_t[4];
- w0_t[0] = w0[0];
+ w0_t[0] = w0lr;
w0_t[1] = w0[1];
w0_t[2] = w0[2];
w0_t[3] = w0[3];
- u32 w1_t[4];
+ u32x w1_t[4];
w1_t[0] = w1[0];
w1_t[1] = w1[1];
w1_t[2] = w1[2];
w1_t[3] = w1[3];
- u32 w2_t[4];
+ u32x w2_t[4];
w2_t[0] = w2[0];
w2_t[1] = w2[1];
w2_t[2] = w2[2];
w2_t[3] = w2[3];
- u32 w3_t[4];
+ u32x w3_t[4];
w3_t[0] = w3[0];
w3_t[1] = w3[1];
w3_t[2] = 0;
w3_t[3] = 0;
- u32 ipad[5];
- u32 opad[5];
+ u32x ipad[5];
+ u32x opad[5];
hmac_sha1_pad (w0_t, w1_t, w2_t, w3_t, ipad, opad);
w3_t[2] = 0;
w3_t[3] = (64 + nr_len) * 8;
- u32 digest[5];
+ u32x digest[5];
hmac_sha1_run (w0_t, w1_t, w2_t, w3_t, ipad, opad, digest);
hmac_sha1_run (w0_t, w1_t, w2_t, w3_t, ipad, opad, digest);
- const u32 r0 = digest[3];
- const u32 r1 = digest[4];
- const u32 r2 = digest[2];
- const u32 r3 = digest[1];
-
- #include COMPARE_S
+ COMPARE_S_SIMD (digest[3], digest[4], digest[2], digest[1]);
}
}
for (u32 i = lid; i < 16; i += lsz)
{
- w_s[i] = swap32 (ikepsk_bufs[salt_pos].nr_buf[i]);
+ w_s[i] = swap32_S (ikepsk_bufs[salt_pos].nr_buf[i]);
}
barrier (CLK_LOCAL_MEM_FENCE);
for (u32 i = lid; i < 128; i += lsz)
{
- s_msg_buf[i] = swap32 (ikepsk_bufs[salt_pos].msg_buf[i]);
+ s_msg_buf[i] = swap32_S (ikepsk_bufs[salt_pos].msg_buf[i]);
}
barrier (CLK_LOCAL_MEM_FENCE);
for (u32 i = lid; i < 16; i += lsz)
{
- w_s[i] = swap32 (ikepsk_bufs[salt_pos].nr_buf[i]);
+ w_s[i] = swap32_S (ikepsk_bufs[salt_pos].nr_buf[i]);
}
barrier (CLK_LOCAL_MEM_FENCE);
for (u32 i = lid; i < 128; i += lsz)
{
- s_msg_buf[i] = swap32 (ikepsk_bufs[salt_pos].msg_buf[i]);
+ s_msg_buf[i] = swap32_S (ikepsk_bufs[salt_pos].msg_buf[i]);
}
barrier (CLK_LOCAL_MEM_FENCE);
for (u32 i = lid; i < 16; i += lsz)
{
- w_s[i] = swap32 (ikepsk_bufs[salt_pos].nr_buf[i]);
+ w_s[i] = swap32_S (ikepsk_bufs[salt_pos].nr_buf[i]);
}
barrier (CLK_LOCAL_MEM_FENCE);
for (u32 i = lid; i < 128; i += lsz)
{
- s_msg_buf[i] = swap32 (ikepsk_bufs[salt_pos].msg_buf[i]);
+ s_msg_buf[i] = swap32_S (ikepsk_bufs[salt_pos].msg_buf[i]);
}
barrier (CLK_LOCAL_MEM_FENCE);
for (u32 i = lid; i < 16; i += lsz)
{
- w_s[i] = swap32 (ikepsk_bufs[salt_pos].nr_buf[i]);
+ w_s[i] = swap32_S (ikepsk_bufs[salt_pos].nr_buf[i]);
}
barrier (CLK_LOCAL_MEM_FENCE);
for (u32 i = lid; i < 128; i += lsz)
{
- s_msg_buf[i] = swap32 (ikepsk_bufs[salt_pos].msg_buf[i]);
+ s_msg_buf[i] = swap32_S (ikepsk_bufs[salt_pos].msg_buf[i]);
}
barrier (CLK_LOCAL_MEM_FENCE);
for (u32 i = lid; i < 16; i += lsz)
{
- w_s[i] = swap32 (ikepsk_bufs[salt_pos].nr_buf[i]);
+ w_s[i] = swap32_S (ikepsk_bufs[salt_pos].nr_buf[i]);
}
barrier (CLK_LOCAL_MEM_FENCE);
for (u32 i = lid; i < 128; i += lsz)
{
- s_msg_buf[i] = swap32 (ikepsk_bufs[salt_pos].msg_buf[i]);
+ s_msg_buf[i] = swap32_S (ikepsk_bufs[salt_pos].msg_buf[i]);
}
barrier (CLK_LOCAL_MEM_FENCE);
for (u32 i = lid; i < 16; i += lsz)
{
- w_s[i] = swap32 (ikepsk_bufs[salt_pos].nr_buf[i]);
+ w_s[i] = swap32_S (ikepsk_bufs[salt_pos].nr_buf[i]);
}
barrier (CLK_LOCAL_MEM_FENCE);
for (u32 i = lid; i < 128; i += lsz)
{
- s_msg_buf[i] = swap32 (ikepsk_bufs[salt_pos].msg_buf[i]);
+ s_msg_buf[i] = swap32_S (ikepsk_bufs[salt_pos].msg_buf[i]);
}
barrier (CLK_LOCAL_MEM_FENCE);
{
append_0x80_2x4 (wordl0, wordl1, pw_l_len);
- switch_buffer_by_offset (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len);
+ switch_buffer_by_offset_le (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len);
}
/**
if (combs_mode == COMBINATOR_MODE_BASE_LEFT)
{
- switch_buffer_by_offset (wordr0, wordr1, wordr2, wordr3, pw_l_len);
+ switch_buffer_by_offset_le (wordr0, wordr1, wordr2, wordr3, pw_l_len);
}
u32 w0[4];
{
append_0x80_2x4 (wordl0, wordl1, pw_l_len);
- switch_buffer_by_offset (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len);
+ switch_buffer_by_offset_le (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len);
}
/**
if (combs_mode == COMBINATOR_MODE_BASE_LEFT)
{
- switch_buffer_by_offset (wordr0, wordr1, wordr2, wordr3, pw_l_len);
+ switch_buffer_by_offset_le (wordr0, wordr1, wordr2, wordr3, pw_l_len);
}
u32 w0[4];
#define _MD4_
+#define NEW_SIMD_CODE
+
#include "include/constants.h"
#include "include/kernel_vendor.h"
#include "include/kernel_functions.c"
#include "OpenCL/types_ocl.c"
#include "OpenCL/common.c"
-
-#define COMPARE_S "OpenCL/check_single_comp4.c"
-#define COMPARE_M "OpenCL/check_multi_comp4.c"
+#include "OpenCL/simd.c"
#define PERM_OP(a,b,tt,n,m) \
{ \
}
};
+#if VECT_SIZE == 1
#define BOX(i,n,S) (S)[(n)][(i)]
-
-static void _des_crypt_encrypt (u32 iv[2], u32 data[2], u32 Kc[16], u32 Kd[16], __local u32 s_SPtrans[8][64])
+#elif VECT_SIZE == 2
+#define BOX(i,n,S) (u32x) ((S)[(n)][(i).s0], (S)[(n)][(i).s1])
+#elif VECT_SIZE == 4
+#define BOX(i,n,S) (u32x) ((S)[(n)][(i).s0], (S)[(n)][(i).s1], (S)[(n)][(i).s2], (S)[(n)][(i).s3])
+#elif VECT_SIZE == 8
+#define BOX(i,n,S) (u32x) ((S)[(n)][(i).s0], (S)[(n)][(i).s1], (S)[(n)][(i).s2], (S)[(n)][(i).s3], (S)[(n)][(i).s4], (S)[(n)][(i).s5], (S)[(n)][(i).s6], (S)[(n)][(i).s7])
+#endif
+
+static void _des_crypt_encrypt (u32x iv[2], u32x data[2], u32x Kc[16], u32x Kd[16], __local u32 s_SPtrans[8][64])
{
- u32 r = data[0];
- u32 l = data[1];
+ u32x r = data[0];
+ u32x l = data[1];
#pragma unroll 16
for (u32 i = 0; i < 16; i += 2)
{
- u32 u;
- u32 t;
+ u32x u;
+ u32x t;
u = Kc[i + 0] ^ rotl32 (r, 30u);
t = Kd[i + 0] ^ rotl32 (r, 26u);
iv[1] = r;
}
-static void _des_crypt_keysetup (u32 c, u32 d, u32 Kc[16], u32 Kd[16], __local u32 s_skb[8][64])
+static void _des_crypt_keysetup (u32x c, u32x d, u32x Kc[16], u32x Kd[16], __local u32 s_skb[8][64])
{
- u32 tt;
+ u32x tt;
PERM_OP (d, c, tt, 4, 0x0f0f0f0f);
HPERM_OP (c, tt, 2, 0xcccc0000);
c = c & 0x0fffffff;
d = d & 0x0fffffff;
- const u32 c00 = (c >> 0) & 0x0000003f;
- const u32 c06 = (c >> 6) & 0x00383003;
- const u32 c07 = (c >> 7) & 0x0000003c;
- const u32 c13 = (c >> 13) & 0x0000060f;
- const u32 c20 = (c >> 20) & 0x00000001;
-
- u32 s = BOX (((c00 >> 0) & 0xff), 0, s_skb)
- | BOX (((c06 >> 0) & 0xff)
- |((c07 >> 0) & 0xff), 1, s_skb)
- | BOX (((c13 >> 0) & 0xff)
- |((c06 >> 8) & 0xff), 2, s_skb)
- | BOX (((c20 >> 0) & 0xff)
- |((c13 >> 8) & 0xff)
- |((c06 >> 16) & 0xff), 3, s_skb);
-
- const u32 d00 = (d >> 0) & 0x00003c3f;
- const u32 d07 = (d >> 7) & 0x00003f03;
- const u32 d21 = (d >> 21) & 0x0000000f;
- const u32 d22 = (d >> 22) & 0x00000030;
-
- u32 t = BOX (((d00 >> 0) & 0xff), 4, s_skb)
- | BOX (((d07 >> 0) & 0xff)
- |((d00 >> 8) & 0xff), 5, s_skb)
- | BOX (((d07 >> 8) & 0xff), 6, s_skb)
- | BOX (((d21 >> 0) & 0xff)
- |((d22 >> 0) & 0xff), 7, s_skb);
+ const u32x c00 = (c >> 0) & 0x0000003f;
+ const u32x c06 = (c >> 6) & 0x00383003;
+ const u32x c07 = (c >> 7) & 0x0000003c;
+ const u32x c13 = (c >> 13) & 0x0000060f;
+ const u32x c20 = (c >> 20) & 0x00000001;
+
+ u32x s = BOX (((c00 >> 0) & 0xff), 0, s_skb)
+ | BOX (((c06 >> 0) & 0xff)
+ |((c07 >> 0) & 0xff), 1, s_skb)
+ | BOX (((c13 >> 0) & 0xff)
+ |((c06 >> 8) & 0xff), 2, s_skb)
+ | BOX (((c20 >> 0) & 0xff)
+ |((c13 >> 8) & 0xff)
+ |((c06 >> 16) & 0xff), 3, s_skb);
+
+ const u32x d00 = (d >> 0) & 0x00003c3f;
+ const u32x d07 = (d >> 7) & 0x00003f03;
+ const u32x d21 = (d >> 21) & 0x0000000f;
+ const u32x d22 = (d >> 22) & 0x00000030;
+
+ u32x t = BOX (((d00 >> 0) & 0xff), 4, s_skb)
+ | BOX (((d07 >> 0) & 0xff)
+ |((d00 >> 8) & 0xff), 5, s_skb)
+ | BOX (((d07 >> 8) & 0xff), 6, s_skb)
+ | BOX (((d21 >> 0) & 0xff)
+ |((d22 >> 0) & 0xff), 7, s_skb);
Kc[i] = ((t << 16) | (s & 0x0000ffff));
Kd[i] = ((s >> 16) | (t & 0xffff0000));
}
}
-static void transform_netntlmv1_key (const u32 w0, const u32 w1, u32 out[2])
+static void transform_netntlmv1_key (const u32x w0, const u32x w1, u32x out[2])
{
- const uchar4 t0 = as_uchar4 (w0);
- const uchar4 t1 = as_uchar4 (w1);
-
- uchar4 k0;
- uchar4 k1;
-
- k0.s0 = (t0.s0 >> 0);
- k0.s1 = (t0.s0 << 7) | (t0.s1 >> 1);
- k0.s2 = (t0.s1 << 6) | (t0.s2 >> 2);
- k0.s3 = (t0.s2 << 5) | (t0.s3 >> 3);
- k1.s0 = (t0.s3 << 4) | (t1.s0 >> 4);
- k1.s1 = (t1.s0 << 3) | (t1.s1 >> 5);
- k1.s2 = (t1.s1 << 2) | (t1.s2 >> 6);
- k1.s3 = (t1.s2 << 1);
-
- out[0] = as_uint (k0);
- out[1] = as_uint (k1);
+ u32x t[8];
+
+ t[0] = (w0 >> 0) & 0xff;
+ t[1] = (w0 >> 8) & 0xff;
+ t[2] = (w0 >> 16) & 0xff;
+ t[3] = (w0 >> 24) & 0xff;
+ t[4] = (w1 >> 0) & 0xff;
+ t[5] = (w1 >> 8) & 0xff;
+ t[6] = (w1 >> 16) & 0xff;
+ t[7] = (w1 >> 24) & 0xff;
+
+ u32x k[8];
+
+ k[0] = (t[0] >> 0);
+ k[1] = (t[0] << 7) | (t[1] >> 1);
+ k[2] = (t[1] << 6) | (t[2] >> 2);
+ k[3] = (t[2] << 5) | (t[3] >> 3);
+ k[4] = (t[3] << 4) | (t[4] >> 4);
+ k[5] = (t[4] << 3) | (t[5] >> 5);
+ k[6] = (t[5] << 2) | (t[6] >> 6);
+ k[7] = (t[6] << 1);
+
+ out[0] = ((k[0] & 0xff) << 0)
+ | ((k[1] & 0xff) << 8)
+ | ((k[2] & 0xff) << 16)
+ | ((k[3] & 0xff) << 24);
+
+ out[1] = ((k[4] & 0xff) << 0)
+ | ((k[5] & 0xff) << 8)
+ | ((k[6] & 0xff) << 16)
+ | ((k[7] & 0xff) << 24);
}
-static void m05500m (__local u32 s_SPtrans[8][64], __local u32 s_skb[8][64], u32 w[16], const u32 pw_len, __global pw_t *pws, __global kernel_rule_t *rules_buf, __global comb_t *combs_buf, __constant u32 * words_buf_r, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 bfs_cnt, const u32 digests_cnt, const u32 digests_offset)
+static void m05500m (__local u32 s_SPtrans[8][64], __local u32 s_skb[8][64], u32 w[16], const u32 pw_len, __global pw_t *pws, __global kernel_rule_t *rules_buf, __global comb_t *combs_buf, __constant u32x * words_buf_r, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 bfs_cnt, const u32 digests_cnt, const u32 digests_offset)
{
/**
* modifier
const u32 s1 = salt_bufs[salt_pos].salt_buf[1];
const u32 s2 = salt_bufs[salt_pos].salt_buf[2];
- u32 data[2];
-
- data[0] = s0;
- data[1] = s1;
-
/**
* loop
*/
u32 w0l = w[0];
- for (u32 il_pos = 0; il_pos < bfs_cnt; il_pos++)
+ for (u32 il_pos = 0; il_pos < bfs_cnt; il_pos += VECT_SIZE)
{
- const u32 w0r = words_buf_r[il_pos];
+ const u32x w0r = words_buf_r[il_pos / VECT_SIZE];
- const u32 w0 = w0l | w0r;
+ const u32x w0 = w0l | w0r;
- u32 a = MD4M_A;
- u32 b = MD4M_B;
- u32 c = MD4M_C;
- u32 d = MD4M_D;
+ u32x a = MD4M_A;
+ u32x b = MD4M_B;
+ u32x c = MD4M_C;
+ u32x d = MD4M_D;
#define w0_t w0
#define w1_t w[ 1]
MD4_STEP (MD4_H , a, b, c, d, w3_t, MD4C02, MD4S20);
MD4_STEP (MD4_H , d, a, b, c, wb_t, MD4C02, MD4S21);
- if (allx (s2 != ((d + MD4M_D) >> 16))) continue;
+ if (MATCHES_NONE_VS (((d + MD4M_D) >> 16), s2)) continue;
MD4_STEP (MD4_H , c, d, a, b, w7_t, MD4C02, MD4S22);
MD4_STEP (MD4_H , b, c, d, a, wf_t, MD4C02, MD4S23);
* DES1
*/
- u32 key[2];
+ u32x key[2];
transform_netntlmv1_key (a, b, key);
- u32 Kc[16];
- u32 Kd[16];
+ u32x Kc[16];
+ u32x Kd[16];
_des_crypt_keysetup (key[0], key[1], Kc, Kd, s_skb);
- u32 iv1[2];
+ u32x data[2];
+
+ data[0] = s0;
+ data[1] = s1;
+
+ u32x iv1[2];
_des_crypt_encrypt (iv1, data, Kc, Kd, s_SPtrans);
* DES2
*/
- const u32 bc = (b >> 24) | (c << 8);
- const u32 cd = (c >> 24) | (d << 8);
+ const u32x bc = (b >> 24) | (c << 8);
+ const u32x cd = (c >> 24) | (d << 8);
transform_netntlmv1_key (bc, cd, key);
_des_crypt_keysetup (key[0], key[1], Kc, Kd, s_skb);
- u32 iv2[2];
+ u32x iv2[2];
_des_crypt_encrypt (iv2, data, Kc, Kd, s_SPtrans);
* compare
*/
- const u32 r0 = iv1[0];
- const u32 r1 = iv1[1];
- const u32 r2 = iv2[0];
- const u32 r3 = iv2[1];
-
- #include COMPARE_M
+ COMPARE_M_SIMD (iv1[0], iv1[1], iv2[0], iv2[1]);
}
}
-static void m05500s (__local u32 s_SPtrans[8][64], __local u32 s_skb[8][64], u32 w[16], const u32 pw_len, __global pw_t *pws, __global kernel_rule_t *rules_buf, __global comb_t *combs_buf, __constant u32 * words_buf_r, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 bfs_cnt, const u32 digests_cnt, const u32 digests_offset)
+static void m05500s (__local u32 s_SPtrans[8][64], __local u32 s_skb[8][64], u32 w[16], const u32 pw_len, __global pw_t *pws, __global kernel_rule_t *rules_buf, __global comb_t *combs_buf, __constant u32x * words_buf_r, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 bfs_cnt, const u32 digests_cnt, const u32 digests_offset)
{
/**
* modifier
const u32 s1 = salt_bufs[salt_pos].salt_buf[1];
const u32 s2 = salt_bufs[salt_pos].salt_buf[2];
- u32 data[2];
-
- data[0] = s0;
- data[1] = s1;
-
/**
* digest
*/
u32 w0l = w[0];
- for (u32 il_pos = 0; il_pos < bfs_cnt; il_pos++)
+ for (u32 il_pos = 0; il_pos < bfs_cnt; il_pos += VECT_SIZE)
{
- const u32 w0r = words_buf_r[il_pos];
+ const u32x w0r = words_buf_r[il_pos / VECT_SIZE];
- const u32 w0 = w0l | w0r;
+ const u32x w0 = w0l | w0r;
- u32 a = MD4M_A;
- u32 b = MD4M_B;
- u32 c = MD4M_C;
- u32 d = MD4M_D;
+ u32x a = MD4M_A;
+ u32x b = MD4M_B;
+ u32x c = MD4M_C;
+ u32x d = MD4M_D;
#define w0_t w0
#define w1_t w[ 1]
MD4_STEP (MD4_H , a, b, c, d, w3_t, MD4C02, MD4S20);
MD4_STEP (MD4_H , d, a, b, c, wb_t, MD4C02, MD4S21);
- if (allx (s2 != ((d + MD4M_D) >> 16))) continue;
+ if (MATCHES_NONE_VS (((d + MD4M_D) >> 16), s2)) continue;
MD4_STEP (MD4_H , c, d, a, b, w7_t, MD4C02, MD4S22);
MD4_STEP (MD4_H , b, c, d, a, wf_t, MD4C02, MD4S23);
* DES1
*/
- u32 key[2];
+ u32x key[2];
transform_netntlmv1_key (a, b, key);
- u32 Kc[16];
- u32 Kd[16];
+ u32x Kc[16];
+ u32x Kd[16];
_des_crypt_keysetup (key[0], key[1], Kc, Kd, s_skb);
- u32 iv1[2];
+ u32x data[2];
+
+ data[0] = s0;
+ data[1] = s1;
+
+ u32x iv1[2];
_des_crypt_encrypt (iv1, data, Kc, Kd, s_SPtrans);
_des_crypt_keysetup (key[0], key[1], Kc, Kd, s_skb);
- u32 iv2[2];
+ u32x iv2[2];
_des_crypt_encrypt (iv2, data, Kc, Kd, s_SPtrans);
*/
- u32 iv2[2];
+ u32x iv2[2];
iv2[0] = search[2];
iv2[1] = search[3];
* compare
*/
- const u32 r0 = iv1[0];
- const u32 r1 = iv1[1];
- const u32 r2 = iv2[0];
- const u32 r3 = iv2[1];
-
- #include COMPARE_S
+ COMPARE_S_SIMD (iv1[0], iv1[1], iv2[0], iv2[1]);
}
}
-__kernel void m05500_m04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, __global comb_t *combs_buf, __constant u32 * words_buf_r, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 bfs_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max)
+__kernel void m05500_m04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, __global comb_t *combs_buf, __constant u32x * words_buf_r, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 bfs_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max)
{
/**
* modifier
m05500m (s_SPtrans, s_skb, w, pw_len, pws, rules_buf, combs_buf, words_buf_r, tmps, hooks, bitmaps_buf_s1_a, bitmaps_buf_s1_b, bitmaps_buf_s1_c, bitmaps_buf_s1_d, bitmaps_buf_s2_a, bitmaps_buf_s2_b, bitmaps_buf_s2_c, bitmaps_buf_s2_d, plains_buf, digests_buf, hashes_shown, salt_bufs, esalt_bufs, d_return_buf, d_scryptV_buf, bitmap_mask, bitmap_shift1, bitmap_shift2, salt_pos, loop_pos, loop_cnt, bfs_cnt, digests_cnt, digests_offset);
}
-__kernel void m05500_m08 (__global pw_t *pws, __global kernel_rule_t *rules_buf, __global comb_t *combs_buf, __constant u32 * words_buf_r, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 bfs_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max)
+__kernel void m05500_m08 (__global pw_t *pws, __global kernel_rule_t *rules_buf, __global comb_t *combs_buf, __constant u32x * words_buf_r, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 bfs_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max)
{
/**
* modifier
m05500m (s_SPtrans, s_skb, w, pw_len, pws, rules_buf, combs_buf, words_buf_r, tmps, hooks, bitmaps_buf_s1_a, bitmaps_buf_s1_b, bitmaps_buf_s1_c, bitmaps_buf_s1_d, bitmaps_buf_s2_a, bitmaps_buf_s2_b, bitmaps_buf_s2_c, bitmaps_buf_s2_d, plains_buf, digests_buf, hashes_shown, salt_bufs, esalt_bufs, d_return_buf, d_scryptV_buf, bitmap_mask, bitmap_shift1, bitmap_shift2, salt_pos, loop_pos, loop_cnt, bfs_cnt, digests_cnt, digests_offset);
}
-__kernel void m05500_m16 (__global pw_t *pws, __global kernel_rule_t *rules_buf, __global comb_t *combs_buf, __constant u32 * words_buf_r, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 bfs_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max)
+__kernel void m05500_m16 (__global pw_t *pws, __global kernel_rule_t *rules_buf, __global comb_t *combs_buf, __constant u32x * words_buf_r, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 bfs_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max)
{
}
-__kernel void m05500_s04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, __global comb_t *combs_buf, __constant u32 * words_buf_r, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 bfs_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max)
+__kernel void m05500_s04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, __global comb_t *combs_buf, __constant u32x * words_buf_r, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 bfs_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max)
{
/**
* modifier
m05500s (s_SPtrans, s_skb, w, pw_len, pws, rules_buf, combs_buf, words_buf_r, tmps, hooks, bitmaps_buf_s1_a, bitmaps_buf_s1_b, bitmaps_buf_s1_c, bitmaps_buf_s1_d, bitmaps_buf_s2_a, bitmaps_buf_s2_b, bitmaps_buf_s2_c, bitmaps_buf_s2_d, plains_buf, digests_buf, hashes_shown, salt_bufs, esalt_bufs, d_return_buf, d_scryptV_buf, bitmap_mask, bitmap_shift1, bitmap_shift2, salt_pos, loop_pos, loop_cnt, bfs_cnt, digests_cnt, digests_offset);
}
-__kernel void m05500_s08 (__global pw_t *pws, __global kernel_rule_t *rules_buf, __global comb_t *combs_buf, __constant u32 * words_buf_r, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 bfs_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max)
+__kernel void m05500_s08 (__global pw_t *pws, __global kernel_rule_t *rules_buf, __global comb_t *combs_buf, __constant u32x * words_buf_r, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 bfs_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max)
{
/**
* modifier
m05500s (s_SPtrans, s_skb, w, pw_len, pws, rules_buf, combs_buf, words_buf_r, tmps, hooks, bitmaps_buf_s1_a, bitmaps_buf_s1_b, bitmaps_buf_s1_c, bitmaps_buf_s1_d, bitmaps_buf_s2_a, bitmaps_buf_s2_b, bitmaps_buf_s2_c, bitmaps_buf_s2_d, plains_buf, digests_buf, hashes_shown, salt_bufs, esalt_bufs, d_return_buf, d_scryptV_buf, bitmap_mask, bitmap_shift1, bitmap_shift2, salt_pos, loop_pos, loop_cnt, bfs_cnt, digests_cnt, digests_offset);
}
-__kernel void m05500_s16 (__global pw_t *pws, __global kernel_rule_t *rules_buf, __global comb_t *combs_buf, __constant u32 * words_buf_r, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 bfs_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max)
+__kernel void m05500_s16 (__global pw_t *pws, __global kernel_rule_t *rules_buf, __global comb_t *combs_buf, __constant u32x * words_buf_r, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 bfs_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max)
{
}
{
append_0x80_2x4 (wordl0, wordl1, pw_l_len);
- switch_buffer_by_offset (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len);
+ switch_buffer_by_offset_le (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len);
}
/**
if (combs_mode == COMBINATOR_MODE_BASE_LEFT)
{
- switch_buffer_by_offset (wordr0, wordr1, wordr2, wordr3, pw_l_len);
+ switch_buffer_by_offset_le (wordr0, wordr1, wordr2, wordr3, pw_l_len);
}
u32 w0[4];
{
append_0x80_2x4 (wordl0, wordl1, pw_l_len);
- switch_buffer_by_offset (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len);
+ switch_buffer_by_offset_le (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len);
}
/**
if (combs_mode == COMBINATOR_MODE_BASE_LEFT)
{
- switch_buffer_by_offset (wordr0, wordr1, wordr2, wordr3, pw_l_len);
+ switch_buffer_by_offset_le (wordr0, wordr1, wordr2, wordr3, pw_l_len);
}
u32 w0[4];
#define _NETNTLMV2_
+#define NEW_SIMD_CODE
+
#include "include/constants.h"
#include "include/kernel_vendor.h"
#include "include/kernel_functions.c"
#include "OpenCL/types_ocl.c"
#include "OpenCL/common.c"
+#include "OpenCL/simd.c"
-#define COMPARE_S "OpenCL/check_single_comp4.c"
-#define COMPARE_M "OpenCL/check_multi_comp4.c"
-
-static void md4_transform (const u32 w0[4], const u32 w1[4], const u32 w2[4], const u32 w3[4], u32 digest[4])
+static void md4_transform (const u32x w0[4], const u32x w1[4], const u32x w2[4], const u32x w3[4], u32x digest[4])
{
- u32 a = digest[0];
- u32 b = digest[1];
- u32 c = digest[2];
- u32 d = digest[3];
-
- u32 w0_t = w0[0];
- u32 w1_t = w0[1];
- u32 w2_t = w0[2];
- u32 w3_t = w0[3];
- u32 w4_t = w1[0];
- u32 w5_t = w1[1];
- u32 w6_t = w1[2];
- u32 w7_t = w1[3];
- u32 w8_t = w2[0];
- u32 w9_t = w2[1];
- u32 wa_t = w2[2];
- u32 wb_t = w2[3];
- u32 wc_t = w3[0];
- u32 wd_t = w3[1];
- u32 we_t = w3[2];
- u32 wf_t = w3[3];
+ u32x a = digest[0];
+ u32x b = digest[1];
+ u32x c = digest[2];
+ u32x d = digest[3];
+
+ u32x w0_t = w0[0];
+ u32x w1_t = w0[1];
+ u32x w2_t = w0[2];
+ u32x w3_t = w0[3];
+ u32x w4_t = w1[0];
+ u32x w5_t = w1[1];
+ u32x w6_t = w1[2];
+ u32x w7_t = w1[3];
+ u32x w8_t = w2[0];
+ u32x w9_t = w2[1];
+ u32x wa_t = w2[2];
+ u32x wb_t = w2[3];
+ u32x wc_t = w3[0];
+ u32x wd_t = w3[1];
+ u32x we_t = w3[2];
+ u32x wf_t = w3[3];
MD4_STEP (MD4_Fo, a, b, c, d, w0_t, MD4C00, MD4S00);
MD4_STEP (MD4_Fo, d, a, b, c, w1_t, MD4C00, MD4S01);
digest[3] += d;
}
-static void md5_transform (const u32 w0[4], const u32 w1[4], const u32 w2[4], const u32 w3[4], u32 digest[4])
+static void md5_transform (const u32x w0[4], const u32x w1[4], const u32x w2[4], const u32x w3[4], u32x digest[4])
{
- u32 a = digest[0];
- u32 b = digest[1];
- u32 c = digest[2];
- u32 d = digest[3];
-
- u32 w0_t = w0[0];
- u32 w1_t = w0[1];
- u32 w2_t = w0[2];
- u32 w3_t = w0[3];
- u32 w4_t = w1[0];
- u32 w5_t = w1[1];
- u32 w6_t = w1[2];
- u32 w7_t = w1[3];
- u32 w8_t = w2[0];
- u32 w9_t = w2[1];
- u32 wa_t = w2[2];
- u32 wb_t = w2[3];
- u32 wc_t = w3[0];
- u32 wd_t = w3[1];
- u32 we_t = w3[2];
- u32 wf_t = w3[3];
+ u32x a = digest[0];
+ u32x b = digest[1];
+ u32x c = digest[2];
+ u32x d = digest[3];
+
+ u32x w0_t = w0[0];
+ u32x w1_t = w0[1];
+ u32x w2_t = w0[2];
+ u32x w3_t = w0[3];
+ u32x w4_t = w1[0];
+ u32x w5_t = w1[1];
+ u32x w6_t = w1[2];
+ u32x w7_t = w1[3];
+ u32x w8_t = w2[0];
+ u32x w9_t = w2[1];
+ u32x wa_t = w2[2];
+ u32x wb_t = w2[3];
+ u32x wc_t = w3[0];
+ u32x wd_t = w3[1];
+ u32x we_t = w3[2];
+ u32x wf_t = w3[3];
MD5_STEP (MD5_Fo, a, b, c, d, w0_t, MD5C00, MD5S00);
MD5_STEP (MD5_Fo, d, a, b, c, w1_t, MD5C01, MD5S01);
digest[3] += d;
}
-static void hmac_md5_pad (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], u32 ipad[4], u32 opad[4])
+static void hmac_md5_pad (u32x w0[4], u32x w1[4], u32x w2[4], u32x w3[4], u32x ipad[4], u32x opad[4])
{
w0[0] = w0[0] ^ 0x36363636;
w0[1] = w0[1] ^ 0x36363636;
md5_transform (w0, w1, w2, w3, opad);
}
-static void hmac_md5_run (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], u32 ipad[4], u32 opad[4], u32 digest[4])
+static void hmac_md5_run (u32x w0[4], u32x w1[4], u32x w2[4], u32x w3[4], u32x ipad[4], u32x opad[4], u32x digest[4])
{
digest[0] = ipad[0];
digest[1] = ipad[1];
u32 w0l = w0[0];
- for (u32 il_pos = 0; il_pos < bfs_cnt; il_pos++)
+ for (u32 il_pos = 0; il_pos < bfs_cnt; il_pos += VECT_SIZE)
{
- const u32 w0r = bfs_buf[il_pos].i;
+ const u32x w0r = w0r_create_bft (bfs_buf, il_pos);
+
+ const u32x w0lr = w0l | w0r;
+
+ u32x w0_t[4];
+ u32x w1_t[4];
+ u32x w2_t[4];
+ u32x w3_t[4];
- w0[0] = w0l | w0r;
+ w0_t[0] = w0lr;
+ w0_t[1] = w0[1];
+ w0_t[2] = w0[2];
+ w0_t[3] = w0[3];
- u32 digest[4];
+ w1_t[0] = w1[0];
+ w1_t[1] = w1[1];
+ w1_t[2] = w1[2];
+ w1_t[3] = w1[3];
+
+ w2_t[0] = w2[0];
+ w2_t[1] = w2[1];
+ w2_t[2] = w2[2];
+ w2_t[3] = w2[3];
+
+ w3_t[0] = w3[0];
+ w3_t[1] = w3[1];
+ w3_t[2] = w3[2];
+ w3_t[3] = w3[3];
+
+ u32x digest[4];
digest[0] = MD4M_A;
digest[1] = MD4M_B;
digest[2] = MD4M_C;
digest[3] = MD4M_D;
- md4_transform (w0, w1, w2, w3, digest);
-
- u32 w0_t[4];
- u32 w1_t[4];
- u32 w2_t[4];
- u32 w3_t[4];
+ md4_transform (w0_t, w1_t, w2_t, w3_t, digest);
w0_t[0] = digest[0];
w0_t[1] = digest[1];
digest[2] = MD5M_C;
digest[3] = MD5M_D;
- u32 ipad[4];
- u32 opad[4];
+ u32x ipad[4];
+ u32x opad[4];
hmac_md5_pad (w0_t, w1_t, w2_t, w3_t, ipad, opad);
hmac_md5_run (w0_t, w1_t, w2_t, w3_t, ipad, opad, digest);
- const u32 r0 = digest[0];
- const u32 r1 = digest[3];
- const u32 r2 = digest[2];
- const u32 r3 = digest[1];
-
- #include COMPARE_M
+ COMPARE_M_SIMD (digest[0], digest[3], digest[2], digest[1]);
}
}
u32 w0l = w0[0];
- for (u32 il_pos = 0; il_pos < bfs_cnt; il_pos++)
+ for (u32 il_pos = 0; il_pos < bfs_cnt; il_pos += VECT_SIZE)
{
- const u32 w0r = bfs_buf[il_pos].i;
+ const u32x w0r = w0r_create_bft (bfs_buf, il_pos);
+
+ const u32x w0lr = w0l | w0r;
+
+ u32x w0_t[4];
+ u32x w1_t[4];
+ u32x w2_t[4];
+ u32x w3_t[4];
+
+ w0_t[0] = w0lr;
+ w0_t[1] = w0[1];
+ w0_t[2] = w0[2];
+ w0_t[3] = w0[3];
- w0[0] = w0l | w0r;
+ w1_t[0] = w1[0];
+ w1_t[1] = w1[1];
+ w1_t[2] = w1[2];
+ w1_t[3] = w1[3];
- u32 digest[4];
+ w2_t[0] = w2[0];
+ w2_t[1] = w2[1];
+ w2_t[2] = w2[2];
+ w2_t[3] = w2[3];
+
+ w3_t[0] = w3[0];
+ w3_t[1] = w3[1];
+ w3_t[2] = w3[2];
+ w3_t[3] = w3[3];
+
+ u32x digest[4];
digest[0] = MD4M_A;
digest[1] = MD4M_B;
digest[2] = MD4M_C;
digest[3] = MD4M_D;
- md4_transform (w0, w1, w2, w3, digest);
-
- u32 w0_t[4];
- u32 w1_t[4];
- u32 w2_t[4];
- u32 w3_t[4];
+ md4_transform (w0_t, w1_t, w2_t, w3_t, digest);
w0_t[0] = digest[0];
w0_t[1] = digest[1];
digest[2] = MD5M_C;
digest[3] = MD5M_D;
- u32 ipad[4];
- u32 opad[4];
+ u32x ipad[4];
+ u32x opad[4];
hmac_md5_pad (w0_t, w1_t, w2_t, w3_t, ipad, opad);
hmac_md5_run (w0_t, w1_t, w2_t, w3_t, ipad, opad, digest);
- const u32 r0 = digest[0];
- const u32 r1 = digest[3];
- const u32 r2 = digest[2];
- const u32 r3 = digest[1];
-
- #include COMPARE_S
+ COMPARE_S_SIMD (digest[0], digest[3], digest[2], digest[1]);
}
}
{
append_0x80_2x4 (wordl0, wordl1, pw_l_len);
- switch_buffer_by_offset (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len);
+ switch_buffer_by_offset_le (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len);
}
/**
if (combs_mode == COMBINATOR_MODE_BASE_LEFT)
{
- switch_buffer_by_offset (wordr0, wordr1, wordr2, wordr3, pw_l_len);
+ switch_buffer_by_offset_le (wordr0, wordr1, wordr2, wordr3, pw_l_len);
}
u32 w0[4];
{
append_0x80_2x4 (wordl0, wordl1, pw_l_len);
- switch_buffer_by_offset (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len);
+ switch_buffer_by_offset_le (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len);
}
/**
if (combs_mode == COMBINATOR_MODE_BASE_LEFT)
{
- switch_buffer_by_offset (wordr0, wordr1, wordr2, wordr3, pw_l_len);
+ switch_buffer_by_offset_le (wordr0, wordr1, wordr2, wordr3, pw_l_len);
}
u32 w0[4];
#define _RIPEMD160_
+#define NEW_SIMD_CODE
+
#include "include/constants.h"
#include "include/kernel_vendor.h"
#include "include/kernel_functions.c"
#include "OpenCL/types_ocl.c"
#include "OpenCL/common.c"
+#include "OpenCL/simd.c"
-#define COMPARE_S "OpenCL/check_single_comp4.c"
-#define COMPARE_M "OpenCL/check_multi_comp4.c"
-
-static void ripemd160_transform (const u32 w[16], u32 dgst[5])
+static void ripemd160_transform (const u32x w[16], u32x dgst[5])
{
- u32 a1 = dgst[0];
- u32 b1 = dgst[1];
- u32 c1 = dgst[2];
- u32 d1 = dgst[3];
- u32 e1 = dgst[4];
+ u32x a1 = dgst[0];
+ u32x b1 = dgst[1];
+ u32x c1 = dgst[2];
+ u32x d1 = dgst[3];
+ u32x e1 = dgst[4];
RIPEMD160_STEP (RIPEMD160_F , a1, b1, c1, d1, e1, w[ 0], RIPEMD160C00, RIPEMD160S00);
RIPEMD160_STEP (RIPEMD160_F , e1, a1, b1, c1, d1, w[ 1], RIPEMD160C00, RIPEMD160S01);
RIPEMD160_STEP (RIPEMD160_J , c1, d1, e1, a1, b1, w[15], RIPEMD160C40, RIPEMD160S4E);
RIPEMD160_STEP (RIPEMD160_J , b1, c1, d1, e1, a1, w[13], RIPEMD160C40, RIPEMD160S4F);
- u32 a2 = dgst[0];
- u32 b2 = dgst[1];
- u32 c2 = dgst[2];
- u32 d2 = dgst[3];
- u32 e2 = dgst[4];
+ u32x a2 = dgst[0];
+ u32x b2 = dgst[1];
+ u32x c2 = dgst[2];
+ u32x d2 = dgst[3];
+ u32x e2 = dgst[4];
RIPEMD160_STEP_WORKAROUND_BUG (RIPEMD160_J , a2, b2, c2, d2, e2, w[ 5], RIPEMD160C50, RIPEMD160S50);
RIPEMD160_STEP (RIPEMD160_J , e2, a2, b2, c2, d2, w[14], RIPEMD160C50, RIPEMD160S51);
RIPEMD160_STEP (RIPEMD160_F , c2, d2, e2, a2, b2, w[ 9], RIPEMD160C90, RIPEMD160S9E);
RIPEMD160_STEP (RIPEMD160_F , b2, c2, d2, e2, a2, w[11], RIPEMD160C90, RIPEMD160S9F);
- const u32 a = dgst[1] + c1 + d2;
- const u32 b = dgst[2] + d1 + e2;
- const u32 c = dgst[3] + e1 + a2;
- const u32 d = dgst[4] + a1 + b2;
- const u32 e = dgst[0] + b1 + c2;
+ const u32x a = dgst[1] + c1 + d2;
+ const u32x b = dgst[2] + d1 + e2;
+ const u32x c = dgst[3] + e1 + a2;
+ const u32x d = dgst[4] + a1 + b2;
+ const u32x e = dgst[0] + b1 + c2;
dgst[0] = a;
dgst[1] = b;
u32 w0l = w0[0];
- for (u32 il_pos = 0; il_pos < bfs_cnt; il_pos++)
+ for (u32 il_pos = 0; il_pos < bfs_cnt; il_pos += VECT_SIZE)
{
- const u32 w0r = bfs_buf[il_pos].i;
+ const u32x w0r = w0r_create_bft (bfs_buf, il_pos);
- w0[0] = w0l | w0r;
+ const u32x w0lr = w0l | w0r;
- u32 wl[16];
+ u32x wl[16];
- wl[ 0] = w0[0];
+ wl[ 0] = w0lr;
wl[ 1] = w0[1];
wl[ 2] = w0[2];
wl[ 3] = w0[3];
wl[14] = w14;
wl[15] = 0;
- u32 dgst[5];
+ u32x dgst[5];
dgst[0] = RIPEMD160M_A;
dgst[1] = RIPEMD160M_B;
ripemd160_transform (wl, dgst);
- const u32 r0 = dgst[0];
- const u32 r1 = dgst[1];
- const u32 r2 = dgst[2];
- const u32 r3 = dgst[3];
-
- #include COMPARE_M
+ COMPARE_M_SIMD (dgst[0], dgst[1], dgst[2], dgst[3]);
}
}
u32 w0l = w0[0];
- for (u32 il_pos = 0; il_pos < bfs_cnt; il_pos++)
+ for (u32 il_pos = 0; il_pos < bfs_cnt; il_pos += VECT_SIZE)
{
- const u32 w0r = bfs_buf[il_pos].i;
+ const u32x w0r = w0r_create_bft (bfs_buf, il_pos);
- w0[0] = w0l | w0r;
+ const u32x w0lr = w0l | w0r;
- u32 wl[16];
+ u32x wl[16];
- wl[ 0] = w0[0];
+ wl[ 0] = w0lr;
wl[ 1] = w0[1];
wl[ 2] = w0[2];
wl[ 3] = w0[3];
wl[14] = w14;
wl[15] = 0;
- u32 dgst[5];
+ u32x dgst[5];
dgst[0] = RIPEMD160M_A;
dgst[1] = RIPEMD160M_B;
ripemd160_transform (wl, dgst);
- const u32 r0 = dgst[0];
- const u32 r1 = dgst[1];
- const u32 r2 = dgst[2];
- const u32 r3 = dgst[3];
-
- #include COMPARE_S
+ COMPARE_S_SIMD (dgst[0], dgst[1], dgst[2], dgst[3]);
}
}
{
append_0x80_2x4 (wordl0, wordl1, pw_l_len);
- switch_buffer_by_offset (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len);
+ switch_buffer_by_offset_le (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len);
}
/**
if (combs_mode == COMBINATOR_MODE_BASE_LEFT)
{
- switch_buffer_by_offset (wordr0, wordr1, wordr2, wordr3, pw_l_len);
+ switch_buffer_by_offset_le (wordr0, wordr1, wordr2, wordr3, pw_l_len);
}
u32 w0[4];
{
append_0x80_2x4 (wordl0, wordl1, pw_l_len);
- switch_buffer_by_offset (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len);
+ switch_buffer_by_offset_le (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len);
}
/**
if (combs_mode == COMBINATOR_MODE_BASE_LEFT)
{
- switch_buffer_by_offset (wordr0, wordr1, wordr2, wordr3, pw_l_len);
+ switch_buffer_by_offset_le (wordr0, wordr1, wordr2, wordr3, pw_l_len);
}
u32 w0[4];
#define _WHIRLPOOL_
+#define NEW_SIMD_CODE
+
#include "include/constants.h"
#include "include/kernel_vendor.h"
#include "include/kernel_functions.c"
#include "OpenCL/types_ocl.c"
#include "OpenCL/common.c"
-
-#define COMPARE_S "OpenCL/check_single_comp4.c"
-#define COMPARE_M "OpenCL/check_multi_comp4.c"
+#include "OpenCL/simd.c"
#define R 10
+#if VECT_SIZE == 1
#define BOX(S,n,i) (S)[(n)][(i)]
+#elif VECT_SIZE == 2
+#define BOX(S,n,i) (u32x) ((S)[(n)][(i).s0], (S)[(n)][(i).s1])
+#elif VECT_SIZE == 4
+#define BOX(S,n,i) (u32x) ((S)[(n)][(i).s0], (S)[(n)][(i).s1], (S)[(n)][(i).s2], (S)[(n)][(i).s3])
+#elif VECT_SIZE == 8
+#define BOX(S,n,i) (u32x) ((S)[(n)][(i).s0], (S)[(n)][(i).s1], (S)[(n)][(i).s2], (S)[(n)][(i).s3], (S)[(n)][(i).s4], (S)[(n)][(i).s5], (S)[(n)][(i).s6], (S)[(n)][(i).s7])
+#endif
__constant u32 Ch[8][256] =
{
// this is a highly optimized that assumes dgst[16] = { 0 }; only reuse of no 2nd transform is needed
-static void whirlpool_transform (const u32 w[16], u32 dgst[16], __local u32 s_Ch[8][256], __local u32 s_Cl[8][256])
+static void whirlpool_transform (const u32x w[16], u32x dgst[16], __local u32 s_Ch[8][256], __local u32 s_Cl[8][256])
{
- u32 Kh[8];
- u32 Kl[8];
+ u32x Kh[8];
+ u32x Kl[8];
Kh[0] = 0x300beec0;
Kl[0] = 0xaf902967;
Kh[7] = 0x28282828;
Kl[7] = 0x28282828;
- u32 stateh[8];
- u32 statel[8];
+ u32x stateh[8];
+ u32x statel[8];
stateh[0] = w[ 0];
statel[0] = w[ 1];
stateh[7] = w[14];
statel[7] = w[15];
- u32 Lh[8];
- u32 Ll[8];
+ u32x Lh[8];
+ u32x Ll[8];
#pragma unroll
for (int i = 0; i < 8; i++)
{
- const u32 Lp0 = stateh[(i + 8) & 7] >> 24;
- const u32 Lp1 = stateh[(i + 7) & 7] >> 16;
- const u32 Lp2 = stateh[(i + 6) & 7] >> 8;
- const u32 Lp3 = stateh[(i + 5) & 7] >> 0;
- const u32 Lp4 = statel[(i + 4) & 7] >> 24;
- const u32 Lp5 = statel[(i + 3) & 7] >> 16;
- const u32 Lp6 = statel[(i + 2) & 7] >> 8;
- const u32 Lp7 = statel[(i + 1) & 7] >> 0;
+ const u32x Lp0 = stateh[(i + 8) & 7] >> 24;
+ const u32x Lp1 = stateh[(i + 7) & 7] >> 16;
+ const u32x Lp2 = stateh[(i + 6) & 7] >> 8;
+ const u32x Lp3 = stateh[(i + 5) & 7] >> 0;
+ const u32x Lp4 = statel[(i + 4) & 7] >> 24;
+ const u32x Lp5 = statel[(i + 3) & 7] >> 16;
+ const u32x Lp6 = statel[(i + 2) & 7] >> 8;
+ const u32x Lp7 = statel[(i + 1) & 7] >> 0;
Lh[i] = BOX (s_Ch, 0, Lp0 & 0xff)
^ BOX (s_Ch, 1, Lp1 & 0xff)
for (int r = 2; r <= R; r++)
{
- u32 Lh[8];
- u32 Ll[8];
+ u32x Lh[8];
+ u32x Ll[8];
#pragma unroll
for (int i = 0; i < 8; i++)
{
- const u32 Lp0 = Kh[(i + 8) & 7] >> 24;
- const u32 Lp1 = Kh[(i + 7) & 7] >> 16;
- const u32 Lp2 = Kh[(i + 6) & 7] >> 8;
- const u32 Lp3 = Kh[(i + 5) & 7] >> 0;
- const u32 Lp4 = Kl[(i + 4) & 7] >> 24;
- const u32 Lp5 = Kl[(i + 3) & 7] >> 16;
- const u32 Lp6 = Kl[(i + 2) & 7] >> 8;
- const u32 Lp7 = Kl[(i + 1) & 7] >> 0;
+ const u32x Lp0 = Kh[(i + 8) & 7] >> 24;
+ const u32x Lp1 = Kh[(i + 7) & 7] >> 16;
+ const u32x Lp2 = Kh[(i + 6) & 7] >> 8;
+ const u32x Lp3 = Kh[(i + 5) & 7] >> 0;
+ const u32x Lp4 = Kl[(i + 4) & 7] >> 24;
+ const u32x Lp5 = Kl[(i + 3) & 7] >> 16;
+ const u32x Lp6 = Kl[(i + 2) & 7] >> 8;
+ const u32x Lp7 = Kl[(i + 1) & 7] >> 0;
Lh[i] = BOX (s_Ch, 0, Lp0 & 0xff)
^ BOX (s_Ch, 1, Lp1 & 0xff)
#pragma unroll 8
for (int i = 0; i < 8; i++)
{
- const u32 Lp0 = stateh[(i + 8) & 7] >> 24;
- const u32 Lp1 = stateh[(i + 7) & 7] >> 16;
- const u32 Lp2 = stateh[(i + 6) & 7] >> 8;
- const u32 Lp3 = stateh[(i + 5) & 7] >> 0;
- const u32 Lp4 = statel[(i + 4) & 7] >> 24;
- const u32 Lp5 = statel[(i + 3) & 7] >> 16;
- const u32 Lp6 = statel[(i + 2) & 7] >> 8;
- const u32 Lp7 = statel[(i + 1) & 7] >> 0;
+ const u32x Lp0 = stateh[(i + 8) & 7] >> 24;
+ const u32x Lp1 = stateh[(i + 7) & 7] >> 16;
+ const u32x Lp2 = stateh[(i + 6) & 7] >> 8;
+ const u32x Lp3 = stateh[(i + 5) & 7] >> 0;
+ const u32x Lp4 = statel[(i + 4) & 7] >> 24;
+ const u32x Lp5 = statel[(i + 3) & 7] >> 16;
+ const u32x Lp6 = statel[(i + 2) & 7] >> 8;
+ const u32x Lp7 = statel[(i + 1) & 7] >> 0;
Lh[i] = BOX (s_Ch, 0, Lp0 & 0xff)
^ BOX (s_Ch, 1, Lp1 & 0xff)
u32 w0l = w0[0];
- for (u32 il_pos = 0; il_pos < bfs_cnt; il_pos++)
+ for (u32 il_pos = 0; il_pos < bfs_cnt; il_pos += VECT_SIZE)
{
- const u32 w0r = bfs_buf[il_pos].i;
+ const u32x w0r = w0r_create_bft (bfs_buf, il_pos);
- w0[0] = w0l | w0r;
+ const u32x w0lr = w0l | w0r;
- u32 wl[16];
+ u32x wl[16];
- wl[ 0] = w0[0];
+ wl[ 0] = w0lr;
wl[ 1] = w0[1];
wl[ 2] = w0[2];
wl[ 3] = w0[3];
wl[14] = 0;
wl[15] = pw_len * 8;
- u32 dgst[16];
+ u32x dgst[16];
whirlpool_transform (wl, dgst, s_Ch, s_Cl);
- const u32 r0 = dgst[0];
- const u32 r1 = dgst[1];
- const u32 r2 = dgst[2];
- const u32 r3 = dgst[3];
-
- #include COMPARE_M
+ COMPARE_M_SIMD (dgst[0], dgst[1], dgst[2], dgst[3]);
}
}
u32 w0l = w0[0];
- for (u32 il_pos = 0; il_pos < bfs_cnt; il_pos++)
+ for (u32 il_pos = 0; il_pos < bfs_cnt; il_pos += VECT_SIZE)
{
- const u32 w0r = bfs_buf[il_pos].i;
+ const u32x w0r = w0r_create_bft (bfs_buf, il_pos);
- w0[0] = w0l | w0r;
+ const u32x w0lr = w0l | w0r;
- u32 wl[16];
+ u32x wl[16];
- wl[ 0] = w0[0];
+ wl[ 0] = w0lr;
wl[ 1] = w0[1];
wl[ 2] = w0[2];
wl[ 3] = w0[3];
wl[14] = 0;
wl[15] = pw_len * 8;
- u32 dgst[16];
+ u32x dgst[16];
whirlpool_transform (wl, dgst, s_Ch, s_Cl);
- const u32 r0 = dgst[0];
- const u32 r1 = dgst[1];
- const u32 r2 = dgst[2];
- const u32 r3 = dgst[3];
-
- #include COMPARE_S
+ COMPARE_S_SIMD (dgst[0], dgst[1], dgst[2], dgst[3]);
}
}
if (combs_mode == COMBINATOR_MODE_BASE_RIGHT)
{
- switch_buffer_by_offset (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len);
+ switch_buffer_by_offset_le (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len);
}
/**
if (combs_mode == COMBINATOR_MODE_BASE_LEFT)
{
- switch_buffer_by_offset (wordr0, wordr1, wordr2, wordr3, pw_l_len);
+ switch_buffer_by_offset_le (wordr0, wordr1, wordr2, wordr3, pw_l_len);
}
u32 w0[4];
if (combs_mode == COMBINATOR_MODE_BASE_RIGHT)
{
- switch_buffer_by_offset (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len);
+ switch_buffer_by_offset_le (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len);
}
/**
if (combs_mode == COMBINATOR_MODE_BASE_LEFT)
{
- switch_buffer_by_offset (wordr0, wordr1, wordr2, wordr3, pw_l_len);
+ switch_buffer_by_offset_le (wordr0, wordr1, wordr2, wordr3, pw_l_len);
}
u32 w0[4];
#define _GOST_
+#define NEW_SIMD_CODE
+
#include "include/constants.h"
#include "include/kernel_vendor.h"
#include "include/kernel_functions.c"
#include "OpenCL/types_ocl.c"
#include "OpenCL/common.c"
-
-#define COMPARE_S "OpenCL/check_single_comp4.c"
-#define COMPARE_M "OpenCL/check_multi_comp4.c"
+#include "OpenCL/simd.c"
__constant u32 c_tables[4][256] =
{
}
};
+#if VECT_SIZE == 1
#define BOX(i,n,S) (S)[(n)][(i)]
+#elif VECT_SIZE == 2
+#define BOX(i,n,S) (u32x) ((S)[(n)][(i).s0], (S)[(n)][(i).s1])
+#elif VECT_SIZE == 4
+#define BOX(i,n,S) (u32x) ((S)[(n)][(i).s0], (S)[(n)][(i).s1], (S)[(n)][(i).s2], (S)[(n)][(i).s3])
+#elif VECT_SIZE == 8
+#define BOX(i,n,S) (u32x) ((S)[(n)][(i).s0], (S)[(n)][(i).s1], (S)[(n)][(i).s2], (S)[(n)][(i).s3], (S)[(n)][(i).s4], (S)[(n)][(i).s5], (S)[(n)][(i).s6], (S)[(n)][(i).s7])
+#endif
#define round(k1,k2,tbl) \
{ \
- u32 t; \
+ u32x t; \
t = (k1) + r; \
l ^= BOX (((t >> 0) & 0xff), 0, tbl) ^ \
BOX (((t >> 8) & 0xff), 1, tbl) ^ \
#define R(k,h,s,i,t) \
{ \
- u32 r; \
- u32 l; \
+ u32x r; \
+ u32x l; \
r = h[i + 0]; \
l = h[i + 1]; \
round (k[0], k[1], t); \
#define A(x) \
{ \
- u32 l; \
- u32 r; \
+ u32x l; \
+ u32x r; \
l = x[0] ^ x[2]; \
r = x[1] ^ x[3]; \
x[0] = x[2]; \
#define AA(x) \
{ \
- u32 l; \
- u32 r; \
+ u32x l; \
+ u32x r; \
l = x[0]; \
r = x[2]; \
x[0] = x[4]; \
#define PASS0(h,s,u,v,t) \
{ \
- u32 k[8]; \
- u32 w[8]; \
+ u32x k[8]; \
+ u32x w[8]; \
X (w, u, v); \
P (k, w); \
R (k, h, s, 0, t); \
#define PASS2(h,s,u,v,t) \
{ \
- u32 k[8]; \
- u32 w[8]; \
+ u32x k[8]; \
+ u32x w[8]; \
X (w, u, v); \
P (k, w); \
R (k, h, s, 2, t); \
#define PASS4(h,s,u,v,t) \
{ \
- u32 k[8]; \
- u32 w[8]; \
+ u32x k[8]; \
+ u32x w[8]; \
X (w, u, v); \
P (k, w); \
R (k, h, s, 4, t); \
#define PASS6(h,s,u,v,t) \
{ \
- u32 k[8]; \
- u32 w[8]; \
+ u32x k[8]; \
+ u32x w[8]; \
X (w, u, v); \
P (k, w); \
R (k, h, s, 6, t); \
u32 w0l = w0[0];
- for (u32 il_pos = 0; il_pos < bfs_cnt; il_pos++)
+ for (u32 il_pos = 0; il_pos < bfs_cnt; il_pos += VECT_SIZE)
{
- const u32 w0r = bfs_buf[il_pos].i;
+ const u32x w0r = w0r_create_bft (bfs_buf, il_pos);
- w0[0] = w0l | w0r;
+ const u32x w0lr = w0l | w0r;
- u32 data[8];
+ u32x data[8];
- data[0] = w0[0];
+ data[0] = w0lr;
data[1] = w0[1];
data[2] = w0[2];
data[3] = w0[3];
data[6] = w1[2];
data[7] = w1[3];
- u32 state[16];
+ u32x state[16];
state[ 0] = 0;
state[ 1] = 0;
state[14] = data[6];
state[15] = data[7];
- u32 state_m[8];
- u32 data_m[8];
+ u32x state_m[8];
+ u32x data_m[8];
/* gost1 */
data_m[6] = data[6];
data_m[7] = data[7];
- u32 tmp[8];
+ u32x tmp[8];
if (pw_len > 0)
{
/* store */
- const u32 r0 = state[0];
- const u32 r1 = state[1];
- const u32 r2 = state[2];
- const u32 r3 = state[3];
-
- #include COMPARE_M
+ COMPARE_M_SIMD (state[0], state[1], state[2], state[3]);
}
}
+
static void m06900s (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], const u32 pw_len, __global pw_t *pws, __global kernel_rule_t *rules_buf, __global comb_t *combs_buf, __global bf_t *bfs_buf, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 bfs_cnt, const u32 digests_cnt, const u32 digests_offset, __local u32 s_tables[4][256])
{
/**
u32 w0l = w0[0];
- for (u32 il_pos = 0; il_pos < bfs_cnt; il_pos++)
+ for (u32 il_pos = 0; il_pos < bfs_cnt; il_pos += VECT_SIZE)
{
- const u32 w0r = bfs_buf[il_pos].i;
+ const u32x w0r = w0r_create_bft (bfs_buf, il_pos);
- w0[0] = w0l | w0r;
+ const u32x w0lr = w0l | w0r;
- u32 data[8];
+ u32x data[8];
- data[0] = w0[0];
+ data[0] = w0lr;
data[1] = w0[1];
data[2] = w0[2];
data[3] = w0[3];
data[6] = w1[2];
data[7] = w1[3];
- u32 state[16];
+ u32x state[16];
state[ 0] = 0;
state[ 1] = 0;
state[14] = data[6];
state[15] = data[7];
- u32 state_m[8];
- u32 data_m[8];
+ u32x state_m[8];
+ u32x data_m[8];
/* gost1 */
data_m[6] = data[6];
data_m[7] = data[7];
- u32 tmp[8];
+ u32x tmp[8];
if (pw_len > 0)
{
/* store */
- const u32 r0 = state[0];
- const u32 r1 = state[1];
- const u32 r2 = state[2];
- const u32 r3 = state[3];
-
- #include COMPARE_S
+ COMPARE_S_SIMD (state[0], state[1], state[2], state[3]);
}
}
if (combs_mode == COMBINATOR_MODE_BASE_RIGHT)
{
- switch_buffer_by_offset (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len);
+ switch_buffer_by_offset_le (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len);
}
/**
if (combs_mode == COMBINATOR_MODE_BASE_LEFT)
{
- switch_buffer_by_offset (wordr0, wordr1, wordr2, wordr3, pw_l_len);
+ switch_buffer_by_offset_le (wordr0, wordr1, wordr2, wordr3, pw_l_len);
}
u32 w0[4];
if (combs_mode == COMBINATOR_MODE_BASE_RIGHT)
{
- switch_buffer_by_offset (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len);
+ switch_buffer_by_offset_le (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len);
}
/**
if (combs_mode == COMBINATOR_MODE_BASE_LEFT)
{
- switch_buffer_by_offset (wordr0, wordr1, wordr2, wordr3, pw_l_len);
+ switch_buffer_by_offset_le (wordr0, wordr1, wordr2, wordr3, pw_l_len);
}
u32 w0[4];
#define _SHA1_
+#define NEW_SIMD_CODE
+
#include "include/constants.h"
#include "include/kernel_vendor.h"
#include "include/kernel_functions.c"
#include "OpenCL/types_ocl.c"
#include "OpenCL/common.c"
+#include "OpenCL/simd.c"
-#define COMPARE_S "OpenCL/check_single_comp4.c"
-#define COMPARE_M "OpenCL/check_multi_comp4.c"
-
-static void sha1_transform (const u32 w0[4], const u32 w1[4], const u32 w2[4], const u32 w3[4], u32 digest[5])
+static void sha1_transform (const u32x w0[4], const u32x w1[4], const u32x w2[4], const u32x w3[4], u32x digest[5])
{
- u32 A = digest[0];
- u32 B = digest[1];
- u32 C = digest[2];
- u32 D = digest[3];
- u32 E = digest[4];
-
- u32 w0_t = w0[0];
- u32 w1_t = w0[1];
- u32 w2_t = w0[2];
- u32 w3_t = w0[3];
- u32 w4_t = w1[0];
- u32 w5_t = w1[1];
- u32 w6_t = w1[2];
- u32 w7_t = w1[3];
- u32 w8_t = w2[0];
- u32 w9_t = w2[1];
- u32 wa_t = w2[2];
- u32 wb_t = w2[3];
- u32 wc_t = w3[0];
- u32 wd_t = w3[1];
- u32 we_t = w3[2];
- u32 wf_t = w3[3];
+ u32x A = digest[0];
+ u32x B = digest[1];
+ u32x C = digest[2];
+ u32x D = digest[3];
+ u32x E = digest[4];
+
+ u32x w0_t = w0[0];
+ u32x w1_t = w0[1];
+ u32x w2_t = w0[2];
+ u32x w3_t = w0[3];
+ u32x w4_t = w1[0];
+ u32x w5_t = w1[1];
+ u32x w6_t = w1[2];
+ u32x w7_t = w1[3];
+ u32x w8_t = w2[0];
+ u32x w9_t = w2[1];
+ u32x wa_t = w2[2];
+ u32x wb_t = w2[3];
+ u32x wc_t = w3[0];
+ u32x wd_t = w3[1];
+ u32x we_t = w3[2];
+ u32x wf_t = w3[3];
#undef K
#define K SHA1C00
digest[4] += E;
}
-static void hmac_sha1_pad (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], u32 ipad[5], u32 opad[5])
+static void hmac_sha1_pad (u32x w0[4], u32x w1[4], u32x w2[4], u32x w3[4], u32x ipad[5], u32x opad[5])
{
w0[0] = w0[0] ^ 0x36363636;
w0[1] = w0[1] ^ 0x36363636;
sha1_transform (w0, w1, w2, w3, opad);
}
-static void hmac_sha1_run (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], u32 ipad[5], u32 opad[5], u32 digest[5])
+static void hmac_sha1_run (u32x w0[4], u32x w1[4], u32x w2[4], u32x w3[4], u32x ipad[5], u32x opad[5], u32x digest[5])
{
digest[0] = ipad[0];
digest[1] = ipad[1];
u32 w0l = w0[0];
- for (u32 il_pos = 0; il_pos < bfs_cnt; il_pos++)
+ for (u32 il_pos = 0; il_pos < bfs_cnt; il_pos += VECT_SIZE)
{
- const u32 w0r = bfs_buf[il_pos].i;
+ const u32x w0r = w0r_create_bft (bfs_buf, il_pos);
- w0[0] = w0l | w0r;
+ const u32x w0lr = w0l | w0r;
/**
* pads
*/
- u32 w0_t[4];
+ u32x w0_t[4];
- w0_t[0] = w0[0];
+ w0_t[0] = w0lr;
w0_t[1] = w0[1];
w0_t[2] = w0[2];
w0_t[3] = w0[3];
- u32 w1_t[4];
+ u32x w1_t[4];
w1_t[0] = w1[0];
w1_t[1] = w1[1];
w1_t[2] = w1[2];
w1_t[3] = w1[3];
- u32 w2_t[4];
+ u32x w2_t[4];
w2_t[0] = w2[0];
w2_t[1] = w2[1];
w2_t[2] = w2[2];
w2_t[3] = w2[3];
- u32 w3_t[4];
+ u32x w3_t[4];
w3_t[0] = w3[0];
w3_t[1] = w3[1];
w3_t[2] = 0;
w3_t[3] = 0;
- u32 ipad[5];
- u32 opad[5];
+ u32x ipad[5];
+ u32x opad[5];
hmac_sha1_pad (w0_t, w1_t, w2_t, w3_t, ipad, opad);
w3_t[2] = 0;
w3_t[3] = (64 + esalt_size) * 8;
- u32 digest[5];
+ u32x digest[5];
hmac_sha1_run (w0_t, w1_t, w2_t, w3_t, ipad, opad, digest);
- const u32 r0 = digest[3];
- const u32 r1 = digest[4];
- const u32 r2 = digest[2];
- const u32 r3 = digest[1];
-
- #include COMPARE_M
+ COMPARE_M_SIMD (digest[3], digest[4], digest[2], digest[1]);
}
}
u32 w0l = w0[0];
- for (u32 il_pos = 0; il_pos < bfs_cnt; il_pos++)
+ for (u32 il_pos = 0; il_pos < bfs_cnt; il_pos += VECT_SIZE)
{
- const u32 w0r = bfs_buf[il_pos].i;
+ const u32x w0r = w0r_create_bft (bfs_buf, il_pos);
- w0[0] = w0l | w0r;
+ const u32x w0lr = w0l | w0r;
/**
* pads
*/
- u32 w0_t[4];
+ u32x w0_t[4];
- w0_t[0] = w0[0];
+ w0_t[0] = w0lr;
w0_t[1] = w0[1];
w0_t[2] = w0[2];
w0_t[3] = w0[3];
- u32 w1_t[4];
+ u32x w1_t[4];
w1_t[0] = w1[0];
w1_t[1] = w1[1];
w1_t[2] = w1[2];
w1_t[3] = w1[3];
- u32 w2_t[4];
+ u32x w2_t[4];
w2_t[0] = w2[0];
w2_t[1] = w2[1];
w2_t[2] = w2[2];
w2_t[3] = w2[3];
- u32 w3_t[4];
+ u32x w3_t[4];
w3_t[0] = w3[0];
w3_t[1] = w3[1];
w3_t[2] = 0;
w3_t[3] = 0;
- u32 ipad[5];
- u32 opad[5];
+ u32x ipad[5];
+ u32x opad[5];
hmac_sha1_pad (w0_t, w1_t, w2_t, w3_t, ipad, opad);
w3_t[2] = 0;
w3_t[3] = (64 + esalt_size) * 8;
- u32 digest[5];
+ u32x digest[5];
hmac_sha1_run (w0_t, w1_t, w2_t, w3_t, ipad, opad, digest);
- const u32 r0 = digest[3];
- const u32 r1 = digest[4];
- const u32 r2 = digest[2];
- const u32 r3 = digest[1];
-
- #include COMPARE_S
+ COMPARE_S_SIMD (digest[3], digest[4], digest[2], digest[1]);
}
}
if (combs_mode == COMBINATOR_MODE_BASE_RIGHT)
{
- switch_buffer_by_offset (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len);
+ switch_buffer_by_offset_le (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len);
}
/**
if (combs_mode == COMBINATOR_MODE_BASE_LEFT)
{
- switch_buffer_by_offset (wordr0, wordr1, wordr2, wordr3, pw_l_len);
+ switch_buffer_by_offset_le (wordr0, wordr1, wordr2, wordr3, pw_l_len);
}
u32 w0[4];
if (combs_mode == COMBINATOR_MODE_BASE_RIGHT)
{
- switch_buffer_by_offset (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len);
+ switch_buffer_by_offset_le (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len);
}
/**
if (combs_mode == COMBINATOR_MODE_BASE_LEFT)
{
- switch_buffer_by_offset (wordr0, wordr1, wordr2, wordr3, pw_l_len);
+ switch_buffer_by_offset_le (wordr0, wordr1, wordr2, wordr3, pw_l_len);
}
u32 w0[4];
{
append_0x80_2x4 (wordl0, wordl1, pw_l_len);
- switch_buffer_by_offset (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len);
+ switch_buffer_by_offset_le (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len);
}
/**
{
append_0x80_2x4 (wordr0, wordr1, pw_r_len);
- switch_buffer_by_offset (wordr0, wordr1, wordr2, wordr3, pw_l_len);
+ switch_buffer_by_offset_le (wordr0, wordr1, wordr2, wordr3, pw_l_len);
}
u32 w0[4];
{
append_0x80_2x4 (wordl0, wordl1, pw_l_len);
- switch_buffer_by_offset (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len);
+ switch_buffer_by_offset_le (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len);
}
/**
{
append_0x80_2x4 (wordr0, wordr1, pw_r_len);
- switch_buffer_by_offset (wordr0, wordr1, wordr2, wordr3, pw_l_len);
+ switch_buffer_by_offset_le (wordr0, wordr1, wordr2, wordr3, pw_l_len);
}
u32 w0[4];
#define _SHA1_
+#define NEW_SIMD_CODE
+
#include "include/constants.h"
#include "include/kernel_vendor.h"
#include "include/kernel_functions.c"
#include "OpenCL/types_ocl.c"
#include "OpenCL/common.c"
-
-#define COMPARE_S "OpenCL/check_single_comp4.c"
-#define COMPARE_M "OpenCL/check_multi_comp4.c"
-
-#define uint_to_hex_lower8(i) l_bin2asc[(i)]
+#include "OpenCL/simd.c"
+
+#if VECT_SIZE == 1
+#define uint_to_hex_lower8(i) (u32x) (l_bin2asc[(i)])
+#elif VECT_SIZE == 2
+#define uint_to_hex_lower8(i) (u32x) (l_bin2asc[(i).s0], l_bin2asc[(i).s1])
+#elif VECT_SIZE == 4
+#define uint_to_hex_lower8(i) (u32x) (l_bin2asc[(i).s0], l_bin2asc[(i).s1], l_bin2asc[(i).s2], l_bin2asc[(i).s3])
+#elif VECT_SIZE == 8
+#define uint_to_hex_lower8(i) (u32x) (l_bin2asc[(i).s0], l_bin2asc[(i).s1], l_bin2asc[(i).s2], l_bin2asc[(i).s3], l_bin2asc[(i).s4], l_bin2asc[(i).s5], l_bin2asc[(i).s6], l_bin2asc[(i).s7])
+#endif
static void m07600m (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], const u32 pw_len, __global pw_t *pws, __global kernel_rule_t *rules_buf, __global comb_t *combs_buf, __global bf_t *bfs_buf, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 bfs_cnt, const u32 digests_cnt, const u32 digests_offset, __local u32 l_bin2asc[256])
{
u32 w0l = w0[0];
- for (u32 il_pos = 0; il_pos < bfs_cnt; il_pos++)
+ for (u32 il_pos = 0; il_pos < bfs_cnt; il_pos += VECT_SIZE)
{
- const u32 w0r = bfs_buf[il_pos].i;
+ const u32x w0r = w0r_create_bft (bfs_buf, il_pos);
- w0[0] = w0l | w0r;
+ const u32x w0lr = w0l | w0r;
/**
* sha1
*/
- u32 w0_t = w0[0];
- u32 w1_t = w0[1];
- u32 w2_t = w0[2];
- u32 w3_t = w0[3];
- u32 w4_t = w1[0];
- u32 w5_t = w1[1];
- u32 w6_t = w1[2];
- u32 w7_t = w1[3];
- u32 w8_t = w2[0];
- u32 w9_t = w2[1];
- u32 wa_t = w2[2];
- u32 wb_t = w2[3];
- u32 wc_t = w3[0];
- u32 wd_t = w3[1];
- u32 we_t = 0;
- u32 wf_t = pw_len * 8;
-
- u32 a = SHA1M_A;
- u32 b = SHA1M_B;
- u32 c = SHA1M_C;
- u32 d = SHA1M_D;
- u32 e = SHA1M_E;
+ u32x w0_t = w0lr;
+ u32x w1_t = w0[1];
+ u32x w2_t = w0[2];
+ u32x w3_t = w0[3];
+ u32x w4_t = w1[0];
+ u32x w5_t = w1[1];
+ u32x w6_t = w1[2];
+ u32x w7_t = w1[3];
+ u32x w8_t = w2[0];
+ u32x w9_t = w2[1];
+ u32x wa_t = w2[2];
+ u32x wb_t = w2[3];
+ u32x wc_t = w3[0];
+ u32x wd_t = w3[1];
+ u32x we_t = 0;
+ u32x wf_t = pw_len * 8;
+
+ u32x a = SHA1M_A;
+ u32x b = SHA1M_B;
+ u32x c = SHA1M_C;
+ u32x d = SHA1M_D;
+ u32x e = SHA1M_E;
#undef K
#define K SHA1C00
* Prepend salt
*/
- u32 w0t[4];
+ u32x w0t[4];
w0t[0] = uint_to_hex_lower8 ((a >> 24) & 255) << 0
| uint_to_hex_lower8 ((a >> 16) & 255) << 16;
w0t[3] = uint_to_hex_lower8 ((b >> 8) & 255) << 0
| uint_to_hex_lower8 ((b >> 0) & 255) << 16;
- u32 w1t[4];
+ u32x w1t[4];
w1t[0] = uint_to_hex_lower8 ((c >> 24) & 255) << 0
| uint_to_hex_lower8 ((c >> 16) & 255) << 16;
w1t[3] = uint_to_hex_lower8 ((d >> 8) & 255) << 0
| uint_to_hex_lower8 ((d >> 0) & 255) << 16;
- u32 w2t[2];
+ u32x w2t[2];
w2t[0] = uint_to_hex_lower8 ((e >> 24) & 255) << 0
| uint_to_hex_lower8 ((e >> 16) & 255) << 16;
d += SHA1M_D;
e += SHA1M_E;
- u32 r_a = a;
- u32 r_b = b;
- u32 r_c = c;
- u32 r_d = d;
- u32 r_e = e;
+ u32x r_a = a;
+ u32x r_b = b;
+ u32x r_c = c;
+ u32x r_d = d;
+ u32x r_e = e;
// 2nd transform
d += r_d;
e += r_e;
- const u32 r0 = d;
- const u32 r1 = e;
- const u32 r2 = c;
- const u32 r3 = b;
-
- #include COMPARE_M
+ COMPARE_M_SIMD (d, e, c, b);
}
}
u32 w0l = w0[0];
- for (u32 il_pos = 0; il_pos < bfs_cnt; il_pos++)
+ for (u32 il_pos = 0; il_pos < bfs_cnt; il_pos += VECT_SIZE)
{
- const u32 w0r = bfs_buf[il_pos].i;
+ const u32x w0r = w0r_create_bft (bfs_buf, il_pos);
- w0[0] = w0l | w0r;
+ const u32x w0lr = w0l | w0r;
/**
* sha1
*/
- u32 w0_t = w0[0];
- u32 w1_t = w0[1];
- u32 w2_t = w0[2];
- u32 w3_t = w0[3];
- u32 w4_t = w1[0];
- u32 w5_t = w1[1];
- u32 w6_t = w1[2];
- u32 w7_t = w1[3];
- u32 w8_t = w2[0];
- u32 w9_t = w2[1];
- u32 wa_t = w2[2];
- u32 wb_t = w2[3];
- u32 wc_t = w3[0];
- u32 wd_t = w3[1];
- u32 we_t = 0;
- u32 wf_t = pw_len * 8;
-
- u32 a = SHA1M_A;
- u32 b = SHA1M_B;
- u32 c = SHA1M_C;
- u32 d = SHA1M_D;
- u32 e = SHA1M_E;
+ u32x w0_t = w0lr;
+ u32x w1_t = w0[1];
+ u32x w2_t = w0[2];
+ u32x w3_t = w0[3];
+ u32x w4_t = w1[0];
+ u32x w5_t = w1[1];
+ u32x w6_t = w1[2];
+ u32x w7_t = w1[3];
+ u32x w8_t = w2[0];
+ u32x w9_t = w2[1];
+ u32x wa_t = w2[2];
+ u32x wb_t = w2[3];
+ u32x wc_t = w3[0];
+ u32x wd_t = w3[1];
+ u32x we_t = 0;
+ u32x wf_t = pw_len * 8;
+
+ u32x a = SHA1M_A;
+ u32x b = SHA1M_B;
+ u32x c = SHA1M_C;
+ u32x d = SHA1M_D;
+ u32x e = SHA1M_E;
#undef K
#define K SHA1C00
* Prepend salt
*/
- u32 w0t[4];
+ u32x w0t[4];
w0t[0] = uint_to_hex_lower8 ((a >> 24) & 255) << 0
| uint_to_hex_lower8 ((a >> 16) & 255) << 16;
w0t[3] = uint_to_hex_lower8 ((b >> 8) & 255) << 0
| uint_to_hex_lower8 ((b >> 0) & 255) << 16;
- u32 w1t[4];
+ u32x w1t[4];
w1t[0] = uint_to_hex_lower8 ((c >> 24) & 255) << 0
| uint_to_hex_lower8 ((c >> 16) & 255) << 16;
w1t[3] = uint_to_hex_lower8 ((d >> 8) & 255) << 0
| uint_to_hex_lower8 ((d >> 0) & 255) << 16;
- u32 w2t[2];
+ u32x w2t[2];
w2t[0] = uint_to_hex_lower8 ((e >> 24) & 255) << 0
| uint_to_hex_lower8 ((e >> 16) & 255) << 16;
d += SHA1M_D;
e += SHA1M_E;
- u32 r_a = a;
- u32 r_b = b;
- u32 r_c = c;
- u32 r_d = d;
- u32 r_e = e;
+ u32x r_a = a;
+ u32x r_b = b;
+ u32x r_c = c;
+ u32x r_d = d;
+ u32x r_e = e;
// 2nd transform
d += r_d;
e += r_e;
- const u32 r0 = d;
- const u32 r1 = e;
- const u32 r2 = c;
- const u32 r3 = b;
-
- #include COMPARE_S
+ COMPARE_S_SIMD (d, e, c, b);
}
}
s3[2] = 0;
s3[3] = 0;
- switch_buffer_by_offset (s0, s1, s2, s3, out_len);
+ switch_buffer_by_offset_le (s0, s1, s2, s3, out_len);
const u32 pw_salt_len = out_len + salt_len;
s3[2] = 0;
s3[3] = 0;
- switch_buffer_by_offset (s0, s1, s2, s3, out_len);
+ switch_buffer_by_offset_le (s0, s1, s2, s3, out_len);
const u32 pw_salt_len = out_len + salt_len;
if (combs_mode == COMBINATOR_MODE_BASE_RIGHT)
{
- switch_buffer_by_offset (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len);
+ switch_buffer_by_offset_le (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len);
}
/**
if (combs_mode == COMBINATOR_MODE_BASE_LEFT)
{
- switch_buffer_by_offset (wordr0, wordr1, wordr2, wordr3, pw_l_len);
+ switch_buffer_by_offset_le (wordr0, wordr1, wordr2, wordr3, pw_l_len);
}
u32 w0[4];
s3[2] = 0;
s3[3] = 0;
- switch_buffer_by_offset (s0, s1, s2, s3, pw_len);
+ switch_buffer_by_offset_le (s0, s1, s2, s3, pw_len);
const u32 pw_salt_len = pw_len + salt_len;
if (combs_mode == COMBINATOR_MODE_BASE_RIGHT)
{
- switch_buffer_by_offset (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len);
+ switch_buffer_by_offset_le (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len);
}
/**
if (combs_mode == COMBINATOR_MODE_BASE_LEFT)
{
- switch_buffer_by_offset (wordr0, wordr1, wordr2, wordr3, pw_l_len);
+ switch_buffer_by_offset_le (wordr0, wordr1, wordr2, wordr3, pw_l_len);
}
u32 w0[4];
s3[2] = 0;
s3[3] = 0;
- switch_buffer_by_offset (s0, s1, s2, s3, pw_len);
+ switch_buffer_by_offset_le (s0, s1, s2, s3, pw_len);
const u32 pw_salt_len = pw_len + salt_len;
s3[2] = 0;
s3[3] = 0;
- switch_buffer_by_offset (s0, s1, s2, s3, pw_len);
+ switch_buffer_by_offset_le (s0, s1, s2, s3, pw_len);
const u32 pw_salt_len = pw_len + salt_len;
s3[2] = 0;
s3[3] = 0;
- switch_buffer_by_offset (s0, s1, s2, s3, pw_len);
+ switch_buffer_by_offset_le (s0, s1, s2, s3, pw_len);
const u32 pw_salt_len = pw_len + salt_len;
s3[2] = 0;
s3[3] = 0;
- switch_buffer_by_offset (s0, s1, s2, s3, out_len);
+ switch_buffer_by_offset_le (s0, s1, s2, s3, out_len);
const u32 pw_salt_len = out_len + salt_len;
s3[2] = 0;
s3[3] = 0;
- switch_buffer_by_offset (s0, s1, s2, s3, out_len);
+ switch_buffer_by_offset_le (s0, s1, s2, s3, out_len);
const u32 pw_salt_len = out_len + salt_len;
if (combs_mode == COMBINATOR_MODE_BASE_RIGHT)
{
- switch_buffer_by_offset (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len);
+ switch_buffer_by_offset_le (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len);
}
/**
if (combs_mode == COMBINATOR_MODE_BASE_LEFT)
{
- switch_buffer_by_offset (wordr0, wordr1, wordr2, wordr3, pw_l_len);
+ switch_buffer_by_offset_le (wordr0, wordr1, wordr2, wordr3, pw_l_len);
}
/**
s3[2] = 0;
s3[3] = 0;
- switch_buffer_by_offset (s0, s1, s2, s3, pw_len);
+ switch_buffer_by_offset_le (s0, s1, s2, s3, pw_len);
const u32 pw_salt_len = pw_len + salt_len;
if (combs_mode == COMBINATOR_MODE_BASE_RIGHT)
{
- switch_buffer_by_offset (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len);
+ switch_buffer_by_offset_le (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len);
}
const u32 search[4] =
if (combs_mode == COMBINATOR_MODE_BASE_LEFT)
{
- switch_buffer_by_offset (wordr0, wordr1, wordr2, wordr3, pw_l_len);
+ switch_buffer_by_offset_le (wordr0, wordr1, wordr2, wordr3, pw_l_len);
}
/**
s3[2] = 0;
s3[3] = 0;
- switch_buffer_by_offset (s0, s1, s2, s3, pw_len);
+ switch_buffer_by_offset_le (s0, s1, s2, s3, pw_len);
const u32 pw_salt_len = pw_len + salt_len;
s3[2] = 0;
s3[3] = 0;
- switch_buffer_by_offset (s0, s1, s2, s3, pw_len);
+ switch_buffer_by_offset_le (s0, s1, s2, s3, pw_len);
const u32 pw_salt_len = pw_len + salt_len;
s3[2] = 0;
s3[3] = 0;
- switch_buffer_by_offset (s0, s1, s2, s3, pw_len);
+ switch_buffer_by_offset_le (s0, s1, s2, s3, pw_len);
const u32 pw_salt_len = pw_len + salt_len;
if (combs_mode == COMBINATOR_MODE_BASE_RIGHT)
{
- switch_buffer_by_offset (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len);
+ switch_buffer_by_offset_le (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len);
}
/**
if (combs_mode == COMBINATOR_MODE_BASE_LEFT)
{
- switch_buffer_by_offset (wordr0, wordr1, wordr2, wordr3, pw_l_len);
+ switch_buffer_by_offset_le (wordr0, wordr1, wordr2, wordr3, pw_l_len);
}
u32 w0[4];
if (combs_mode == COMBINATOR_MODE_BASE_RIGHT)
{
- switch_buffer_by_offset (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len);
+ switch_buffer_by_offset_le (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len);
}
/**
if (combs_mode == COMBINATOR_MODE_BASE_LEFT)
{
- switch_buffer_by_offset (wordr0, wordr1, wordr2, wordr3, pw_l_len);
+ switch_buffer_by_offset_le (wordr0, wordr1, wordr2, wordr3, pw_l_len);
}
u32 w0[4];
#define _SHA256_
+#define NEW_SIMD_CODE
+
#include "include/constants.h"
#include "include/kernel_vendor.h"
#include "include/kernel_functions.c"
#include "OpenCL/types_ocl.c"
#include "OpenCL/common.c"
-
-#define COMPARE_S "OpenCL/check_single_comp4.c"
-#define COMPARE_M "OpenCL/check_multi_comp4.c"
+#include "OpenCL/simd.c"
__constant u32 k_sha256[64] =
{
SHA256C3c, SHA256C3d, SHA256C3e, SHA256C3f,
};
-static void sha256_transform (u32 digest[8], const u32 w[16])
+#define SHA256_S0_S(x) (rotl32_S ((x), 25u) ^ rotl32_S ((x), 14u) ^ SHIFT_RIGHT_32 ((x), 3u))
+#define SHA256_S1_S(x) (rotl32_S ((x), 15u) ^ rotl32_S ((x), 13u) ^ SHIFT_RIGHT_32 ((x), 10u))
+
+#define SHA256_EXPAND_S(x,y,z,w) (SHA256_S1_S (x) + y + SHA256_S0_S (z) + w)
+
+static void sha256_transform (u32x digest[8], const u32x w[16])
{
- u32 a = digest[0];
- u32 b = digest[1];
- u32 c = digest[2];
- u32 d = digest[3];
- u32 e = digest[4];
- u32 f = digest[5];
- u32 g = digest[6];
- u32 h = digest[7];
-
- u32 w0_t = w[ 0];
- u32 w1_t = w[ 1];
- u32 w2_t = w[ 2];
- u32 w3_t = w[ 3];
- u32 w4_t = w[ 4];
- u32 w5_t = w[ 5];
- u32 w6_t = w[ 6];
- u32 w7_t = w[ 7];
- u32 w8_t = w[ 8];
- u32 w9_t = w[ 9];
- u32 wa_t = w[10];
- u32 wb_t = w[11];
- u32 wc_t = w[12];
- u32 wd_t = w[13];
- u32 we_t = w[14];
- u32 wf_t = w[15];
+ u32x a = digest[0];
+ u32x b = digest[1];
+ u32x c = digest[2];
+ u32x d = digest[3];
+ u32x e = digest[4];
+ u32x f = digest[5];
+ u32x g = digest[6];
+ u32x h = digest[7];
+
+ u32x w0_t = w[ 0];
+ u32x w1_t = w[ 1];
+ u32x w2_t = w[ 2];
+ u32x w3_t = w[ 3];
+ u32x w4_t = w[ 4];
+ u32x w5_t = w[ 5];
+ u32x w6_t = w[ 6];
+ u32x w7_t = w[ 7];
+ u32x w8_t = w[ 8];
+ u32x w9_t = w[ 9];
+ u32x wa_t = w[10];
+ u32x wb_t = w[11];
+ u32x wc_t = w[12];
+ u32x wd_t = w[13];
+ u32x we_t = w[14];
+ u32x wf_t = w[15];
#define ROUND_EXPAND() \
{ \
digest[7] += h;
}
-static void sha256_transform_z (u32 digest[8])
+static void sha256_transform_z (u32x digest[8])
{
- u32 a = digest[0];
- u32 b = digest[1];
- u32 c = digest[2];
- u32 d = digest[3];
- u32 e = digest[4];
- u32 f = digest[5];
- u32 g = digest[6];
- u32 h = digest[7];
+ u32x a = digest[0];
+ u32x b = digest[1];
+ u32x c = digest[2];
+ u32x d = digest[3];
+ u32x e = digest[4];
+ u32x f = digest[5];
+ u32x g = digest[6];
+ u32x h = digest[7];
#define ROUND_STEP_Z(i) \
{ \
digest[7] += h;
}
-static void sha256_transform_s (u32 digest[8], __local u32 w[64])
+static void sha256_transform_s (u32x digest[8], __local u32 w[64])
{
- u32 a = digest[0];
- u32 b = digest[1];
- u32 c = digest[2];
- u32 d = digest[3];
- u32 e = digest[4];
- u32 f = digest[5];
- u32 g = digest[6];
- u32 h = digest[7];
+ u32x a = digest[0];
+ u32x b = digest[1];
+ u32x c = digest[2];
+ u32x d = digest[3];
+ u32x e = digest[4];
+ u32x f = digest[5];
+ u32x g = digest[6];
+ u32x h = digest[7];
#define ROUND_STEP_S(i) \
{ \
digest[7] += h;
}
-static void m08000m (__local u32 w_s1[64], __local u32 w_s2[64], u32 w[16], const u32 pw_len, __global pw_t *pws, __global kernel_rule_t *rules_buf, __global comb_t *combs_buf, __constant u32 * words_buf_r, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 bfs_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 gid_max)
+static void m08000m (__local u32 w_s1[64], __local u32 w_s2[64], u32 w[16], const u32 pw_len, __global pw_t *pws, __global kernel_rule_t *rules_buf, __global comb_t *combs_buf, __constant u32x * words_buf_r, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 bfs_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 gid_max)
{
/**
* modifier
* salt
*/
- const u32 salt_buf0 = swap32 (salt_bufs[salt_pos].salt_buf[ 0]);
- const u32 salt_buf1 = swap32 (salt_bufs[salt_pos].salt_buf[ 1]);
- const u32 salt_buf2 = swap32 (salt_bufs[salt_pos].salt_buf[ 2]); // 0x80
+ const u32 salt_buf0 = swap32_S (salt_bufs[salt_pos].salt_buf[ 0]);
+ const u32 salt_buf1 = swap32_S (salt_bufs[salt_pos].salt_buf[ 1]);
+ const u32 salt_buf2 = swap32_S (salt_bufs[salt_pos].salt_buf[ 2]); // 0x80
/**
* precompute final msg blocks
#pragma unroll
for (int i = 16; i < 64; i++)
{
- w_s1[i] = SHA256_EXPAND (w_s1[i - 2], w_s1[i - 7], w_s1[i - 15], w_s1[i - 16]);
+ w_s1[i] = SHA256_EXPAND_S (w_s1[i - 2], w_s1[i - 7], w_s1[i - 15], w_s1[i - 16]);
}
w_s2[ 0] = salt_buf0 << 16 | salt_buf1 >> 16;
#pragma unroll
for (int i = 16; i < 64; i++)
{
- w_s2[i] = SHA256_EXPAND (w_s2[i - 2], w_s2[i - 7], w_s2[i - 15], w_s2[i - 16]);
+ w_s2[i] = SHA256_EXPAND_S (w_s2[i - 2], w_s2[i - 7], w_s2[i - 15], w_s2[i - 16]);
}
}
* modifier
*/
- w[ 1] = w[ 1] >> 8;
- w[ 2] = w[ 2] >> 8;
- w[ 3] = w[ 3] >> 8;
- w[ 4] = w[ 4] >> 8;
- w[ 5] = w[ 5] >> 8;
- w[ 6] = w[ 6] >> 8;
- w[ 7] = w[ 7] >> 8;
- w[ 8] = w[ 8] >> 8;
- w[ 9] = w[ 9] >> 8;
- w[10] = w[10] >> 8;
- w[11] = w[11] >> 8;
- w[12] = w[12] >> 8;
- w[13] = w[13] >> 8;
- w[14] = w[14] >> 8;
- w[15] = w[15] >> 8;
+ u32x w_t[16];
+
+ w_t[ 0] = w[ 0] >> 8;
+ w_t[ 1] = w[ 1] >> 8;
+ w_t[ 2] = w[ 2] >> 8;
+ w_t[ 3] = w[ 3] >> 8;
+ w_t[ 4] = w[ 4] >> 8;
+ w_t[ 5] = w[ 5] >> 8;
+ w_t[ 6] = w[ 6] >> 8;
+ w_t[ 7] = w[ 7] >> 8;
+ w_t[ 8] = w[ 8] >> 8;
+ w_t[ 9] = w[ 9] >> 8;
+ w_t[10] = w[10] >> 8;
+ w_t[11] = w[11] >> 8;
+ w_t[12] = w[12] >> 8;
+ w_t[13] = w[13] >> 8;
+ w_t[14] = w[14] >> 8;
+ w_t[15] = w[15] >> 8;
/**
* loop
u32 w0l = w[0];
- for (u32 il_pos = 0; il_pos < bfs_cnt; il_pos++)
+ for (u32 il_pos = 0; il_pos < bfs_cnt; il_pos += VECT_SIZE)
{
- const u32 w0r = words_buf_r[il_pos];
+ const u32x w0r = words_buf_r[il_pos / VECT_SIZE];
- const u32 w0 = w0l | w0r;
+ const u32x w0lr = w0l | w0r;
- w[0] = w0 >> 8;
+ w_t[0] = w0lr >> 8;
- u32 digest[8];
+ u32x digest[8];
digest[0] = SHA256M_A;
digest[1] = SHA256M_B;
digest[6] = SHA256M_G;
digest[7] = SHA256M_H;
- sha256_transform (digest, w); // 0 - 64
+ sha256_transform (digest, w_t); // 0 - 64
sha256_transform_z (digest); // 64 - 128
sha256_transform_z (digest); // 128 - 192
sha256_transform_z (digest); // 192 - 256
sha256_transform_s (digest, w_s1); // 448 - 512
sha256_transform_s (digest, w_s2); // 512 - 576
- const u32 r0 = digest[3];
- const u32 r1 = digest[7];
- const u32 r2 = digest[2];
- const u32 r3 = digest[6];
-
- #include COMPARE_M
+ COMPARE_M_SIMD (digest[3], digest[7], digest[2], digest[6]);
}
}
-static void m08000s (__local u32 w_s1[64], __local u32 w_s2[64], u32 w[16], const u32 pw_len, __global pw_t *pws, __global kernel_rule_t *rules_buf, __global comb_t *combs_buf, __constant u32 * words_buf_r, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 bfs_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 gid_max)
+static void m08000s (__local u32 w_s1[64], __local u32 w_s2[64], u32 w[16], const u32 pw_len, __global pw_t *pws, __global kernel_rule_t *rules_buf, __global comb_t *combs_buf, __constant u32x * words_buf_r, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 bfs_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 gid_max)
{
/**
* modifier
* salt
*/
- const u32 salt_buf0 = swap32 (salt_bufs[salt_pos].salt_buf[ 0]);
- const u32 salt_buf1 = swap32 (salt_bufs[salt_pos].salt_buf[ 1]);
- const u32 salt_buf2 = swap32 (salt_bufs[salt_pos].salt_buf[ 2]); // 0x80
+ const u32 salt_buf0 = swap32_S (salt_bufs[salt_pos].salt_buf[ 0]);
+ const u32 salt_buf1 = swap32_S (salt_bufs[salt_pos].salt_buf[ 1]);
+ const u32 salt_buf2 = swap32_S (salt_bufs[salt_pos].salt_buf[ 2]); // 0x80
/**
* precompute final msg blocks
#pragma unroll
for (int i = 16; i < 64; i++)
{
- w_s1[i] = SHA256_EXPAND (w_s1[i - 2], w_s1[i - 7], w_s1[i - 15], w_s1[i - 16]);
+ w_s1[i] = SHA256_EXPAND_S (w_s1[i - 2], w_s1[i - 7], w_s1[i - 15], w_s1[i - 16]);
}
w_s2[ 0] = salt_buf0 << 16 | salt_buf1 >> 16;
#pragma unroll
for (int i = 16; i < 64; i++)
{
- w_s2[i] = SHA256_EXPAND (w_s2[i - 2], w_s2[i - 7], w_s2[i - 15], w_s2[i - 16]);
+ w_s2[i] = SHA256_EXPAND_S (w_s2[i - 2], w_s2[i - 7], w_s2[i - 15], w_s2[i - 16]);
}
}
* modifier
*/
- w[ 1] = w[ 1] >> 8;
- w[ 2] = w[ 2] >> 8;
- w[ 3] = w[ 3] >> 8;
- w[ 4] = w[ 4] >> 8;
- w[ 5] = w[ 5] >> 8;
- w[ 6] = w[ 6] >> 8;
- w[ 7] = w[ 7] >> 8;
- w[ 8] = w[ 8] >> 8;
- w[ 9] = w[ 9] >> 8;
- w[10] = w[10] >> 8;
- w[11] = w[11] >> 8;
- w[12] = w[12] >> 8;
- w[13] = w[13] >> 8;
- w[14] = w[14] >> 8;
- w[15] = w[15] >> 8;
+ u32x w_t[16];
+
+ w_t[ 0] = w[ 0] >> 8;
+ w_t[ 1] = w[ 1] >> 8;
+ w_t[ 2] = w[ 2] >> 8;
+ w_t[ 3] = w[ 3] >> 8;
+ w_t[ 4] = w[ 4] >> 8;
+ w_t[ 5] = w[ 5] >> 8;
+ w_t[ 6] = w[ 6] >> 8;
+ w_t[ 7] = w[ 7] >> 8;
+ w_t[ 8] = w[ 8] >> 8;
+ w_t[ 9] = w[ 9] >> 8;
+ w_t[10] = w[10] >> 8;
+ w_t[11] = w[11] >> 8;
+ w_t[12] = w[12] >> 8;
+ w_t[13] = w[13] >> 8;
+ w_t[14] = w[14] >> 8;
+ w_t[15] = w[15] >> 8;
/**
* digest
u32 w0l = w[0];
- for (u32 il_pos = 0; il_pos < bfs_cnt; il_pos++)
+ for (u32 il_pos = 0; il_pos < bfs_cnt; il_pos += VECT_SIZE)
{
- const u32 w0r = words_buf_r[il_pos];
+ const u32x w0r = words_buf_r[il_pos / VECT_SIZE];
- const u32 w0 = w0l | w0r;
+ const u32x w0lr = w0l | w0r;
- w[0] = w0 >> 8;
+ w_t[0] = w0lr >> 8;
- u32 digest[8];
+ u32x digest[8];
digest[0] = SHA256M_A;
digest[1] = SHA256M_B;
digest[6] = SHA256M_G;
digest[7] = SHA256M_H;
- sha256_transform (digest, w); // 0 - 64
+ sha256_transform (digest, w_t); // 0 - 64
sha256_transform_z (digest); // 64 - 128
sha256_transform_z (digest); // 128 - 192
sha256_transform_z (digest); // 192 - 256
sha256_transform_s (digest, w_s1); // 448 - 512
sha256_transform_s (digest, w_s2); // 512 - 576
- const u32 r0 = digest[3];
- const u32 r1 = digest[7];
- const u32 r2 = digest[2];
- const u32 r3 = digest[6];
-
- #include COMPARE_S
+ COMPARE_S_SIMD (digest[3], digest[7], digest[2], digest[6]);
}
}
-__kernel void m08000_m04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, __global comb_t *combs_buf, __constant u32 * words_buf_r, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 bfs_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max)
+__kernel void m08000_m04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, __global comb_t *combs_buf, __constant u32x * words_buf_r, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 bfs_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max)
{
/**
* base
m08000m (w_s1, w_s2, w, pw_len, pws, rules_buf, combs_buf, words_buf_r, tmps, hooks, bitmaps_buf_s1_a, bitmaps_buf_s1_b, bitmaps_buf_s1_c, bitmaps_buf_s1_d, bitmaps_buf_s2_a, bitmaps_buf_s2_b, bitmaps_buf_s2_c, bitmaps_buf_s2_d, plains_buf, digests_buf, hashes_shown, salt_bufs, esalt_bufs, d_return_buf, d_scryptV_buf, bitmap_mask, bitmap_shift1, bitmap_shift2, salt_pos, loop_pos, loop_cnt, bfs_cnt, digests_cnt, digests_offset, gid_max);
}
-__kernel void m08000_m08 (__global pw_t *pws, __global kernel_rule_t *rules_buf, __global comb_t *combs_buf, __constant u32 * words_buf_r, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 bfs_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max)
+__kernel void m08000_m08 (__global pw_t *pws, __global kernel_rule_t *rules_buf, __global comb_t *combs_buf, __constant u32x * words_buf_r, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 bfs_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max)
{
/**
* base
m08000m (w_s1, w_s2, w, pw_len, pws, rules_buf, combs_buf, words_buf_r, tmps, hooks, bitmaps_buf_s1_a, bitmaps_buf_s1_b, bitmaps_buf_s1_c, bitmaps_buf_s1_d, bitmaps_buf_s2_a, bitmaps_buf_s2_b, bitmaps_buf_s2_c, bitmaps_buf_s2_d, plains_buf, digests_buf, hashes_shown, salt_bufs, esalt_bufs, d_return_buf, d_scryptV_buf, bitmap_mask, bitmap_shift1, bitmap_shift2, salt_pos, loop_pos, loop_cnt, bfs_cnt, digests_cnt, digests_offset, gid_max);
}
-__kernel void m08000_m16 (__global pw_t *pws, __global kernel_rule_t *rules_buf, __global comb_t *combs_buf, __constant u32 * words_buf_r, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 bfs_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max)
+__kernel void m08000_m16 (__global pw_t *pws, __global kernel_rule_t *rules_buf, __global comb_t *combs_buf, __constant u32x * words_buf_r, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 bfs_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max)
{
/**
* base
m08000m (w_s1, w_s2, w, pw_len, pws, rules_buf, combs_buf, words_buf_r, tmps, hooks, bitmaps_buf_s1_a, bitmaps_buf_s1_b, bitmaps_buf_s1_c, bitmaps_buf_s1_d, bitmaps_buf_s2_a, bitmaps_buf_s2_b, bitmaps_buf_s2_c, bitmaps_buf_s2_d, plains_buf, digests_buf, hashes_shown, salt_bufs, esalt_bufs, d_return_buf, d_scryptV_buf, bitmap_mask, bitmap_shift1, bitmap_shift2, salt_pos, loop_pos, loop_cnt, bfs_cnt, digests_cnt, digests_offset, gid_max);
}
-__kernel void m08000_s04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, __global comb_t *combs_buf, __constant u32 * words_buf_r, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 bfs_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max)
+__kernel void m08000_s04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, __global comb_t *combs_buf, __constant u32x * words_buf_r, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 bfs_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max)
{
/**
* base
m08000s (w_s1, w_s2, w, pw_len, pws, rules_buf, combs_buf, words_buf_r, tmps, hooks, bitmaps_buf_s1_a, bitmaps_buf_s1_b, bitmaps_buf_s1_c, bitmaps_buf_s1_d, bitmaps_buf_s2_a, bitmaps_buf_s2_b, bitmaps_buf_s2_c, bitmaps_buf_s2_d, plains_buf, digests_buf, hashes_shown, salt_bufs, esalt_bufs, d_return_buf, d_scryptV_buf, bitmap_mask, bitmap_shift1, bitmap_shift2, salt_pos, loop_pos, loop_cnt, bfs_cnt, digests_cnt, digests_offset, gid_max);
}
-__kernel void m08000_s08 (__global pw_t *pws, __global kernel_rule_t *rules_buf, __global comb_t *combs_buf, __constant u32 * words_buf_r, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 bfs_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max)
+__kernel void m08000_s08 (__global pw_t *pws, __global kernel_rule_t *rules_buf, __global comb_t *combs_buf, __constant u32x * words_buf_r, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 bfs_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max)
{
/**
* base
m08000s (w_s1, w_s2, w, pw_len, pws, rules_buf, combs_buf, words_buf_r, tmps, hooks, bitmaps_buf_s1_a, bitmaps_buf_s1_b, bitmaps_buf_s1_c, bitmaps_buf_s1_d, bitmaps_buf_s2_a, bitmaps_buf_s2_b, bitmaps_buf_s2_c, bitmaps_buf_s2_d, plains_buf, digests_buf, hashes_shown, salt_bufs, esalt_bufs, d_return_buf, d_scryptV_buf, bitmap_mask, bitmap_shift1, bitmap_shift2, salt_pos, loop_pos, loop_cnt, bfs_cnt, digests_cnt, digests_offset, gid_max);
}
-__kernel void m08000_s16 (__global pw_t *pws, __global kernel_rule_t *rules_buf, __global comb_t *combs_buf, __constant u32 * words_buf_r, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 bfs_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max)
+__kernel void m08000_s16 (__global pw_t *pws, __global kernel_rule_t *rules_buf, __global comb_t *combs_buf, __constant u32x * words_buf_r, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 bfs_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max)
{
/**
* base
if (combs_mode == COMBINATOR_MODE_BASE_RIGHT)
{
- switch_buffer_by_offset (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len);
+ switch_buffer_by_offset_le (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len);
}
/**
if (combs_mode == COMBINATOR_MODE_BASE_LEFT)
{
- switch_buffer_by_offset (wordr0, wordr1, wordr2, wordr3, pw_l_len);
+ switch_buffer_by_offset_le (wordr0, wordr1, wordr2, wordr3, pw_l_len);
}
u32 w0[4];
if (combs_mode == COMBINATOR_MODE_BASE_RIGHT)
{
- switch_buffer_by_offset (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len);
+ switch_buffer_by_offset_le (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len);
}
/**
if (combs_mode == COMBINATOR_MODE_BASE_LEFT)
{
- switch_buffer_by_offset (wordr0, wordr1, wordr2, wordr3, pw_l_len);
+ switch_buffer_by_offset_le (wordr0, wordr1, wordr2, wordr3, pw_l_len);
}
u32 w0[4];
#define _SHA1_
+#define NEW_SIMD_CODE
+
#include "include/constants.h"
#include "include/kernel_vendor.h"
#include "include/kernel_functions.c"
#include "OpenCL/types_ocl.c"
#include "OpenCL/common.c"
-
-#define COMPARE_S "OpenCL/check_single_comp4.c"
-#define COMPARE_M "OpenCL/check_multi_comp4.c"
+#include "OpenCL/simd.c"
static void m08100m (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], const u32 pw_len, __global pw_t *pws, __global kernel_rule_t *rules_buf, __global comb_t *combs_buf, __global bf_t *bfs_buf, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 bfs_cnt, const u32 digests_cnt, const u32 digests_offset)
{
u32 w0l = w0[0];
- for (u32 il_pos = 0; il_pos < bfs_cnt; il_pos++)
+ for (u32 il_pos = 0; il_pos < bfs_cnt; il_pos += VECT_SIZE)
{
- const u32 w0r = bfs_buf[il_pos].i;
+ const u32x w0r = w0r_create_bft (bfs_buf, il_pos);
- w0[0] = w0l | w0r;
+ const u32x w0lr = w0l | w0r;
/**
* prepend salt
*/
- u32 w0_t[4];
- u32 w1_t[4];
- u32 w2_t[4];
- u32 w3_t[4];
+ u32x w0_t[4];
+ u32x w1_t[4];
+ u32x w2_t[4];
+ u32x w3_t[4];
w0_t[0] = salt_buf0[0];
w0_t[1] = salt_buf0[1];
- w0_t[2] = w0[0];
+ w0_t[2] = w0lr;
w0_t[3] = w0[1];
w1_t[0] = w0[2];
w1_t[1] = w0[3];
* sha1
*/
- u32 a = SHA1M_A;
- u32 b = SHA1M_B;
- u32 c = SHA1M_C;
- u32 d = SHA1M_D;
- u32 e = SHA1M_E;
+ u32x a = SHA1M_A;
+ u32x b = SHA1M_B;
+ u32x c = SHA1M_C;
+ u32x d = SHA1M_D;
+ u32x e = SHA1M_E;
#undef K
#define K SHA1C00
w3_t[2] = rotl32 ((w2_t[3] ^ w1_t[2] ^ w0_t[0] ^ w3_t[2]), 1u); SHA1_STEP (SHA1_F1, c, d, e, a, b, w3_t[2]);
w3_t[3] = rotl32 ((w3_t[0] ^ w1_t[3] ^ w0_t[1] ^ w3_t[3]), 1u); SHA1_STEP (SHA1_F1, b, c, d, e, a, w3_t[3]);
- const u32 r0 = d;
- const u32 r1 = e;
- const u32 r2 = c;
- const u32 r3 = b;
-
- #include COMPARE_M
+ COMPARE_M_SIMD (d, e, c, b);
}
}
* reverse
*/
- const u32 e_rev = rotl32 (search[1], 2u);
+ const u32 e_rev = rotl32_S (search[1], 2u);
/**
* salt
u32 w0l = w0[0];
- for (u32 il_pos = 0; il_pos < bfs_cnt; il_pos++)
+ for (u32 il_pos = 0; il_pos < bfs_cnt; il_pos += VECT_SIZE)
{
- const u32 w0r = bfs_buf[il_pos].i;
+ const u32x w0r = w0r_create_bft (bfs_buf, il_pos);
- w0[0] = w0l | w0r;
+ const u32x w0lr = w0l | w0r;
/**
* prepend salt
*/
- u32 w0_t[4];
- u32 w1_t[4];
- u32 w2_t[4];
- u32 w3_t[4];
+ u32x w0_t[4];
+ u32x w1_t[4];
+ u32x w2_t[4];
+ u32x w3_t[4];
w0_t[0] = salt_buf0[0];
w0_t[1] = salt_buf0[1];
- w0_t[2] = w0[0];
+ w0_t[2] = w0lr;
w0_t[3] = w0[1];
w1_t[0] = w0[2];
w1_t[1] = w0[3];
* sha1
*/
- u32 a = SHA1M_A;
- u32 b = SHA1M_B;
- u32 c = SHA1M_C;
- u32 d = SHA1M_D;
- u32 e = SHA1M_E;
+ u32x a = SHA1M_A;
+ u32x b = SHA1M_B;
+ u32x c = SHA1M_C;
+ u32x d = SHA1M_D;
+ u32x e = SHA1M_E;
#undef K
#define K SHA1C00
w2_t[3] = rotl32 ((w2_t[0] ^ w0_t[3] ^ w3_t[1] ^ w2_t[3]), 1u); SHA1_STEP (SHA1_F1, a, b, c, d, e, w2_t[3]);
w3_t[0] = rotl32 ((w2_t[1] ^ w1_t[0] ^ w3_t[2] ^ w3_t[0]), 1u); SHA1_STEP (SHA1_F1, e, a, b, c, d, w3_t[0]);
- if (allx (e != e_rev)) continue;
+ if (MATCHES_NONE_VS (e, e_rev)) continue;
w3_t[1] = rotl32 ((w2_t[2] ^ w1_t[1] ^ w3_t[3] ^ w3_t[1]), 1u); SHA1_STEP (SHA1_F1, d, e, a, b, c, w3_t[1]);
w3_t[2] = rotl32 ((w2_t[3] ^ w1_t[2] ^ w0_t[0] ^ w3_t[2]), 1u); SHA1_STEP (SHA1_F1, c, d, e, a, b, w3_t[2]);
w3_t[3] = rotl32 ((w3_t[0] ^ w1_t[3] ^ w0_t[1] ^ w3_t[3]), 1u); SHA1_STEP (SHA1_F1, b, c, d, e, a, w3_t[3]);
- const u32 r0 = d;
- const u32 r1 = e;
- const u32 r2 = c;
- const u32 r3 = b;
-
- #include COMPARE_S
+ COMPARE_S_SIMD (d, e, c, b);
}
}
* base
*/
- w0[0] = swap32 (w0[0]);
- w0[1] = swap32 (w0[1]);
- w0[2] = swap32 (w0[2]);
- w0[3] = swap32 (w0[3]);
+ w0[0] = swap32_S (w0[0]);
+ w0[1] = swap32_S (w0[1]);
+ w0[2] = swap32_S (w0[2]);
+ w0[3] = swap32_S (w0[3]);
- append_0x80_2x4 (w0, w1, pw_len + 1);
+ append_0x80_2x4_S (w0, w1, pw_len + 1);
- w0[0] = swap32 (w0[0]);
- w0[1] = swap32 (w0[1]);
- w0[2] = swap32 (w0[2]);
- w0[3] = swap32 (w0[3]);
- w1[0] = swap32 (w1[0]);
+ w0[0] = swap32_S (w0[0]);
+ w0[1] = swap32_S (w0[1]);
+ w0[2] = swap32_S (w0[2]);
+ w0[3] = swap32_S (w0[3]);
+ w1[0] = swap32_S (w1[0]);
/**
* main
* base
*/
- w0[0] = swap32 (w0[0]);
- w0[1] = swap32 (w0[1]);
- w0[2] = swap32 (w0[2]);
- w0[3] = swap32 (w0[3]);
- w1[0] = swap32 (w1[0]);
- w1[1] = swap32 (w1[1]);
- w1[2] = swap32 (w1[2]);
- w1[3] = swap32 (w1[3]);
-
- append_0x80_3x4 (w0, w1, w2, pw_len + 1);
-
- w0[0] = swap32 (w0[0]);
- w0[1] = swap32 (w0[1]);
- w0[2] = swap32 (w0[2]);
- w0[3] = swap32 (w0[3]);
- w1[0] = swap32 (w1[0]);
- w1[1] = swap32 (w1[1]);
- w1[2] = swap32 (w1[2]);
- w1[3] = swap32 (w1[3]);
- w2[0] = swap32 (w2[0]);
+ w0[0] = swap32_S (w0[0]);
+ w0[1] = swap32_S (w0[1]);
+ w0[2] = swap32_S (w0[2]);
+ w0[3] = swap32_S (w0[3]);
+ w1[0] = swap32_S (w1[0]);
+ w1[1] = swap32_S (w1[1]);
+ w1[2] = swap32_S (w1[2]);
+ w1[3] = swap32_S (w1[3]);
+
+ append_0x80_3x4_S (w0, w1, w2, pw_len + 1);
+
+ w0[0] = swap32_S (w0[0]);
+ w0[1] = swap32_S (w0[1]);
+ w0[2] = swap32_S (w0[2]);
+ w0[3] = swap32_S (w0[3]);
+ w1[0] = swap32_S (w1[0]);
+ w1[1] = swap32_S (w1[1]);
+ w1[2] = swap32_S (w1[2]);
+ w1[3] = swap32_S (w1[3]);
+ w2[0] = swap32_S (w2[0]);
/**
* main
* base
*/
- w0[0] = swap32 (w0[0]);
- w0[1] = swap32 (w0[1]);
- w0[2] = swap32 (w0[2]);
- w0[3] = swap32 (w0[3]);
- w1[0] = swap32 (w1[0]);
- w1[1] = swap32 (w1[1]);
- w1[2] = swap32 (w1[2]);
- w1[3] = swap32 (w1[3]);
- w2[0] = swap32 (w2[0]);
- w2[1] = swap32 (w2[1]);
- w2[2] = swap32 (w2[2]);
- w2[3] = swap32 (w2[3]);
- w3[0] = swap32 (w3[0]);
- w3[1] = swap32 (w3[1]);
+ w0[0] = swap32_S (w0[0]);
+ w0[1] = swap32_S (w0[1]);
+ w0[2] = swap32_S (w0[2]);
+ w0[3] = swap32_S (w0[3]);
+ w1[0] = swap32_S (w1[0]);
+ w1[1] = swap32_S (w1[1]);
+ w1[2] = swap32_S (w1[2]);
+ w1[3] = swap32_S (w1[3]);
+ w2[0] = swap32_S (w2[0]);
+ w2[1] = swap32_S (w2[1]);
+ w2[2] = swap32_S (w2[2]);
+ w2[3] = swap32_S (w2[3]);
+ w3[0] = swap32_S (w3[0]);
+ w3[1] = swap32_S (w3[1]);
w3[2] = 0;
w3[3] = 0;
- append_0x80_4x4 (w0, w1, w2, w3, pw_len + 1);
-
- w0[0] = swap32 (w0[0]);
- w0[1] = swap32 (w0[1]);
- w0[2] = swap32 (w0[2]);
- w0[3] = swap32 (w0[3]);
- w1[0] = swap32 (w1[0]);
- w1[1] = swap32 (w1[1]);
- w1[2] = swap32 (w1[2]);
- w1[3] = swap32 (w1[3]);
- w2[0] = swap32 (w2[0]);
- w2[1] = swap32 (w2[1]);
- w2[2] = swap32 (w2[2]);
- w2[3] = swap32 (w2[3]);
- w3[0] = swap32 (w3[0]);
- w3[1] = swap32 (w3[1]);
+ append_0x80_4x4_S (w0, w1, w2, w3, pw_len + 1);
+
+ w0[0] = swap32_S (w0[0]);
+ w0[1] = swap32_S (w0[1]);
+ w0[2] = swap32_S (w0[2]);
+ w0[3] = swap32_S (w0[3]);
+ w1[0] = swap32_S (w1[0]);
+ w1[1] = swap32_S (w1[1]);
+ w1[2] = swap32_S (w1[2]);
+ w1[3] = swap32_S (w1[3]);
+ w2[0] = swap32_S (w2[0]);
+ w2[1] = swap32_S (w2[1]);
+ w2[2] = swap32_S (w2[2]);
+ w2[3] = swap32_S (w2[3]);
+ w3[0] = swap32_S (w3[0]);
+ w3[1] = swap32_S (w3[1]);
w3[2] = 0;
w3[3] = 0;
* base
*/
- w0[0] = swap32 (w0[0]);
- w0[1] = swap32 (w0[1]);
- w0[2] = swap32 (w0[2]);
- w0[3] = swap32 (w0[3]);
+ w0[0] = swap32_S (w0[0]);
+ w0[1] = swap32_S (w0[1]);
+ w0[2] = swap32_S (w0[2]);
+ w0[3] = swap32_S (w0[3]);
- append_0x80_2x4 (w0, w1, pw_len + 1);
+ append_0x80_2x4_S (w0, w1, pw_len + 1);
- w0[0] = swap32 (w0[0]);
- w0[1] = swap32 (w0[1]);
- w0[2] = swap32 (w0[2]);
- w0[3] = swap32 (w0[3]);
- w1[0] = swap32 (w1[0]);
+ w0[0] = swap32_S (w0[0]);
+ w0[1] = swap32_S (w0[1]);
+ w0[2] = swap32_S (w0[2]);
+ w0[3] = swap32_S (w0[3]);
+ w1[0] = swap32_S (w1[0]);
/**
* main
* base
*/
- w0[0] = swap32 (w0[0]);
- w0[1] = swap32 (w0[1]);
- w0[2] = swap32 (w0[2]);
- w0[3] = swap32 (w0[3]);
- w1[0] = swap32 (w1[0]);
- w1[1] = swap32 (w1[1]);
- w1[2] = swap32 (w1[2]);
- w1[3] = swap32 (w1[3]);
-
- append_0x80_3x4 (w0, w1, w2, pw_len + 1);
-
- w0[0] = swap32 (w0[0]);
- w0[1] = swap32 (w0[1]);
- w0[2] = swap32 (w0[2]);
- w0[3] = swap32 (w0[3]);
- w1[0] = swap32 (w1[0]);
- w1[1] = swap32 (w1[1]);
- w1[2] = swap32 (w1[2]);
- w1[3] = swap32 (w1[3]);
- w2[0] = swap32 (w2[0]);
+ w0[0] = swap32_S (w0[0]);
+ w0[1] = swap32_S (w0[1]);
+ w0[2] = swap32_S (w0[2]);
+ w0[3] = swap32_S (w0[3]);
+ w1[0] = swap32_S (w1[0]);
+ w1[1] = swap32_S (w1[1]);
+ w1[2] = swap32_S (w1[2]);
+ w1[3] = swap32_S (w1[3]);
+
+ append_0x80_3x4_S (w0, w1, w2, pw_len + 1);
+
+ w0[0] = swap32_S (w0[0]);
+ w0[1] = swap32_S (w0[1]);
+ w0[2] = swap32_S (w0[2]);
+ w0[3] = swap32_S (w0[3]);
+ w1[0] = swap32_S (w1[0]);
+ w1[1] = swap32_S (w1[1]);
+ w1[2] = swap32_S (w1[2]);
+ w1[3] = swap32_S (w1[3]);
+ w2[0] = swap32_S (w2[0]);
/**
* main
* base
*/
- w0[0] = swap32 (w0[0]);
- w0[1] = swap32 (w0[1]);
- w0[2] = swap32 (w0[2]);
- w0[3] = swap32 (w0[3]);
- w1[0] = swap32 (w1[0]);
- w1[1] = swap32 (w1[1]);
- w1[2] = swap32 (w1[2]);
- w1[3] = swap32 (w1[3]);
- w2[0] = swap32 (w2[0]);
- w2[1] = swap32 (w2[1]);
- w2[2] = swap32 (w2[2]);
- w2[3] = swap32 (w2[3]);
- w3[0] = swap32 (w3[0]);
- w3[1] = swap32 (w3[1]);
+ w0[0] = swap32_S (w0[0]);
+ w0[1] = swap32_S (w0[1]);
+ w0[2] = swap32_S (w0[2]);
+ w0[3] = swap32_S (w0[3]);
+ w1[0] = swap32_S (w1[0]);
+ w1[1] = swap32_S (w1[1]);
+ w1[2] = swap32_S (w1[2]);
+ w1[3] = swap32_S (w1[3]);
+ w2[0] = swap32_S (w2[0]);
+ w2[1] = swap32_S (w2[1]);
+ w2[2] = swap32_S (w2[2]);
+ w2[3] = swap32_S (w2[3]);
+ w3[0] = swap32_S (w3[0]);
+ w3[1] = swap32_S (w3[1]);
w3[2] = 0;
w3[3] = 0;
- append_0x80_4x4 (w0, w1, w2, w3, pw_len + 1);
-
- w0[0] = swap32 (w0[0]);
- w0[1] = swap32 (w0[1]);
- w0[2] = swap32 (w0[2]);
- w0[3] = swap32 (w0[3]);
- w1[0] = swap32 (w1[0]);
- w1[1] = swap32 (w1[1]);
- w1[2] = swap32 (w1[2]);
- w1[3] = swap32 (w1[3]);
- w2[0] = swap32 (w2[0]);
- w2[1] = swap32 (w2[1]);
- w2[2] = swap32 (w2[2]);
- w2[3] = swap32 (w2[3]);
- w3[0] = swap32 (w3[0]);
- w3[1] = swap32 (w3[1]);
+ append_0x80_4x4_S (w0, w1, w2, w3, pw_len + 1);
+
+ w0[0] = swap32_S (w0[0]);
+ w0[1] = swap32_S (w0[1]);
+ w0[2] = swap32_S (w0[2]);
+ w0[3] = swap32_S (w0[3]);
+ w1[0] = swap32_S (w1[0]);
+ w1[1] = swap32_S (w1[1]);
+ w1[2] = swap32_S (w1[2]);
+ w1[3] = swap32_S (w1[3]);
+ w2[0] = swap32_S (w2[0]);
+ w2[1] = swap32_S (w2[1]);
+ w2[2] = swap32_S (w2[2]);
+ w2[3] = swap32_S (w2[3]);
+ w3[0] = swap32_S (w3[0]);
+ w3[1] = swap32_S (w3[1]);
w3[2] = 0;
w3[3] = 0;
w3_t[2] = w3[2];
w3_t[3] = w3[3];
- switch_buffer_by_offset (w0_t, w1_t, w2_t, w3_t, 1);
+ switch_buffer_by_offset_le (w0_t, w1_t, w2_t, w3_t, 1);
w0_t[0] |= pw_len & 0xff;
s3[2] = 0;
s3[3] = 0;
- switch_buffer_by_offset (s0, s1, s2, s3, 1 + out_len + domain_len + 1);
+ switch_buffer_by_offset_le (s0, s1, s2, s3, 1 + out_len + domain_len + 1);
u32 d0[4];
d3[2] = 0;
d3[3] = 0;
- switch_buffer_by_offset (d0, d1, d2, d3, 1 + out_len);
+ switch_buffer_by_offset_le (d0, d1, d2, d3, 1 + out_len);
/**
* sha1
w3_t[2] = w3[2];
w3_t[3] = w3[3];
- switch_buffer_by_offset (w0_t, w1_t, w2_t, w3_t, 1);
+ switch_buffer_by_offset_le (w0_t, w1_t, w2_t, w3_t, 1);
w0_t[0] |= pw_len & 0xff;
s3[2] = 0;
s3[3] = 0;
- switch_buffer_by_offset (s0, s1, s2, s3, 1 + out_len + domain_len + 1);
+ switch_buffer_by_offset_le (s0, s1, s2, s3, 1 + out_len + domain_len + 1);
u32 d0[4];
d3[2] = 0;
d3[3] = 0;
- switch_buffer_by_offset (d0, d1, d2, d3, 1 + out_len);
+ switch_buffer_by_offset_le (d0, d1, d2, d3, 1 + out_len);
/**
* sha1
if (combs_mode == COMBINATOR_MODE_BASE_RIGHT)
{
- switch_buffer_by_offset (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len);
+ switch_buffer_by_offset_le (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len);
}
/**
if (combs_mode == COMBINATOR_MODE_BASE_LEFT)
{
- switch_buffer_by_offset (wordr0, wordr1, wordr2, wordr3, pw_l_len);
+ switch_buffer_by_offset_le (wordr0, wordr1, wordr2, wordr3, pw_l_len);
}
u32 w0[4];
w3_t[2] = w3[2];
w3_t[3] = w3[3];
- switch_buffer_by_offset (w0_t, w1_t, w2_t, w3_t, 1);
+ switch_buffer_by_offset_le (w0_t, w1_t, w2_t, w3_t, 1);
w0_t[0] |= pw_len & 0xff;
s3[2] = 0;
s3[3] = 0;
- switch_buffer_by_offset (s0, s1, s2, s3, 1 + pw_len + domain_len + 1);
+ switch_buffer_by_offset_le (s0, s1, s2, s3, 1 + pw_len + domain_len + 1);
u32 d0[4];
d3[2] = 0;
d3[3] = 0;
- switch_buffer_by_offset (d0, d1, d2, d3, 1 + pw_len);
+ switch_buffer_by_offset_le (d0, d1, d2, d3, 1 + pw_len);
/**
* sha1
if (combs_mode == COMBINATOR_MODE_BASE_RIGHT)
{
- switch_buffer_by_offset (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len);
+ switch_buffer_by_offset_le (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len);
}
/**
if (combs_mode == COMBINATOR_MODE_BASE_LEFT)
{
- switch_buffer_by_offset (wordr0, wordr1, wordr2, wordr3, pw_l_len);
+ switch_buffer_by_offset_le (wordr0, wordr1, wordr2, wordr3, pw_l_len);
}
u32 w0[4];
w3_t[2] = w3[2];
w3_t[3] = w3[3];
- switch_buffer_by_offset (w0_t, w1_t, w2_t, w3_t, 1);
+ switch_buffer_by_offset_le (w0_t, w1_t, w2_t, w3_t, 1);
w0_t[0] |= pw_len & 0xff;
s3[2] = 0;
s3[3] = 0;
- switch_buffer_by_offset (s0, s1, s2, s3, 1 + pw_len + domain_len + 1);
+ switch_buffer_by_offset_le (s0, s1, s2, s3, 1 + pw_len + domain_len + 1);
u32 d0[4];
d3[2] = 0;
d3[3] = 0;
- switch_buffer_by_offset (d0, d1, d2, d3, 1 + pw_len);
+ switch_buffer_by_offset_le (d0, d1, d2, d3, 1 + pw_len);
/**
* sha1
#define _SHA1_
+#define NEW_SIMD_CODE
+
#include "include/constants.h"
#include "include/kernel_vendor.h"
#include "include/kernel_functions.c"
#include "OpenCL/types_ocl.c"
#include "OpenCL/common.c"
+#include "OpenCL/simd.c"
-#define COMPARE_S "OpenCL/check_single_comp4.c"
-#define COMPARE_M "OpenCL/check_multi_comp4.c"
-
-static void sha1_transform (const u32 w0[4], const u32 w1[4], const u32 w2[4], const u32 w3[4], u32 digest[5])
+static void sha1_transform (const u32x w0[4], const u32x w1[4], const u32x w2[4], const u32x w3[4], u32x digest[5])
{
- u32 A = digest[0];
- u32 B = digest[1];
- u32 C = digest[2];
- u32 D = digest[3];
- u32 E = digest[4];
-
- u32 w0_t = w0[0];
- u32 w1_t = w0[1];
- u32 w2_t = w0[2];
- u32 w3_t = w0[3];
- u32 w4_t = w1[0];
- u32 w5_t = w1[1];
- u32 w6_t = w1[2];
- u32 w7_t = w1[3];
- u32 w8_t = w2[0];
- u32 w9_t = w2[1];
- u32 wa_t = w2[2];
- u32 wb_t = w2[3];
- u32 wc_t = w3[0];
- u32 wd_t = w3[1];
- u32 we_t = w3[2];
- u32 wf_t = w3[3];
+ u32x A = digest[0];
+ u32x B = digest[1];
+ u32x C = digest[2];
+ u32x D = digest[3];
+ u32x E = digest[4];
+
+ u32x w0_t = w0[0];
+ u32x w1_t = w0[1];
+ u32x w2_t = w0[2];
+ u32x w3_t = w0[3];
+ u32x w4_t = w1[0];
+ u32x w5_t = w1[1];
+ u32x w6_t = w1[2];
+ u32x w7_t = w1[3];
+ u32x w8_t = w2[0];
+ u32x w9_t = w2[1];
+ u32x wa_t = w2[2];
+ u32x wb_t = w2[3];
+ u32x wc_t = w3[0];
+ u32x wd_t = w3[1];
+ u32x we_t = w3[2];
+ u32x wf_t = w3[3];
#undef K
#define K SHA1C00
s3[2] = 0;
s3[3] = 0;
- switch_buffer_by_offset (s0, s1, s2, s3, 1 + pw_len + domain_len + 1);
+ switch_buffer_by_offset_le_S (s0, s1, s2, s3, 1 + pw_len + domain_len + 1);
u32 d0[4];
d3[2] = 0;
d3[3] = 0;
- switch_buffer_by_offset (d0, d1, d2, d3, 1 + pw_len);
+ switch_buffer_by_offset_le_S (d0, d1, d2, d3, 1 + pw_len);
/**
* loop
u32 w0l = w0[0];
- for (u32 il_pos = 0; il_pos < bfs_cnt; il_pos++)
+ for (u32 il_pos = 0; il_pos < bfs_cnt; il_pos += VECT_SIZE)
{
- const u32 w0r = bfs_buf[il_pos].i;
+ const u32x w0r = w0r_create_bft (bfs_buf, il_pos);
- w0[0] = w0l | w0r;
+ const u32x w0lr = w0l | w0r;
- u32 w0_t[4];
+ u32x w0_t[4];
- w0_t[0] = w0[0];
+ w0_t[0] = w0lr;
w0_t[1] = w0[1];
w0_t[2] = w0[2];
w0_t[3] = w0[3];
- u32 w1_t[4];
+ u32x w1_t[4];
w1_t[0] = w1[0];
w1_t[1] = w1[1];
w1_t[2] = w1[2];
w1_t[3] = w1[3];
- u32 w2_t[4];
+ u32x w2_t[4];
w2_t[0] = w2[0];
w2_t[1] = w2[1];
w2_t[2] = w2[2];
w2_t[3] = w2[3];
- u32 w3_t[4];
+ u32x w3_t[4];
w3_t[0] = w3[0];
w3_t[1] = w3[1];
w3_t[2] = w3[2];
w3_t[3] = w3[3];
- switch_buffer_by_offset (w0_t, w1_t, w2_t, w3_t, 1);
+ switch_buffer_by_offset_le (w0_t, w1_t, w2_t, w3_t, 1);
w0_t[0] |= pw_len & 0xff;
* sha1
*/
- u32 w0_t2[4];
+ u32x w0_t2[4];
w0_t2[0] = swap32 (w0_t[0] | d0[0] | s0[0]);
w0_t2[1] = swap32 (w0_t[1] | d0[1] | s0[1]);
w0_t2[2] = swap32 (w0_t[2] | d0[2] | s0[2]);
w0_t2[3] = swap32 (w0_t[3] | d0[3] | s0[3]);
- u32 w1_t2[4];
+ u32x w1_t2[4];
w1_t2[0] = swap32 (w1_t[0] | d1[0] | s1[0]);
w1_t2[1] = swap32 (w1_t[1] | d1[1] | s1[1]);
w1_t2[2] = swap32 (w1_t[2] | d1[2] | s1[2]);
w1_t2[3] = swap32 (w1_t[3] | d1[3] | s1[3]);
- u32 w2_t2[4];
+ u32x w2_t2[4];
w2_t2[0] = swap32 (w2_t[0] | d2[0] | s2[0]);
w2_t2[1] = swap32 (w2_t[1] | d2[1] | s2[1]);
w2_t2[2] = swap32 (w2_t[2] | d2[2] | s2[2]);
w2_t2[3] = swap32 (w2_t[3] | d2[3] | s2[3]);
- u32 w3_t2[4];
+ u32x w3_t2[4];
w3_t2[0] = swap32 (w3_t[0] | d3[0] | s3[0]);
w3_t2[1] = swap32 (w3_t[1] | d3[1] | s3[1]);
w3_t2[2] = 0;
w3_t2[3] = (1 + pw_len + domain_len + 1 + salt_len) * 8;
- u32 digest[5];
+ u32x digest[5];
digest[0] = SHA1M_A;
digest[1] = SHA1M_B;
for (u32 i = 0; i < salt_iter; i++)
{
- u32 w0_t3[4];
+ u32x w0_t3[4];
w0_t3[0] = digest[0];
w0_t3[1] = digest[1];
w0_t3[2] = digest[2];
w0_t3[3] = digest[3];
- u32 w1_t3[4];
+ u32x w1_t3[4];
w1_t3[0] = digest[4];
w1_t3[1] = swap32 (salt_buf0[0]);
w1_t3[2] = swap32 (salt_buf0[1]);
w1_t3[3] = swap32 (salt_buf0[2]);
- u32 w2_t3[4];
+ u32x w2_t3[4];
w2_t3[0] = swap32 (salt_buf0[3]);
w2_t3[1] = swap32 (salt_buf1[0]);
w2_t3[2] = swap32 (salt_buf1[1]);
w2_t3[3] = swap32 (salt_buf1[2]);
- u32 w3_t3[4];
+ u32x w3_t3[4];
w3_t3[0] = swap32 (salt_buf1[3]);
w3_t3[1] = 0;
sha1_transform (w0_t3, w1_t3, w2_t3, w3_t3, digest);
}
- const u32 r0 = digest[3];
- const u32 r1 = digest[4];
- const u32 r2 = digest[2];
- const u32 r3 = digest[1];
-
- #include COMPARE_M
+ COMPARE_M_SIMD (digest[3], digest[4], digest[2], digest[1]);
}
}
s3[2] = 0;
s3[3] = 0;
- switch_buffer_by_offset (s0, s1, s2, s3, 1 + pw_len + domain_len + 1);
+ switch_buffer_by_offset_le_S (s0, s1, s2, s3, 1 + pw_len + domain_len + 1);
u32 d0[4];
d3[2] = 0;
d3[3] = 0;
- switch_buffer_by_offset (d0, d1, d2, d3, 1 + pw_len);
+ switch_buffer_by_offset_le_S (d0, d1, d2, d3, 1 + pw_len);
/**
* loop
u32 w0l = w0[0];
- for (u32 il_pos = 0; il_pos < bfs_cnt; il_pos++)
+ for (u32 il_pos = 0; il_pos < bfs_cnt; il_pos += VECT_SIZE)
{
- const u32 w0r = bfs_buf[il_pos].i;
+ const u32x w0r = w0r_create_bft (bfs_buf, il_pos);
- w0[0] = w0l | w0r;
+ const u32x w0lr = w0l | w0r;
- u32 w0_t[4];
+ u32x w0_t[4];
- w0_t[0] = w0[0];
+ w0_t[0] = w0lr;
w0_t[1] = w0[1];
w0_t[2] = w0[2];
w0_t[3] = w0[3];
- u32 w1_t[4];
+ u32x w1_t[4];
w1_t[0] = w1[0];
w1_t[1] = w1[1];
w1_t[2] = w1[2];
w1_t[3] = w1[3];
- u32 w2_t[4];
+ u32x w2_t[4];
w2_t[0] = w2[0];
w2_t[1] = w2[1];
w2_t[2] = w2[2];
w2_t[3] = w2[3];
- u32 w3_t[4];
+ u32x w3_t[4];
w3_t[0] = w3[0];
w3_t[1] = w3[1];
w3_t[2] = w3[2];
w3_t[3] = w3[3];
- switch_buffer_by_offset (w0_t, w1_t, w2_t, w3_t, 1);
+ switch_buffer_by_offset_le (w0_t, w1_t, w2_t, w3_t, 1);
w0_t[0] |= pw_len & 0xff;
* sha1
*/
- u32 w0_t2[4];
+ u32x w0_t2[4];
w0_t2[0] = swap32 (w0_t[0] | d0[0] | s0[0]);
w0_t2[1] = swap32 (w0_t[1] | d0[1] | s0[1]);
w0_t2[2] = swap32 (w0_t[2] | d0[2] | s0[2]);
w0_t2[3] = swap32 (w0_t[3] | d0[3] | s0[3]);
- u32 w1_t2[4];
+ u32x w1_t2[4];
w1_t2[0] = swap32 (w1_t[0] | d1[0] | s1[0]);
w1_t2[1] = swap32 (w1_t[1] | d1[1] | s1[1]);
w1_t2[2] = swap32 (w1_t[2] | d1[2] | s1[2]);
w1_t2[3] = swap32 (w1_t[3] | d1[3] | s1[3]);
- u32 w2_t2[4];
+ u32x w2_t2[4];
w2_t2[0] = swap32 (w2_t[0] | d2[0] | s2[0]);
w2_t2[1] = swap32 (w2_t[1] | d2[1] | s2[1]);
w2_t2[2] = swap32 (w2_t[2] | d2[2] | s2[2]);
w2_t2[3] = swap32 (w2_t[3] | d2[3] | s2[3]);
- u32 w3_t2[4];
+ u32x w3_t2[4];
w3_t2[0] = swap32 (w3_t[0] | d3[0] | s3[0]);
w3_t2[1] = swap32 (w3_t[1] | d3[1] | s3[1]);
w3_t2[2] = 0;
w3_t2[3] = (1 + pw_len + domain_len + 1 + salt_len) * 8;
- u32 digest[5];
+ u32x digest[5];
digest[0] = SHA1M_A;
digest[1] = SHA1M_B;
for (u32 i = 0; i < salt_iter; i++)
{
- u32 w0_t3[4];
+ u32x w0_t3[4];
w0_t3[0] = digest[0];
w0_t3[1] = digest[1];
w0_t3[2] = digest[2];
w0_t3[3] = digest[3];
- u32 w1_t3[4];
+ u32x w1_t3[4];
w1_t3[0] = digest[4];
w1_t3[1] = swap32 (salt_buf0[0]);
w1_t3[2] = swap32 (salt_buf0[1]);
w1_t3[3] = swap32 (salt_buf0[2]);
- u32 w2_t3[4];
+ u32x w2_t3[4];
w2_t3[0] = swap32 (salt_buf0[3]);
w2_t3[1] = swap32 (salt_buf1[0]);
w2_t3[2] = swap32 (salt_buf1[1]);
w2_t3[3] = swap32 (salt_buf1[2]);
- u32 w3_t3[4];
+ u32x w3_t3[4];
w3_t3[0] = swap32 (salt_buf1[3]);
w3_t3[1] = 0;
sha1_transform (w0_t3, w1_t3, w2_t3, w3_t3, digest);
}
- const u32 r0 = digest[3];
- const u32 r1 = digest[4];
- const u32 r2 = digest[2];
- const u32 r3 = digest[1];
-
- #include COMPARE_S
+ COMPARE_S_SIMD (digest[3], digest[4], digest[2], digest[1]);
}
}
{
append_0x80_2x4 (wordl0, wordl1, pw_l_len);
- switch_buffer_by_offset (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len);
+ switch_buffer_by_offset_le (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len);
}
/**
{
append_0x80_2x4 (wordr0, wordr1, pw_r_len);
- switch_buffer_by_offset (wordr0, wordr1, wordr2, wordr3, pw_l_len);
+ switch_buffer_by_offset_le (wordr0, wordr1, wordr2, wordr3, pw_l_len);
}
u32 w0[4];
{
append_0x80_2x4 (wordl0, wordl1, pw_l_len);
- switch_buffer_by_offset (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len);
+ switch_buffer_by_offset_le (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len);
}
/**
{
append_0x80_2x4 (wordr0, wordr1, pw_r_len);
- switch_buffer_by_offset (wordr0, wordr1, wordr2, wordr3, pw_l_len);
+ switch_buffer_by_offset_le (wordr0, wordr1, wordr2, wordr3, pw_l_len);
}
u32 w0[4];
#define _SHA1_
+#define NEW_SIMD_CODE
+
#include "include/constants.h"
#include "include/kernel_vendor.h"
#include "include/kernel_functions.c"
#include "OpenCL/types_ocl.c"
#include "OpenCL/common.c"
-
-#define COMPARE_S "OpenCL/check_single_comp4.c"
-#define COMPARE_M "OpenCL/check_multi_comp4.c"
-
-#define uint_to_hex_lower8_le(i) l_bin2asc[(i)]
-
-static void sha1_transform (const u32 w0[4], const u32 w1[4], const u32 w2[4], const u32 w3[4], u32 digest[5])
+#include "OpenCL/simd.c"
+
+#if VECT_SIZE == 1
+#define uint_to_hex_lower8_le(i) (u32x) (l_bin2asc[(i)])
+#elif VECT_SIZE == 2
+#define uint_to_hex_lower8_le(i) (u32x) (l_bin2asc[(i).s0], l_bin2asc[(i).s1])
+#elif VECT_SIZE == 4
+#define uint_to_hex_lower8_le(i) (u32x) (l_bin2asc[(i).s0], l_bin2asc[(i).s1], l_bin2asc[(i).s2], l_bin2asc[(i).s3])
+#elif VECT_SIZE == 8
+#define uint_to_hex_lower8_le(i) (u32x) (l_bin2asc[(i).s0], l_bin2asc[(i).s1], l_bin2asc[(i).s2], l_bin2asc[(i).s3], l_bin2asc[(i).s4], l_bin2asc[(i).s5], l_bin2asc[(i).s6], l_bin2asc[(i).s7])
+#endif
+
+static void sha1_transform (const u32x w0[4], const u32x w1[4], const u32x w2[4], const u32x w3[4], u32x digest[5])
{
- u32 A = digest[0];
- u32 B = digest[1];
- u32 C = digest[2];
- u32 D = digest[3];
- u32 E = digest[4];
-
- u32 w0_t = w0[0];
- u32 w1_t = w0[1];
- u32 w2_t = w0[2];
- u32 w3_t = w0[3];
- u32 w4_t = w1[0];
- u32 w5_t = w1[1];
- u32 w6_t = w1[2];
- u32 w7_t = w1[3];
- u32 w8_t = w2[0];
- u32 w9_t = w2[1];
- u32 wa_t = w2[2];
- u32 wb_t = w2[3];
- u32 wc_t = w3[0];
- u32 wd_t = w3[1];
- u32 we_t = w3[2];
- u32 wf_t = w3[3];
+ u32x A = digest[0];
+ u32x B = digest[1];
+ u32x C = digest[2];
+ u32x D = digest[3];
+ u32x E = digest[4];
+
+ u32x w0_t = w0[0];
+ u32x w1_t = w0[1];
+ u32x w2_t = w0[2];
+ u32x w3_t = w0[3];
+ u32x w4_t = w1[0];
+ u32x w5_t = w1[1];
+ u32x w6_t = w1[2];
+ u32x w7_t = w1[3];
+ u32x w8_t = w2[0];
+ u32x w9_t = w2[1];
+ u32x wa_t = w2[2];
+ u32x wb_t = w2[3];
+ u32x wc_t = w3[0];
+ u32x wd_t = w3[1];
+ u32x we_t = w3[2];
+ u32x wf_t = w3[3];
#undef K
#define K SHA1C00
u32 salt_buf0[4];
- salt_buf0[0] = swap32 (salt_bufs[salt_pos].salt_buf[ 0]);
- salt_buf0[1] = swap32 (salt_bufs[salt_pos].salt_buf[ 1]);
- salt_buf0[2] = swap32 (salt_bufs[salt_pos].salt_buf[ 2]);
- salt_buf0[3] = swap32 (salt_bufs[salt_pos].salt_buf[ 3]);
+ salt_buf0[0] = swap32_S (salt_bufs[salt_pos].salt_buf[ 0]);
+ salt_buf0[1] = swap32_S (salt_bufs[salt_pos].salt_buf[ 1]);
+ salt_buf0[2] = swap32_S (salt_bufs[salt_pos].salt_buf[ 2]);
+ salt_buf0[3] = swap32_S (salt_bufs[salt_pos].salt_buf[ 3]);
u32 salt_buf1[4];
- salt_buf1[0] = swap32 (salt_bufs[salt_pos].salt_buf[ 4]);
- salt_buf1[1] = swap32 (salt_bufs[salt_pos].salt_buf[ 5]);
- salt_buf1[2] = swap32 (salt_bufs[salt_pos].salt_buf[ 6]);
- salt_buf1[3] = swap32 (salt_bufs[salt_pos].salt_buf[ 7]);
+ salt_buf1[0] = swap32_S (salt_bufs[salt_pos].salt_buf[ 4]);
+ salt_buf1[1] = swap32_S (salt_bufs[salt_pos].salt_buf[ 5]);
+ salt_buf1[2] = swap32_S (salt_bufs[salt_pos].salt_buf[ 6]);
+ salt_buf1[3] = swap32_S (salt_bufs[salt_pos].salt_buf[ 7]);
u32 salt_buf2[4];
- salt_buf2[0] = swap32 (salt_bufs[salt_pos].salt_buf[ 8]);
- salt_buf2[1] = swap32 (salt_bufs[salt_pos].salt_buf[ 9]);
+ salt_buf2[0] = swap32_S (salt_bufs[salt_pos].salt_buf[ 8]);
+ salt_buf2[1] = swap32_S (salt_bufs[salt_pos].salt_buf[ 9]);
salt_buf2[2] = 0;
salt_buf2[3] = 0;
u32 w0l = w0[0];
- for (u32 il_pos = 0; il_pos < bfs_cnt; il_pos++)
+ for (u32 il_pos = 0; il_pos < bfs_cnt; il_pos += VECT_SIZE)
{
- const u32 w0r = bfs_buf[il_pos].i;
+ const u32x w0r = w0r_create_bft (bfs_buf, il_pos);
- w0[0] = w0l | w0r;
+ const u32x w0lr = w0l | w0r;
- u32 w0_t[4];
+ u32x w0_t[4];
- w0_t[0] = w0[0];
+ w0_t[0] = w0lr;
w0_t[1] = w0[1];
w0_t[2] = w0[2];
w0_t[3] = w0[3];
- u32 w1_t[4];
+ u32x w1_t[4];
w1_t[0] = w1[0];
w1_t[1] = w1[1];
w1_t[2] = w1[2];
w1_t[3] = w1[3];
- u32 w2_t[4];
+ u32x w2_t[4];
w2_t[0] = w2[0];
w2_t[1] = w2[1];
w2_t[2] = w2[2];
w2_t[3] = w2[3];
- u32 w3_t[4];
+ u32x w3_t[4];
w3_t[0] = w3[0];
w3_t[1] = w3[1];
w3_t[2] = 0;
w3_t[3] = pw_len * 8;
- u32 digest[5];
+ u32x digest[5];
digest[0] = SHA1M_A;
digest[1] = SHA1M_B;
sha1_transform (w0_t, w1_t, w2_t, w3_t, digest);
- u32 a;
- u32 b;
- u32 c;
- u32 d;
- u32 e;
+ u32x a;
+ u32x b;
+ u32x c;
+ u32x d;
+ u32x e;
a = digest[0];
b = digest[1];
sha1_transform (w0_t, w1_t, w2_t, w3_t, digest);
- const u32 r0 = digest[3];
- const u32 r1 = digest[4];
- const u32 r2 = digest[2];
- const u32 r3 = digest[1];
-
- #include COMPARE_M
+ COMPARE_M_SIMD (digest[3], digest[4], digest[2], digest[1]);
}
}
u32 salt_buf0[4];
- salt_buf0[0] = swap32 (salt_bufs[salt_pos].salt_buf[ 0]);
- salt_buf0[1] = swap32 (salt_bufs[salt_pos].salt_buf[ 1]);
- salt_buf0[2] = swap32 (salt_bufs[salt_pos].salt_buf[ 2]);
- salt_buf0[3] = swap32 (salt_bufs[salt_pos].salt_buf[ 3]);
+ salt_buf0[0] = swap32_S (salt_bufs[salt_pos].salt_buf[ 0]);
+ salt_buf0[1] = swap32_S (salt_bufs[salt_pos].salt_buf[ 1]);
+ salt_buf0[2] = swap32_S (salt_bufs[salt_pos].salt_buf[ 2]);
+ salt_buf0[3] = swap32_S (salt_bufs[salt_pos].salt_buf[ 3]);
u32 salt_buf1[4];
- salt_buf1[0] = swap32 (salt_bufs[salt_pos].salt_buf[ 4]);
- salt_buf1[1] = swap32 (salt_bufs[salt_pos].salt_buf[ 5]);
- salt_buf1[2] = swap32 (salt_bufs[salt_pos].salt_buf[ 6]);
- salt_buf1[3] = swap32 (salt_bufs[salt_pos].salt_buf[ 7]);
+ salt_buf1[0] = swap32_S (salt_bufs[salt_pos].salt_buf[ 4]);
+ salt_buf1[1] = swap32_S (salt_bufs[salt_pos].salt_buf[ 5]);
+ salt_buf1[2] = swap32_S (salt_bufs[salt_pos].salt_buf[ 6]);
+ salt_buf1[3] = swap32_S (salt_bufs[salt_pos].salt_buf[ 7]);
u32 salt_buf2[4];
- salt_buf2[0] = swap32 (salt_bufs[salt_pos].salt_buf[ 8]);
- salt_buf2[1] = swap32 (salt_bufs[salt_pos].salt_buf[ 9]);
+ salt_buf2[0] = swap32_S (salt_bufs[salt_pos].salt_buf[ 8]);
+ salt_buf2[1] = swap32_S (salt_bufs[salt_pos].salt_buf[ 9]);
salt_buf2[2] = 0;
salt_buf2[3] = 0;
u32 w0l = w0[0];
- for (u32 il_pos = 0; il_pos < bfs_cnt; il_pos++)
+ for (u32 il_pos = 0; il_pos < bfs_cnt; il_pos += VECT_SIZE)
{
- const u32 w0r = bfs_buf[il_pos].i;
+ const u32x w0r = w0r_create_bft (bfs_buf, il_pos);
- w0[0] = w0l | w0r;
+ const u32x w0lr = w0l | w0r;
- u32 w0_t[4];
+ u32x w0_t[4];
- w0_t[0] = w0[0];
+ w0_t[0] = w0lr;
w0_t[1] = w0[1];
w0_t[2] = w0[2];
w0_t[3] = w0[3];
- u32 w1_t[4];
+ u32x w1_t[4];
w1_t[0] = w1[0];
w1_t[1] = w1[1];
w1_t[2] = w1[2];
w1_t[3] = w1[3];
- u32 w2_t[4];
+ u32x w2_t[4];
w2_t[0] = w2[0];
w2_t[1] = w2[1];
w2_t[2] = w2[2];
w2_t[3] = w2[3];
- u32 w3_t[4];
+ u32x w3_t[4];
w3_t[0] = w3[0];
w3_t[1] = w3[1];
w3_t[2] = 0;
w3_t[3] = pw_len * 8;
- u32 digest[5];
+ u32x digest[5];
digest[0] = SHA1M_A;
digest[1] = SHA1M_B;
sha1_transform (w0_t, w1_t, w2_t, w3_t, digest);
- u32 a;
- u32 b;
- u32 c;
- u32 d;
- u32 e;
+ u32x a;
+ u32x b;
+ u32x c;
+ u32x d;
+ u32x e;
a = digest[0];
b = digest[1];
sha1_transform (w0_t, w1_t, w2_t, w3_t, digest);
- const u32 r0 = digest[3];
- const u32 r1 = digest[4];
- const u32 r2 = digest[2];
- const u32 r3 = digest[1];
-
- #include COMPARE_S
+ COMPARE_S_SIMD (digest[3], digest[4], digest[2], digest[1]);
}
}
if (combs_mode == COMBINATOR_MODE_BASE_RIGHT)
{
- switch_buffer_by_offset (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len);
+ switch_buffer_by_offset_le (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len);
}
/**
if (combs_mode == COMBINATOR_MODE_BASE_LEFT)
{
- switch_buffer_by_offset (wordr0, wordr1, wordr2, wordr3, pw_l_len);
+ switch_buffer_by_offset_le (wordr0, wordr1, wordr2, wordr3, pw_l_len);
}
u32 w0[4];
if (combs_mode == COMBINATOR_MODE_BASE_RIGHT)
{
- switch_buffer_by_offset (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len);
+ switch_buffer_by_offset_le (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len);
}
/**
if (combs_mode == COMBINATOR_MODE_BASE_LEFT)
{
- switch_buffer_by_offset (wordr0, wordr1, wordr2, wordr3, pw_l_len);
+ switch_buffer_by_offset_le (wordr0, wordr1, wordr2, wordr3, pw_l_len);
}
u32 w0[4];
#define _DES_
+#define NEW_SIMD_CODE
+
#include "include/constants.h"
#include "include/kernel_vendor.h"
#include "include/kernel_functions.c"
#include "OpenCL/types_ocl.c"
#include "OpenCL/common.c"
-
-#define COMPARE_S "OpenCL/check_single_comp4.c"
-#define COMPARE_M "OpenCL/check_multi_comp4.c"
+#include "OpenCL/simd.c"
#define PERM_OP(a,b,tt,n,m) \
{ \
}
};
+#if VECT_SIZE == 1
#define BOX(i,n,S) (S)[(n)][(i)]
-
-static void _des_crypt_encrypt (u32 iv[2], u32 data[2], u32 Kc[16], u32 Kd[16], __local u32 s_SPtrans[8][64])
+#elif VECT_SIZE == 2
+#define BOX(i,n,S) (u32x) ((S)[(n)][(i).s0], (S)[(n)][(i).s1])
+#elif VECT_SIZE == 4
+#define BOX(i,n,S) (u32x) ((S)[(n)][(i).s0], (S)[(n)][(i).s1], (S)[(n)][(i).s2], (S)[(n)][(i).s3])
+#elif VECT_SIZE == 8
+#define BOX(i,n,S) (u32x) ((S)[(n)][(i).s0], (S)[(n)][(i).s1], (S)[(n)][(i).s2], (S)[(n)][(i).s3], (S)[(n)][(i).s4], (S)[(n)][(i).s5], (S)[(n)][(i).s6], (S)[(n)][(i).s7])
+#endif
+
+#if VECT_SIZE == 1
+#define BOX1(i,S) (S)[(i)]
+#elif VECT_SIZE == 2
+#define BOX1(i,S) (u32x) ((S)[(i).s0], (S)[(i).s1])
+#elif VECT_SIZE == 4
+#define BOX1(i,S) (u32x) ((S)[(i).s0], (S)[(i).s1], (S)[(i).s2], (S)[(i).s3])
+#elif VECT_SIZE == 8
+#define BOX1(i,S) (u32x) ((S)[(i).s0], (S)[(i).s1], (S)[(i).s2], (S)[(i).s3], (S)[(i).s4], (S)[(i).s5], (S)[(i).s6], (S)[(i).s7])
+#endif
+
+static void _des_crypt_encrypt (u32x iv[2], u32x data[2], u32x Kc[16], u32x Kd[16], __local u32 s_SPtrans[8][64])
{
- u32 tt;
-
- u32 r = data[0];
- u32 l = data[1];
+ u32x r = data[0];
+ u32x l = data[1];
#pragma unroll 16
for (u32 i = 0; i < 16; i += 2)
{
- u32 u;
- u32 t;
+ u32x u;
+ u32x t;
u = Kc[i + 0] ^ r;
t = Kd[i + 0] ^ rotl32 (r, 28u);
iv[1] = r;
}
-static void _des_crypt_keysetup (u32 c, u32 d, u32 Kc[16], u32 Kd[16], __local u32 s_skb[8][64])
+static void _des_crypt_keysetup (u32x c, u32x d, u32x Kc[16], u32x Kd[16], __local u32 s_skb[8][64])
{
- u32 tt;
+ u32x tt;
PERM_OP (d, c, tt, 4, 0x0f0f0f0f);
HPERM_OP (c, tt, 2, 0xcccc0000);
c = c & 0x0fffffff;
d = d & 0x0fffffff;
- const u32 c00 = (c >> 0) & 0x0000003f;
- const u32 c06 = (c >> 6) & 0x00383003;
- const u32 c07 = (c >> 7) & 0x0000003c;
- const u32 c13 = (c >> 13) & 0x0000060f;
- const u32 c20 = (c >> 20) & 0x00000001;
-
- u32 s = BOX (((c00 >> 0) & 0xff), 0, s_skb)
- | BOX (((c06 >> 0) & 0xff)
- |((c07 >> 0) & 0xff), 1, s_skb)
- | BOX (((c13 >> 0) & 0xff)
- |((c06 >> 8) & 0xff), 2, s_skb)
- | BOX (((c20 >> 0) & 0xff)
- |((c13 >> 8) & 0xff)
- |((c06 >> 16) & 0xff), 3, s_skb);
-
- const u32 d00 = (d >> 0) & 0x00003c3f;
- const u32 d07 = (d >> 7) & 0x00003f03;
- const u32 d21 = (d >> 21) & 0x0000000f;
- const u32 d22 = (d >> 22) & 0x00000030;
-
- u32 t = BOX (((d00 >> 0) & 0xff), 4, s_skb)
- | BOX (((d07 >> 0) & 0xff)
- |((d00 >> 8) & 0xff), 5, s_skb)
- | BOX (((d07 >> 8) & 0xff), 6, s_skb)
- | BOX (((d21 >> 0) & 0xff)
- |((d22 >> 0) & 0xff), 7, s_skb);
+ const u32x c00 = (c >> 0) & 0x0000003f;
+ const u32x c06 = (c >> 6) & 0x00383003;
+ const u32x c07 = (c >> 7) & 0x0000003c;
+ const u32x c13 = (c >> 13) & 0x0000060f;
+ const u32x c20 = (c >> 20) & 0x00000001;
+
+ u32x s = BOX (((c00 >> 0) & 0xff), 0, s_skb)
+ | BOX (((c06 >> 0) & 0xff)
+ |((c07 >> 0) & 0xff), 1, s_skb)
+ | BOX (((c13 >> 0) & 0xff)
+ |((c06 >> 8) & 0xff), 2, s_skb)
+ | BOX (((c20 >> 0) & 0xff)
+ |((c13 >> 8) & 0xff)
+ |((c06 >> 16) & 0xff), 3, s_skb);
+
+ const u32x d00 = (d >> 0) & 0x00003c3f;
+ const u32x d07 = (d >> 7) & 0x00003f03;
+ const u32x d21 = (d >> 21) & 0x0000000f;
+ const u32x d22 = (d >> 22) & 0x00000030;
+
+ u32x t = BOX (((d00 >> 0) & 0xff), 4, s_skb)
+ | BOX (((d07 >> 0) & 0xff)
+ |((d00 >> 8) & 0xff), 5, s_skb)
+ | BOX (((d07 >> 8) & 0xff), 6, s_skb)
+ | BOX (((d21 >> 0) & 0xff)
+ |((d22 >> 0) & 0xff), 7, s_skb);
Kc[i] = ((t << 16) | (s & 0x0000ffff));
Kd[i] = ((s >> 16) | (t & 0xffff0000));
}
}
-static void transform_racf_key (const u32 w0, const u32 w1, u32 key[2])
+static void transform_racf_key (const u32x w0, const u32x w1, u32x key[2])
{
- key[0] = (ascii_to_ebcdic_pc[(w0 >> 0) & 0xff]) << 0
- | (ascii_to_ebcdic_pc[(w0 >> 8) & 0xff]) << 8
- | (ascii_to_ebcdic_pc[(w0 >> 16) & 0xff]) << 16
- | (ascii_to_ebcdic_pc[(w0 >> 24) & 0xff]) << 24;
-
- key[1] = (ascii_to_ebcdic_pc[(w1 >> 0) & 0xff]) << 0
- | (ascii_to_ebcdic_pc[(w1 >> 8) & 0xff]) << 8
- | (ascii_to_ebcdic_pc[(w1 >> 16) & 0xff]) << 16
- | (ascii_to_ebcdic_pc[(w1 >> 24) & 0xff]) << 24;
+ key[0] = BOX1 (((w0 >> 0) & 0xff), ascii_to_ebcdic_pc) << 0
+ | BOX1 (((w0 >> 8) & 0xff), ascii_to_ebcdic_pc) << 8
+ | BOX1 (((w0 >> 16) & 0xff), ascii_to_ebcdic_pc) << 16
+ | BOX1 (((w0 >> 24) & 0xff), ascii_to_ebcdic_pc) << 24;
+
+ key[1] = BOX1 (((w1 >> 0) & 0xff), ascii_to_ebcdic_pc) << 0
+ | BOX1 (((w1 >> 8) & 0xff), ascii_to_ebcdic_pc) << 8
+ | BOX1 (((w1 >> 16) & 0xff), ascii_to_ebcdic_pc) << 16
+ | BOX1 (((w1 >> 24) & 0xff), ascii_to_ebcdic_pc) << 24;
}
-static void m08500m (__local u32 s_SPtrans[8][64], __local u32 s_skb[8][64], u32 w[16], const u32 pw_len, __global pw_t *pws, __global kernel_rule_t *rules_buf, __global comb_t *combs_buf, __constant u32 * words_buf_r, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 bfs_cnt, const u32 digests_cnt, const u32 digests_offset)
+static void m08500m (__local u32 s_SPtrans[8][64], __local u32 s_skb[8][64], u32 w[16], const u32 pw_len, __global pw_t *pws, __global kernel_rule_t *rules_buf, __global comb_t *combs_buf, __constant u32x * words_buf_r, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 bfs_cnt, const u32 digests_cnt, const u32 digests_offset)
{
/**
* modifier
u32 w1 = w[1];
- for (u32 il_pos = 0; il_pos < bfs_cnt; il_pos++)
+ for (u32 il_pos = 0; il_pos < bfs_cnt; il_pos += VECT_SIZE)
{
- const u32 w0r = words_buf_r[il_pos];
+ const u32x w0r = words_buf_r[il_pos / VECT_SIZE];
- const u32 w0 = w0l | w0r;
+ const u32x w0 = w0l | w0r;
- u32 key[2];
+ u32x key[2];
transform_racf_key (w0, w1, key);
- const u32 c = key[0];
- const u32 d = key[1];
+ const u32x c = key[0];
+ const u32x d = key[1];
- u32 Kc[16];
- u32 Kd[16];
+ u32x Kc[16];
+ u32x Kd[16];
_des_crypt_keysetup (c, d, Kc, Kd, s_skb);
- u32 data[2];
+ u32x data[2];
data[0] = salt_buf0[0];
data[1] = salt_buf0[1];
- u32 iv[2];
+ u32x iv[2];
_des_crypt_encrypt (iv, data, Kc, Kd, s_SPtrans);
- const u32 r0 = iv[0];
- const u32 r1 = iv[1];
- const u32 r2 = 0;
- const u32 r3 = 0;
+ u32x iv2 = 0;
+ u32x iv3 = 0;
- #include COMPARE_M
+ COMPARE_M_SIMD (iv[0], iv[1], iv2, iv3);
}
}
-static void m08500s (__local u32 s_SPtrans[8][64], __local u32 s_skb[8][64], u32 w[16], const u32 pw_len, __global pw_t *pws, __global kernel_rule_t *rules_buf, __global comb_t *combs_buf, __constant u32 * words_buf_r, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 bfs_cnt, const u32 digests_cnt, const u32 digests_offset)
+static void m08500s (__local u32 s_SPtrans[8][64], __local u32 s_skb[8][64], u32 w[16], const u32 pw_len, __global pw_t *pws, __global kernel_rule_t *rules_buf, __global comb_t *combs_buf, __constant u32x * words_buf_r, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 bfs_cnt, const u32 digests_cnt, const u32 digests_offset)
{
/**
* modifier
u32 w1 = w[1];
- for (u32 il_pos = 0; il_pos < bfs_cnt; il_pos++)
+ for (u32 il_pos = 0; il_pos < bfs_cnt; il_pos += VECT_SIZE)
{
- const u32 w0r = words_buf_r[il_pos];
+ const u32x w0r = words_buf_r[il_pos / VECT_SIZE];
- const u32 w0 = w0l | w0r;
+ const u32x w0 = w0l | w0r;
- u32 key[2];
+ u32x key[2];
transform_racf_key (w0, w1, key);
- const u32 c = key[0];
- const u32 d = key[1];
+ const u32x c = key[0];
+ const u32x d = key[1];
- u32 Kc[16];
- u32 Kd[16];
+ u32x Kc[16];
+ u32x Kd[16];
_des_crypt_keysetup (c, d, Kc, Kd, s_skb);
- u32 data[2];
+ u32x data[2];
data[0] = salt_buf0[0];
data[1] = salt_buf0[1];
- u32 iv[2];
+ u32x iv[2];
_des_crypt_encrypt (iv, data, Kc, Kd, s_SPtrans);
- const u32 r0 = iv[0];
- const u32 r1 = iv[1];
- const u32 r2 = 0;
- const u32 r3 = 0;
+ u32x iv2 = 0;
+ u32x iv3 = 0;
- #include COMPARE_S
+ COMPARE_S_SIMD (iv[0], iv[1], iv2, iv3);
}
}
-__kernel void m08500_m04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, __global comb_t *combs_buf, __constant u32 * words_buf_r, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 bfs_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max)
+__kernel void m08500_m04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, __global comb_t *combs_buf, __constant u32x * words_buf_r, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 bfs_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max)
{
/**
* base
m08500m (s_SPtrans, s_skb, w, pw_len, pws, rules_buf, combs_buf, words_buf_r, tmps, hooks, bitmaps_buf_s1_a, bitmaps_buf_s1_b, bitmaps_buf_s1_c, bitmaps_buf_s1_d, bitmaps_buf_s2_a, bitmaps_buf_s2_b, bitmaps_buf_s2_c, bitmaps_buf_s2_d, plains_buf, digests_buf, hashes_shown, salt_bufs, esalt_bufs, d_return_buf, d_scryptV_buf, bitmap_mask, bitmap_shift1, bitmap_shift2, salt_pos, loop_pos, loop_cnt, bfs_cnt, digests_cnt, digests_offset);
}
-__kernel void m08500_m08 (__global pw_t *pws, __global kernel_rule_t *rules_buf, __global comb_t *combs_buf, __constant u32 * words_buf_r, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 bfs_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max)
+__kernel void m08500_m08 (__global pw_t *pws, __global kernel_rule_t *rules_buf, __global comb_t *combs_buf, __constant u32x * words_buf_r, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 bfs_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max)
{
}
-__kernel void m08500_m16 (__global pw_t *pws, __global kernel_rule_t *rules_buf, __global comb_t *combs_buf, __constant u32 * words_buf_r, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 bfs_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max)
+__kernel void m08500_m16 (__global pw_t *pws, __global kernel_rule_t *rules_buf, __global comb_t *combs_buf, __constant u32x * words_buf_r, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 bfs_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max)
{
}
-__kernel void m08500_s04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, __global comb_t *combs_buf, __constant u32 * words_buf_r, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 bfs_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max)
+__kernel void m08500_s04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, __global comb_t *combs_buf, __constant u32x * words_buf_r, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 bfs_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max)
{
/**
* base
m08500s (s_SPtrans, s_skb, w, pw_len, pws, rules_buf, combs_buf, words_buf_r, tmps, hooks, bitmaps_buf_s1_a, bitmaps_buf_s1_b, bitmaps_buf_s1_c, bitmaps_buf_s1_d, bitmaps_buf_s2_a, bitmaps_buf_s2_b, bitmaps_buf_s2_c, bitmaps_buf_s2_d, plains_buf, digests_buf, hashes_shown, salt_bufs, esalt_bufs, d_return_buf, d_scryptV_buf, bitmap_mask, bitmap_shift1, bitmap_shift2, salt_pos, loop_pos, loop_cnt, bfs_cnt, digests_cnt, digests_offset);
}
-__kernel void m08500_s08 (__global pw_t *pws, __global kernel_rule_t *rules_buf, __global comb_t *combs_buf, __constant u32 * words_buf_r, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 bfs_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max)
+__kernel void m08500_s08 (__global pw_t *pws, __global kernel_rule_t *rules_buf, __global comb_t *combs_buf, __constant u32x * words_buf_r, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 bfs_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max)
{
}
-__kernel void m08500_s16 (__global pw_t *pws, __global kernel_rule_t *rules_buf, __global comb_t *combs_buf, __constant u32 * words_buf_r, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 bfs_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max)
+__kernel void m08500_s16 (__global pw_t *pws, __global kernel_rule_t *rules_buf, __global comb_t *combs_buf, __constant u32x * words_buf_r, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 bfs_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max)
{
}
if (combs_mode == COMBINATOR_MODE_BASE_RIGHT)
{
- switch_buffer_by_offset (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len);
+ switch_buffer_by_offset_le (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len);
}
/**
if (combs_mode == COMBINATOR_MODE_BASE_LEFT)
{
- switch_buffer_by_offset (wordr0, wordr1, wordr2, wordr3, pw_l_len);
+ switch_buffer_by_offset_le (wordr0, wordr1, wordr2, wordr3, pw_l_len);
}
u32 w[16];
if (combs_mode == COMBINATOR_MODE_BASE_RIGHT)
{
- switch_buffer_by_offset (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len);
+ switch_buffer_by_offset_le (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len);
}
/**
if (combs_mode == COMBINATOR_MODE_BASE_LEFT)
{
- switch_buffer_by_offset (wordr0, wordr1, wordr2, wordr3, pw_l_len);
+ switch_buffer_by_offset_le (wordr0, wordr1, wordr2, wordr3, pw_l_len);
}
u32 w[16];
#define _LOTUS5_
+#define NEW_SIMD_CODE
+
#include "include/constants.h"
#include "include/kernel_vendor.h"
#include "include/kernel_functions.c"
#include "OpenCL/types_ocl.c"
#include "OpenCL/common.c"
-
-#define COMPARE_S "OpenCL/check_single_comp4.c"
-#define COMPARE_M "OpenCL/check_multi_comp4.c"
+#include "OpenCL/simd.c"
__constant u32 lotus_magic_table[256] =
{
0x29, 0x39, 0xb9, 0xe9, 0x4c, 0xff, 0x43, 0xab,
};
-#define BOX(S,i) (S)[(i)]
-
-static void lotus_mix (u32 *in, __local u32 s_lotus_magic_table[256])
+#if VECT_SIZE == 1
+#define BOX1(S,i) (S)[(i)]
+#elif VECT_SIZE == 2
+#define BOX1(S,i) (u32x) ((S)[(i).s0], (S)[(i).s1])
+#elif VECT_SIZE == 4
+#define BOX1(S,i) (u32x) ((S)[(i).s0], (S)[(i).s1], (S)[(i).s2], (S)[(i).s3])
+#elif VECT_SIZE == 8
+#define BOX1(S,i) (u32x) ((S)[(i).s0], (S)[(i).s1], (S)[(i).s2], (S)[(i).s3], (S)[(i).s4], (S)[(i).s5], (S)[(i).s6], (S)[(i).s7])
+#endif
+
+static void lotus_mix (u32x *in, __local u32 s_lotus_magic_table[256])
{
- u32 p = 0;
+ u32x p = 0;
for (int i = 0; i < 18; i++)
{
#pragma unroll 12
for (int j = 0; j < 12; j++)
{
- u32 tmp_in = in[j];
- u32 tmp_out = 0;
+ u32x tmp_in = in[j];
+ u32x tmp_out = 0;
- p = (p + s--) & 0xff; p = ((tmp_in >> 0) & 0xff) ^ BOX (s_lotus_magic_table, p); tmp_out |= p << 0;
- p = (p + s--) & 0xff; p = ((tmp_in >> 8) & 0xff) ^ BOX (s_lotus_magic_table, p); tmp_out |= p << 8;
- p = (p + s--) & 0xff; p = ((tmp_in >> 16) & 0xff) ^ BOX (s_lotus_magic_table, p); tmp_out |= p << 16;
- p = (p + s--) & 0xff; p = ((tmp_in >> 24) & 0xff) ^ BOX (s_lotus_magic_table, p); tmp_out |= p << 24;
+ p = (p + s--) & 0xff; p = ((tmp_in >> 0) & 0xff) ^ BOX1 (s_lotus_magic_table, p); tmp_out |= p << 0;
+ p = (p + s--) & 0xff; p = ((tmp_in >> 8) & 0xff) ^ BOX1 (s_lotus_magic_table, p); tmp_out |= p << 8;
+ p = (p + s--) & 0xff; p = ((tmp_in >> 16) & 0xff) ^ BOX1 (s_lotus_magic_table, p); tmp_out |= p << 16;
+ p = (p + s--) & 0xff; p = ((tmp_in >> 24) & 0xff) ^ BOX1 (s_lotus_magic_table, p); tmp_out |= p << 24;
in[j] = tmp_out;
}
}
}
-static void lotus_transform_password (u32 in[4], u32 out[4], __local u32 s_lotus_magic_table[256])
+static void lotus_transform_password (u32x in[4], u32x out[4], __local u32 s_lotus_magic_table[256])
{
- u32 t = out[3] >> 24;
+ u32x t = out[3] >> 24;
- u32 c;
+ u32x c;
#pragma unroll 4
for (int i = 0; i < 4; i++)
{
- t ^= (in[i] >> 0) & 0xff; c = BOX (s_lotus_magic_table, t); out[i] ^= c << 0; t = ((out[i] >> 0) & 0xff);
- t ^= (in[i] >> 8) & 0xff; c = BOX (s_lotus_magic_table, t); out[i] ^= c << 8; t = ((out[i] >> 8) & 0xff);
- t ^= (in[i] >> 16) & 0xff; c = BOX (s_lotus_magic_table, t); out[i] ^= c << 16; t = ((out[i] >> 16) & 0xff);
- t ^= (in[i] >> 24) & 0xff; c = BOX (s_lotus_magic_table, t); out[i] ^= c << 24; t = ((out[i] >> 24) & 0xff);
+ t ^= (in[i] >> 0) & 0xff; c = BOX1 (s_lotus_magic_table, t); out[i] ^= c << 0; t = ((out[i] >> 0) & 0xff);
+ t ^= (in[i] >> 8) & 0xff; c = BOX1 (s_lotus_magic_table, t); out[i] ^= c << 8; t = ((out[i] >> 8) & 0xff);
+ t ^= (in[i] >> 16) & 0xff; c = BOX1 (s_lotus_magic_table, t); out[i] ^= c << 16; t = ((out[i] >> 16) & 0xff);
+ t ^= (in[i] >> 24) & 0xff; c = BOX1 (s_lotus_magic_table, t); out[i] ^= c << 24; t = ((out[i] >> 24) & 0xff);
}
}
}
}
-static void mdtransform_norecalc (u32 state[4], u32 block[4], __local u32 s_lotus_magic_table[256])
+static void mdtransform_norecalc (u32x state[4], u32x block[4], __local u32 s_lotus_magic_table[256])
{
- u32 x[12];
+ u32x x[12];
x[ 0] = state[0];
x[ 1] = state[1];
state[3] = x[3];
}
-static void mdtransform (u32 state[4], u32 checksum[4], u32 block[4], __local u32 s_lotus_magic_table[256])
+static void mdtransform (u32x state[4], u32x checksum[4], u32x block[4], __local u32 s_lotus_magic_table[256])
{
mdtransform_norecalc (state, block, s_lotus_magic_table);
lotus_transform_password (block, checksum, s_lotus_magic_table);
}
-static void domino_big_md (const u32 saved_key[16], const u32 size, u32 state[4], __local u32 s_lotus_magic_table[256])
+static void domino_big_md (const u32x saved_key[16], const u32x size, u32x state[4], __local u32 s_lotus_magic_table[256])
{
- u32 checksum[4];
+ u32x checksum[4];
checksum[0] = 0;
checksum[1] = 0;
checksum[2] = 0;
checksum[3] = 0;
- u32 block[4];
+ u32x block[4];
block[0] = saved_key[0];
block[1] = saved_key[1];
mdtransform_norecalc (state, checksum, s_lotus_magic_table);
}
-static void m08600m (__local u32 s_lotus_magic_table[256], u32 w[16], const u32 pw_len, __global pw_t *pws, __global kernel_rule_t *rules_buf, __global comb_t *combs_buf, __constant u32 * words_buf_r, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 bfs_cnt, const u32 digests_cnt, const u32 digests_offset)
+static void m08600m (__local u32 s_lotus_magic_table[256], u32 w[16], const u32 pw_len, __global pw_t *pws, __global kernel_rule_t *rules_buf, __global comb_t *combs_buf, __constant u32x * words_buf_r, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 bfs_cnt, const u32 digests_cnt, const u32 digests_offset)
{
/**
* modifier
u32 w0l = w[0];
- for (u32 il_pos = 0; il_pos < bfs_cnt; il_pos++)
+ for (u32 il_pos = 0; il_pos < bfs_cnt; il_pos += VECT_SIZE)
{
- const u32 w0r = words_buf_r[il_pos];
+ const u32x w0r = words_buf_r[il_pos / VECT_SIZE];
- const u32 w0 = w0l | w0r;
+ const u32x w0 = w0l | w0r;
- u32 w_tmp[16];
+ u32x w_tmp[16];
w_tmp[ 0] = w0;
w_tmp[ 1] = w[ 1];
w_tmp[14] = w[14];
w_tmp[15] = w[15];
- u32 state[4];
+ u32x state[4];
state[0] = 0;
state[1] = 0;
domino_big_md (w_tmp, pw_len, state, s_lotus_magic_table);
- const u32 r0 = state[0];
- const u32 r1 = state[1];
- const u32 r2 = state[2];
- const u32 r3 = state[3];
-
- #include COMPARE_M
+ COMPARE_M_SIMD (state[0], state[1], state[2], state[3]);
}
}
-static void m08600s (__local u32 s_lotus_magic_table[256], u32 w[16], const u32 pw_len, __global pw_t *pws, __global kernel_rule_t *rules_buf, __global comb_t *combs_buf, __constant u32 * words_buf_r, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 bfs_cnt, const u32 digests_cnt, const u32 digests_offset)
+static void m08600s (__local u32 s_lotus_magic_table[256], u32 w[16], const u32 pw_len, __global pw_t *pws, __global kernel_rule_t *rules_buf, __global comb_t *combs_buf, __constant u32x * words_buf_r, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 bfs_cnt, const u32 digests_cnt, const u32 digests_offset)
{
/**
* modifier
u32 w0l = w[0];
- for (u32 il_pos = 0; il_pos < bfs_cnt; il_pos++)
+ for (u32 il_pos = 0; il_pos < bfs_cnt; il_pos += VECT_SIZE)
{
- const u32 w0r = words_buf_r[il_pos];
+ const u32x w0r = words_buf_r[il_pos / VECT_SIZE];
- const u32 w0 = w0l | w0r;
+ const u32x w0 = w0l | w0r;
- u32 w_tmp[16];
+ u32x w_tmp[16];
w_tmp[ 0] = w0;
w_tmp[ 1] = w[ 1];
w_tmp[14] = w[14];
w_tmp[15] = w[15];
- u32 state[4];
+ u32x state[4];
state[0] = 0;
state[1] = 0;
domino_big_md (w_tmp, pw_len, state, s_lotus_magic_table);
- const u32 r0 = state[0];
- const u32 r1 = state[1];
- const u32 r2 = state[2];
- const u32 r3 = state[3];
-
- #include COMPARE_S
+ COMPARE_S_SIMD (state[0], state[1], state[2], state[3]);
}
}
-__kernel void m08600_m04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, __global comb_t *combs_buf, __constant u32 * words_buf_r, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 bfs_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max)
+__kernel void m08600_m04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, __global comb_t *combs_buf, __constant u32x * words_buf_r, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 bfs_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max)
{
/**
* base
m08600m (s_lotus_magic_table, w, pw_len, pws, rules_buf, combs_buf, words_buf_r, tmps, hooks, bitmaps_buf_s1_a, bitmaps_buf_s1_b, bitmaps_buf_s1_c, bitmaps_buf_s1_d, bitmaps_buf_s2_a, bitmaps_buf_s2_b, bitmaps_buf_s2_c, bitmaps_buf_s2_d, plains_buf, digests_buf, hashes_shown, salt_bufs, esalt_bufs, d_return_buf, d_scryptV_buf, bitmap_mask, bitmap_shift1, bitmap_shift2, salt_pos, loop_pos, loop_cnt, bfs_cnt, digests_cnt, digests_offset);
}
-__kernel void m08600_m08 (__global pw_t *pws, __global kernel_rule_t *rules_buf, __global comb_t *combs_buf, __constant u32 * words_buf_r, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 bfs_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max)
+__kernel void m08600_m08 (__global pw_t *pws, __global kernel_rule_t *rules_buf, __global comb_t *combs_buf, __constant u32x * words_buf_r, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 bfs_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max)
{
/**
* base
m08600m (s_lotus_magic_table, w, pw_len, pws, rules_buf, combs_buf, words_buf_r, tmps, hooks, bitmaps_buf_s1_a, bitmaps_buf_s1_b, bitmaps_buf_s1_c, bitmaps_buf_s1_d, bitmaps_buf_s2_a, bitmaps_buf_s2_b, bitmaps_buf_s2_c, bitmaps_buf_s2_d, plains_buf, digests_buf, hashes_shown, salt_bufs, esalt_bufs, d_return_buf, d_scryptV_buf, bitmap_mask, bitmap_shift1, bitmap_shift2, salt_pos, loop_pos, loop_cnt, bfs_cnt, digests_cnt, digests_offset);
}
-__kernel void m08600_m16 (__global pw_t *pws, __global kernel_rule_t *rules_buf, __global comb_t *combs_buf, __constant u32 * words_buf_r, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 bfs_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max)
+__kernel void m08600_m16 (__global pw_t *pws, __global kernel_rule_t *rules_buf, __global comb_t *combs_buf, __constant u32x * words_buf_r, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 bfs_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max)
{
/**
* base
m08600m (s_lotus_magic_table, w, pw_len, pws, rules_buf, combs_buf, words_buf_r, tmps, hooks, bitmaps_buf_s1_a, bitmaps_buf_s1_b, bitmaps_buf_s1_c, bitmaps_buf_s1_d, bitmaps_buf_s2_a, bitmaps_buf_s2_b, bitmaps_buf_s2_c, bitmaps_buf_s2_d, plains_buf, digests_buf, hashes_shown, salt_bufs, esalt_bufs, d_return_buf, d_scryptV_buf, bitmap_mask, bitmap_shift1, bitmap_shift2, salt_pos, loop_pos, loop_cnt, bfs_cnt, digests_cnt, digests_offset);
}
-__kernel void m08600_s04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, __global comb_t *combs_buf, __constant u32 * words_buf_r, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 bfs_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max)
+__kernel void m08600_s04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, __global comb_t *combs_buf, __constant u32x * words_buf_r, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 bfs_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max)
{
/**
* base
m08600s (s_lotus_magic_table, w, pw_len, pws, rules_buf, combs_buf, words_buf_r, tmps, hooks, bitmaps_buf_s1_a, bitmaps_buf_s1_b, bitmaps_buf_s1_c, bitmaps_buf_s1_d, bitmaps_buf_s2_a, bitmaps_buf_s2_b, bitmaps_buf_s2_c, bitmaps_buf_s2_d, plains_buf, digests_buf, hashes_shown, salt_bufs, esalt_bufs, d_return_buf, d_scryptV_buf, bitmap_mask, bitmap_shift1, bitmap_shift2, salt_pos, loop_pos, loop_cnt, bfs_cnt, digests_cnt, digests_offset);
}
-__kernel void m08600_s08 (__global pw_t *pws, __global kernel_rule_t *rules_buf, __global comb_t *combs_buf, __constant u32 * words_buf_r, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 bfs_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max)
+__kernel void m08600_s08 (__global pw_t *pws, __global kernel_rule_t *rules_buf, __global comb_t *combs_buf, __constant u32x * words_buf_r, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 bfs_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max)
{
/**
* base
m08600s (s_lotus_magic_table, w, pw_len, pws, rules_buf, combs_buf, words_buf_r, tmps, hooks, bitmaps_buf_s1_a, bitmaps_buf_s1_b, bitmaps_buf_s1_c, bitmaps_buf_s1_d, bitmaps_buf_s2_a, bitmaps_buf_s2_b, bitmaps_buf_s2_c, bitmaps_buf_s2_d, plains_buf, digests_buf, hashes_shown, salt_bufs, esalt_bufs, d_return_buf, d_scryptV_buf, bitmap_mask, bitmap_shift1, bitmap_shift2, salt_pos, loop_pos, loop_cnt, bfs_cnt, digests_cnt, digests_offset);
}
-__kernel void m08600_s16 (__global pw_t *pws, __global kernel_rule_t *rules_buf, __global comb_t *combs_buf, __constant u32 * words_buf_r, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 bfs_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max)
+__kernel void m08600_s16 (__global pw_t *pws, __global kernel_rule_t *rules_buf, __global comb_t *combs_buf, __constant u32x * words_buf_r, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 bfs_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max)
{
/**
* base
if (combs_mode == COMBINATOR_MODE_BASE_RIGHT)
{
- switch_buffer_by_offset (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len);
+ switch_buffer_by_offset_le (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len);
}
/**
if (combs_mode == COMBINATOR_MODE_BASE_LEFT)
{
- switch_buffer_by_offset (wordr0, wordr1, wordr2, wordr3, pw_l_len);
+ switch_buffer_by_offset_le (wordr0, wordr1, wordr2, wordr3, pw_l_len);
}
u32 w[16];
if (combs_mode == COMBINATOR_MODE_BASE_RIGHT)
{
- switch_buffer_by_offset (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len);
+ switch_buffer_by_offset_le (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len);
}
/**
if (combs_mode == COMBINATOR_MODE_BASE_LEFT)
{
- switch_buffer_by_offset (wordr0, wordr1, wordr2, wordr3, pw_l_len);
+ switch_buffer_by_offset_le (wordr0, wordr1, wordr2, wordr3, pw_l_len);
}
u32 w[16];
#define _LOTUS6_
+#define NEW_SIMD_CODE
+
#include "include/constants.h"
#include "include/kernel_vendor.h"
#include "include/kernel_functions.c"
#include "OpenCL/types_ocl.c"
#include "OpenCL/common.c"
-
-#define COMPARE_S "OpenCL/check_single_comp4.c"
-#define COMPARE_M "OpenCL/check_multi_comp4.c"
+#include "OpenCL/simd.c"
__constant u32 lotus_magic_table[256] =
{
#define BOX(S,i) (S)[(i)]
-#define uint_to_hex_upper8(i) l_bin2asc[(i)]
-
-static void lotus_mix (u32 *in, __local u32 s_lotus_magic_table[256])
+#if VECT_SIZE == 1
+#define uint_to_hex_upper8(i) (u32x) (l_bin2asc[(i)])
+#elif VECT_SIZE == 2
+#define uint_to_hex_upper8(i) (u32x) (l_bin2asc[(i).s0], l_bin2asc[(i).s1])
+#elif VECT_SIZE == 4
+#define uint_to_hex_upper8(i) (u32x) (l_bin2asc[(i).s0], l_bin2asc[(i).s1], l_bin2asc[(i).s2], l_bin2asc[(i).s3])
+#elif VECT_SIZE == 8
+#define uint_to_hex_upper8(i) (u32x) (l_bin2asc[(i).s0], l_bin2asc[(i).s1], l_bin2asc[(i).s2], l_bin2asc[(i).s3], l_bin2asc[(i).s4], l_bin2asc[(i).s5], l_bin2asc[(i).s6], l_bin2asc[(i).s7])
+#endif
+
+#if VECT_SIZE == 1
+#define BOX1(S,i) (S)[(i)]
+#elif VECT_SIZE == 2
+#define BOX1(S,i) (u32x) ((S)[(i).s0], (S)[(i).s1])
+#elif VECT_SIZE == 4
+#define BOX1(S,i) (u32x) ((S)[(i).s0], (S)[(i).s1], (S)[(i).s2], (S)[(i).s3])
+#elif VECT_SIZE == 8
+#define BOX1(S,i) (u32x) ((S)[(i).s0], (S)[(i).s1], (S)[(i).s2], (S)[(i).s3], (S)[(i).s4], (S)[(i).s5], (S)[(i).s6], (S)[(i).s7])
+#endif
+
+static void lotus_mix (u32x *in, __local u32 s_lotus_magic_table[256])
{
- u32 p = 0;
+ u32x p = 0;
for (int i = 0; i < 18; i++)
{
u32 s = 48;
- #pragma unroll
+ #pragma unroll 12
for (int j = 0; j < 12; j++)
{
- u32 tmp_in = in[j];
- u32 tmp_out = 0;
+ u32x tmp_in = in[j];
+ u32x tmp_out = 0;
- p = (p + s--) & 0xff; p = ((tmp_in >> 0) & 0xff) ^ BOX (s_lotus_magic_table, p); tmp_out |= p << 0;
- p = (p + s--) & 0xff; p = ((tmp_in >> 8) & 0xff) ^ BOX (s_lotus_magic_table, p); tmp_out |= p << 8;
- p = (p + s--) & 0xff; p = ((tmp_in >> 16) & 0xff) ^ BOX (s_lotus_magic_table, p); tmp_out |= p << 16;
- p = (p + s--) & 0xff; p = ((tmp_in >> 24) & 0xff) ^ BOX (s_lotus_magic_table, p); tmp_out |= p << 24;
+ p = (p + s--) & 0xff; p = ((tmp_in >> 0) & 0xff) ^ BOX1 (s_lotus_magic_table, p); tmp_out |= p << 0;
+ p = (p + s--) & 0xff; p = ((tmp_in >> 8) & 0xff) ^ BOX1 (s_lotus_magic_table, p); tmp_out |= p << 8;
+ p = (p + s--) & 0xff; p = ((tmp_in >> 16) & 0xff) ^ BOX1 (s_lotus_magic_table, p); tmp_out |= p << 16;
+ p = (p + s--) & 0xff; p = ((tmp_in >> 24) & 0xff) ^ BOX1 (s_lotus_magic_table, p); tmp_out |= p << 24;
in[j] = tmp_out;
}
}
}
-static void lotus_transform_password (u32 in[4], u32 out[4], __local u32 s_lotus_magic_table[256])
+static void lotus_transform_password (u32x in[4], u32x out[4], __local u32 s_lotus_magic_table[256])
{
- u32 t = out[3] >> 24;
+ u32x t = out[3] >> 24;
- u32 c;
+ u32x c;
- //#pragma unroll // kernel fails if used
+ #pragma unroll 4
for (int i = 0; i < 4; i++)
{
- t ^= (in[i] >> 0) & 0xff; c = BOX (s_lotus_magic_table, t); out[i] ^= c << 0; t = ((out[i] >> 0) & 0xff);
- t ^= (in[i] >> 8) & 0xff; c = BOX (s_lotus_magic_table, t); out[i] ^= c << 8; t = ((out[i] >> 8) & 0xff);
- t ^= (in[i] >> 16) & 0xff; c = BOX (s_lotus_magic_table, t); out[i] ^= c << 16; t = ((out[i] >> 16) & 0xff);
- t ^= (in[i] >> 24) & 0xff; c = BOX (s_lotus_magic_table, t); out[i] ^= c << 24; t = ((out[i] >> 24) & 0xff);
+ t ^= (in[i] >> 0) & 0xff; c = BOX1 (s_lotus_magic_table, t); out[i] ^= c << 0; t = ((out[i] >> 0) & 0xff);
+ t ^= (in[i] >> 8) & 0xff; c = BOX1 (s_lotus_magic_table, t); out[i] ^= c << 8; t = ((out[i] >> 8) & 0xff);
+ t ^= (in[i] >> 16) & 0xff; c = BOX1 (s_lotus_magic_table, t); out[i] ^= c << 16; t = ((out[i] >> 16) & 0xff);
+ t ^= (in[i] >> 24) & 0xff; c = BOX1 (s_lotus_magic_table, t); out[i] ^= c << 24; t = ((out[i] >> 24) & 0xff);
}
}
}
}
-static void mdtransform_norecalc (u32 state[4], u32 block[4], __local u32 s_lotus_magic_table[256])
+static void mdtransform_norecalc (u32x state[4], u32x block[4], __local u32 s_lotus_magic_table[256])
{
- u32 x[12];
+ u32x x[12];
x[ 0] = state[0];
x[ 1] = state[1];
state[3] = x[3];
}
-static void mdtransform (u32 state[4], u32 checksum[4], u32 block[4], __local u32 s_lotus_magic_table[256])
+static void mdtransform (u32x state[4], u32x checksum[4], u32x block[4], __local u32 s_lotus_magic_table[256])
{
mdtransform_norecalc (state, block, s_lotus_magic_table);
lotus_transform_password (block, checksum, s_lotus_magic_table);
}
-static void domino_big_md (const u32 saved_key[16], const u32 size, u32 state[4], __local u32 s_lotus_magic_table[256])
+static void domino_big_md (const u32x saved_key[16], const u32 size, u32x state[4], __local u32 s_lotus_magic_table[256])
{
- u32 checksum[4];
+ u32x checksum[4];
checksum[0] = 0;
checksum[1] = 0;
checksum[2] = 0;
checksum[3] = 0;
- u32 block[4];
+ u32x block[4];
block[0] = 0;
block[1] = 0;
mdtransform_norecalc (state, checksum, s_lotus_magic_table);
}
-static void m08700m (__local u32 s_lotus_magic_table[256], __local u32 l_bin2asc[256], u32 w[16], const u32 pw_len, __global pw_t *pws, __global kernel_rule_t *rules_buf, __global comb_t *combs_buf, __constant u32 * words_buf_r, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 bfs_cnt, const u32 digests_cnt, const u32 digests_offset)
+static void m08700m (__local u32 s_lotus_magic_table[256], __local u32 l_bin2asc[256], u32 w[16], const u32 pw_len, __global pw_t *pws, __global kernel_rule_t *rules_buf, __global comb_t *combs_buf, __constant u32x * words_buf_r, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 bfs_cnt, const u32 digests_cnt, const u32 digests_offset)
{
/**
* modifier
u32 w0l = w[0];
- for (u32 il_pos = 0; il_pos < bfs_cnt; il_pos++)
+ for (u32 il_pos = 0; il_pos < bfs_cnt; il_pos += VECT_SIZE)
{
- const u32 w0r = words_buf_r[il_pos];
+ const u32x w0r = words_buf_r[il_pos / VECT_SIZE];
- const u32 w0 = w0l | w0r;
+ const u32x w0 = w0l | w0r;
- u32 w_tmp[16];
+ u32x w_tmp[16];
w_tmp[ 0] = w0;
w_tmp[ 1] = w[ 1];
w_tmp[14] = w[14];
w_tmp[15] = w[15];
- u32 state[4];
+ u32x state[4];
state[0] = 0;
state[1] = 0;
domino_big_md (w_tmp, pw_len, state, s_lotus_magic_table);
- const u32 w0_t = uint_to_hex_upper8 ((state[0] >> 0) & 255) << 0
- | uint_to_hex_upper8 ((state[0] >> 8) & 255) << 16;
- const u32 w1_t = uint_to_hex_upper8 ((state[0] >> 16) & 255) << 0
- | uint_to_hex_upper8 ((state[0] >> 24) & 255) << 16;
- const u32 w2_t = uint_to_hex_upper8 ((state[1] >> 0) & 255) << 0
- | uint_to_hex_upper8 ((state[1] >> 8) & 255) << 16;
- const u32 w3_t = uint_to_hex_upper8 ((state[1] >> 16) & 255) << 0
- | uint_to_hex_upper8 ((state[1] >> 24) & 255) << 16;
- const u32 w4_t = uint_to_hex_upper8 ((state[2] >> 0) & 255) << 0
- | uint_to_hex_upper8 ((state[2] >> 8) & 255) << 16;
- const u32 w5_t = uint_to_hex_upper8 ((state[2] >> 16) & 255) << 0
- | uint_to_hex_upper8 ((state[2] >> 24) & 255) << 16;
- const u32 w6_t = uint_to_hex_upper8 ((state[3] >> 0) & 255) << 0
- | uint_to_hex_upper8 ((state[3] >> 8) & 255) << 16;
- //const u32 w7_t = uint_to_hex_upper8 ((state[3] >> 16) & 255) << 0
- // | uint_to_hex_upper8 ((state[3] >> 24) & 255) << 16;
-
- const u32 pade = 0x0e0e0e0e;
+ const u32x w0_t = uint_to_hex_upper8 ((state[0] >> 0) & 255) << 0
+ | uint_to_hex_upper8 ((state[0] >> 8) & 255) << 16;
+ const u32x w1_t = uint_to_hex_upper8 ((state[0] >> 16) & 255) << 0
+ | uint_to_hex_upper8 ((state[0] >> 24) & 255) << 16;
+ const u32x w2_t = uint_to_hex_upper8 ((state[1] >> 0) & 255) << 0
+ | uint_to_hex_upper8 ((state[1] >> 8) & 255) << 16;
+ const u32x w3_t = uint_to_hex_upper8 ((state[1] >> 16) & 255) << 0
+ | uint_to_hex_upper8 ((state[1] >> 24) & 255) << 16;
+ const u32x w4_t = uint_to_hex_upper8 ((state[2] >> 0) & 255) << 0
+ | uint_to_hex_upper8 ((state[2] >> 8) & 255) << 16;
+ const u32x w5_t = uint_to_hex_upper8 ((state[2] >> 16) & 255) << 0
+ | uint_to_hex_upper8 ((state[2] >> 24) & 255) << 16;
+ const u32x w6_t = uint_to_hex_upper8 ((state[3] >> 0) & 255) << 0
+ | uint_to_hex_upper8 ((state[3] >> 8) & 255) << 16;
+ //const u32x w7_t = uint_to_hex_upper8 ((state[3] >> 16) & 255) << 0
+ // | uint_to_hex_upper8 ((state[3] >> 24) & 255) << 16;
+
+ const u32x pade = 0x0e0e0e0e;
w_tmp[ 0] = salt0;
w_tmp[ 1] = salt1 | w0_t << 16;
domino_big_md (w_tmp, 34, state, s_lotus_magic_table);
- u32 a = state[0] & 0xffffffff;
- u32 b = state[1] & 0xffffffff;
- u32 c = state[2] & 0x000000ff;
- u32 d = state[3] & 0x00000000;
+ u32x a = state[0] & 0xffffffff;
+ u32x b = state[1] & 0xffffffff;
+ u32x c = state[2] & 0x000000ff;
+ u32x d = state[3] & 0x00000000;
- const u32 r0 = a;
- const u32 r1 = b;
- const u32 r2 = c;
- const u32 r3 = d;
-
- #include COMPARE_M
+ COMPARE_M_SIMD (a, b, c, d);
}
}
-static void m08700s (__local u32 s_lotus_magic_table[256], __local u32 l_bin2asc[256], u32 w[16], const u32 pw_len, __global pw_t *pws, __global kernel_rule_t *rules_buf, __global comb_t *combs_buf, __constant u32 * words_buf_r, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 bfs_cnt, const u32 digests_cnt, const u32 digests_offset)
+static void m08700s (__local u32 s_lotus_magic_table[256], __local u32 l_bin2asc[256], u32 w[16], const u32 pw_len, __global pw_t *pws, __global kernel_rule_t *rules_buf, __global comb_t *combs_buf, __constant u32x * words_buf_r, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 bfs_cnt, const u32 digests_cnt, const u32 digests_offset)
{
/**
* modifier
u32 w0l = w[0];
- for (u32 il_pos = 0; il_pos < bfs_cnt; il_pos++)
+ for (u32 il_pos = 0; il_pos < bfs_cnt; il_pos += VECT_SIZE)
{
- const u32 w0r = words_buf_r[il_pos];
+ const u32x w0r = words_buf_r[il_pos / VECT_SIZE];
- const u32 w0 = w0l | w0r;
+ const u32x w0 = w0l | w0r;
- u32 w_tmp[16];
+ u32x w_tmp[16];
w_tmp[ 0] = w0;
w_tmp[ 1] = w[ 1];
w_tmp[14] = w[14];
w_tmp[15] = w[15];
- u32 state[4];
+ u32x state[4];
state[0] = 0;
state[1] = 0;
domino_big_md (w_tmp, pw_len, state, s_lotus_magic_table);
- const u32 w0_t = uint_to_hex_upper8 ((state[0] >> 0) & 255) << 0
- | uint_to_hex_upper8 ((state[0] >> 8) & 255) << 16;
- const u32 w1_t = uint_to_hex_upper8 ((state[0] >> 16) & 255) << 0
- | uint_to_hex_upper8 ((state[0] >> 24) & 255) << 16;
- const u32 w2_t = uint_to_hex_upper8 ((state[1] >> 0) & 255) << 0
- | uint_to_hex_upper8 ((state[1] >> 8) & 255) << 16;
- const u32 w3_t = uint_to_hex_upper8 ((state[1] >> 16) & 255) << 0
- | uint_to_hex_upper8 ((state[1] >> 24) & 255) << 16;
- const u32 w4_t = uint_to_hex_upper8 ((state[2] >> 0) & 255) << 0
- | uint_to_hex_upper8 ((state[2] >> 8) & 255) << 16;
- const u32 w5_t = uint_to_hex_upper8 ((state[2] >> 16) & 255) << 0
- | uint_to_hex_upper8 ((state[2] >> 24) & 255) << 16;
- const u32 w6_t = uint_to_hex_upper8 ((state[3] >> 0) & 255) << 0
- | uint_to_hex_upper8 ((state[3] >> 8) & 255) << 16;
- //const u32 w7_t = uint_to_hex_upper8 ((state[3] >> 16) & 255) << 0
- // | uint_to_hex_upper8 ((state[3] >> 24) & 255) << 16;
-
- const u32 pade = 0x0e0e0e0e;
+ const u32x w0_t = uint_to_hex_upper8 ((state[0] >> 0) & 255) << 0
+ | uint_to_hex_upper8 ((state[0] >> 8) & 255) << 16;
+ const u32x w1_t = uint_to_hex_upper8 ((state[0] >> 16) & 255) << 0
+ | uint_to_hex_upper8 ((state[0] >> 24) & 255) << 16;
+ const u32x w2_t = uint_to_hex_upper8 ((state[1] >> 0) & 255) << 0
+ | uint_to_hex_upper8 ((state[1] >> 8) & 255) << 16;
+ const u32x w3_t = uint_to_hex_upper8 ((state[1] >> 16) & 255) << 0
+ | uint_to_hex_upper8 ((state[1] >> 24) & 255) << 16;
+ const u32x w4_t = uint_to_hex_upper8 ((state[2] >> 0) & 255) << 0
+ | uint_to_hex_upper8 ((state[2] >> 8) & 255) << 16;
+ const u32x w5_t = uint_to_hex_upper8 ((state[2] >> 16) & 255) << 0
+ | uint_to_hex_upper8 ((state[2] >> 24) & 255) << 16;
+ const u32x w6_t = uint_to_hex_upper8 ((state[3] >> 0) & 255) << 0
+ | uint_to_hex_upper8 ((state[3] >> 8) & 255) << 16;
+ //const u32x w7_t = uint_to_hex_upper8 ((state[3] >> 16) & 255) << 0
+ // | uint_to_hex_upper8 ((state[3] >> 24) & 255) << 16;
+
+ const u32x pade = 0x0e0e0e0e;
w_tmp[ 0] = salt0;
w_tmp[ 1] = salt1 | w0_t << 16;
domino_big_md (w_tmp, 34, state, s_lotus_magic_table);
- u32 a = state[0] & 0xffffffff;
- u32 b = state[1] & 0xffffffff;
- u32 c = state[2] & 0x000000ff;
- u32 d = state[3] & 0x00000000;
-
- const u32 r0 = a;
- const u32 r1 = b;
- const u32 r2 = c;
- const u32 r3 = d;
+ u32x a = state[0] & 0xffffffff;
+ u32x b = state[1] & 0xffffffff;
+ u32x c = state[2] & 0x000000ff;
+ u32x d = state[3] & 0x00000000;
- #include COMPARE_S
+ COMPARE_S_SIMD (a, b, c, d);
}
}
-__kernel void m08700_m04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, __global comb_t *combs_buf, __constant u32 * words_buf_r, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 bfs_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max)
+__kernel void m08700_m04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, __global comb_t *combs_buf, __constant u32x * words_buf_r, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 bfs_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max)
{
/**
* base
m08700m (s_lotus_magic_table, l_bin2asc, w, pw_len, pws, rules_buf, combs_buf, words_buf_r, tmps, hooks, bitmaps_buf_s1_a, bitmaps_buf_s1_b, bitmaps_buf_s1_c, bitmaps_buf_s1_d, bitmaps_buf_s2_a, bitmaps_buf_s2_b, bitmaps_buf_s2_c, bitmaps_buf_s2_d, plains_buf, digests_buf, hashes_shown, salt_bufs, esalt_bufs, d_return_buf, d_scryptV_buf, bitmap_mask, bitmap_shift1, bitmap_shift2, salt_pos, loop_pos, loop_cnt, bfs_cnt, digests_cnt, digests_offset);
}
-__kernel void m08700_m08 (__global pw_t *pws, __global kernel_rule_t *rules_buf, __global comb_t *combs_buf, __constant u32 * words_buf_r, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 bfs_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max)
+__kernel void m08700_m08 (__global pw_t *pws, __global kernel_rule_t *rules_buf, __global comb_t *combs_buf, __constant u32x * words_buf_r, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 bfs_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max)
{
/**
* base
m08700m (s_lotus_magic_table, l_bin2asc, w, pw_len, pws, rules_buf, combs_buf, words_buf_r, tmps, hooks, bitmaps_buf_s1_a, bitmaps_buf_s1_b, bitmaps_buf_s1_c, bitmaps_buf_s1_d, bitmaps_buf_s2_a, bitmaps_buf_s2_b, bitmaps_buf_s2_c, bitmaps_buf_s2_d, plains_buf, digests_buf, hashes_shown, salt_bufs, esalt_bufs, d_return_buf, d_scryptV_buf, bitmap_mask, bitmap_shift1, bitmap_shift2, salt_pos, loop_pos, loop_cnt, bfs_cnt, digests_cnt, digests_offset);
}
-__kernel void m08700_m16 (__global pw_t *pws, __global kernel_rule_t *rules_buf, __global comb_t *combs_buf, __constant u32 * words_buf_r, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 bfs_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max)
+__kernel void m08700_m16 (__global pw_t *pws, __global kernel_rule_t *rules_buf, __global comb_t *combs_buf, __constant u32x * words_buf_r, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 bfs_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max)
{
/**
* base
m08700m (s_lotus_magic_table, l_bin2asc, w, pw_len, pws, rules_buf, combs_buf, words_buf_r, tmps, hooks, bitmaps_buf_s1_a, bitmaps_buf_s1_b, bitmaps_buf_s1_c, bitmaps_buf_s1_d, bitmaps_buf_s2_a, bitmaps_buf_s2_b, bitmaps_buf_s2_c, bitmaps_buf_s2_d, plains_buf, digests_buf, hashes_shown, salt_bufs, esalt_bufs, d_return_buf, d_scryptV_buf, bitmap_mask, bitmap_shift1, bitmap_shift2, salt_pos, loop_pos, loop_cnt, bfs_cnt, digests_cnt, digests_offset);
}
-__kernel void m08700_s04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, __global comb_t *combs_buf, __constant u32 * words_buf_r, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 bfs_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max)
+__kernel void m08700_s04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, __global comb_t *combs_buf, __constant u32x * words_buf_r, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 bfs_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max)
{
/**
* base
m08700s (s_lotus_magic_table, l_bin2asc, w, pw_len, pws, rules_buf, combs_buf, words_buf_r, tmps, hooks, bitmaps_buf_s1_a, bitmaps_buf_s1_b, bitmaps_buf_s1_c, bitmaps_buf_s1_d, bitmaps_buf_s2_a, bitmaps_buf_s2_b, bitmaps_buf_s2_c, bitmaps_buf_s2_d, plains_buf, digests_buf, hashes_shown, salt_bufs, esalt_bufs, d_return_buf, d_scryptV_buf, bitmap_mask, bitmap_shift1, bitmap_shift2, salt_pos, loop_pos, loop_cnt, bfs_cnt, digests_cnt, digests_offset);
}
-__kernel void m08700_s08 (__global pw_t *pws, __global kernel_rule_t *rules_buf, __global comb_t *combs_buf, __constant u32 * words_buf_r, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 bfs_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max)
+__kernel void m08700_s08 (__global pw_t *pws, __global kernel_rule_t *rules_buf, __global comb_t *combs_buf, __constant u32x * words_buf_r, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 bfs_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max)
{
/**
* base
m08700s (s_lotus_magic_table, l_bin2asc, w, pw_len, pws, rules_buf, combs_buf, words_buf_r, tmps, hooks, bitmaps_buf_s1_a, bitmaps_buf_s1_b, bitmaps_buf_s1_c, bitmaps_buf_s1_d, bitmaps_buf_s2_a, bitmaps_buf_s2_b, bitmaps_buf_s2_c, bitmaps_buf_s2_d, plains_buf, digests_buf, hashes_shown, salt_bufs, esalt_bufs, d_return_buf, d_scryptV_buf, bitmap_mask, bitmap_shift1, bitmap_shift2, salt_pos, loop_pos, loop_cnt, bfs_cnt, digests_cnt, digests_offset);
}
-__kernel void m08700_s16 (__global pw_t *pws, __global kernel_rule_t *rules_buf, __global comb_t *combs_buf, __constant u32 * words_buf_r, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 bfs_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max)
+__kernel void m08700_s16 (__global pw_t *pws, __global kernel_rule_t *rules_buf, __global comb_t *combs_buf, __constant u32x * words_buf_r, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 bfs_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max)
{
/**
* base
if (combs_mode == COMBINATOR_MODE_BASE_RIGHT)
{
- switch_buffer_by_offset (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len);
+ switch_buffer_by_offset_le (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len);
}
/**
if (combs_mode == COMBINATOR_MODE_BASE_LEFT)
{
- switch_buffer_by_offset (wordr0, wordr1, wordr2, wordr3, pw_l_len);
+ switch_buffer_by_offset_le (wordr0, wordr1, wordr2, wordr3, pw_l_len);
}
u32 w0[4];
if (combs_mode == COMBINATOR_MODE_BASE_RIGHT)
{
- switch_buffer_by_offset (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len);
+ switch_buffer_by_offset_le (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len);
}
/**
if (combs_mode == COMBINATOR_MODE_BASE_LEFT)
{
- switch_buffer_by_offset (wordr0, wordr1, wordr2, wordr3, pw_l_len);
+ switch_buffer_by_offset_le (wordr0, wordr1, wordr2, wordr3, pw_l_len);
}
u32 w0[4];
#define _OLDOFFICE01_
+#define NEW_SIMD_CODE
+
#include "include/constants.h"
#include "include/kernel_vendor.h"
#include "include/kernel_functions.c"
#include "OpenCL/types_ocl.c"
#include "OpenCL/common.c"
-
-#define COMPARE_S "OpenCL/check_single_comp4.c"
-#define COMPARE_M "OpenCL/check_multi_comp4.c"
+#include "OpenCL/simd.c"
typedef struct
{
rc4_key->S[j] = tmp;
}
-static void rc4_init_16 (__local RC4_KEY *rc4_key, const u32 data[4])
+static void rc4_init_16 (__local RC4_KEY *rc4_key, const u32x data[4])
{
- u32 v = 0x03020100;
- u32 a = 0x04040404;
+ u32x v = 0x03020100;
+ u32x a = 0x04040404;
__local u32 *ptr = (__local u32 *) rc4_key->S;
}
}
-static u8 rc4_next_16 (__local RC4_KEY *rc4_key, u8 i, u8 j, const u32 in[4], u32 out[4])
+static u8 rc4_next_16 (__local RC4_KEY *rc4_key, u8 i, u8 j, const u32x in[4], u32x out[4])
{
#pragma unroll
for (u32 k = 0; k < 4; k++)
return j;
}
-static void md5_transform (const u32 w0[4], const u32 w1[4], const u32 w2[4], const u32 w3[4], u32 digest[4])
+static void md5_transform (const u32x w0[4], const u32x w1[4], const u32x w2[4], const u32x w3[4], u32x digest[4])
{
- u32 a = digest[0];
- u32 b = digest[1];
- u32 c = digest[2];
- u32 d = digest[3];
-
- u32 w0_t = w0[0];
- u32 w1_t = w0[1];
- u32 w2_t = w0[2];
- u32 w3_t = w0[3];
- u32 w4_t = w1[0];
- u32 w5_t = w1[1];
- u32 w6_t = w1[2];
- u32 w7_t = w1[3];
- u32 w8_t = w2[0];
- u32 w9_t = w2[1];
- u32 wa_t = w2[2];
- u32 wb_t = w2[3];
- u32 wc_t = w3[0];
- u32 wd_t = w3[1];
- u32 we_t = w3[2];
- u32 wf_t = w3[3];
+ u32x a = digest[0];
+ u32x b = digest[1];
+ u32x c = digest[2];
+ u32x d = digest[3];
+
+ u32x w0_t = w0[0];
+ u32x w1_t = w0[1];
+ u32x w2_t = w0[2];
+ u32x w3_t = w0[3];
+ u32x w4_t = w1[0];
+ u32x w5_t = w1[1];
+ u32x w6_t = w1[2];
+ u32x w7_t = w1[3];
+ u32x w8_t = w2[0];
+ u32x w9_t = w2[1];
+ u32x wa_t = w2[2];
+ u32x wb_t = w2[3];
+ u32x wc_t = w3[0];
+ u32x wd_t = w3[1];
+ u32x we_t = w3[2];
+ u32x wf_t = w3[3];
MD5_STEP (MD5_Fo, a, b, c, d, w0_t, MD5C00, MD5S00);
MD5_STEP (MD5_Fo, d, a, b, c, w1_t, MD5C01, MD5S01);
u32 w0l = w0[0];
- for (u32 il_pos = 0; il_pos < bfs_cnt; il_pos++)
+ for (u32 il_pos = 0; il_pos < bfs_cnt; il_pos += VECT_SIZE)
{
- const u32 w0r = bfs_buf[il_pos].i;
+ const u32x w0r = w0r_create_bft (bfs_buf, il_pos);
- w0[0] = w0l | w0r;
+ const u32x w0lr = w0l | w0r;
- u32 w0_t[4];
- u32 w1_t[4];
- u32 w2_t[4];
- u32 w3_t[4];
+ u32x w0_t[4];
+ u32x w1_t[4];
+ u32x w2_t[4];
+ u32x w3_t[4];
- w0_t[0] = w0[0];
+ w0_t[0] = w0lr;
w0_t[1] = w0[1];
w0_t[2] = w0[2];
w0_t[3] = w0[3];
w3_t[2] = pw_len * 8;
w3_t[3] = 0;
- u32 digest_t0[4];
- u32 digest_t1[2]; // need only first 5 byte
- u32 digest_t2[2];
- u32 digest_t3[2];
+ u32x digest_t0[4];
+ u32x digest_t1[2]; // need only first 5 byte
+ u32x digest_t2[2];
+ u32x digest_t3[2];
digest_t0[0] = MD5M_A;
digest_t0[1] = MD5M_B;
// prepare 16 * 21 buffer stuff
- u32 digest[4];
+ u32x digest[4];
digest[0] = MD5M_A;
digest[1] = MD5M_B;
// now the RC4 part
- u32 key[4];
+ u32x key[4];
key[0] = digest[0];
key[1] = digest[1];
rc4_init_16 (rc4_key, key);
- u32 out[4];
+ u32x out[4];
u8 j = rc4_next_16 (rc4_key, 0, 0, encryptedVerifier, out);
rc4_next_16 (rc4_key, 16, j, digest, out);
- const u32 r0 = out[0];
- const u32 r1 = out[1];
- const u32 r2 = out[2];
- const u32 r3 = out[3];
-
- #include COMPARE_M
+ COMPARE_M_SIMD (out[0], out[1], out[2], out[3]);
}
}
u32 w0l = w0[0];
- for (u32 il_pos = 0; il_pos < bfs_cnt; il_pos++)
+ for (u32 il_pos = 0; il_pos < bfs_cnt; il_pos += VECT_SIZE)
{
- const u32 w0r = bfs_buf[il_pos].i;
+ const u32x w0r = w0r_create_bft (bfs_buf, il_pos);
- w0[0] = w0l | w0r;
+ const u32x w0lr = w0l | w0r;
- u32 w0_t[4];
- u32 w1_t[4];
- u32 w2_t[4];
- u32 w3_t[4];
+ u32x w0_t[4];
+ u32x w1_t[4];
+ u32x w2_t[4];
+ u32x w3_t[4];
- w0_t[0] = w0[0];
+ w0_t[0] = w0lr;
w0_t[1] = w0[1];
w0_t[2] = w0[2];
w0_t[3] = w0[3];
w3_t[2] = pw_len * 8;
w3_t[3] = 0;
- u32 digest_t0[4];
- u32 digest_t1[2]; // need only first 5 byte
- u32 digest_t2[2];
- u32 digest_t3[2];
+ u32x digest_t0[4];
+ u32x digest_t1[2]; // need only first 5 byte
+ u32x digest_t2[2];
+ u32x digest_t3[2];
digest_t0[0] = MD5M_A;
digest_t0[1] = MD5M_B;
// now the RC4 part
- u32 key[4];
+ u32x key[4];
key[0] = digest[0];
key[1] = digest[1];
rc4_init_16 (rc4_key, key);
- u32 out[4];
+ u32x out[4];
u8 j = rc4_next_16 (rc4_key, 0, 0, encryptedVerifier, out);
rc4_next_16 (rc4_key, 16, j, digest, out);
- const u32 r0 = out[0];
- const u32 r1 = out[1];
- const u32 r2 = out[2];
- const u32 r3 = out[3];
-
- #include COMPARE_S
+ COMPARE_S_SIMD (out[0], out[1], out[2], out[3]);
}
}
if (combs_mode == COMBINATOR_MODE_BASE_RIGHT)
{
- switch_buffer_by_offset (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len);
+ switch_buffer_by_offset_le (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len);
}
/**
if (combs_mode == COMBINATOR_MODE_BASE_LEFT)
{
- switch_buffer_by_offset (wordr0, wordr1, wordr2, wordr3, pw_l_len);
+ switch_buffer_by_offset_le (wordr0, wordr1, wordr2, wordr3, pw_l_len);
}
// first md5 to generate RC4 128 bit key
if (combs_mode == COMBINATOR_MODE_BASE_RIGHT)
{
- switch_buffer_by_offset (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len);
+ switch_buffer_by_offset_le (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len);
}
/**
if (combs_mode == COMBINATOR_MODE_BASE_LEFT)
{
- switch_buffer_by_offset (wordr0, wordr1, wordr2, wordr3, pw_l_len);
+ switch_buffer_by_offset_le (wordr0, wordr1, wordr2, wordr3, pw_l_len);
}
// first md5 to generate RC4 128 bit key
#define _OLDOFFICE01_
+#define NEW_SIMD_CODE
+
#include "include/constants.h"
#include "include/kernel_vendor.h"
#include "include/kernel_functions.c"
#include "OpenCL/types_ocl.c"
#include "OpenCL/common.c"
-
-#define COMPARE_S "OpenCL/check_single_comp4.c"
-#define COMPARE_M "OpenCL/check_multi_comp4.c"
+#include "OpenCL/simd.c"
typedef struct
{
rc4_key->S[j] = tmp;
}
-static void rc4_init_16 (__local RC4_KEY *rc4_key, const u32 data[4])
+static void rc4_init_16 (__local RC4_KEY *rc4_key, const u32x data[4])
{
- u32 v = 0x03020100;
- u32 a = 0x04040404;
+ u32x v = 0x03020100;
+ u32x a = 0x04040404;
__local u32 *ptr = (__local u32 *) rc4_key->S;
}
}
-static u8 rc4_next_16 (__local RC4_KEY *rc4_key, u8 i, u8 j, const u32 in[4], u32 out[4])
+static u8 rc4_next_16 (__local RC4_KEY *rc4_key, u8 i, u8 j, const u32x in[4], u32x out[4])
{
#pragma unroll
for (u32 k = 0; k < 4; k++)
return j;
}
-static void md5_transform (const u32 w0[4], const u32 w1[4], const u32 w2[4], const u32 w3[4], u32 digest[4])
+static void md5_transform (const u32x w0[4], const u32x w1[4], const u32x w2[4], const u32x w3[4], u32x digest[4])
{
- u32 a = digest[0];
- u32 b = digest[1];
- u32 c = digest[2];
- u32 d = digest[3];
-
- u32 w0_t = w0[0];
- u32 w1_t = w0[1];
- u32 w2_t = w0[2];
- u32 w3_t = w0[3];
- u32 w4_t = w1[0];
- u32 w5_t = w1[1];
- u32 w6_t = w1[2];
- u32 w7_t = w1[3];
- u32 w8_t = w2[0];
- u32 w9_t = w2[1];
- u32 wa_t = w2[2];
- u32 wb_t = w2[3];
- u32 wc_t = w3[0];
- u32 wd_t = w3[1];
- u32 we_t = w3[2];
- u32 wf_t = w3[3];
+ u32x a = digest[0];
+ u32x b = digest[1];
+ u32x c = digest[2];
+ u32x d = digest[3];
+
+ u32x w0_t = w0[0];
+ u32x w1_t = w0[1];
+ u32x w2_t = w0[2];
+ u32x w3_t = w0[3];
+ u32x w4_t = w1[0];
+ u32x w5_t = w1[1];
+ u32x w6_t = w1[2];
+ u32x w7_t = w1[3];
+ u32x w8_t = w2[0];
+ u32x w9_t = w2[1];
+ u32x wa_t = w2[2];
+ u32x wb_t = w2[3];
+ u32x wc_t = w3[0];
+ u32x wd_t = w3[1];
+ u32x we_t = w3[2];
+ u32x wf_t = w3[3];
MD5_STEP (MD5_Fo, a, b, c, d, w0_t, MD5C00, MD5S00);
MD5_STEP (MD5_Fo, d, a, b, c, w1_t, MD5C01, MD5S01);
u32 w0l = w0[0];
- for (u32 il_pos = 0; il_pos < bfs_cnt; il_pos++)
+ for (u32 il_pos = 0; il_pos < bfs_cnt; il_pos += VECT_SIZE)
{
- const u32 w0r = bfs_buf[il_pos].i;
+ const u32x w0r = w0r_create_bft (bfs_buf, il_pos);
- w0[0] = w0l | w0r;
+ const u32x w0lr = w0l | w0r;
// first md5 to generate RC4 128 bit key
- u32 w0_t[4];
- u32 w1_t[4];
- u32 w2_t[4];
- u32 w3_t[4];
+ u32x w0_t[4];
+ u32x w1_t[4];
+ u32x w2_t[4];
+ u32x w3_t[4];
- w0_t[0] = w0[0];
+ w0_t[0] = w0lr;
w0_t[1] = w0[1] & 0xff;
w0_t[2] = 0x8000;
w0_t[3] = 0;
w3_t[2] = 9 * 8;
w3_t[3] = 0;
- u32 digest[4];
+ u32x digest[4];
digest[0] = MD5M_A;
digest[1] = MD5M_B;
// now the RC4 part
- u32 key[4];
+ u32x key[4];
key[0] = digest[0];
key[1] = digest[1];
rc4_init_16 (rc4_key, key);
- u32 out[4];
+ u32x out[4];
u8 j = rc4_next_16 (rc4_key, 0, 0, encryptedVerifier, out);
rc4_next_16 (rc4_key, 16, j, digest, out);
- const u32 r0 = out[0];
- const u32 r1 = out[1];
- const u32 r2 = out[2];
- const u32 r3 = out[3];
-
- #include COMPARE_M
+ COMPARE_M_SIMD (out[0], out[1], out[2], out[3]);
}
}
u32 w0l = w0[0];
- for (u32 il_pos = 0; il_pos < bfs_cnt; il_pos++)
+ for (u32 il_pos = 0; il_pos < bfs_cnt; il_pos += VECT_SIZE)
{
- const u32 w0r = bfs_buf[il_pos].i;
+ const u32x w0r = w0r_create_bft (bfs_buf, il_pos);
- w0[0] = w0l | w0r;
+ const u32x w0lr = w0l | w0r;
// first md5 to generate RC4 128 bit key
- u32 w0_t[4];
- u32 w1_t[4];
- u32 w2_t[4];
- u32 w3_t[4];
+ u32x w0_t[4];
+ u32x w1_t[4];
+ u32x w2_t[4];
+ u32x w3_t[4];
- w0_t[0] = w0[0];
+ w0_t[0] = w0lr;
w0_t[1] = w0[1] & 0xff;
w0_t[2] = 0x8000;
w0_t[3] = 0;
w3_t[2] = 9 * 8;
w3_t[3] = 0;
- u32 digest[4];
+ u32x digest[4];
digest[0] = MD5M_A;
digest[1] = MD5M_B;
// now the RC4 part
- u32 key[4];
+ u32x key[4];
key[0] = digest[0];
key[1] = digest[1];
rc4_init_16 (rc4_key, key);
- u32 out[4];
+ u32x out[4];
u8 j = rc4_next_16 (rc4_key, 0, 0, encryptedVerifier, out);
rc4_next_16 (rc4_key, 16, j, digest, out);
- const u32 r0 = out[0];
- const u32 r1 = out[1];
- const u32 r2 = out[2];
- const u32 r3 = out[3];
-
- #include COMPARE_S
+ COMPARE_S_SIMD (out[0], out[1], out[2], out[3]);
}
}
if (combs_mode == COMBINATOR_MODE_BASE_RIGHT)
{
- switch_buffer_by_offset (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len);
+ switch_buffer_by_offset_le (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len);
}
/**
if (combs_mode == COMBINATOR_MODE_BASE_LEFT)
{
- switch_buffer_by_offset (wordr0, wordr1, wordr2, wordr3, pw_l_len);
+ switch_buffer_by_offset_le (wordr0, wordr1, wordr2, wordr3, pw_l_len);
}
u32 w0[4];
if (combs_mode == COMBINATOR_MODE_BASE_RIGHT)
{
- switch_buffer_by_offset (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len);
+ switch_buffer_by_offset_le (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len);
}
/**
if (combs_mode == COMBINATOR_MODE_BASE_LEFT)
{
- switch_buffer_by_offset (wordr0, wordr1, wordr2, wordr3, pw_l_len);
+ switch_buffer_by_offset_le (wordr0, wordr1, wordr2, wordr3, pw_l_len);
}
u32 w0[4];
#define _OLDOFFICE01_
+#define NEW_SIMD_CODE
+
#include "include/constants.h"
#include "include/kernel_vendor.h"
#include "include/kernel_functions.c"
#include "OpenCL/types_ocl.c"
#include "OpenCL/common.c"
+#include "OpenCL/simd.c"
-#define COMPARE_S "OpenCL/check_single_comp4.c"
-#define COMPARE_M "OpenCL/check_multi_comp4.c"
-
-static void md5_transform (const u32 w0[4], const u32 w1[4], const u32 w2[4], const u32 w3[4], u32 digest[4])
+static void md5_transform (const u32x w0[4], const u32x w1[4], const u32x w2[4], const u32x w3[4], u32x digest[4])
{
- u32 a = digest[0];
- u32 b = digest[1];
- u32 c = digest[2];
- u32 d = digest[3];
-
- u32 w0_t = w0[0];
- u32 w1_t = w0[1];
- u32 w2_t = w0[2];
- u32 w3_t = w0[3];
- u32 w4_t = w1[0];
- u32 w5_t = w1[1];
- u32 w6_t = w1[2];
- u32 w7_t = w1[3];
- u32 w8_t = w2[0];
- u32 w9_t = w2[1];
- u32 wa_t = w2[2];
- u32 wb_t = w2[3];
- u32 wc_t = w3[0];
- u32 wd_t = w3[1];
- u32 we_t = w3[2];
- u32 wf_t = w3[3];
+ u32x a = digest[0];
+ u32x b = digest[1];
+ u32x c = digest[2];
+ u32x d = digest[3];
+
+ u32x w0_t = w0[0];
+ u32x w1_t = w0[1];
+ u32x w2_t = w0[2];
+ u32x w3_t = w0[3];
+ u32x w4_t = w1[0];
+ u32x w5_t = w1[1];
+ u32x w6_t = w1[2];
+ u32x w7_t = w1[3];
+ u32x w8_t = w2[0];
+ u32x w9_t = w2[1];
+ u32x wa_t = w2[2];
+ u32x wb_t = w2[3];
+ u32x wc_t = w3[0];
+ u32x wd_t = w3[1];
+ u32x we_t = w3[2];
+ u32x wf_t = w3[3];
MD5_STEP (MD5_Fo, a, b, c, d, w0_t, MD5C00, MD5S00);
MD5_STEP (MD5_Fo, d, a, b, c, w1_t, MD5C01, MD5S01);
digest[3] += d;
}
-static void gen336 (u32 digest_pre[4], u32 salt_buf[4], u32 digest[4])
+static void gen336 (u32x digest_pre[4], u32 salt_buf[4], u32x digest[4])
{
- u32 digest_t0[2];
- u32 digest_t1[2];
- u32 digest_t2[2];
- u32 digest_t3[2];
+ u32x digest_t0[2];
+ u32x digest_t1[2];
+ u32x digest_t2[2];
+ u32x digest_t3[2];
digest_t0[0] = digest_pre[0];
digest_t0[1] = digest_pre[1] & 0xff;
digest_t3[0] = digest_pre[0] << 24;
digest_t3[1] = digest_pre[0] >> 8 | digest_pre[1] << 24;
- u32 salt_buf_t0[4];
- u32 salt_buf_t1[5];
- u32 salt_buf_t2[5];
- u32 salt_buf_t3[5];
+ u32x salt_buf_t0[4];
+ u32x salt_buf_t1[5];
+ u32x salt_buf_t2[5];
+ u32x salt_buf_t3[5];
salt_buf_t0[0] = salt_buf[0];
salt_buf_t0[1] = salt_buf[1];
salt_buf_t3[3] = salt_buf[2] >> 8 | salt_buf[3] << 24;
salt_buf_t3[4] = salt_buf[3] >> 8;
- u32 w0_t[4];
- u32 w1_t[4];
- u32 w2_t[4];
- u32 w3_t[4];
+ u32x w0_t[4];
+ u32x w1_t[4];
+ u32x w2_t[4];
+ u32x w3_t[4];
// generate the 16 * 21 buffer
u32 w0l = w0[0];
- for (u32 il_pos = 0; il_pos < bfs_cnt; il_pos++)
+ for (u32 il_pos = 0; il_pos < bfs_cnt; il_pos += VECT_SIZE)
{
- const u32 w0r = bfs_buf[il_pos].i;
+ const u32x w0r = w0r_create_bft (bfs_buf, il_pos);
- w0[0] = w0l | w0r;
+ const u32x w0lr = w0l | w0r;
- u32 w0_t[4];
- u32 w1_t[4];
- u32 w2_t[4];
- u32 w3_t[4];
+ u32x w0_t[4];
+ u32x w1_t[4];
+ u32x w2_t[4];
+ u32x w3_t[4];
- w0_t[0] = w0[0];
+ w0_t[0] = w0lr;
w0_t[1] = w0[1];
w0_t[2] = w0[2];
w0_t[3] = w0[3];
w3_t[2] = pw_len * 8;
w3_t[3] = 0;
- u32 digest_pre[4];
+ u32x digest_pre[4];
digest_pre[0] = MD5M_A;
digest_pre[1] = MD5M_B;
digest_pre[2] &= 0x00000000;
digest_pre[3] &= 0x00000000;
- u32 digest[4];
+ u32x digest[4];
digest[0] = MD5M_A;
digest[1] = MD5M_B;
gen336 (digest_pre, salt_buf, digest);
- u32 a = digest[0];
- u32 b = digest[1] & 0xff;
+ u32x a = digest[0];
+ u32x b = digest[1] & 0xff;
+ u32x c = 0;
+ u32x d = 0;
- const u32 r0 = a;
- const u32 r1 = b;
- const u32 r2 = 0;
- const u32 r3 = 0;
-
- #include COMPARE_M
+ COMPARE_M_SIMD (a, b, c, d);
}
}
u32 w0l = w0[0];
- for (u32 il_pos = 0; il_pos < bfs_cnt; il_pos++)
+ for (u32 il_pos = 0; il_pos < bfs_cnt; il_pos += VECT_SIZE)
{
- const u32 w0r = bfs_buf[il_pos].i;
+ const u32x w0r = w0r_create_bft (bfs_buf, il_pos);
- w0[0] = w0l | w0r;
+ const u32x w0lr = w0l | w0r;
- u32 w0_t[4];
- u32 w1_t[4];
- u32 w2_t[4];
- u32 w3_t[4];
+ u32x w0_t[4];
+ u32x w1_t[4];
+ u32x w2_t[4];
+ u32x w3_t[4];
- w0_t[0] = w0[0];
+ w0_t[0] = w0lr;
w0_t[1] = w0[1];
w0_t[2] = w0[2];
w0_t[3] = w0[3];
w3_t[2] = pw_len * 8;
w3_t[3] = 0;
- u32 digest_pre[4];
+ u32x digest_pre[4];
digest_pre[0] = MD5M_A;
digest_pre[1] = MD5M_B;
digest_pre[2] &= 0x00000000;
digest_pre[3] &= 0x00000000;
- u32 digest[4];
+ u32x digest[4];
digest[0] = MD5M_A;
digest[1] = MD5M_B;
gen336 (digest_pre, salt_buf, digest);
- u32 a = digest[0];
- u32 b = digest[1] & 0xff;
-
- const u32 r0 = a;
- const u32 r1 = b;
- const u32 r2 = 0;
- const u32 r3 = 0;
+ u32x a = digest[0];
+ u32x b = digest[1] & 0xff;
+ u32x c = 0;
+ u32x d = 0;
- #include COMPARE_S
+ COMPARE_S_SIMD (a, b, c, d);
}
}
make_unicode (w0, w0_t, w1_t);
make_unicode (w1, w2_t, w3_t);
- switch_buffer_by_offset (w0_t, w1_t, w2_t, w3_t, salt_len);
+ switch_buffer_by_offset_le (w0_t, w1_t, w2_t, w3_t, salt_len);
w0_t[0] = salt_buf[0];
w0_t[1] = salt_buf[1];
make_unicode (w0, w0_t, w1_t);
make_unicode (w1, w2_t, w3_t);
- switch_buffer_by_offset (w0_t, w1_t, w2_t, w3_t, salt_len);
+ switch_buffer_by_offset_le (w0_t, w1_t, w2_t, w3_t, salt_len);
w0_t[0] = salt_buf[0];
w0_t[1] = salt_buf[1];
if (combs_mode == COMBINATOR_MODE_BASE_RIGHT)
{
- switch_buffer_by_offset (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len);
+ switch_buffer_by_offset_le (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len);
}
/**
if (combs_mode == COMBINATOR_MODE_BASE_LEFT)
{
- switch_buffer_by_offset (wordr0, wordr1, wordr2, wordr3, pw_l_len);
+ switch_buffer_by_offset_le (wordr0, wordr1, wordr2, wordr3, pw_l_len);
}
u32 w0[4];
make_unicode (w0, w0_t, w1_t);
make_unicode (w1, w2_t, w3_t);
- switch_buffer_by_offset (w0_t, w1_t, w2_t, w3_t, salt_len);
+ switch_buffer_by_offset_le (w0_t, w1_t, w2_t, w3_t, salt_len);
w0_t[0] = salt_buf[0];
w0_t[1] = salt_buf[1];
if (combs_mode == COMBINATOR_MODE_BASE_RIGHT)
{
- switch_buffer_by_offset (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len);
+ switch_buffer_by_offset_le (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len);
}
/**
if (combs_mode == COMBINATOR_MODE_BASE_LEFT)
{
- switch_buffer_by_offset (wordr0, wordr1, wordr2, wordr3, pw_l_len);
+ switch_buffer_by_offset_le (wordr0, wordr1, wordr2, wordr3, pw_l_len);
}
u32 w0[4];
make_unicode (w0, w0_t, w1_t);
make_unicode (w1, w2_t, w3_t);
- switch_buffer_by_offset (w0_t, w1_t, w2_t, w3_t, salt_len);
+ switch_buffer_by_offset_le (w0_t, w1_t, w2_t, w3_t, salt_len);
w0_t[0] = salt_buf[0];
w0_t[1] = salt_buf[1];
#define _OLDOFFICE34_
+#define NEW_SIMD_CODE
+
#include "include/constants.h"
#include "include/kernel_vendor.h"
#include "include/kernel_functions.c"
#include "OpenCL/types_ocl.c"
#include "OpenCL/common.c"
-
-#define COMPARE_S "OpenCL/check_single_comp4.c"
-#define COMPARE_M "OpenCL/check_multi_comp4.c"
+#include "OpenCL/simd.c"
typedef struct
{
rc4_key->S[j] = tmp;
}
-static void rc4_init_16 (__local RC4_KEY *rc4_key, const u32 data[4])
+static void rc4_init_16 (__local RC4_KEY *rc4_key, const u32x data[4])
{
- u32 v = 0x03020100;
- u32 a = 0x04040404;
+ u32x v = 0x03020100;
+ u32x a = 0x04040404;
__local u32 *ptr = (__local u32 *) rc4_key->S;
{
u32 idx = i * 16;
- u32 v;
+ u32x v;
v = data[0];
}
}
-static u8 rc4_next_16 (__local RC4_KEY *rc4_key, u8 i, u8 j, const u32 in[4], u32 out[4])
+static u8 rc4_next_16 (__local RC4_KEY *rc4_key, u8 i, u8 j, const u32x in[4], u32x out[4])
{
#pragma unroll
for (u32 k = 0; k < 4; k++)
{
- u32 xor4 = 0;
+ u32x xor4 = 0;
u8 idx;
return j;
}
-static void sha1_transform (const u32 w0[4], const u32 w1[4], const u32 w2[4], const u32 w3[4], u32 digest[5])
+static void sha1_transform (const u32x w0[4], const u32x w1[4], const u32x w2[4], const u32x w3[4], u32x digest[5])
{
- u32 A = digest[0];
- u32 B = digest[1];
- u32 C = digest[2];
- u32 D = digest[3];
- u32 E = digest[4];
-
- u32 w0_t = w0[0];
- u32 w1_t = w0[1];
- u32 w2_t = w0[2];
- u32 w3_t = w0[3];
- u32 w4_t = w1[0];
- u32 w5_t = w1[1];
- u32 w6_t = w1[2];
- u32 w7_t = w1[3];
- u32 w8_t = w2[0];
- u32 w9_t = w2[1];
- u32 wa_t = w2[2];
- u32 wb_t = w2[3];
- u32 wc_t = w3[0];
- u32 wd_t = w3[1];
- u32 we_t = w3[2];
- u32 wf_t = w3[3];
+ u32x A = digest[0];
+ u32x B = digest[1];
+ u32x C = digest[2];
+ u32x D = digest[3];
+ u32x E = digest[4];
+
+ u32x w0_t = w0[0];
+ u32x w1_t = w0[1];
+ u32x w2_t = w0[2];
+ u32x w3_t = w0[3];
+ u32x w4_t = w1[0];
+ u32x w5_t = w1[1];
+ u32x w6_t = w1[2];
+ u32x w7_t = w1[3];
+ u32x w8_t = w2[0];
+ u32x w9_t = w2[1];
+ u32x wa_t = w2[2];
+ u32x wb_t = w2[3];
+ u32x wc_t = w3[0];
+ u32x wd_t = w3[1];
+ u32x we_t = w3[2];
+ u32x wf_t = w3[3];
#undef K
#define K SHA1C00
u32 w0l = w0[0];
- for (u32 il_pos = 0; il_pos < bfs_cnt; il_pos++)
+ for (u32 il_pos = 0; il_pos < bfs_cnt; il_pos += VECT_SIZE)
{
- const u32 w0r = bfs_buf[il_pos].i;
+ const u32x w0r = w0r_create_bft (bfs_buf, il_pos);
- w0[0] = w0l | w0r;
+ const u32x w0lr = w0l | w0r;
- u32 w0_t[4];
- u32 w1_t[4];
- u32 w2_t[4];
- u32 w3_t[4];
+ u32x w0_t[4];
+ u32x w1_t[4];
+ u32x w2_t[4];
+ u32x w3_t[4];
w0_t[0] = salt_buf[0];
w0_t[1] = salt_buf[1];
w0_t[2] = salt_buf[2];
w0_t[3] = salt_buf[3];
- w1_t[0] = w0[0];
+ w1_t[0] = w0lr;
w1_t[1] = w0[1];
w1_t[2] = w0[2];
w1_t[3] = w0[3];
w3_t[2] = 0;
w3_t[3] = pw_salt_len * 8;
- u32 digest[5];
+ u32x digest[5];
digest[0] = SHA1M_A;
digest[1] = SHA1M_B;
sha1_transform (w0_t, w1_t, w2_t, w3_t, digest);
- u32 key[4];
+ u32x key[4];
key[0] = swap32 (digest[0]);
key[1] = swap32 (digest[1]);
rc4_init_16 (rc4_key, key);
- u32 out[4];
+ u32x out[4];
u8 j = rc4_next_16 (rc4_key, 0, 0, encryptedVerifier, out);
rc4_next_16 (rc4_key, 16, j, digest, out);
- const u32 r0 = out[0];
- const u32 r1 = out[1];
- const u32 r2 = out[2];
- const u32 r3 = out[3];
-
- #include COMPARE_M
+ COMPARE_M_SIMD (out[0], out[1], out[2], out[3]);
}
}
u32 w0l = w0[0];
- for (u32 il_pos = 0; il_pos < bfs_cnt; il_pos++)
+ for (u32 il_pos = 0; il_pos < bfs_cnt; il_pos += VECT_SIZE)
{
- const u32 w0r = bfs_buf[il_pos].i;
+ const u32x w0r = w0r_create_bft (bfs_buf, il_pos);
- w0[0] = w0l | w0r;
+ const u32x w0lr = w0l | w0r;
- u32 w0_t[4];
- u32 w1_t[4];
- u32 w2_t[4];
- u32 w3_t[4];
+ u32x w0_t[4];
+ u32x w1_t[4];
+ u32x w2_t[4];
+ u32x w3_t[4];
w0_t[0] = salt_buf[0];
w0_t[1] = salt_buf[1];
w0_t[2] = salt_buf[2];
w0_t[3] = salt_buf[3];
- w1_t[0] = w0[0];
+ w1_t[0] = w0lr;
w1_t[1] = w0[1];
w1_t[2] = w0[2];
w1_t[3] = w0[3];
w3_t[2] = 0;
w3_t[3] = pw_salt_len * 8;
- u32 digest[5];
+ u32x digest[5];
digest[0] = SHA1M_A;
digest[1] = SHA1M_B;
sha1_transform (w0_t, w1_t, w2_t, w3_t, digest);
- u32 key[4];
+ u32x key[4];
key[0] = swap32 (digest[0]);
key[1] = swap32 (digest[1]);
rc4_init_16 (rc4_key, key);
- u32 out[4];
+ u32x out[4];
u8 j = rc4_next_16 (rc4_key, 0, 0, encryptedVerifier, out);
rc4_next_16 (rc4_key, 16, j, digest, out);
- const u32 r0 = out[0];
- const u32 r1 = out[1];
- const u32 r2 = out[2];
- const u32 r3 = out[3];
-
- #include COMPARE_S
+ COMPARE_S_SIMD (out[0], out[1], out[2], out[3]);
}
}
if (combs_mode == COMBINATOR_MODE_BASE_RIGHT)
{
- switch_buffer_by_offset (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len);
+ switch_buffer_by_offset_le (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len);
}
/**
if (combs_mode == COMBINATOR_MODE_BASE_LEFT)
{
- switch_buffer_by_offset (wordr0, wordr1, wordr2, wordr3, pw_l_len);
+ switch_buffer_by_offset_le (wordr0, wordr1, wordr2, wordr3, pw_l_len);
}
u32 w0[4];
if (combs_mode == COMBINATOR_MODE_BASE_RIGHT)
{
- switch_buffer_by_offset (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len);
+ switch_buffer_by_offset_le (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len);
}
/**
if (combs_mode == COMBINATOR_MODE_BASE_LEFT)
{
- switch_buffer_by_offset (wordr0, wordr1, wordr2, wordr3, pw_l_len);
+ switch_buffer_by_offset_le (wordr0, wordr1, wordr2, wordr3, pw_l_len);
}
u32 w0[4];
#define _OLDOFFICE34_
+#define NEW_SIMD_CODE
+
#include "include/constants.h"
#include "include/kernel_vendor.h"
#include "include/kernel_functions.c"
#include "OpenCL/types_ocl.c"
#include "OpenCL/common.c"
-
-#define COMPARE_S "OpenCL/check_single_comp4.c"
-#define COMPARE_M "OpenCL/check_multi_comp4.c"
+#include "OpenCL/simd.c"
typedef struct
{
rc4_key->S[j] = tmp;
}
-static void rc4_init_16 (__local RC4_KEY *rc4_key, const u32 data[4])
+static void rc4_init_16 (__local RC4_KEY *rc4_key, const u32x data[4])
{
- u32 v = 0x03020100;
- u32 a = 0x04040404;
+ u32x v = 0x03020100;
+ u32x a = 0x04040404;
__local u32 *ptr = (__local u32 *) rc4_key->S;
{
u32 idx = i * 16;
- u32 v;
+ u32x v;
v = data[0];
}
}
-static u8 rc4_next_16 (__local RC4_KEY *rc4_key, u8 i, u8 j, const u32 in[4], u32 out[4])
+static u8 rc4_next_16 (__local RC4_KEY *rc4_key, u8 i, u8 j, const u32x in[4], u32x out[4])
{
#pragma unroll
for (u32 k = 0; k < 4; k++)
{
- u32 xor4 = 0;
+ u32x xor4 = 0;
u8 idx;
return j;
}
-static void sha1_transform (const u32 w0[4], const u32 w1[4], const u32 w2[4], const u32 w3[4], u32 digest[5])
+static void sha1_transform (const u32x w0[4], const u32x w1[4], const u32x w2[4], const u32x w3[4], u32x digest[5])
{
- u32 A = digest[0];
- u32 B = digest[1];
- u32 C = digest[2];
- u32 D = digest[3];
- u32 E = digest[4];
-
- u32 w0_t = w0[0];
- u32 w1_t = w0[1];
- u32 w2_t = w0[2];
- u32 w3_t = w0[3];
- u32 w4_t = w1[0];
- u32 w5_t = w1[1];
- u32 w6_t = w1[2];
- u32 w7_t = w1[3];
- u32 w8_t = w2[0];
- u32 w9_t = w2[1];
- u32 wa_t = w2[2];
- u32 wb_t = w2[3];
- u32 wc_t = w3[0];
- u32 wd_t = w3[1];
- u32 we_t = w3[2];
- u32 wf_t = w3[3];
+ u32x A = digest[0];
+ u32x B = digest[1];
+ u32x C = digest[2];
+ u32x D = digest[3];
+ u32x E = digest[4];
+
+ u32x w0_t = w0[0];
+ u32x w1_t = w0[1];
+ u32x w2_t = w0[2];
+ u32x w3_t = w0[3];
+ u32x w4_t = w1[0];
+ u32x w5_t = w1[1];
+ u32x w6_t = w1[2];
+ u32x w7_t = w1[3];
+ u32x w8_t = w2[0];
+ u32x w9_t = w2[1];
+ u32x wa_t = w2[2];
+ u32x wb_t = w2[3];
+ u32x wc_t = w3[0];
+ u32x wd_t = w3[1];
+ u32x we_t = w3[2];
+ u32x wf_t = w3[3];
#undef K
#define K SHA1C00
u32 w0l = w0[0];
- for (u32 il_pos = 0; il_pos < bfs_cnt; il_pos++)
+ for (u32 il_pos = 0; il_pos < bfs_cnt; il_pos += VECT_SIZE)
{
- const u32 w0r = bfs_buf[il_pos].i;
+ const u32x w0r = w0r_create_bft (bfs_buf, il_pos);
- w0[0] = w0l | w0r;
+ const u32x w0lr = w0l | w0r;
- u32 key[4];
+ u32x key[4];
- key[0] = w0[0];
+ key[0] = w0lr;
key[1] = w0[1] & 0xff;
key[2] = 0;
key[3] = 0;
rc4_init_16 (rc4_key, key);
- u32 out[4];
+ u32x out[4];
u8 j = rc4_next_16 (rc4_key, 0, 0, encryptedVerifier, out);
- u32 w0_t[4];
- u32 w1_t[4];
- u32 w2_t[4];
- u32 w3_t[4];
+ u32x w0_t[4];
+ u32x w1_t[4];
+ u32x w2_t[4];
+ u32x w3_t[4];
w0_t[0] = swap32 (out[0]);
w0_t[1] = swap32 (out[1]);
w3_t[2] = 0;
w3_t[3] = 16 * 8;
- u32 digest[5];
+ u32x digest[5];
digest[0] = SHA1M_A;
digest[1] = SHA1M_B;
rc4_next_16 (rc4_key, 16, j, digest, out);
- const u32 r0 = out[0];
- const u32 r1 = out[1];
- const u32 r2 = out[2];
- const u32 r3 = out[3];
-
- #include COMPARE_M
+ COMPARE_M_SIMD (out[0], out[1], out[2], out[3]);
}
}
u32 w0l = w0[0];
- for (u32 il_pos = 0; il_pos < bfs_cnt; il_pos++)
+ for (u32 il_pos = 0; il_pos < bfs_cnt; il_pos += VECT_SIZE)
{
- const u32 w0r = bfs_buf[il_pos].i;
+ const u32x w0r = w0r_create_bft (bfs_buf, il_pos);
- w0[0] = w0l | w0r;
+ const u32x w0lr = w0l | w0r;
- u32 key[4];
+ u32x key[4];
- key[0] = w0[0];
+ key[0] = w0lr;
key[1] = w0[1] & 0xff;
key[2] = 0;
key[3] = 0;
rc4_init_16 (rc4_key, key);
- u32 out[4];
+ u32x out[4];
u8 j = rc4_next_16 (rc4_key, 0, 0, encryptedVerifier, out);
- u32 w0_t[4];
- u32 w1_t[4];
- u32 w2_t[4];
- u32 w3_t[4];
+ u32x w0_t[4];
+ u32x w1_t[4];
+ u32x w2_t[4];
+ u32x w3_t[4];
w0_t[0] = swap32 (out[0]);
w0_t[1] = swap32 (out[1]);
w3_t[2] = 0;
w3_t[3] = 16 * 8;
- u32 digest[5];
+ u32x digest[5];
digest[0] = SHA1M_A;
digest[1] = SHA1M_B;
rc4_next_16 (rc4_key, 16, j, digest, out);
- const u32 r0 = out[0];
- const u32 r1 = out[1];
- const u32 r2 = out[2];
- const u32 r3 = out[3];
-
- #include COMPARE_S
+ COMPARE_S_SIMD (out[0], out[1], out[2], out[3]);
}
}
make_unicode (w0, w0_t, w1_t);
make_unicode (w1, w2_t, w3_t);
- switch_buffer_by_offset (w0_t, w1_t, w2_t, w3_t, salt_len);
+ switch_buffer_by_offset_le (w0_t, w1_t, w2_t, w3_t, salt_len);
w0_t[0] = salt_buf[0];
w0_t[1] = salt_buf[1];
make_unicode (w0, w0_t, w1_t);
make_unicode (w1, w2_t, w3_t);
- switch_buffer_by_offset (w0_t, w1_t, w2_t, w3_t, salt_len);
+ switch_buffer_by_offset_le (w0_t, w1_t, w2_t, w3_t, salt_len);
w0_t[0] = salt_buf[0];
w0_t[1] = salt_buf[1];
if (combs_mode == COMBINATOR_MODE_BASE_RIGHT)
{
- switch_buffer_by_offset (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len);
+ switch_buffer_by_offset_le (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len);
}
/**
if (combs_mode == COMBINATOR_MODE_BASE_LEFT)
{
- switch_buffer_by_offset (wordr0, wordr1, wordr2, wordr3, pw_l_len);
+ switch_buffer_by_offset_le (wordr0, wordr1, wordr2, wordr3, pw_l_len);
}
u32 w0[4];
make_unicode (w0, w0_t, w1_t);
make_unicode (w1, w2_t, w3_t);
- switch_buffer_by_offset (w0_t, w1_t, w2_t, w3_t, salt_len);
+ switch_buffer_by_offset_le (w0_t, w1_t, w2_t, w3_t, salt_len);
w0_t[0] = salt_buf[0];
w0_t[1] = salt_buf[1];
if (combs_mode == COMBINATOR_MODE_BASE_RIGHT)
{
- switch_buffer_by_offset (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len);
+ switch_buffer_by_offset_le (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len);
}
/**
if (combs_mode == COMBINATOR_MODE_BASE_LEFT)
{
- switch_buffer_by_offset (wordr0, wordr1, wordr2, wordr3, pw_l_len);
+ switch_buffer_by_offset_le (wordr0, wordr1, wordr2, wordr3, pw_l_len);
}
u32 w0[4];
make_unicode (w0, w0_t, w1_t);
make_unicode (w1, w2_t, w3_t);
- switch_buffer_by_offset (w0_t, w1_t, w2_t, w3_t, salt_len);
+ switch_buffer_by_offset_le (w0_t, w1_t, w2_t, w3_t, salt_len);
w0_t[0] = salt_buf[0];
w0_t[1] = salt_buf[1];
#define _OLDOFFICE34_
+#define NEW_SIMD_CODE
+
#include "include/constants.h"
#include "include/kernel_vendor.h"
#include "include/kernel_functions.c"
#include "OpenCL/types_ocl.c"
#include "OpenCL/common.c"
+#include "OpenCL/simd.c"
-#define COMPARE_S "OpenCL/check_single_comp4.c"
-#define COMPARE_M "OpenCL/check_multi_comp4.c"
-
-static void sha1_transform (const u32 w0[4], const u32 w1[4], const u32 w2[4], const u32 w3[4], u32 digest[5])
+static void sha1_transform (const u32x w0[4], const u32x w1[4], const u32x w2[4], const u32x w3[4], u32x digest[5])
{
- u32 A = digest[0];
- u32 B = digest[1];
- u32 C = digest[2];
- u32 D = digest[3];
- u32 E = digest[4];
-
- u32 w0_t = w0[0];
- u32 w1_t = w0[1];
- u32 w2_t = w0[2];
- u32 w3_t = w0[3];
- u32 w4_t = w1[0];
- u32 w5_t = w1[1];
- u32 w6_t = w1[2];
- u32 w7_t = w1[3];
- u32 w8_t = w2[0];
- u32 w9_t = w2[1];
- u32 wa_t = w2[2];
- u32 wb_t = w2[3];
- u32 wc_t = w3[0];
- u32 wd_t = w3[1];
- u32 we_t = w3[2];
- u32 wf_t = w3[3];
+ u32x A = digest[0];
+ u32x B = digest[1];
+ u32x C = digest[2];
+ u32x D = digest[3];
+ u32x E = digest[4];
+
+ u32x w0_t = w0[0];
+ u32x w1_t = w0[1];
+ u32x w2_t = w0[2];
+ u32x w3_t = w0[3];
+ u32x w4_t = w1[0];
+ u32x w5_t = w1[1];
+ u32x w6_t = w1[2];
+ u32x w7_t = w1[3];
+ u32x w8_t = w2[0];
+ u32x w9_t = w2[1];
+ u32x wa_t = w2[2];
+ u32x wb_t = w2[3];
+ u32x wc_t = w3[0];
+ u32x wd_t = w3[1];
+ u32x we_t = w3[2];
+ u32x wf_t = w3[3];
#undef K
#define K SHA1C00
u32 w0l = w0[0];
- for (u32 il_pos = 0; il_pos < bfs_cnt; il_pos++)
+ for (u32 il_pos = 0; il_pos < bfs_cnt; il_pos += VECT_SIZE)
{
- const u32 w0r = bfs_buf[il_pos].i;
+ const u32x w0r = w0r_create_bft (bfs_buf, il_pos);
- w0[0] = w0l | w0r;
+ const u32x w0lr = w0l | w0r;
- u32 w0_t[4];
- u32 w1_t[4];
- u32 w2_t[4];
- u32 w3_t[4];
+ u32x w0_t[4];
+ u32x w1_t[4];
+ u32x w2_t[4];
+ u32x w3_t[4];
w0_t[0] = salt_buf[0];
w0_t[1] = salt_buf[1];
w0_t[2] = salt_buf[2];
w0_t[3] = salt_buf[3];
- w1_t[0] = w0[0];
+ w1_t[0] = w0lr;
w1_t[1] = w0[1];
w1_t[2] = w0[2];
w1_t[3] = w0[3];
w3_t[2] = 0;
w3_t[3] = pw_salt_len * 8;
- u32 digest[5];
+ u32x digest[5];
digest[0] = SHA1M_A;
digest[1] = SHA1M_B;
sha1_transform (w0_t, w1_t, w2_t, w3_t, digest);
- u32 a = swap32 (digest[0]);
- u32 b = swap32 (digest[1]) & 0xff;
+ u32x a = swap32 (digest[0]);
+ u32x b = swap32 (digest[1]) & 0xff;
+ u32x c = 0;
+ u32x d = 0;
- const u32 r0 = a;
- const u32 r1 = b;
- const u32 r2 = 0;
- const u32 r3 = 0;
-
- #include COMPARE_M
+ COMPARE_M_SIMD (a, b, c, d);
}
}
u32 w0l = w0[0];
- for (u32 il_pos = 0; il_pos < bfs_cnt; il_pos++)
+ for (u32 il_pos = 0; il_pos < bfs_cnt; il_pos += VECT_SIZE)
{
- const u32 w0r = bfs_buf[il_pos].i;
+ const u32x w0r = w0r_create_bft (bfs_buf, il_pos);
- w0[0] = w0l | w0r;
+ const u32x w0lr = w0l | w0r;
- u32 w0_t[4];
- u32 w1_t[4];
- u32 w2_t[4];
- u32 w3_t[4];
+ u32x w0_t[4];
+ u32x w1_t[4];
+ u32x w2_t[4];
+ u32x w3_t[4];
w0_t[0] = salt_buf[0];
w0_t[1] = salt_buf[1];
w0_t[2] = salt_buf[2];
w0_t[3] = salt_buf[3];
- w1_t[0] = w0[0];
+ w1_t[0] = w0lr;
w1_t[1] = w0[1];
w1_t[2] = w0[2];
w1_t[3] = w0[3];
w3_t[2] = 0;
w3_t[3] = pw_salt_len * 8;
- u32 digest[5];
+ u32x digest[5];
digest[0] = SHA1M_A;
digest[1] = SHA1M_B;
sha1_transform (w0_t, w1_t, w2_t, w3_t, digest);
- u32 a = swap32 (digest[0]);
- u32 b = swap32 (digest[1]) & 0xff;
-
- const u32 r0 = a;
- const u32 r1 = b;
- const u32 r2 = 0;
- const u32 r3 = 0;
+ u32x a = swap32 (digest[0]);
+ u32x b = swap32 (digest[1]) & 0xff;
+ u32x c = 0;
+ u32x d = 0;
- #include COMPARE_S
+ COMPARE_S_SIMD (a, b, c, d);
}
}
if (combs_mode == COMBINATOR_MODE_BASE_RIGHT)
{
- switch_buffer_by_offset (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len);
+ switch_buffer_by_offset_le (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len);
}
/**
if (combs_mode == COMBINATOR_MODE_BASE_LEFT)
{
- switch_buffer_by_offset (wordr0, wordr1, wordr2, wordr3, pw_l_len);
+ switch_buffer_by_offset_le (wordr0, wordr1, wordr2, wordr3, pw_l_len);
}
u32 w0[4];
if (combs_mode == COMBINATOR_MODE_BASE_RIGHT)
{
- switch_buffer_by_offset (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len);
+ switch_buffer_by_offset_le (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len);
}
/**
if (combs_mode == COMBINATOR_MODE_BASE_LEFT)
{
- switch_buffer_by_offset (wordr0, wordr1, wordr2, wordr3, pw_l_len);
+ switch_buffer_by_offset_le (wordr0, wordr1, wordr2, wordr3, pw_l_len);
}
u32 w0[4];
#define _MD5_
+#define NEW_SIMD_CODE
+
#include "include/constants.h"
#include "include/kernel_vendor.h"
#include "include/kernel_functions.c"
#include "OpenCL/types_ocl.c"
#include "OpenCL/common.c"
+#include "OpenCL/simd.c"
-#define COMPARE_S "OpenCL/check_single_comp4.c"
-#define COMPARE_M "OpenCL/check_multi_comp4.c"
-
-static void m09900m (u32 w[16], const u32 pw_len, __global pw_t *pws, __global kernel_rule_t *rules_buf, __global comb_t *combs_buf, __constant u32 * words_buf_r, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 bfs_cnt, const u32 digests_cnt, const u32 digests_offset)
+static void m09900m (u32 w[16], const u32 pw_len, __global pw_t *pws, __global kernel_rule_t *rules_buf, __global comb_t *combs_buf, __constant u32x * words_buf_r, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 bfs_cnt, const u32 digests_cnt, const u32 digests_offset)
{
/**
* modifier
u32 w0l = w[0];
- for (u32 il_pos = 0; il_pos < bfs_cnt; il_pos++)
+ for (u32 il_pos = 0; il_pos < bfs_cnt; il_pos += VECT_SIZE)
{
- const u32 w0r = words_buf_r[il_pos];
+ const u32x w0r = words_buf_r[il_pos / VECT_SIZE];
- const u32 w0 = w0l | w0r;
+ const u32x w0 = w0l | w0r;
- u32 a = MD5M_A;
- u32 b = MD5M_B;
- u32 c = MD5M_C;
- u32 d = MD5M_D;
+ u32x a = MD5M_A;
+ u32x b = MD5M_B;
+ u32x c = MD5M_C;
+ u32x d = MD5M_D;
MD5_STEP (MD5_Fo, a, b, c, d, w0, F_w0c00, MD5S00);
MD5_STEP0(MD5_Fo, d, a, b, c, F_w1c01, MD5S01);
c += MD5M_C;
d += MD5M_D;
- u32 r_a = a;
- u32 r_b = b;
- u32 r_c = c;
- u32 r_d = d;
+ u32x r_a = a;
+ u32x r_b = b;
+ u32x r_c = c;
+ u32x r_d = d;
- u32 t0[4];
- u32 t1[4];
- u32 t2[4];
- u32 t3[4];
+ u32x t0[4];
+ u32x t1[4];
+ u32x t2[4];
+ u32x t3[4];
t0[0] = 0;
t0[1] = 0;
c += r_c;
d += r_d;
- const u32 r0 = a;
- const u32 r1 = d;
- const u32 r2 = c;
- const u32 r3 = b;
-
- #include COMPARE_M
+ COMPARE_M_SIMD (a, d, c, b);
}
}
-static void m09900s (u32 w[16], const u32 pw_len, __global pw_t *pws, __global kernel_rule_t *rules_buf, __global comb_t *combs_buf, __constant u32 * words_buf_r, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 bfs_cnt, const u32 digests_cnt, const u32 digests_offset)
+static void m09900s (u32 w[16], const u32 pw_len, __global pw_t *pws, __global kernel_rule_t *rules_buf, __global comb_t *combs_buf, __constant u32x * words_buf_r, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 bfs_cnt, const u32 digests_cnt, const u32 digests_offset)
{
/**
* modifier
u32 w0l = w[0];
- for (u32 il_pos = 0; il_pos < bfs_cnt; il_pos++)
+ for (u32 il_pos = 0; il_pos < bfs_cnt; il_pos += VECT_SIZE)
{
- const u32 w0r = words_buf_r[il_pos];
+ const u32x w0r = words_buf_r[il_pos / VECT_SIZE];
- const u32 w0 = w0l | w0r;
+ const u32x w0 = w0l | w0r;
- u32 a = MD5M_A;
- u32 b = MD5M_B;
- u32 c = MD5M_C;
- u32 d = MD5M_D;
+ u32x a = MD5M_A;
+ u32x b = MD5M_B;
+ u32x c = MD5M_C;
+ u32x d = MD5M_D;
MD5_STEP (MD5_Fo, a, b, c, d, w0, F_w0c00, MD5S00);
MD5_STEP0(MD5_Fo, d, a, b, c, F_w1c01, MD5S01);
c += MD5M_C;
d += MD5M_D;
- u32 r_a = a;
- u32 r_b = b;
- u32 r_c = c;
- u32 r_d = d;
+ u32x r_a = a;
+ u32x r_b = b;
+ u32x r_c = c;
+ u32x r_d = d;
- u32 t0[4];
- u32 t1[4];
- u32 t2[4];
- u32 t3[4];
+ u32x t0[4];
+ u32x t1[4];
+ u32x t2[4];
+ u32x t3[4];
t0[0] = 0;
t0[1] = 0;
MD5_STEP (MD5_I , b, c, d, a, t3[1], MD5C3b, MD5S33);
MD5_STEP (MD5_I , a, b, c, d, t1[0], MD5C3c, MD5S30);
- if (allx ((a + r_a) != search[0])) continue;
+ if (MATCHES_NONE_VS ((a + r_a), search[0])) continue;
MD5_STEP (MD5_I , d, a, b, c, t2[3], MD5C3d, MD5S31);
MD5_STEP (MD5_I , c, d, a, b, t0[2], MD5C3e, MD5S32);
c += r_c;
d += r_d;
- const u32 r0 = a;
- const u32 r1 = d;
- const u32 r2 = c;
- const u32 r3 = b;
-
- #include COMPARE_S
+ COMPARE_S_SIMD (a, d, c, b);
}
}
-__kernel void m09900_m04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, __global comb_t *combs_buf, __constant u32 * words_buf_r, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 bfs_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max)
+__kernel void m09900_m04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, __global comb_t *combs_buf, __constant u32x * words_buf_r, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 bfs_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max)
{
/**
* base
m09900m (w, pw_len, pws, rules_buf, combs_buf, words_buf_r, tmps, hooks, bitmaps_buf_s1_a, bitmaps_buf_s1_b, bitmaps_buf_s1_c, bitmaps_buf_s1_d, bitmaps_buf_s2_a, bitmaps_buf_s2_b, bitmaps_buf_s2_c, bitmaps_buf_s2_d, plains_buf, digests_buf, hashes_shown, salt_bufs, esalt_bufs, d_return_buf, d_scryptV_buf, bitmap_mask, bitmap_shift1, bitmap_shift2, salt_pos, loop_pos, loop_cnt, bfs_cnt, digests_cnt, digests_offset);
}
-__kernel void m09900_m08 (__global pw_t *pws, __global kernel_rule_t *rules_buf, __global comb_t *combs_buf, __constant u32 * words_buf_r, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 bfs_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max)
+__kernel void m09900_m08 (__global pw_t *pws, __global kernel_rule_t *rules_buf, __global comb_t *combs_buf, __constant u32x * words_buf_r, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 bfs_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max)
{
/**
* base
m09900m (w, pw_len, pws, rules_buf, combs_buf, words_buf_r, tmps, hooks, bitmaps_buf_s1_a, bitmaps_buf_s1_b, bitmaps_buf_s1_c, bitmaps_buf_s1_d, bitmaps_buf_s2_a, bitmaps_buf_s2_b, bitmaps_buf_s2_c, bitmaps_buf_s2_d, plains_buf, digests_buf, hashes_shown, salt_bufs, esalt_bufs, d_return_buf, d_scryptV_buf, bitmap_mask, bitmap_shift1, bitmap_shift2, salt_pos, loop_pos, loop_cnt, bfs_cnt, digests_cnt, digests_offset);
}
-__kernel void m09900_m16 (__global pw_t *pws, __global kernel_rule_t *rules_buf, __global comb_t *combs_buf, __constant u32 * words_buf_r, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 bfs_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max)
+__kernel void m09900_m16 (__global pw_t *pws, __global kernel_rule_t *rules_buf, __global comb_t *combs_buf, __constant u32x * words_buf_r, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 bfs_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max)
{
/**
* base
m09900m (w, pw_len, pws, rules_buf, combs_buf, words_buf_r, tmps, hooks, bitmaps_buf_s1_a, bitmaps_buf_s1_b, bitmaps_buf_s1_c, bitmaps_buf_s1_d, bitmaps_buf_s2_a, bitmaps_buf_s2_b, bitmaps_buf_s2_c, bitmaps_buf_s2_d, plains_buf, digests_buf, hashes_shown, salt_bufs, esalt_bufs, d_return_buf, d_scryptV_buf, bitmap_mask, bitmap_shift1, bitmap_shift2, salt_pos, loop_pos, loop_cnt, bfs_cnt, digests_cnt, digests_offset);
}
-__kernel void m09900_s04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, __global comb_t *combs_buf, __constant u32 * words_buf_r, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 bfs_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max)
+__kernel void m09900_s04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, __global comb_t *combs_buf, __constant u32x * words_buf_r, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 bfs_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max)
{
/**
* base
m09900s (w, pw_len, pws, rules_buf, combs_buf, words_buf_r, tmps, hooks, bitmaps_buf_s1_a, bitmaps_buf_s1_b, bitmaps_buf_s1_c, bitmaps_buf_s1_d, bitmaps_buf_s2_a, bitmaps_buf_s2_b, bitmaps_buf_s2_c, bitmaps_buf_s2_d, plains_buf, digests_buf, hashes_shown, salt_bufs, esalt_bufs, d_return_buf, d_scryptV_buf, bitmap_mask, bitmap_shift1, bitmap_shift2, salt_pos, loop_pos, loop_cnt, bfs_cnt, digests_cnt, digests_offset);
}
-__kernel void m09900_s08 (__global pw_t *pws, __global kernel_rule_t *rules_buf, __global comb_t *combs_buf, __constant u32 * words_buf_r, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 bfs_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max)
+__kernel void m09900_s08 (__global pw_t *pws, __global kernel_rule_t *rules_buf, __global comb_t *combs_buf, __constant u32x * words_buf_r, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 bfs_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max)
{
/**
* base
m09900s (w, pw_len, pws, rules_buf, combs_buf, words_buf_r, tmps, hooks, bitmaps_buf_s1_a, bitmaps_buf_s1_b, bitmaps_buf_s1_c, bitmaps_buf_s1_d, bitmaps_buf_s2_a, bitmaps_buf_s2_b, bitmaps_buf_s2_c, bitmaps_buf_s2_d, plains_buf, digests_buf, hashes_shown, salt_bufs, esalt_bufs, d_return_buf, d_scryptV_buf, bitmap_mask, bitmap_shift1, bitmap_shift2, salt_pos, loop_pos, loop_cnt, bfs_cnt, digests_cnt, digests_offset);
}
-__kernel void m09900_s16 (__global pw_t *pws, __global kernel_rule_t *rules_buf, __global comb_t *combs_buf, __constant u32 * words_buf_r, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 bfs_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max)
+__kernel void m09900_s16 (__global pw_t *pws, __global kernel_rule_t *rules_buf, __global comb_t *combs_buf, __constant u32x * words_buf_r, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 bfs_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max)
{
/**
* base
if (combs_mode == COMBINATOR_MODE_BASE_RIGHT)
{
- switch_buffer_by_offset (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len);
+ switch_buffer_by_offset_le (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len);
}
/**
if (combs_mode == COMBINATOR_MODE_BASE_LEFT)
{
- switch_buffer_by_offset (wordr0, wordr1, wordr2, wordr3, pw_l_len);
+ switch_buffer_by_offset_le (wordr0, wordr1, wordr2, wordr3, pw_l_len);
}
u32 w[16];
if (combs_mode == COMBINATOR_MODE_BASE_RIGHT)
{
- switch_buffer_by_offset (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len);
+ switch_buffer_by_offset_le (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len);
}
/**
if (combs_mode == COMBINATOR_MODE_BASE_LEFT)
{
- switch_buffer_by_offset (wordr0, wordr1, wordr2, wordr3, pw_l_len);
+ switch_buffer_by_offset_le (wordr0, wordr1, wordr2, wordr3, pw_l_len);
}
u32 w[16];
#define _SIPHASH_
+#define NEW_SIMD_CODE
+
#include "include/constants.h"
#include "include/kernel_vendor.h"
#include "include/kernel_functions.c"
#include "OpenCL/types_ocl.c"
#include "OpenCL/common.c"
-
-#define COMPARE_S "OpenCL/check_single_comp4.c"
-#define COMPARE_M "OpenCL/check_multi_comp4.c"
+#include "OpenCL/simd.c"
#define SIPROUND(v0,v1,v2,v3) \
(v0) += (v1); \
(v1) = rotl64 ((v1), 13); \
(v1) ^= (v0); \
- (v0) = as_ulong (as_uint2 ((v0)).s10); \
+ (v0) = rotl64 ((v0), 32); \
(v2) += (v3); \
(v3) = rotl64 ((v3), 16); \
(v3) ^= (v2); \
(v2) += (v1); \
(v1) = rotl64 ((v1), 17); \
(v1) ^= (v2); \
- (v2) = as_ulong (as_uint2 ((v2)).s10);
+ (v2) = rotl64 ((v2), 32)
-static void m10100m (u32 w[16], const u32 pw_len, __global pw_t *pws, __global kernel_rule_t *rules_buf, __global comb_t *combs_buf, __constant u32 * words_buf_r, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 bfs_cnt, const u32 digests_cnt, const u32 digests_offset)
+static void m10100m (u32 w[16], const u32 pw_len, __global pw_t *pws, __global kernel_rule_t *rules_buf, __global comb_t *combs_buf, __constant u32x * words_buf_r, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 bfs_cnt, const u32 digests_cnt, const u32 digests_offset)
{
/**
* modifier
u64 v2p = SIPHASHM_2;
u64 v3p = SIPHASHM_3;
- v0p ^= hl32_to_64 (salt_bufs[salt_pos].salt_buf[1], salt_bufs[salt_pos].salt_buf[0]);
- v1p ^= hl32_to_64 (salt_bufs[salt_pos].salt_buf[3], salt_bufs[salt_pos].salt_buf[2]);
- v2p ^= hl32_to_64 (salt_bufs[salt_pos].salt_buf[1], salt_bufs[salt_pos].salt_buf[0]);
- v3p ^= hl32_to_64 (salt_bufs[salt_pos].salt_buf[3], salt_bufs[salt_pos].salt_buf[2]);
+ v0p ^= hl32_to_64_S (salt_bufs[salt_pos].salt_buf[1], salt_bufs[salt_pos].salt_buf[0]);
+ v1p ^= hl32_to_64_S (salt_bufs[salt_pos].salt_buf[3], salt_bufs[salt_pos].salt_buf[2]);
+ v2p ^= hl32_to_64_S (salt_bufs[salt_pos].salt_buf[1], salt_bufs[salt_pos].salt_buf[0]);
+ v3p ^= hl32_to_64_S (salt_bufs[salt_pos].salt_buf[3], salt_bufs[salt_pos].salt_buf[2]);
u64 *w_ptr = (u64 *) w;
u32 w0l = w[0];
- for (u32 il_pos = 0; il_pos < bfs_cnt; il_pos++)
+ for (u32 il_pos = 0; il_pos < bfs_cnt; il_pos += VECT_SIZE)
{
- const u32 w0r = words_buf_r[il_pos];
+ const u32x w0r = words_buf_r[il_pos / VECT_SIZE];
- const u32 w0 = w0l | w0r;
+ const u32x w0 = w0l | w0r;
- u64 v0 = v0p;
- u64 v1 = v1p;
- u64 v2 = v2p;
- u64 v3 = v3p;
+ u64x v0 = v0p;
+ u64x v1 = v1p;
+ u64x v2 = v2p;
+ u64x v3 = v3p;
- u64 m = hl32_to_64 (w[1], w0);
+ u64x m = hl32_to_64 (w[1], w0);
v3 ^= m;
SIPROUND (v0, v1, v2, v3);
SIPROUND (v0, v1, v2, v3);
- const u64 v = v0 ^ v1 ^ v2 ^ v3;
+ const u64x v = v0 ^ v1 ^ v2 ^ v3;
- const u32 a = l32_from_64 (v);
- const u32 b = h32_from_64 (v);
+ const u32x a = l32_from_64 (v);
+ const u32x b = h32_from_64 (v);
+ const u32x c = 0;
+ const u32x d = 0;
- const u32 r0 = a;
- const u32 r1 = b;
- const u32 r2 = 0;
- const u32 r3 = 0;
-
- #include COMPARE_M
+ COMPARE_M_SIMD (a, b, c, d);
}
}
-static void m10100s (u32 w[16], const u32 pw_len, __global pw_t *pws, __global kernel_rule_t *rules_buf, __global comb_t *combs_buf, __constant u32 * words_buf_r, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 bfs_cnt, const u32 digests_cnt, const u32 digests_offset)
+static void m10100s (u32 w[16], const u32 pw_len, __global pw_t *pws, __global kernel_rule_t *rules_buf, __global comb_t *combs_buf, __constant u32x * words_buf_r, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 bfs_cnt, const u32 digests_cnt, const u32 digests_offset)
{
/**
* modifier
u64 v2p = SIPHASHM_2;
u64 v3p = SIPHASHM_3;
- v0p ^= hl32_to_64 (salt_bufs[salt_pos].salt_buf[1], salt_bufs[salt_pos].salt_buf[0]);
- v1p ^= hl32_to_64 (salt_bufs[salt_pos].salt_buf[3], salt_bufs[salt_pos].salt_buf[2]);
- v2p ^= hl32_to_64 (salt_bufs[salt_pos].salt_buf[1], salt_bufs[salt_pos].salt_buf[0]);
- v3p ^= hl32_to_64 (salt_bufs[salt_pos].salt_buf[3], salt_bufs[salt_pos].salt_buf[2]);
+ v0p ^= hl32_to_64_S (salt_bufs[salt_pos].salt_buf[1], salt_bufs[salt_pos].salt_buf[0]);
+ v1p ^= hl32_to_64_S (salt_bufs[salt_pos].salt_buf[3], salt_bufs[salt_pos].salt_buf[2]);
+ v2p ^= hl32_to_64_S (salt_bufs[salt_pos].salt_buf[1], salt_bufs[salt_pos].salt_buf[0]);
+ v3p ^= hl32_to_64_S (salt_bufs[salt_pos].salt_buf[3], salt_bufs[salt_pos].salt_buf[2]);
u64 *w_ptr = (u64 *) w;
u32 w0l = w[0];
- for (u32 il_pos = 0; il_pos < bfs_cnt; il_pos++)
+ for (u32 il_pos = 0; il_pos < bfs_cnt; il_pos += VECT_SIZE)
{
- const u32 w0r = words_buf_r[il_pos];
+ const u32x w0r = words_buf_r[il_pos / VECT_SIZE];
- const u32 w0 = w0l | w0r;
+ const u32x w0 = w0l | w0r;
- u64 v0 = v0p;
- u64 v1 = v1p;
- u64 v2 = v2p;
- u64 v3 = v3p;
+ u64x v0 = v0p;
+ u64x v1 = v1p;
+ u64x v2 = v2p;
+ u64x v3 = v3p;
- u64 m = hl32_to_64 (w[1], w0);
+ u64x m = hl32_to_64 (w[1], w0);
v3 ^= m;
SIPROUND (v0, v1, v2, v3);
SIPROUND (v0, v1, v2, v3);
- const u64 v = v0 ^ v1 ^ v2 ^ v3;
-
- const u32 a = l32_from_64 (v);
- const u32 b = h32_from_64 (v);
+ const u64x v = v0 ^ v1 ^ v2 ^ v3;
- const u32 r0 = a;
- const u32 r1 = b;
- const u32 r2 = 0;
- const u32 r3 = 0;
+ const u32x a = l32_from_64 (v);
+ const u32x b = h32_from_64 (v);
+ const u32x c = 0;
+ const u32x d = 0;
- #include COMPARE_S
+ COMPARE_S_SIMD (a, b, c, d);
}
}
-__kernel void m10100_m04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, __global comb_t *combs_buf, __constant u32 * words_buf_r, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 bfs_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max)
+__kernel void m10100_m04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, __global comb_t *combs_buf, __constant u32x * words_buf_r, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 bfs_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max)
{
/**
* base
m10100m (w, pw_len, pws, rules_buf, combs_buf, words_buf_r, tmps, hooks, bitmaps_buf_s1_a, bitmaps_buf_s1_b, bitmaps_buf_s1_c, bitmaps_buf_s1_d, bitmaps_buf_s2_a, bitmaps_buf_s2_b, bitmaps_buf_s2_c, bitmaps_buf_s2_d, plains_buf, digests_buf, hashes_shown, salt_bufs, esalt_bufs, d_return_buf, d_scryptV_buf, bitmap_mask, bitmap_shift1, bitmap_shift2, salt_pos, loop_pos, loop_cnt, bfs_cnt, digests_cnt, digests_offset);
}
-__kernel void m10100_m08 (__global pw_t *pws, __global kernel_rule_t *rules_buf, __global comb_t *combs_buf, __constant u32 * words_buf_r, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 bfs_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max)
+__kernel void m10100_m08 (__global pw_t *pws, __global kernel_rule_t *rules_buf, __global comb_t *combs_buf, __constant u32x * words_buf_r, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 bfs_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max)
{
/**
* base
m10100m (w, pw_len, pws, rules_buf, combs_buf, words_buf_r, tmps, hooks, bitmaps_buf_s1_a, bitmaps_buf_s1_b, bitmaps_buf_s1_c, bitmaps_buf_s1_d, bitmaps_buf_s2_a, bitmaps_buf_s2_b, bitmaps_buf_s2_c, bitmaps_buf_s2_d, plains_buf, digests_buf, hashes_shown, salt_bufs, esalt_bufs, d_return_buf, d_scryptV_buf, bitmap_mask, bitmap_shift1, bitmap_shift2, salt_pos, loop_pos, loop_cnt, bfs_cnt, digests_cnt, digests_offset);
}
-__kernel void m10100_m16 (__global pw_t *pws, __global kernel_rule_t *rules_buf, __global comb_t *combs_buf, __constant u32 * words_buf_r, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 bfs_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max)
+__kernel void m10100_m16 (__global pw_t *pws, __global kernel_rule_t *rules_buf, __global comb_t *combs_buf, __constant u32x * words_buf_r, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 bfs_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max)
{
/**
* base
m10100m (w, pw_len, pws, rules_buf, combs_buf, words_buf_r, tmps, hooks, bitmaps_buf_s1_a, bitmaps_buf_s1_b, bitmaps_buf_s1_c, bitmaps_buf_s1_d, bitmaps_buf_s2_a, bitmaps_buf_s2_b, bitmaps_buf_s2_c, bitmaps_buf_s2_d, plains_buf, digests_buf, hashes_shown, salt_bufs, esalt_bufs, d_return_buf, d_scryptV_buf, bitmap_mask, bitmap_shift1, bitmap_shift2, salt_pos, loop_pos, loop_cnt, bfs_cnt, digests_cnt, digests_offset);
}
-__kernel void m10100_s04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, __global comb_t *combs_buf, __constant u32 * words_buf_r, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 bfs_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max)
+__kernel void m10100_s04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, __global comb_t *combs_buf, __constant u32x * words_buf_r, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 bfs_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max)
{
/**
* base
m10100s (w, pw_len, pws, rules_buf, combs_buf, words_buf_r, tmps, hooks, bitmaps_buf_s1_a, bitmaps_buf_s1_b, bitmaps_buf_s1_c, bitmaps_buf_s1_d, bitmaps_buf_s2_a, bitmaps_buf_s2_b, bitmaps_buf_s2_c, bitmaps_buf_s2_d, plains_buf, digests_buf, hashes_shown, salt_bufs, esalt_bufs, d_return_buf, d_scryptV_buf, bitmap_mask, bitmap_shift1, bitmap_shift2, salt_pos, loop_pos, loop_cnt, bfs_cnt, digests_cnt, digests_offset);
}
-__kernel void m10100_s08 (__global pw_t *pws, __global kernel_rule_t *rules_buf, __global comb_t *combs_buf, __constant u32 * words_buf_r, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 bfs_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max)
+__kernel void m10100_s08 (__global pw_t *pws, __global kernel_rule_t *rules_buf, __global comb_t *combs_buf, __constant u32x * words_buf_r, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 bfs_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max)
{
/**
* base
m10100s (w, pw_len, pws, rules_buf, combs_buf, words_buf_r, tmps, hooks, bitmaps_buf_s1_a, bitmaps_buf_s1_b, bitmaps_buf_s1_c, bitmaps_buf_s1_d, bitmaps_buf_s2_a, bitmaps_buf_s2_b, bitmaps_buf_s2_c, bitmaps_buf_s2_d, plains_buf, digests_buf, hashes_shown, salt_bufs, esalt_bufs, d_return_buf, d_scryptV_buf, bitmap_mask, bitmap_shift1, bitmap_shift2, salt_pos, loop_pos, loop_cnt, bfs_cnt, digests_cnt, digests_offset);
}
-__kernel void m10100_s16 (__global pw_t *pws, __global kernel_rule_t *rules_buf, __global comb_t *combs_buf, __constant u32 * words_buf_r, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 bfs_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max)
+__kernel void m10100_s16 (__global pw_t *pws, __global kernel_rule_t *rules_buf, __global comb_t *combs_buf, __constant u32x * words_buf_r, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 bfs_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max)
{
/**
* base
w3[2] = 0;
w3[3] = 0;
- switch_buffer_by_offset (w0, w1, w2, w3, pw_len);
+ switch_buffer_by_offset_le (w0, w1, w2, w3, pw_len);
w0[0] |= word_buf0[0];
w0[1] |= word_buf0[1];
w3_t[2] = 0;
w3_t[3] = 0;
- switch_buffer_by_offset (w0_t, w1_t, w2_t, w3_t, pw_len);
+ switch_buffer_by_offset_le (w0_t, w1_t, w2_t, w3_t, pw_len);
// add password
// truncate at 32 is wanted, not a bug!
w3_t[2] = 0;
w3_t[3] = 0;
- switch_buffer_by_offset (w0_t, w1_t, w2_t, w3_t, pw_len);
+ switch_buffer_by_offset_le (w0_t, w1_t, w2_t, w3_t, pw_len);
// add password
// truncate at 32 is wanted, not a bug!
if (combs_mode == COMBINATOR_MODE_BASE_RIGHT)
{
- switch_buffer_by_offset (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len);
+ switch_buffer_by_offset_le (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len);
}
/**
if (combs_mode == COMBINATOR_MODE_BASE_LEFT)
{
- switch_buffer_by_offset (wordr0, wordr1, wordr2, wordr3, pw_l_len);
+ switch_buffer_by_offset_le (wordr0, wordr1, wordr2, wordr3, pw_l_len);
}
u32 w0[4];
w3_t[2] = 0;
w3_t[3] = 0;
- switch_buffer_by_offset (w0_t, w1_t, w2_t, w3_t, pw_len);
+ switch_buffer_by_offset_le (w0_t, w1_t, w2_t, w3_t, pw_len);
// add password
// truncate at 32 is wanted, not a bug!
if (combs_mode == COMBINATOR_MODE_BASE_RIGHT)
{
- switch_buffer_by_offset (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len);
+ switch_buffer_by_offset_le (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len);
}
/**
if (combs_mode == COMBINATOR_MODE_BASE_LEFT)
{
- switch_buffer_by_offset (wordr0, wordr1, wordr2, wordr3, pw_l_len);
+ switch_buffer_by_offset_le (wordr0, wordr1, wordr2, wordr3, pw_l_len);
}
u32 w0[4];
w3_t[2] = 0;
w3_t[3] = 0;
- switch_buffer_by_offset (w0_t, w1_t, w2_t, w3_t, pw_len);
+ switch_buffer_by_offset_le (w0_t, w1_t, w2_t, w3_t, pw_len);
// add password
// truncate at 32 is wanted, not a bug!
w3_t[2] = 0;
w3_t[3] = 0;
- switch_buffer_by_offset (w0_t, w1_t, w2_t, w3_t, pw_len);
+ switch_buffer_by_offset_le_S (w0_t, w1_t, w2_t, w3_t, pw_len);
// add password
// truncate at 32 is wanted, not a bug!
w3_t[2] = 0;
w3_t[3] = 0;
- switch_buffer_by_offset (w0_t, w1_t, w2_t, w3_t, pw_len);
+ switch_buffer_by_offset_le_S (w0_t, w1_t, w2_t, w3_t, pw_len);
// add password
// truncate at 32 is wanted, not a bug!
if (combs_mode == COMBINATOR_MODE_BASE_RIGHT)
{
- switch_buffer_by_offset (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len);
+ switch_buffer_by_offset_le (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len);
}
/**
if (combs_mode == COMBINATOR_MODE_BASE_LEFT)
{
- switch_buffer_by_offset (wordr0, wordr1, wordr2, wordr3, pw_l_len);
+ switch_buffer_by_offset_le (wordr0, wordr1, wordr2, wordr3, pw_l_len);
}
u32 w0[2];
if (combs_mode == COMBINATOR_MODE_BASE_RIGHT)
{
- switch_buffer_by_offset (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len);
+ switch_buffer_by_offset_le (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len);
}
/**
if (combs_mode == COMBINATOR_MODE_BASE_LEFT)
{
- switch_buffer_by_offset (wordr0, wordr1, wordr2, wordr3, pw_l_len);
+ switch_buffer_by_offset_le (wordr0, wordr1, wordr2, wordr3, pw_l_len);
}
u32 w0[2];
#define _MD5_
+#define NEW_SIMD_CODE
+
#include "include/constants.h"
#include "include/kernel_vendor.h"
#include "include/kernel_functions.c"
#include "OpenCL/types_ocl.c"
#include "OpenCL/common.c"
-
-#define COMPARE_S "OpenCL/check_single_comp4.c"
-#define COMPARE_M "OpenCL/check_multi_comp4.c"
+#include "OpenCL/simd.c"
__constant u32 padding[8] =
{
rc4_key->S[j] = tmp;
}
-static void rc4_init_16 (__local RC4_KEY *rc4_key, const u32 data[4])
+static void rc4_init_16 (__local RC4_KEY *rc4_key, const u32x data[4])
{
- u32 v = 0x03020100;
- u32 a = 0x04040404;
+ u32x v = 0x03020100;
+ u32x a = 0x04040404;
__local u32 *ptr = (__local u32 *) rc4_key->S;
ptr[i] = v; v += a;
}
- const u32 d0 = data[0] >> 0;
- const u32 d1 = data[0] >> 8;
- const u32 d2 = data[0] >> 16;
- const u32 d3 = data[0] >> 24;
- const u32 d4 = data[1] >> 0;
+ const u32x d0 = data[0] >> 0;
+ const u32x d1 = data[0] >> 8;
+ const u32x d2 = data[0] >> 16;
+ const u32x d3 = data[0] >> 24;
+ const u32x d4 = data[1] >> 0;
u32 j = 0;
j += rc4_key->S[255] + d0; swap (rc4_key, 255, j);
}
-static u8 rc4_next_16 (__local RC4_KEY *rc4_key, u8 i, u8 j, __constant u32 in[4], u32 out[4])
+static u8 rc4_next_16 (__local RC4_KEY *rc4_key, u8 i, u8 j, __constant u32x in[4], u32x out[4])
{
for (u32 k = 0; k < 4; k++)
{
- u32 xor4 = 0;
+ u32x xor4 = 0;
u8 idx;
u32 w0l = w0[0];
- for (u32 il_pos = 0; il_pos < bfs_cnt; il_pos++)
+ for (u32 il_pos = 0; il_pos < bfs_cnt; il_pos += VECT_SIZE)
{
- const u32 w0r = bfs_buf[il_pos].i;
+ const u32x w0r = w0r_create_bft (bfs_buf, il_pos);
- w0[0] = w0l | w0r;
+ const u32x w0lr = w0l | w0r;
// now the RC4 part
- u32 key[4];
+ u32x key[4];
- key[0] = w0[0];
+ key[0] = w0lr;
key[1] = w0[1];
key[2] = 0;
key[3] = 0;
rc4_init_16 (rc4_key, key);
- u32 out[4];
+ u32x out[4];
rc4_next_16 (rc4_key, 0, 0, padding, out);
- const u32 r0 = out[0];
- const u32 r1 = out[1];
- const u32 r2 = out[2];
- const u32 r3 = out[3];
-
- #include COMPARE_M
+ COMPARE_M_SIMD (out[0], out[1], out[2], out[3]);
}
}
u32 w0l = w0[0];
- for (u32 il_pos = 0; il_pos < bfs_cnt; il_pos++)
+ for (u32 il_pos = 0; il_pos < bfs_cnt; il_pos += VECT_SIZE)
{
- const u32 w0r = bfs_buf[il_pos].i;
+ const u32x w0r = w0r_create_bft (bfs_buf, il_pos);
- w0[0] = w0l | w0r;
+ const u32x w0lr = w0l | w0r;
// now the RC4 part
- u32 key[4];
+ u32x key[4];
- key[0] = w0[0];
+ key[0] = w0lr;
key[1] = w0[1];
key[2] = 0;
key[3] = 0;
rc4_init_16 (rc4_key, key);
- u32 out[4];
+ u32x out[4];
rc4_next_16 (rc4_key, 0, 0, padding, out);
- const u32 r0 = out[0];
- const u32 r1 = out[1];
- const u32 r2 = out[2];
- const u32 r3 = out[3];
-
- #include COMPARE_S
+ COMPARE_S_SIMD (out[0], out[1], out[2], out[3]);
}
}
w3_t[2] = 0;
w3_t[3] = 0;
- switch_buffer_by_offset (w0_t, w1_t, w2_t, w3_t, pw_len);
+ switch_buffer_by_offset_le (w0_t, w1_t, w2_t, w3_t, pw_len);
// add password
// truncate at 32 is wanted, not a bug!
w3_t[2] = 0;
w3_t[3] = 0;
- switch_buffer_by_offset (w0_t, w1_t, w2_t, w3_t, pw_len);
+ switch_buffer_by_offset_le (w0_t, w1_t, w2_t, w3_t, pw_len);
// add password
// truncate at 32 is wanted, not a bug!
{
append_0x80_2x4 (wordl0, wordl1, pw_l_len);
- switch_buffer_by_offset (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len);
+ switch_buffer_by_offset_le (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len);
}
/**
if (combs_mode == COMBINATOR_MODE_BASE_LEFT)
{
- switch_buffer_by_offset (wordr0, wordr1, wordr2, wordr3, pw_l_len);
+ switch_buffer_by_offset_le (wordr0, wordr1, wordr2, wordr3, pw_l_len);
}
u32 w0[4];
w3_t[2] = 0;
w3_t[3] = 0;
- switch_buffer_by_offset (w0_t, w1_t, w2_t, w3_t, pw_len);
+ switch_buffer_by_offset_le (w0_t, w1_t, w2_t, w3_t, pw_len);
// add password
// truncate at 32 is wanted, not a bug!
{
append_0x80_2x4 (wordl0, wordl1, pw_l_len);
- switch_buffer_by_offset (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len);
+ switch_buffer_by_offset_le (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len);
}
/**
if (combs_mode == COMBINATOR_MODE_BASE_LEFT)
{
- switch_buffer_by_offset (wordr0, wordr1, wordr2, wordr3, pw_l_len);
+ switch_buffer_by_offset_le (wordr0, wordr1, wordr2, wordr3, pw_l_len);
}
u32 w0[4];
w3_t[2] = 0;
w3_t[3] = 0;
- switch_buffer_by_offset (w0_t, w1_t, w2_t, w3_t, pw_len);
+ switch_buffer_by_offset_le (w0_t, w1_t, w2_t, w3_t, pw_len);
// add password
// truncate at 32 is wanted, not a bug!
#define _MD5_
+#define NEW_SIMD_CODE
+
#include "include/constants.h"
#include "include/kernel_vendor.h"
#include "include/kernel_functions.c"
#include "OpenCL/types_ocl.c"
#include "OpenCL/common.c"
-
-#define COMPARE_S "OpenCL/check_single_comp4.c"
-#define COMPARE_M "OpenCL/check_multi_comp4.c"
+#include "OpenCL/simd.c"
__constant u32 padding[8] =
{
0x7a695364
};
-static void md5_transform (const u32 w0[4], const u32 w1[4], const u32 w2[4], const u32 w3[4], u32 digest[4])
+static void md5_transform (const u32x w0[4], const u32x w1[4], const u32x w2[4], const u32x w3[4], u32x digest[4])
{
- u32 a = digest[0];
- u32 b = digest[1];
- u32 c = digest[2];
- u32 d = digest[3];
-
- u32 w0_t = w0[0];
- u32 w1_t = w0[1];
- u32 w2_t = w0[2];
- u32 w3_t = w0[3];
- u32 w4_t = w1[0];
- u32 w5_t = w1[1];
- u32 w6_t = w1[2];
- u32 w7_t = w1[3];
- u32 w8_t = w2[0];
- u32 w9_t = w2[1];
- u32 wa_t = w2[2];
- u32 wb_t = w2[3];
- u32 wc_t = w3[0];
- u32 wd_t = w3[1];
- u32 we_t = w3[2];
- u32 wf_t = w3[3];
+ u32x a = digest[0];
+ u32x b = digest[1];
+ u32x c = digest[2];
+ u32x d = digest[3];
+
+ u32x w0_t = w0[0];
+ u32x w1_t = w0[1];
+ u32x w2_t = w0[2];
+ u32x w3_t = w0[3];
+ u32x w4_t = w1[0];
+ u32x w5_t = w1[1];
+ u32x w6_t = w1[2];
+ u32x w7_t = w1[3];
+ u32x w8_t = w2[0];
+ u32x w9_t = w2[1];
+ u32x wa_t = w2[2];
+ u32x wb_t = w2[3];
+ u32x wc_t = w3[0];
+ u32x wd_t = w3[1];
+ u32x we_t = w3[2];
+ u32x wf_t = w3[3];
MD5_STEP (MD5_Fo, a, b, c, d, w0_t, MD5C00, MD5S00);
MD5_STEP (MD5_Fo, d, a, b, c, w1_t, MD5C01, MD5S01);
u32 w0l = w0[0];
- for (u32 il_pos = 0; il_pos < bfs_cnt; il_pos++)
+ for (u32 il_pos = 0; il_pos < bfs_cnt; il_pos += VECT_SIZE)
{
- const u32 w0r = bfs_buf[il_pos].i;
+ const u32x w0r = w0r_create_bft (bfs_buf, il_pos);
- w0[0] = w0l | w0r;
+ const u32x w0lr = w0l | w0r;
- u32 w0_t[4];
- u32 w1_t[4];
- u32 w2_t[4];
- u32 w3_t[4];
+ u32x w0_t[4];
+ u32x w1_t[4];
+ u32x w2_t[4];
+ u32x w3_t[4];
// max length supported by pdf11 is 32
w3_t[2] = 0;
w3_t[3] = 0;
- switch_buffer_by_offset (w0_t, w1_t, w2_t, w3_t, pw_len);
+ switch_buffer_by_offset_le (w0_t, w1_t, w2_t, w3_t, pw_len);
// add password
// truncate at 32 is wanted, not a bug!
// add o_buf
- w0_t[0] |= w0[0];
+ w0_t[0] |= w0lr;
w0_t[1] |= w0[1];
w0_t[2] |= w0[2];
w0_t[3] |= w0[3];
w3_t[2] = o_buf[6];
w3_t[3] = o_buf[7];
- u32 digest[4];
+ u32x digest[4];
digest[0] = MD5M_A;
digest[1] = MD5M_B;
md5_transform (w0_t, w1_t, w2_t, w3_t, digest);
- u32 a = digest[0];
- u32 b = digest[1] & 0xff;
+ u32x a = digest[0];
+ u32x b = digest[1] & 0xff;
+ u32x c = 0;
+ u32x d = 0;
- const u32 r0 = a;
- const u32 r1 = b;
- const u32 r2 = 0;
- const u32 r3 = 0;
-
- #include COMPARE_M
+ COMPARE_M_SIMD (a, b, c, d);
}
}
u32 w0l = w0[0];
- for (u32 il_pos = 0; il_pos < bfs_cnt; il_pos++)
+ for (u32 il_pos = 0; il_pos < bfs_cnt; il_pos += VECT_SIZE)
{
- const u32 w0r = bfs_buf[il_pos].i;
+ const u32x w0r = w0r_create_bft (bfs_buf, il_pos);
- w0[0] = w0l | w0r;
+ const u32x w0lr = w0l | w0r;
- u32 w0_t[4];
- u32 w1_t[4];
- u32 w2_t[4];
- u32 w3_t[4];
+ u32x w0_t[4];
+ u32x w1_t[4];
+ u32x w2_t[4];
+ u32x w3_t[4];
// max length supported by pdf11 is 32
w3_t[2] = 0;
w3_t[3] = 0;
- switch_buffer_by_offset (w0_t, w1_t, w2_t, w3_t, pw_len);
+ switch_buffer_by_offset_le (w0_t, w1_t, w2_t, w3_t, pw_len);
// add password
// truncate at 32 is wanted, not a bug!
// add o_buf
- w0_t[0] |= w0[0];
+ w0_t[0] |= w0lr;
w0_t[1] |= w0[1];
w0_t[2] |= w0[2];
w0_t[3] |= w0[3];
w3_t[2] = o_buf[6];
w3_t[3] = o_buf[7];
- u32 digest[4];
+ u32x digest[4];
digest[0] = MD5M_A;
digest[1] = MD5M_B;
md5_transform (w0_t, w1_t, w2_t, w3_t, digest);
- u32 a = digest[0];
- u32 b = digest[1] & 0xff;
-
- const u32 r0 = a;
- const u32 r1 = b;
- const u32 r2 = 0;
- const u32 r3 = 0;
+ u32x a = digest[0];
+ u32x b = digest[1] & 0xff;
+ u32x c = 0;
+ u32x d = 0;
- #include COMPARE_S
+ COMPARE_S_SIMD (a, b, c, d);
}
}
w3_t[2] = 0;
w3_t[3] = 0;
- switch_buffer_by_offset (w0_t, w1_t, w2_t, w3_t, pw_len);
+ switch_buffer_by_offset_le (w0_t, w1_t, w2_t, w3_t, pw_len);
// add password
// truncate at 32 is wanted, not a bug!
{
append_0x80_2x4 (wordl0, wordl1, pw_l_len);
- switch_buffer_by_offset (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len);
+ switch_buffer_by_offset_le (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len);
}
/**
{
append_0x80_2x4 (wordr0, wordr1, pw_r_len);
- switch_buffer_by_offset (wordr0, wordr1, wordr2, wordr3, pw_l_len);
+ switch_buffer_by_offset_le (wordr0, wordr1, wordr2, wordr3, pw_l_len);
}
u32 w0[4];
{
append_0x80_2x4 (wordl0, wordl1, pw_l_len);
- switch_buffer_by_offset (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len);
+ switch_buffer_by_offset_le (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len);
}
/**
{
append_0x80_2x4 (wordr0, wordr1, pw_r_len);
- switch_buffer_by_offset (wordr0, wordr1, wordr2, wordr3, pw_l_len);
+ switch_buffer_by_offset_le (wordr0, wordr1, wordr2, wordr3, pw_l_len);
}
u32 w0[4];
#define _SHA384_
+#define NEW_SIMD_CODE
+
#include "include/constants.h"
#include "include/kernel_vendor.h"
#include "include/kernel_functions.c"
#include "OpenCL/types_ocl.c"
#include "OpenCL/common.c"
-
-#define COMPARE_S "OpenCL/check_single_comp4.c"
-#define COMPARE_M "OpenCL/check_multi_comp4.c"
+#include "OpenCL/simd.c"
__constant u64 k_sha384[80] =
{
SHA384C4c, SHA384C4d, SHA384C4e, SHA384C4f,
};
-static void sha384_transform (const u32 w0[4], const u32 w1[4], const u32 w2[4], const u32 w3[4], u64 digest[8])
+static void sha384_transform (const u32x w0[4], const u32x w1[4], const u32x w2[4], const u32x w3[4], u64x digest[8])
{
- u64 w0_t = hl32_to_64 (w0[0], w0[1]);
- u64 w1_t = hl32_to_64 (w0[2], w0[3]);
- u64 w2_t = hl32_to_64 (w1[0], w1[1]);
- u64 w3_t = hl32_to_64 (w1[2], w1[3]);
- u64 w4_t = hl32_to_64 (w2[0], w2[1]);
- u64 w5_t = hl32_to_64 (w2[2], w2[3]);
- u64 w6_t = hl32_to_64 (w3[0], w3[1]);
- u64 w7_t = 0;
- u64 w8_t = 0;
- u64 w9_t = 0;
- u64 wa_t = 0;
- u64 wb_t = 0;
- u64 wc_t = 0;
- u64 wd_t = 0;
- u64 we_t = 0;
- u64 wf_t = hl32_to_64 (w3[2], w3[3]);
-
- u64 a = digest[0];
- u64 b = digest[1];
- u64 c = digest[2];
- u64 d = digest[3];
- u64 e = digest[4];
- u64 f = digest[5];
- u64 g = digest[6];
- u64 h = digest[7];
+ u64x w0_t = hl32_to_64 (w0[0], w0[1]);
+ u64x w1_t = hl32_to_64 (w0[2], w0[3]);
+ u64x w2_t = hl32_to_64 (w1[0], w1[1]);
+ u64x w3_t = hl32_to_64 (w1[2], w1[3]);
+ u64x w4_t = hl32_to_64 (w2[0], w2[1]);
+ u64x w5_t = hl32_to_64 (w2[2], w2[3]);
+ u64x w6_t = hl32_to_64 (w3[0], w3[1]);
+ u64x w7_t = 0;
+ u64x w8_t = 0;
+ u64x w9_t = 0;
+ u64x wa_t = 0;
+ u64x wb_t = 0;
+ u64x wc_t = 0;
+ u64x wd_t = 0;
+ u64x we_t = 0;
+ u64x wf_t = hl32_to_64 (w3[2], w3[3]);
+
+ u64x a = digest[0];
+ u64x b = digest[1];
+ u64x c = digest[2];
+ u64x d = digest[3];
+ u64x e = digest[4];
+ u64x f = digest[5];
+ u64x g = digest[6];
+ u64x h = digest[7];
#define ROUND_EXPAND() \
{ \
digest[7] = 0;
}
-static void m10800m (u32 w[16], const u32 pw_len, __global pw_t *pws, __global kernel_rule_t *rules_buf, __global comb_t *combs_buf, __constant u32 * words_buf_r, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 bfs_cnt, const u32 digests_cnt, const u32 digests_offset)
+static void m10800m (u32 w[16], const u32 pw_len, __global pw_t *pws, __global kernel_rule_t *rules_buf, __global comb_t *combs_buf, __constant u32x * words_buf_r, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 bfs_cnt, const u32 digests_cnt, const u32 digests_offset)
{
/**
* modifier
u32 w0l = w[0];
- for (u32 il_pos = 0; il_pos < bfs_cnt; il_pos++)
+ for (u32 il_pos = 0; il_pos < bfs_cnt; il_pos += VECT_SIZE)
{
- const u32 w0r = words_buf_r[il_pos];
+ const u32x w0r = words_buf_r[il_pos / VECT_SIZE];
- const u32 w0 = w0l | w0r;
+ const u32x w0 = w0l | w0r;
- u32 w0_t[4];
- u32 w1_t[4];
- u32 w2_t[4];
- u32 w3_t[4];
+ u32x w0_t[4];
+ u32x w1_t[4];
+ u32x w2_t[4];
+ u32x w3_t[4];
w0_t[0] = w0;
w0_t[1] = w[ 1];
w3_t[2] = w[14];
w3_t[3] = w[15];
- u64 digest[8];
+ u64x digest[8];
digest[0] = SHA384M_A;
digest[1] = SHA384M_B;
sha384_transform (w0_t, w1_t, w2_t, w3_t, digest);
- const u32 r0 = l32_from_64 (digest[3]);
- const u32 r1 = h32_from_64 (digest[3]);
- const u32 r2 = l32_from_64 (digest[2]);
- const u32 r3 = h32_from_64 (digest[2]);
+ const u32x r0 = l32_from_64 (digest[3]);
+ const u32x r1 = h32_from_64 (digest[3]);
+ const u32x r2 = l32_from_64 (digest[2]);
+ const u32x r3 = h32_from_64 (digest[2]);
- #include COMPARE_M
+ COMPARE_M_SIMD (r0, r1, r2, r3);
}
}
-static void m10800s (u32 w[16], const u32 pw_len, __global pw_t *pws, __global kernel_rule_t *rules_buf, __global comb_t *combs_buf, __constant u32 * words_buf_r, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 bfs_cnt, const u32 digests_cnt, const u32 digests_offset)
+static void m10800s (u32 w[16], const u32 pw_len, __global pw_t *pws, __global kernel_rule_t *rules_buf, __global comb_t *combs_buf, __constant u32x * words_buf_r, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 bfs_cnt, const u32 digests_cnt, const u32 digests_offset)
{
/**
* modifier
u32 w0l = w[0];
- for (u32 il_pos = 0; il_pos < bfs_cnt; il_pos++)
+ for (u32 il_pos = 0; il_pos < bfs_cnt; il_pos += VECT_SIZE)
{
- const u32 w0r = words_buf_r[il_pos];
+ const u32x w0r = words_buf_r[il_pos / VECT_SIZE];
- const u32 w0 = w0l | w0r;
+ const u32x w0 = w0l | w0r;
- u32 w0_t[4];
- u32 w1_t[4];
- u32 w2_t[4];
- u32 w3_t[4];
+ u32x w0_t[4];
+ u32x w1_t[4];
+ u32x w2_t[4];
+ u32x w3_t[4];
w0_t[0] = w0;
w0_t[1] = w[ 1];
w3_t[2] = w[14];
w3_t[3] = w[15];
- u64 digest[8];
+ u64x digest[8];
digest[0] = SHA384M_A;
digest[1] = SHA384M_B;
sha384_transform (w0_t, w1_t, w2_t, w3_t, digest);
- const u32 r0 = l32_from_64 (digest[3]);
- const u32 r1 = h32_from_64 (digest[3]);
- const u32 r2 = l32_from_64 (digest[2]);
- const u32 r3 = h32_from_64 (digest[2]);
+ const u32x r0 = l32_from_64 (digest[3]);
+ const u32x r1 = h32_from_64 (digest[3]);
+ const u32x r2 = l32_from_64 (digest[2]);
+ const u32x r3 = h32_from_64 (digest[2]);
- #include COMPARE_S
+ COMPARE_S_SIMD (r0, r1, r2, r3);
}
}
-__kernel void m10800_m04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, __global comb_t *combs_buf, __constant u32 * words_buf_r, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 bfs_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max)
+__kernel void m10800_m04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, __global comb_t *combs_buf, __constant u32x * words_buf_r, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 bfs_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max)
{
/**
* base
m10800m (w, pw_len, pws, rules_buf, combs_buf, words_buf_r, tmps, hooks, bitmaps_buf_s1_a, bitmaps_buf_s1_b, bitmaps_buf_s1_c, bitmaps_buf_s1_d, bitmaps_buf_s2_a, bitmaps_buf_s2_b, bitmaps_buf_s2_c, bitmaps_buf_s2_d, plains_buf, digests_buf, hashes_shown, salt_bufs, esalt_bufs, d_return_buf, d_scryptV_buf, bitmap_mask, bitmap_shift1, bitmap_shift2, salt_pos, loop_pos, loop_cnt, bfs_cnt, digests_cnt, digests_offset);
}
-__kernel void m10800_m08 (__global pw_t *pws, __global kernel_rule_t *rules_buf, __global comb_t *combs_buf, __constant u32 * words_buf_r, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 bfs_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max)
+__kernel void m10800_m08 (__global pw_t *pws, __global kernel_rule_t *rules_buf, __global comb_t *combs_buf, __constant u32x * words_buf_r, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 bfs_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max)
{
/**
* base
m10800m (w, pw_len, pws, rules_buf, combs_buf, words_buf_r, tmps, hooks, bitmaps_buf_s1_a, bitmaps_buf_s1_b, bitmaps_buf_s1_c, bitmaps_buf_s1_d, bitmaps_buf_s2_a, bitmaps_buf_s2_b, bitmaps_buf_s2_c, bitmaps_buf_s2_d, plains_buf, digests_buf, hashes_shown, salt_bufs, esalt_bufs, d_return_buf, d_scryptV_buf, bitmap_mask, bitmap_shift1, bitmap_shift2, salt_pos, loop_pos, loop_cnt, bfs_cnt, digests_cnt, digests_offset);
}
-__kernel void m10800_m16 (__global pw_t *pws, __global kernel_rule_t *rules_buf, __global comb_t *combs_buf, __constant u32 * words_buf_r, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 bfs_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max)
+__kernel void m10800_m16 (__global pw_t *pws, __global kernel_rule_t *rules_buf, __global comb_t *combs_buf, __constant u32x * words_buf_r, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 bfs_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max)
{
/**
* base
m10800m (w, pw_len, pws, rules_buf, combs_buf, words_buf_r, tmps, hooks, bitmaps_buf_s1_a, bitmaps_buf_s1_b, bitmaps_buf_s1_c, bitmaps_buf_s1_d, bitmaps_buf_s2_a, bitmaps_buf_s2_b, bitmaps_buf_s2_c, bitmaps_buf_s2_d, plains_buf, digests_buf, hashes_shown, salt_bufs, esalt_bufs, d_return_buf, d_scryptV_buf, bitmap_mask, bitmap_shift1, bitmap_shift2, salt_pos, loop_pos, loop_cnt, bfs_cnt, digests_cnt, digests_offset);
}
-__kernel void m10800_s04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, __global comb_t *combs_buf, __constant u32 * words_buf_r, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 bfs_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max)
+__kernel void m10800_s04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, __global comb_t *combs_buf, __constant u32x * words_buf_r, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 bfs_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max)
{
/**
* base
m10800s (w, pw_len, pws, rules_buf, combs_buf, words_buf_r, tmps, hooks, bitmaps_buf_s1_a, bitmaps_buf_s1_b, bitmaps_buf_s1_c, bitmaps_buf_s1_d, bitmaps_buf_s2_a, bitmaps_buf_s2_b, bitmaps_buf_s2_c, bitmaps_buf_s2_d, plains_buf, digests_buf, hashes_shown, salt_bufs, esalt_bufs, d_return_buf, d_scryptV_buf, bitmap_mask, bitmap_shift1, bitmap_shift2, salt_pos, loop_pos, loop_cnt, bfs_cnt, digests_cnt, digests_offset);
}
-__kernel void m10800_s08 (__global pw_t *pws, __global kernel_rule_t *rules_buf, __global comb_t *combs_buf, __constant u32 * words_buf_r, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 bfs_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max)
+__kernel void m10800_s08 (__global pw_t *pws, __global kernel_rule_t *rules_buf, __global comb_t *combs_buf, __constant u32x * words_buf_r, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 bfs_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max)
{
/**
* base
m10800s (w, pw_len, pws, rules_buf, combs_buf, words_buf_r, tmps, hooks, bitmaps_buf_s1_a, bitmaps_buf_s1_b, bitmaps_buf_s1_c, bitmaps_buf_s1_d, bitmaps_buf_s2_a, bitmaps_buf_s2_b, bitmaps_buf_s2_c, bitmaps_buf_s2_d, plains_buf, digests_buf, hashes_shown, salt_bufs, esalt_bufs, d_return_buf, d_scryptV_buf, bitmap_mask, bitmap_shift1, bitmap_shift2, salt_pos, loop_pos, loop_cnt, bfs_cnt, digests_cnt, digests_offset);
}
-__kernel void m10800_s16 (__global pw_t *pws, __global kernel_rule_t *rules_buf, __global comb_t *combs_buf, __constant u32 * words_buf_r, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 bfs_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max)
+__kernel void m10800_s16 (__global pw_t *pws, __global kernel_rule_t *rules_buf, __global comb_t *combs_buf, __constant u32x * words_buf_r, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 bfs_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max)
{
/**
* base
if (combs_mode == COMBINATOR_MODE_BASE_RIGHT)
{
- switch_buffer_by_offset (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len);
+ switch_buffer_by_offset_le (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len);
}
/**
if (combs_mode == COMBINATOR_MODE_BASE_LEFT)
{
- switch_buffer_by_offset (wordr0, wordr1, wordr2, wordr3, pw_l_len);
+ switch_buffer_by_offset_le (wordr0, wordr1, wordr2, wordr3, pw_l_len);
}
u32 w0[4];
if (combs_mode == COMBINATOR_MODE_BASE_RIGHT)
{
- switch_buffer_by_offset (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len);
+ switch_buffer_by_offset_le (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len);
}
/**
if (combs_mode == COMBINATOR_MODE_BASE_LEFT)
{
- switch_buffer_by_offset (wordr0, wordr1, wordr2, wordr3, pw_l_len);
+ switch_buffer_by_offset_le (wordr0, wordr1, wordr2, wordr3, pw_l_len);
}
u32 w0[4];
#define _MD5_
+#define NEW_SIMD_CODE
+
#include "include/constants.h"
#include "include/kernel_vendor.h"
#include "include/kernel_functions.c"
#include "OpenCL/types_ocl.c"
#include "OpenCL/common.c"
-
-#define COMPARE_S "OpenCL/check_single_comp4.c"
-#define COMPARE_M "OpenCL/check_multi_comp4.c"
+#include "OpenCL/simd.c"
static void m11000m (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], const u32 pw_len, __global pw_t *pws, __global kernel_rule_t *rules_buf, __global comb_t *combs_buf, __global bf_t *bfs_buf, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 bfs_cnt, const u32 digests_cnt, const u32 digests_offset)
{
u32 w0l = w0[0];
- for (u32 il_pos = 0; il_pos < bfs_cnt; il_pos++)
+ for (u32 il_pos = 0; il_pos < bfs_cnt; il_pos += VECT_SIZE)
{
- const u32 w0r = bfs_buf[il_pos].i;
+ const u32x w0r = w0r_create_bft (bfs_buf, il_pos);
- w0[0] = w0l | w0r;
+ const u32x w0lr = w0l | w0r;
/**
* prepend salt
// first step fixed 56 bytes of salt
- u32 w0_t[4];
- u32 w1_t[4];
- u32 w2_t[4];
- u32 w3_t[4];
+ u32x w0_t[4];
+ u32x w1_t[4];
+ u32x w2_t[4];
+ u32x w3_t[4];
w0_t[0] = salt_buf0[0];
w0_t[1] = salt_buf0[1];
// after 56 byte salt, we have beginning of the password
- w3_t[2] = w0[0];
+ w3_t[2] = w0lr;
w3_t[3] = w0[1];
/**
// first transform
- u32 a = MD5M_A;
- u32 b = MD5M_B;
- u32 c = MD5M_C;
- u32 d = MD5M_D;
+ u32x a = MD5M_A;
+ u32x b = MD5M_B;
+ u32x c = MD5M_C;
+ u32x d = MD5M_D;
MD5_STEP (MD5_Fo, a, b, c, d, w0_t[0], MD5C00, MD5S00);
MD5_STEP (MD5_Fo, d, a, b, c, w0_t[1], MD5C01, MD5S01);
c += MD5M_C;
d += MD5M_D;
- u32 r_a = a;
- u32 r_b = b;
- u32 r_c = c;
- u32 r_d = d;
+ u32x r_a = a;
+ u32x r_b = b;
+ u32x r_c = c;
+ u32x r_d = d;
// 2nd transform
c += r_c;
d += r_d;
- const u32 r0 = a;
- const u32 r1 = d;
- const u32 r2 = c;
- const u32 r3 = b;
-
- #include COMPARE_M
+ COMPARE_M_SIMD (a, d, c, b);
}
}
u32 w0l = w0[0];
- for (u32 il_pos = 0; il_pos < bfs_cnt; il_pos++)
+ for (u32 il_pos = 0; il_pos < bfs_cnt; il_pos += VECT_SIZE)
{
- const u32 w0r = bfs_buf[il_pos].i;
+ const u32x w0r = w0r_create_bft (bfs_buf, il_pos);
- w0[0] = w0l | w0r;
+ const u32x w0lr = w0l | w0r;
/**
* prepend salt
// first step fixed 56 bytes of salt
- u32 w0_t[4];
- u32 w1_t[4];
- u32 w2_t[4];
- u32 w3_t[4];
+ u32x w0_t[4];
+ u32x w1_t[4];
+ u32x w2_t[4];
+ u32x w3_t[4];
w0_t[0] = salt_buf0[0];
w0_t[1] = salt_buf0[1];
// after 56 byte salt, we have beginning of the password
- w3_t[2] = w0[0];
+ w3_t[2] = w0lr;
w3_t[3] = w0[1];
/**
// first transform
- u32 a = MD5M_A;
- u32 b = MD5M_B;
- u32 c = MD5M_C;
- u32 d = MD5M_D;
+ u32x a = MD5M_A;
+ u32x b = MD5M_B;
+ u32x c = MD5M_C;
+ u32x d = MD5M_D;
MD5_STEP (MD5_Fo, a, b, c, d, w0_t[0], MD5C00, MD5S00);
MD5_STEP (MD5_Fo, d, a, b, c, w0_t[1], MD5C01, MD5S01);
c += MD5M_C;
d += MD5M_D;
- u32 r_a = a;
- u32 r_b = b;
- u32 r_c = c;
- u32 r_d = d;
+ u32x r_a = a;
+ u32x r_b = b;
+ u32x r_c = c;
+ u32x r_d = d;
// 2nd transform
c += r_c;
d += r_d;
- const u32 r0 = a;
- const u32 r1 = d;
- const u32 r2 = c;
- const u32 r3 = b;
-
- #include COMPARE_S
+ COMPARE_S_SIMD (a, d, c, b);
}
}
* append the salt
*/
- switch_buffer_by_offset (w0_t, w1_t, w2_t, w3_t, pw_len);
+ switch_buffer_by_offset_le (w0_t, w1_t, w2_t, w3_t, pw_len);
const u32 pw_salt_len = out_len + salt_len;
* append the salt
*/
- switch_buffer_by_offset (w0_t, w1_t, w2_t, w3_t, pw_len);
+ switch_buffer_by_offset_le (w0_t, w1_t, w2_t, w3_t, pw_len);
const u32 pw_salt_len = out_len + salt_len;
if (combs_mode == COMBINATOR_MODE_BASE_RIGHT)
{
- switch_buffer_by_offset (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len);
+ switch_buffer_by_offset_le (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len);
}
/**
if (combs_mode == COMBINATOR_MODE_BASE_LEFT)
{
- switch_buffer_by_offset (wordr0, wordr1, wordr2, wordr3, pw_l_len);
+ switch_buffer_by_offset_le (wordr0, wordr1, wordr2, wordr3, pw_l_len);
}
u32 w0_t[4];
* append the salt
*/
- switch_buffer_by_offset (w0_t, w1_t, w2_t, w3_t, pw_len);
+ switch_buffer_by_offset_le (w0_t, w1_t, w2_t, w3_t, pw_len);
const u32 pw_salt_len = pw_len + salt_len;
if (combs_mode == COMBINATOR_MODE_BASE_RIGHT)
{
- switch_buffer_by_offset (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len);
+ switch_buffer_by_offset_le (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len);
}
/**
if (combs_mode == COMBINATOR_MODE_BASE_LEFT)
{
- switch_buffer_by_offset (wordr0, wordr1, wordr2, wordr3, pw_l_len);
+ switch_buffer_by_offset_le (wordr0, wordr1, wordr2, wordr3, pw_l_len);
}
u32 w0_t[4];
* append the salt
*/
- switch_buffer_by_offset (w0_t, w1_t, w2_t, w3_t, pw_len);
+ switch_buffer_by_offset_le (w0_t, w1_t, w2_t, w3_t, pw_len);
const u32 pw_salt_len = pw_len + salt_len;
#define _MD5_
+#define NEW_SIMD_CODE
+
#include "include/constants.h"
#include "include/kernel_vendor.h"
#include "include/kernel_functions.c"
#include "OpenCL/types_ocl.c"
#include "OpenCL/common.c"
-
-#define COMPARE_S "OpenCL/check_single_comp4.c"
-#define COMPARE_M "OpenCL/check_multi_comp4.c"
-
-#define uint_to_hex_lower8(i) l_bin2asc[(i)]
+#include "OpenCL/simd.c"
+
+#if VECT_SIZE == 1
+#define uint_to_hex_lower8(i) (u32x) (l_bin2asc[(i)])
+#elif VECT_SIZE == 2
+#define uint_to_hex_lower8(i) (u32x) (l_bin2asc[(i).s0], l_bin2asc[(i).s1])
+#elif VECT_SIZE == 4
+#define uint_to_hex_lower8(i) (u32x) (l_bin2asc[(i).s0], l_bin2asc[(i).s1], l_bin2asc[(i).s2], l_bin2asc[(i).s3])
+#elif VECT_SIZE == 8
+#define uint_to_hex_lower8(i) (u32x) (l_bin2asc[(i).s0], l_bin2asc[(i).s1], l_bin2asc[(i).s2], l_bin2asc[(i).s3], l_bin2asc[(i).s4], l_bin2asc[(i).s5], l_bin2asc[(i).s6], l_bin2asc[(i).s7])
+#endif
static void m11100m (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], const u32 pw_len, __global pw_t *pws, __global kernel_rule_t *rules_buf, __global comb_t *combs_buf, __global bf_t *bfs_buf, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 bfs_cnt, const u32 digests_cnt, const u32 digests_offset, __local u32 l_bin2asc[256])
{
const u32 salt_len = salt_bufs[salt_pos].salt_len - 4;
- switch_buffer_by_offset (salt_buf0, salt_buf1, salt_buf2, salt_buf3, pw_len);
+ switch_buffer_by_offset_le_S (salt_buf0, salt_buf1, salt_buf2, salt_buf3, pw_len);
const u32 pw_salt_len = pw_len + salt_len;
u32 w0l = w0[0];
- for (u32 il_pos = 0; il_pos < bfs_cnt; il_pos++)
+ for (u32 il_pos = 0; il_pos < bfs_cnt; il_pos += VECT_SIZE)
{
- const u32 w0r = bfs_buf[il_pos].i;
+ const u32x w0r = w0r_create_bft (bfs_buf, il_pos);
- w0[0] = w0l | w0r;
+ const u32x w0lr = w0l | w0r;
- u32 w0_t[4];
+ u32x w0_t[4];
- w0_t[0] = w0[0] | salt_buf0[0];
+ w0_t[0] = w0lr | salt_buf0[0];
w0_t[1] = w0[1] | salt_buf0[1];
w0_t[2] = w0[2] | salt_buf0[2];
w0_t[3] = w0[3] | salt_buf0[3];
- u32 w1_t[4];
+ u32x w1_t[4];
w1_t[0] = w1[0] | salt_buf1[0];
w1_t[1] = w1[1] | salt_buf1[1];
w1_t[2] = w1[2] | salt_buf1[2];
w1_t[3] = w1[3] | salt_buf1[3];
- u32 w2_t[4];
+ u32x w2_t[4];
w2_t[0] = w2[0] | salt_buf2[0];
w2_t[1] = w2[1] | salt_buf2[1];
w2_t[2] = w2[2] | salt_buf2[2];
w2_t[3] = w2[3] | salt_buf2[3];
- u32 w3_t[4];
+ u32x w3_t[4];
w3_t[0] = w3[0] | salt_buf3[0];
w3_t[1] = w3[1] | salt_buf3[1];
* md5 ($pass.$salt)
*/
- u32 a = MD5M_A;
- u32 b = MD5M_B;
- u32 c = MD5M_C;
- u32 d = MD5M_D;
+ u32x a = MD5M_A;
+ u32x b = MD5M_B;
+ u32x c = MD5M_C;
+ u32x d = MD5M_D;
MD5_STEP (MD5_Fo, a, b, c, d, w0_t[0], MD5C00, MD5S00);
MD5_STEP (MD5_Fo, d, a, b, c, w0_t[1], MD5C01, MD5S01);
MD5_STEP (MD5_I , c, d, a, b, w0_t[2], MD5C3e, MD5S32);
MD5_STEP (MD5_I , b, c, d, a, w2_t[1], MD5C3f, MD5S33);
- const u32 r0 = a;
- const u32 r1 = d;
- const u32 r2 = c;
- const u32 r3 = b;
-
- #include COMPARE_M
+ COMPARE_M_SIMD (a, d, c, b);
}
}
const u32 salt_len = salt_bufs[salt_pos].salt_len - 4;
- switch_buffer_by_offset (salt_buf0, salt_buf1, salt_buf2, salt_buf3, pw_len);
+ switch_buffer_by_offset_le_S (salt_buf0, salt_buf1, salt_buf2, salt_buf3, pw_len);
const u32 pw_salt_len = pw_len + salt_len;
u32 w0l = w0[0];
- for (u32 il_pos = 0; il_pos < bfs_cnt; il_pos++)
+ for (u32 il_pos = 0; il_pos < bfs_cnt; il_pos += VECT_SIZE)
{
- const u32 w0r = bfs_buf[il_pos].i;
+ const u32x w0r = w0r_create_bft (bfs_buf, il_pos);
- w0[0] = w0l | w0r;
+ const u32x w0lr = w0l | w0r;
- u32 w0_t[4];
+ u32x w0_t[4];
- w0_t[0] = w0[0] | salt_buf0[0];
+ w0_t[0] = w0lr | salt_buf0[0];
w0_t[1] = w0[1] | salt_buf0[1];
w0_t[2] = w0[2] | salt_buf0[2];
w0_t[3] = w0[3] | salt_buf0[3];
- u32 w1_t[4];
+ u32x w1_t[4];
w1_t[0] = w1[0] | salt_buf1[0];
w1_t[1] = w1[1] | salt_buf1[1];
w1_t[2] = w1[2] | salt_buf1[2];
w1_t[3] = w1[3] | salt_buf1[3];
- u32 w2_t[4];
+ u32x w2_t[4];
w2_t[0] = w2[0] | salt_buf2[0];
w2_t[1] = w2[1] | salt_buf2[1];
w2_t[2] = w2[2] | salt_buf2[2];
w2_t[3] = w2[3] | salt_buf2[3];
- u32 w3_t[4];
+ u32x w3_t[4];
w3_t[0] = w3[0] | salt_buf3[0];
w3_t[1] = w3[1] | salt_buf3[1];
* md5 ($pass.$salt)
*/
- u32 a = MD5M_A;
- u32 b = MD5M_B;
- u32 c = MD5M_C;
- u32 d = MD5M_D;
+ u32x a = MD5M_A;
+ u32x b = MD5M_B;
+ u32x c = MD5M_C;
+ u32x d = MD5M_D;
MD5_STEP (MD5_Fo, a, b, c, d, w0_t[0], MD5C00, MD5S00);
MD5_STEP (MD5_Fo, d, a, b, c, w0_t[1], MD5C01, MD5S01);
MD5_STEP (MD5_I , c, d, a, b, w0_t[2], MD5C3e, MD5S32);
MD5_STEP (MD5_I , b, c, d, a, w2_t[1], MD5C3f, MD5S33);
- const u32 r0 = a;
- const u32 r1 = d;
- const u32 r2 = c;
- const u32 r3 = b;
-
- #include COMPARE_S
+ COMPARE_S_SIMD (a, d, c, b);
}
}
{
append_0x80_2x4 (wordl0, wordl1, pw_l_len);
- switch_buffer_by_offset (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len);
+ switch_buffer_by_offset_le (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len);
}
/**
{
append_0x80_2x4 (wordr0, wordr1, pw_r_len);
- switch_buffer_by_offset (wordr0, wordr1, wordr2, wordr3, pw_l_len);
+ switch_buffer_by_offset_le (wordr0, wordr1, wordr2, wordr3, pw_l_len);
}
u32 w0[4];
{
append_0x80_2x4 (wordl0, wordl1, pw_l_len);
- switch_buffer_by_offset (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len);
+ switch_buffer_by_offset_le (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len);
}
/**
{
append_0x80_2x4 (wordr0, wordr1, pw_r_len);
- switch_buffer_by_offset (wordr0, wordr1, wordr2, wordr3, pw_l_len);
+ switch_buffer_by_offset_le (wordr0, wordr1, wordr2, wordr3, pw_l_len);
}
u32 w0[4];
#define _SHA1_
+#define NEW_SIMD_CODE
+
#include "include/constants.h"
#include "include/kernel_vendor.h"
#include "include/kernel_functions.c"
#include "OpenCL/types_ocl.c"
#include "OpenCL/common.c"
-
-#define COMPARE_S "OpenCL/check_single_comp4.c"
-#define COMPARE_M "OpenCL/check_multi_comp4.c"
+#include "OpenCL/simd.c"
static void m11200m (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], const u32 pw_len, __global pw_t *pws, __global kernel_rule_t *rules_buf, __global comb_t *combs_buf, __global bf_t *bfs_buf, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 bfs_cnt, const u32 digests_cnt, const u32 digests_offset)
{
u32 salt_buf[5];
- salt_buf[0] = swap32 (salt_bufs[salt_pos].salt_buf[0]);
- salt_buf[1] = swap32 (salt_bufs[salt_pos].salt_buf[1]);
- salt_buf[2] = swap32 (salt_bufs[salt_pos].salt_buf[2]);
- salt_buf[3] = swap32 (salt_bufs[salt_pos].salt_buf[3]);
- salt_buf[4] = swap32 (salt_bufs[salt_pos].salt_buf[4]);
+ salt_buf[0] = swap32_S (salt_bufs[salt_pos].salt_buf[0]);
+ salt_buf[1] = swap32_S (salt_bufs[salt_pos].salt_buf[1]);
+ salt_buf[2] = swap32_S (salt_bufs[salt_pos].salt_buf[2]);
+ salt_buf[3] = swap32_S (salt_bufs[salt_pos].salt_buf[3]);
+ salt_buf[4] = swap32_S (salt_bufs[salt_pos].salt_buf[4]);
/**
* loop
u32 w0l = w0[0];
- for (u32 il_pos = 0; il_pos < bfs_cnt; il_pos++)
+ for (u32 il_pos = 0; il_pos < bfs_cnt; il_pos += VECT_SIZE)
{
- const u32 w0r = bfs_buf[il_pos].i;
+ const u32x w0r = w0r_create_bft (bfs_buf, il_pos);
- w0[0] = w0l | w0r;
+ const u32x w0lr = w0l | w0r;
/**
* sha1 ($pass)
*/
- u32 w0_t = w0[0];
- u32 w1_t = w0[1];
- u32 w2_t = w0[2];
- u32 w3_t = w0[3];
- u32 w4_t = w1[0];
- u32 w5_t = w1[1];
- u32 w6_t = w1[2];
- u32 w7_t = w1[3];
- u32 w8_t = w2[0];
- u32 w9_t = w2[1];
- u32 wa_t = w2[2];
- u32 wb_t = w2[3];
- u32 wc_t = w3[0];
- u32 wd_t = w3[1];
- u32 we_t = 0;
- u32 wf_t = pw_len * 8;
-
- u32 a = SHA1M_A;
- u32 b = SHA1M_B;
- u32 c = SHA1M_C;
- u32 d = SHA1M_D;
- u32 e = SHA1M_E;
+ u32x w0_t = w0lr;
+ u32x w1_t = w0[1];
+ u32x w2_t = w0[2];
+ u32x w3_t = w0[3];
+ u32x w4_t = w1[0];
+ u32x w5_t = w1[1];
+ u32x w6_t = w1[2];
+ u32x w7_t = w1[3];
+ u32x w8_t = w2[0];
+ u32x w9_t = w2[1];
+ u32x wa_t = w2[2];
+ u32x wb_t = w2[3];
+ u32x wc_t = w3[0];
+ u32x wd_t = w3[1];
+ u32x we_t = 0;
+ u32x wf_t = pw_len * 8;
+
+ u32x a = SHA1M_A;
+ u32x b = SHA1M_B;
+ u32x c = SHA1M_C;
+ u32x d = SHA1M_D;
+ u32x e = SHA1M_E;
#undef K
#define K SHA1C00
we_t = rotl32 ((wb_t ^ w6_t ^ w0_t ^ we_t), 1u); SHA1_STEP (SHA1_F1, c, d, e, a, b, we_t);
wf_t = rotl32 ((wc_t ^ w7_t ^ w1_t ^ wf_t), 1u); SHA1_STEP (SHA1_F1, b, c, d, e, a, wf_t);
- u32 plain_sha1_a = a + SHA1M_A;
- u32 plain_sha1_b = b + SHA1M_B;
- u32 plain_sha1_c = c + SHA1M_C;
- u32 plain_sha1_d = d + SHA1M_D;
- u32 plain_sha1_e = e + SHA1M_E;
+ u32x plain_sha1_a = a + SHA1M_A;
+ u32x plain_sha1_b = b + SHA1M_B;
+ u32x plain_sha1_c = c + SHA1M_C;
+ u32x plain_sha1_d = d + SHA1M_D;
+ u32x plain_sha1_e = e + SHA1M_E;
/**
* sha1 (sha1 ($pass))
d ^= plain_sha1_d;
e ^= plain_sha1_e;
- const u32 r0 = d;
- const u32 r1 = e;
- const u32 r2 = c;
- const u32 r3 = b;
-
- #include COMPARE_M
+ COMPARE_M_SIMD (d, e, c, b);
}
}
u32 salt_buf[5];
- salt_buf[0] = swap32 (salt_bufs[salt_pos].salt_buf[0]);
- salt_buf[1] = swap32 (salt_bufs[salt_pos].salt_buf[1]);
- salt_buf[2] = swap32 (salt_bufs[salt_pos].salt_buf[2]);
- salt_buf[3] = swap32 (salt_bufs[salt_pos].salt_buf[3]);
- salt_buf[4] = swap32 (salt_bufs[salt_pos].salt_buf[4]);
+ salt_buf[0] = swap32_S (salt_bufs[salt_pos].salt_buf[0]);
+ salt_buf[1] = swap32_S (salt_bufs[salt_pos].salt_buf[1]);
+ salt_buf[2] = swap32_S (salt_bufs[salt_pos].salt_buf[2]);
+ salt_buf[3] = swap32_S (salt_bufs[salt_pos].salt_buf[3]);
+ salt_buf[4] = swap32_S (salt_bufs[salt_pos].salt_buf[4]);
/**
* loop
u32 w0l = w0[0];
- for (u32 il_pos = 0; il_pos < bfs_cnt; il_pos++)
+ for (u32 il_pos = 0; il_pos < bfs_cnt; il_pos += VECT_SIZE)
{
- const u32 w0r = bfs_buf[il_pos].i;
+ const u32x w0r = w0r_create_bft (bfs_buf, il_pos);
- w0[0] = w0l | w0r;
+ const u32x w0lr = w0l | w0r;
/**
* sha1 ($pass)
*/
- u32 w0_t = w0[0];
- u32 w1_t = w0[1];
- u32 w2_t = w0[2];
- u32 w3_t = w0[3];
- u32 w4_t = w1[0];
- u32 w5_t = w1[1];
- u32 w6_t = w1[2];
- u32 w7_t = w1[3];
- u32 w8_t = w2[0];
- u32 w9_t = w2[1];
- u32 wa_t = w2[2];
- u32 wb_t = w2[3];
- u32 wc_t = w3[0];
- u32 wd_t = w3[1];
- u32 we_t = 0;
- u32 wf_t = pw_len * 8;
-
- u32 a = SHA1M_A;
- u32 b = SHA1M_B;
- u32 c = SHA1M_C;
- u32 d = SHA1M_D;
- u32 e = SHA1M_E;
+ u32x w0_t = w0lr;
+ u32x w1_t = w0[1];
+ u32x w2_t = w0[2];
+ u32x w3_t = w0[3];
+ u32x w4_t = w1[0];
+ u32x w5_t = w1[1];
+ u32x w6_t = w1[2];
+ u32x w7_t = w1[3];
+ u32x w8_t = w2[0];
+ u32x w9_t = w2[1];
+ u32x wa_t = w2[2];
+ u32x wb_t = w2[3];
+ u32x wc_t = w3[0];
+ u32x wd_t = w3[1];
+ u32x we_t = 0;
+ u32x wf_t = pw_len * 8;
+
+ u32x a = SHA1M_A;
+ u32x b = SHA1M_B;
+ u32x c = SHA1M_C;
+ u32x d = SHA1M_D;
+ u32x e = SHA1M_E;
#undef K
#define K SHA1C00
we_t = rotl32 ((wb_t ^ w6_t ^ w0_t ^ we_t), 1u); SHA1_STEP (SHA1_F1, c, d, e, a, b, we_t);
wf_t = rotl32 ((wc_t ^ w7_t ^ w1_t ^ wf_t), 1u); SHA1_STEP (SHA1_F1, b, c, d, e, a, wf_t);
- u32 plain_sha1_a = a + SHA1M_A;
- u32 plain_sha1_b = b + SHA1M_B;
- u32 plain_sha1_c = c + SHA1M_C;
- u32 plain_sha1_d = d + SHA1M_D;
- u32 plain_sha1_e = e + SHA1M_E;
+ u32x plain_sha1_a = a + SHA1M_A;
+ u32x plain_sha1_b = b + SHA1M_B;
+ u32x plain_sha1_c = c + SHA1M_C;
+ u32x plain_sha1_d = d + SHA1M_D;
+ u32x plain_sha1_e = e + SHA1M_E;
/**
* sha1 (sha1 ($pass))
d ^= plain_sha1_d;
e ^= plain_sha1_e;
- const u32 r0 = d;
- const u32 r1 = e;
- const u32 r2 = c;
- const u32 r3 = b;
-
- #include COMPARE_S
+ COMPARE_S_SIMD (d, e, c, b);
}
}
u32 salt_len = salt_bufs[salt_pos].salt_len;
- switch_buffer_by_offset (salt_buf0, salt_buf1, salt_buf2, salt_buf3, pw_len);
+ switch_buffer_by_offset_le (salt_buf0, salt_buf1, salt_buf2, salt_buf3, pw_len);
w0[0] |= salt_buf0[0];
w0[1] |= salt_buf0[1];
{
append_0x80_2x4 (wordl0, wordl1, pw_l_len);
- switch_buffer_by_offset (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len);
+ switch_buffer_by_offset_le (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len);
}
/**
if (combs_mode == COMBINATOR_MODE_BASE_LEFT)
{
- switch_buffer_by_offset (wordr0, wordr1, wordr2, wordr3, pw_l_len);
+ switch_buffer_by_offset_le (wordr0, wordr1, wordr2, wordr3, pw_l_len);
}
u32 w0[4];
{
append_0x80_2x4 (wordl0, wordl1, pw_l_len);
- switch_buffer_by_offset (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len);
+ switch_buffer_by_offset_le (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len);
}
/**
if (combs_mode == COMBINATOR_MODE_BASE_LEFT)
{
- switch_buffer_by_offset (wordr0, wordr1, wordr2, wordr3, pw_l_len);
+ switch_buffer_by_offset_le (wordr0, wordr1, wordr2, wordr3, pw_l_len);
}
u32 w0[4];
#define _MD5_
+#define NEW_SIMD_CODE
+
#include "include/constants.h"
#include "include/kernel_vendor.h"
#include "include/kernel_functions.c"
#include "OpenCL/types_ocl.c"
#include "OpenCL/common.c"
-
-#define COMPARE_S "OpenCL/check_single_comp4.c"
-#define COMPARE_M "OpenCL/check_multi_comp4.c"
-
-#define uint_to_hex_lower8(i) l_bin2asc[(i)]
-
-static u32 memcat32 (u32 block0[16], u32 block1[16], const u32 block_len, const u32 append0[4], const u32 append1[4], const u32 append2[4], const u32 append3[4], const u32 append_len)
+#include "OpenCL/simd.c"
+
+#if VECT_SIZE == 1
+#define uint_to_hex_lower8(i) (u32x) (l_bin2asc[(i)])
+#elif VECT_SIZE == 2
+#define uint_to_hex_lower8(i) (u32x) (l_bin2asc[(i).s0], l_bin2asc[(i).s1])
+#elif VECT_SIZE == 4
+#define uint_to_hex_lower8(i) (u32x) (l_bin2asc[(i).s0], l_bin2asc[(i).s1], l_bin2asc[(i).s2], l_bin2asc[(i).s3])
+#elif VECT_SIZE == 8
+#define uint_to_hex_lower8(i) (u32x) (l_bin2asc[(i).s0], l_bin2asc[(i).s1], l_bin2asc[(i).s2], l_bin2asc[(i).s3], l_bin2asc[(i).s4], l_bin2asc[(i).s5], l_bin2asc[(i).s6], l_bin2asc[(i).s7])
+#endif
+
+static u32 memcat32 (u32x block0[16], u32x block1[16], const u32 block_len, const u32x append0[4], const u32x append1[4], const u32x append2[4], const u32x append3[4], const u32 append_len)
{
const u32 mod = block_len & 3;
const u32 div = block_len / 4;
#if defined IS_AMD || defined IS_GENERIC
const int offset_minus_4 = 4 - mod;
- u32 append0_t[4];
+ u32x append0_t[4];
append0_t[0] = amd_bytealign (append0[0], 0, offset_minus_4);
append0_t[1] = amd_bytealign (append0[1], append0[0], offset_minus_4);
append0_t[2] = amd_bytealign (append0[2], append0[1], offset_minus_4);
append0_t[3] = amd_bytealign (append0[3], append0[2], offset_minus_4);
- u32 append1_t[4];
+ u32x append1_t[4];
append1_t[0] = amd_bytealign (append1[0], append0[3], offset_minus_4);
append1_t[1] = amd_bytealign (append1[1], append1[0], offset_minus_4);
append1_t[2] = amd_bytealign (append1[2], append1[1], offset_minus_4);
append1_t[3] = amd_bytealign (append1[3], append1[2], offset_minus_4);
- u32 append2_t[4];
+ u32x append2_t[4];
append2_t[0] = amd_bytealign (append2[0], append1[3], offset_minus_4);
append2_t[1] = amd_bytealign (append2[1], append2[0], offset_minus_4);
append2_t[2] = amd_bytealign (append2[2], append2[1], offset_minus_4);
append2_t[3] = amd_bytealign (append2[3], append2[2], offset_minus_4);
- u32 append3_t[4];
+ u32x append3_t[4];
append3_t[0] = amd_bytealign (append3[0], append2[3], offset_minus_4);
append3_t[1] = amd_bytealign (append3[1], append3[0], offset_minus_4);
append3_t[2] = amd_bytealign (append3[2], append3[1], offset_minus_4);
append3_t[3] = amd_bytealign (append3[3], append3[2], offset_minus_4);
- u32 append4_t[4];
+ u32x append4_t[4];
append4_t[0] = amd_bytealign ( 0, append3[3], offset_minus_4);
append4_t[1] = 0;
const int selector = (0x76543210 >> (offset_minus_4 * 4)) & 0xffff;
- u32 append0_t[4];
+ u32x append0_t[4];
append0_t[0] = __byte_perm ( 0, append0[0], selector);
append0_t[1] = __byte_perm (append0[0], append0[1], selector);
append0_t[2] = __byte_perm (append0[1], append0[2], selector);
append0_t[3] = __byte_perm (append0[2], append0[3], selector);
- u32 append1_t[4];
+ u32x append1_t[4];
append1_t[0] = __byte_perm (append0[3], append1[0], selector);
append1_t[1] = __byte_perm (append1[0], append1[1], selector);
append1_t[2] = __byte_perm (append1[1], append1[2], selector);
append1_t[3] = __byte_perm (append1[2], append1[3], selector);
- u32 append2_t[4];
+ u32x append2_t[4];
append2_t[0] = __byte_perm (append1[3], append2[0], selector);
append2_t[1] = __byte_perm (append2[0], append2[1], selector);
append2_t[2] = __byte_perm (append2[1], append2[2], selector);
append2_t[3] = __byte_perm (append2[2], append2[3], selector);
- u32 append3_t[4];
+ u32x append3_t[4];
append3_t[0] = __byte_perm (append2[3], append3[0], selector);
append3_t[1] = __byte_perm (append3[0], append3[1], selector);
append3_t[2] = __byte_perm (append3[1], append3[2], selector);
append3_t[3] = __byte_perm (append3[2], append3[3], selector);
- u32 append4_t[4];
+ u32x append4_t[4];
append4_t[0] = __byte_perm (append3[3], 0, selector);
append4_t[1] = 0;
u32 w0l = w0[0];
- for (u32 il_pos = 0; il_pos < bfs_cnt; il_pos++)
+ for (u32 il_pos = 0; il_pos < bfs_cnt; il_pos += VECT_SIZE)
{
- const u32 w0r = bfs_buf[il_pos].i;
+ const u32x w0r = w0r_create_bft (bfs_buf, il_pos);
- w0[0] = w0l | w0r;
+ const u32x w0lr = w0l | w0r;
/*
* HA1 = md5 ($salt . $pass)
// append the pass to the salt
- u32 block0[16];
+ u32x block0[16];
block0[ 0] = salt_buf0[ 0];
block0[ 1] = salt_buf0[ 1];
block0[14] = salt_buf0[14];
block0[15] = salt_buf0[15];
- u32 block1[16];
+ u32x block1[16];
block1[ 0] = salt_buf1[ 0];
block1[ 1] = salt_buf1[ 1];
block1[14] = salt_buf1[14];
block1[15] = salt_buf1[15];
- memcat32 (block0, block1, salt_len, w0, w1, w2, w3, pw_len);
+ u32x w0_t[4];
+ u32x w1_t[4];
+ u32x w2_t[4];
+ u32x w3_t[4];
- u32 w0_t[4];
+ w0_t[0] = w0lr;
+ w0_t[1] = w0[1];
+ w0_t[2] = w0[2];
+ w0_t[3] = w0[3];
+
+ w1_t[0] = w1[0];
+ w1_t[1] = w1[1];
+ w1_t[2] = w1[2];
+ w1_t[3] = w1[3];
+
+ w2_t[0] = w2[0];
+ w2_t[1] = w2[1];
+ w2_t[2] = w2[2];
+ w2_t[3] = w2[3];
+
+ w3_t[0] = w3[0];
+ w3_t[1] = w3[1];
+ w3_t[2] = w3[2];
+ w3_t[3] = w3[3];
+
+ memcat32 (block0, block1, salt_len, w0_t, w1_t, w2_t, w3_t, pw_len);
w0_t[0] = block0[ 0];
w0_t[1] = block0[ 1];
w0_t[2] = block0[ 2];
w0_t[3] = block0[ 3];
- u32 w1_t[4];
-
w1_t[0] = block0[ 4];
w1_t[1] = block0[ 5];
w1_t[2] = block0[ 6];
w1_t[3] = block0[ 7];
- u32 w2_t[4];
-
w2_t[0] = block0[ 8];
w2_t[1] = block0[ 9];
w2_t[2] = block0[10];
w2_t[3] = block0[11];
- u32 w3_t[4];
-
w3_t[0] = block0[12];
w3_t[1] = block0[13];
w3_t[2] = pw_salt_len * 8;
// md5
- u32 tmp2;
+ u32x tmp2;
- u32 a = MD5M_A;
- u32 b = MD5M_B;
- u32 c = MD5M_C;
- u32 d = MD5M_D;
+ u32x a = MD5M_A;
+ u32x b = MD5M_B;
+ u32x c = MD5M_C;
+ u32x d = MD5M_D;
MD5_STEP (MD5_Fo, a, b, c, d, w0_t[0], MD5C00, MD5S00);
MD5_STEP (MD5_Fo, d, a, b, c, w0_t[1], MD5C01, MD5S01);
c += MD5M_C;
d += MD5M_D;
- u32 r_a = a;
- u32 r_b = b;
- u32 r_c = c;
- u32 r_d = d;
+ u32x r_a = a;
+ u32x r_b = b;
+ u32x r_c = c;
+ u32x r_d = d;
// 2nd transform
c += r_c;
d += r_d;
- const u32 r0 = a;
- const u32 r1 = d;
- const u32 r2 = c;
- const u32 r3 = b;
-
- #include COMPARE_S
+ COMPARE_S_SIMD (a, d, c, b);
}
}
u32 w0l = w0[0];
- for (u32 il_pos = 0; il_pos < bfs_cnt; il_pos++)
+ for (u32 il_pos = 0; il_pos < bfs_cnt; il_pos += VECT_SIZE)
{
- const u32 w0r = bfs_buf[il_pos].i;
+ const u32x w0r = w0r_create_bft (bfs_buf, il_pos);
- w0[0] = w0l | w0r;
+ const u32x w0lr = w0l | w0r;
/*
* HA1 = md5 ($salt . $pass)
// append the pass to the salt
- u32 block0[16];
+ u32x block0[16];
block0[ 0] = salt_buf0[ 0];
block0[ 1] = salt_buf0[ 1];
block0[14] = salt_buf0[14];
block0[15] = salt_buf0[15];
- u32 block1[16];
+ u32x block1[16];
block1[ 0] = salt_buf1[ 0];
block1[ 1] = salt_buf1[ 1];
block1[14] = salt_buf1[14];
block1[15] = salt_buf1[15];
- memcat32 (block0, block1, salt_len, w0, w1, w2, w3, pw_len);
+ u32x w0_t[4];
+ u32x w1_t[4];
+ u32x w2_t[4];
+ u32x w3_t[4];
+
+ w0_t[0] = w0lr;
+ w0_t[1] = w0[1];
+ w0_t[2] = w0[2];
+ w0_t[3] = w0[3];
- u32 w0_t[4];
+ w1_t[0] = w1[0];
+ w1_t[1] = w1[1];
+ w1_t[2] = w1[2];
+ w1_t[3] = w1[3];
+
+ w2_t[0] = w2[0];
+ w2_t[1] = w2[1];
+ w2_t[2] = w2[2];
+ w2_t[3] = w2[3];
+
+ w3_t[0] = w3[0];
+ w3_t[1] = w3[1];
+ w3_t[2] = w3[2];
+ w3_t[3] = w3[3];
+
+ memcat32 (block0, block1, salt_len, w0_t, w1_t, w2_t, w3_t, pw_len);
w0_t[0] = block0[ 0];
w0_t[1] = block0[ 1];
w0_t[2] = block0[ 2];
w0_t[3] = block0[ 3];
- u32 w1_t[4];
-
w1_t[0] = block0[ 4];
w1_t[1] = block0[ 5];
w1_t[2] = block0[ 6];
w1_t[3] = block0[ 7];
- u32 w2_t[4];
-
w2_t[0] = block0[ 8];
w2_t[1] = block0[ 9];
w2_t[2] = block0[10];
w2_t[3] = block0[11];
- u32 w3_t[4];
-
w3_t[0] = block0[12];
w3_t[1] = block0[13];
w3_t[2] = pw_salt_len * 8;
// md5
- u32 tmp2;
+ u32x tmp2;
- u32 a = MD5M_A;
- u32 b = MD5M_B;
- u32 c = MD5M_C;
- u32 d = MD5M_D;
+ u32x a = MD5M_A;
+ u32x b = MD5M_B;
+ u32x c = MD5M_C;
+ u32x d = MD5M_D;
MD5_STEP (MD5_Fo, a, b, c, d, w0_t[0], MD5C00, MD5S00);
MD5_STEP (MD5_Fo, d, a, b, c, w0_t[1], MD5C01, MD5S01);
c += MD5M_C;
d += MD5M_D;
- u32 r_a = a;
- u32 r_b = b;
- u32 r_c = c;
- u32 r_d = d;
+ u32x r_a = a;
+ u32x r_b = b;
+ u32x r_c = c;
+ u32x r_d = d;
// 2nd transform
c += r_c;
d += r_d;
- const u32 r0 = a;
- const u32 r1 = d;
- const u32 r2 = c;
- const u32 r3 = b;
-
- #include COMPARE_S
+ COMPARE_S_SIMD (a, d, c, b);
}
}
u32 w0l = w0[0];
- for (u32 il_pos = 0; il_pos < bfs_cnt; il_pos++)
+ for (u32 il_pos = 0; il_pos < bfs_cnt; il_pos += VECT_SIZE)
{
- const u32 w0r = bfs_buf[il_pos].i;
+ const u32x w0r = w0r_create_bft (bfs_buf, il_pos);
- w0[0] = w0l | w0r;
+ const u32x w0lr = w0l | w0r;
/*
* HA1 = md5 ($salt . $pass)
// append the pass to the salt
- u32 block0[16];
+ u32x block0[16];
block0[ 0] = salt_buf0[ 0];
block0[ 1] = salt_buf0[ 1];
block0[14] = salt_buf0[14];
block0[15] = salt_buf0[15];
- u32 block1[16];
+ u32x block1[16];
block1[ 0] = salt_buf1[ 0];
block1[ 1] = salt_buf1[ 1];
block1[14] = salt_buf1[14];
block1[15] = salt_buf1[15];
- memcat32 (block0, block1, salt_len, w0, w1, w2, w3, pw_len);
+ u32x w0_t[4];
+ u32x w1_t[4];
+ u32x w2_t[4];
+ u32x w3_t[4];
+
+ w0_t[0] = w0lr;
+ w0_t[1] = w0[1];
+ w0_t[2] = w0[2];
+ w0_t[3] = w0[3];
- u32 w0_t[4];
+ w1_t[0] = w1[0];
+ w1_t[1] = w1[1];
+ w1_t[2] = w1[2];
+ w1_t[3] = w1[3];
+
+ w2_t[0] = w2[0];
+ w2_t[1] = w2[1];
+ w2_t[2] = w2[2];
+ w2_t[3] = w2[3];
+
+ w3_t[0] = w3[0];
+ w3_t[1] = w3[1];
+ w3_t[2] = w3[2];
+ w3_t[3] = w3[3];
+
+ memcat32 (block0, block1, salt_len, w0_t, w1_t, w2_t, w3_t, pw_len);
w0_t[0] = block0[ 0];
w0_t[1] = block0[ 1];
w0_t[2] = block0[ 2];
w0_t[3] = block0[ 3];
- u32 w1_t[4];
-
w1_t[0] = block0[ 4];
w1_t[1] = block0[ 5];
w1_t[2] = block0[ 6];
w1_t[3] = block0[ 7];
- u32 w2_t[4];
-
w2_t[0] = block0[ 8];
w2_t[1] = block0[ 9];
w2_t[2] = block0[10];
w2_t[3] = block0[11];
- u32 w3_t[4];
-
w3_t[0] = block0[12];
w3_t[1] = block0[13];
w3_t[2] = block0[14];
// md5
- u32 tmp2;
+ u32x tmp2;
- u32 a = MD5M_A;
- u32 b = MD5M_B;
- u32 c = MD5M_C;
- u32 d = MD5M_D;
+ u32x a = MD5M_A;
+ u32x b = MD5M_B;
+ u32x c = MD5M_C;
+ u32x d = MD5M_D;
MD5_STEP (MD5_Fo, a, b, c, d, w0_t[0], MD5C00, MD5S00);
MD5_STEP (MD5_Fo, d, a, b, c, w0_t[1], MD5C01, MD5S01);
c += MD5M_C;
d += MD5M_D;
- u32 r_a = a;
- u32 r_b = b;
- u32 r_c = c;
- u32 r_d = d;
+ u32x r_a = a;
+ u32x r_b = b;
+ u32x r_c = c;
+ u32x r_d = d;
w0_t[0] = block1[ 0];
w0_t[1] = block1[ 1];
c += r_c;
d += r_d;
- const u32 r0 = a;
- const u32 r1 = d;
- const u32 r2 = c;
- const u32 r3 = b;
-
- #include COMPARE_S
+ COMPARE_S_SIMD (a, d, c, b);
}
}
u32 w0l = w0[0];
- for (u32 il_pos = 0; il_pos < bfs_cnt; il_pos++)
+ for (u32 il_pos = 0; il_pos < bfs_cnt; il_pos += VECT_SIZE)
{
- const u32 w0r = bfs_buf[il_pos].i;
+ const u32x w0r = w0r_create_bft (bfs_buf, il_pos);
- w0[0] = w0l | w0r;
+ const u32x w0lr = w0l | w0r;
/*
* HA1 = md5 ($salt . $pass)
// append the pass to the salt
- u32 block0[16];
+ u32x block0[16];
block0[ 0] = salt_buf0[ 0];
block0[ 1] = salt_buf0[ 1];
block1[14] = salt_buf1[14];
block1[15] = salt_buf1[15];
- memcat32 (block0, block1, salt_len, w0, w1, w2, w3, pw_len);
+ u32x w0_t[4];
+ u32x w1_t[4];
+ u32x w2_t[4];
+ u32x w3_t[4];
+
+ w0_t[0] = w0lr;
+ w0_t[1] = w0[1];
+ w0_t[2] = w0[2];
+ w0_t[3] = w0[3];
+
+ w1_t[0] = w1[0];
+ w1_t[1] = w1[1];
+ w1_t[2] = w1[2];
+ w1_t[3] = w1[3];
+
+ w2_t[0] = w2[0];
+ w2_t[1] = w2[1];
+ w2_t[2] = w2[2];
+ w2_t[3] = w2[3];
- u32 w0_t[4];
+ w3_t[0] = w3[0];
+ w3_t[1] = w3[1];
+ w3_t[2] = w3[2];
+ w3_t[3] = w3[3];
+
+ memcat32 (block0, block1, salt_len, w0_t, w1_t, w2_t, w3_t, pw_len);
w0_t[0] = block0[ 0];
w0_t[1] = block0[ 1];
w0_t[2] = block0[ 2];
w0_t[3] = block0[ 3];
- u32 w1_t[4];
-
w1_t[0] = block0[ 4];
w1_t[1] = block0[ 5];
w1_t[2] = block0[ 6];
w1_t[3] = block0[ 7];
- u32 w2_t[4];
-
w2_t[0] = block0[ 8];
w2_t[1] = block0[ 9];
w2_t[2] = block0[10];
w2_t[3] = block0[11];
- u32 w3_t[4];
-
w3_t[0] = block0[12];
w3_t[1] = block0[13];
w3_t[2] = block0[14];
// md5
- u32 tmp2;
+ u32x tmp2;
- u32 a = MD5M_A;
- u32 b = MD5M_B;
- u32 c = MD5M_C;
- u32 d = MD5M_D;
+ u32x a = MD5M_A;
+ u32x b = MD5M_B;
+ u32x c = MD5M_C;
+ u32x d = MD5M_D;
MD5_STEP (MD5_Fo, a, b, c, d, w0_t[0], MD5C00, MD5S00);
MD5_STEP (MD5_Fo, d, a, b, c, w0_t[1], MD5C01, MD5S01);
c += MD5M_C;
d += MD5M_D;
- u32 r_a = a;
- u32 r_b = b;
- u32 r_c = c;
- u32 r_d = d;
+ u32x r_a = a;
+ u32x r_b = b;
+ u32x r_c = c;
+ u32x r_d = d;
w0_t[0] = block1[ 0];
w0_t[1] = block1[ 1];
c += r_c;
d += r_d;
- const u32 r0 = a;
- const u32 r1 = d;
- const u32 r2 = c;
- const u32 r3 = b;
-
- #include COMPARE_S
+ COMPARE_S_SIMD (a, d, c, b);
}
}
u32 w0l = w0[0];
- for (u32 il_pos = 0; il_pos < bfs_cnt; il_pos++)
+ for (u32 il_pos = 0; il_pos < bfs_cnt; il_pos += VECT_SIZE)
{
- const u32 w0r = bfs_buf[il_pos].i;
+ const u32x w0r = w0r_create_bft (bfs_buf, il_pos);
- w0[0] = w0l | w0r;
+ const u32x w0lr = w0l | w0r;
/*
* HA1 = md5 ($salt . $pass)
// append the pass to the salt
- u32 block0[16];
+ u32x block0[16];
block0[ 0] = salt_buf0[ 0];
block0[ 1] = salt_buf0[ 1];
block0[14] = salt_buf0[14];
block0[15] = salt_buf0[15];
- u32 block1[16];
+ u32x block1[16];
block1[ 0] = salt_buf1[ 0];
block1[ 1] = salt_buf1[ 1];
block1[14] = salt_buf1[14];
block1[15] = salt_buf1[15];
- memcat32 (block0, block1, salt_len, w0, w1, w2, w3, pw_len);
+ u32x w0_t[4];
+ u32x w1_t[4];
+ u32x w2_t[4];
+ u32x w3_t[4];
+
+ w0_t[0] = w0lr;
+ w0_t[1] = w0[1];
+ w0_t[2] = w0[2];
+ w0_t[3] = w0[3];
+
+ w1_t[0] = w1[0];
+ w1_t[1] = w1[1];
+ w1_t[2] = w1[2];
+ w1_t[3] = w1[3];
+
+ w2_t[0] = w2[0];
+ w2_t[1] = w2[1];
+ w2_t[2] = w2[2];
+ w2_t[3] = w2[3];
- u32 w0_t[4];
+ w3_t[0] = w3[0];
+ w3_t[1] = w3[1];
+ w3_t[2] = w3[2];
+ w3_t[3] = w3[3];
+
+ memcat32 (block0, block1, salt_len, w0_t, w1_t, w2_t, w3_t, pw_len);
w0_t[0] = block0[ 0];
w0_t[1] = block0[ 1];
w0_t[2] = block0[ 2];
w0_t[3] = block0[ 3];
- u32 w1_t[4];
-
w1_t[0] = block0[ 4];
w1_t[1] = block0[ 5];
w1_t[2] = block0[ 6];
w1_t[3] = block0[ 7];
- u32 w2_t[4];
-
w2_t[0] = block0[ 8];
w2_t[1] = block0[ 9];
w2_t[2] = block0[10];
w2_t[3] = block0[11];
- u32 w3_t[4];
-
w3_t[0] = block0[12];
w3_t[1] = block0[13];
w3_t[2] = pw_salt_len * 8;
// md5
- u32 tmp2;
+ u32x tmp2;
- u32 a = MD5M_A;
- u32 b = MD5M_B;
- u32 c = MD5M_C;
- u32 d = MD5M_D;
+ u32x a = MD5M_A;
+ u32x b = MD5M_B;
+ u32x c = MD5M_C;
+ u32x d = MD5M_D;
MD5_STEP (MD5_Fo, a, b, c, d, w0_t[0], MD5C00, MD5S00);
MD5_STEP (MD5_Fo, d, a, b, c, w0_t[1], MD5C01, MD5S01);
c += MD5M_C;
d += MD5M_D;
- u32 r_a = a;
- u32 r_b = b;
- u32 r_c = c;
- u32 r_d = d;
+ u32x r_a = a;
+ u32x r_b = b;
+ u32x r_c = c;
+ u32x r_d = d;
// 2nd transform
c += r_c;
d += r_d;
- const u32 r0 = a;
- const u32 r1 = d;
- const u32 r2 = c;
- const u32 r3 = b;
-
- #include COMPARE_S
+ COMPARE_S_SIMD (a, d, c, b);
}
}
u32 w0l = w0[0];
- for (u32 il_pos = 0; il_pos < bfs_cnt; il_pos++)
+ for (u32 il_pos = 0; il_pos < bfs_cnt; il_pos += VECT_SIZE)
{
- const u32 w0r = bfs_buf[il_pos].i;
+ const u32x w0r = w0r_create_bft (bfs_buf, il_pos);
- w0[0] = w0l | w0r;
+ const u32x w0lr = w0l | w0r;
/*
* HA1 = md5 ($salt . $pass)
// append the pass to the salt
- u32 block0[16];
+ u32x block0[16];
block0[ 0] = salt_buf0[ 0];
block0[ 1] = salt_buf0[ 1];
block0[14] = salt_buf0[14];
block0[15] = salt_buf0[15];
- u32 block1[16];
+ u32x block1[16];
block1[ 0] = salt_buf1[ 0];
block1[ 1] = salt_buf1[ 1];
block1[14] = salt_buf1[14];
block1[15] = salt_buf1[15];
- memcat32 (block0, block1, salt_len, w0, w1, w2, w3, pw_len);
+ u32x w0_t[4];
+ u32x w1_t[4];
+ u32x w2_t[4];
+ u32x w3_t[4];
+
+ w0_t[0] = w0lr;
+ w0_t[1] = w0[1];
+ w0_t[2] = w0[2];
+ w0_t[3] = w0[3];
+
+ w1_t[0] = w1[0];
+ w1_t[1] = w1[1];
+ w1_t[2] = w1[2];
+ w1_t[3] = w1[3];
+
+ w2_t[0] = w2[0];
+ w2_t[1] = w2[1];
+ w2_t[2] = w2[2];
+ w2_t[3] = w2[3];
+
+ w3_t[0] = w3[0];
+ w3_t[1] = w3[1];
+ w3_t[2] = w3[2];
+ w3_t[3] = w3[3];
- u32 w0_t[4];
+ memcat32 (block0, block1, salt_len, w0_t, w1_t, w2_t, w3_t, pw_len);
w0_t[0] = block0[ 0];
w0_t[1] = block0[ 1];
w0_t[2] = block0[ 2];
w0_t[3] = block0[ 3];
- u32 w1_t[4];
-
w1_t[0] = block0[ 4];
w1_t[1] = block0[ 5];
w1_t[2] = block0[ 6];
w1_t[3] = block0[ 7];
- u32 w2_t[4];
-
w2_t[0] = block0[ 8];
w2_t[1] = block0[ 9];
w2_t[2] = block0[10];
w2_t[3] = block0[11];
- u32 w3_t[4];
-
w3_t[0] = block0[12];
w3_t[1] = block0[13];
w3_t[2] = pw_salt_len * 8;
// md5
- u32 tmp2;
+ u32x tmp2;
- u32 a = MD5M_A;
- u32 b = MD5M_B;
- u32 c = MD5M_C;
- u32 d = MD5M_D;
+ u32x a = MD5M_A;
+ u32x b = MD5M_B;
+ u32x c = MD5M_C;
+ u32x d = MD5M_D;
MD5_STEP (MD5_Fo, a, b, c, d, w0_t[0], MD5C00, MD5S00);
MD5_STEP (MD5_Fo, d, a, b, c, w0_t[1], MD5C01, MD5S01);
c += MD5M_C;
d += MD5M_D;
- u32 r_a = a;
- u32 r_b = b;
- u32 r_c = c;
- u32 r_d = d;
+ u32x r_a = a;
+ u32x r_b = b;
+ u32x r_c = c;
+ u32x r_d = d;
// 2nd transform
c += r_c;
d += r_d;
- const u32 r0 = a;
- const u32 r1 = d;
- const u32 r2 = c;
- const u32 r3 = b;
-
- #include COMPARE_S
+ COMPARE_S_SIMD (a, d, c, b);
}
}
u32 w0l = w0[0];
- for (u32 il_pos = 0; il_pos < bfs_cnt; il_pos++)
+ for (u32 il_pos = 0; il_pos < bfs_cnt; il_pos += VECT_SIZE)
{
- const u32 w0r = bfs_buf[il_pos].i;
+ const u32x w0r = w0r_create_bft (bfs_buf, il_pos);
- w0[0] = w0l | w0r;
+ const u32x w0lr = w0l | w0r;
/*
* HA1 = md5 ($salt . $pass)
// append the pass to the salt
- u32 block0[16];
+ u32x block0[16];
block0[ 0] = salt_buf0[ 0];
block0[ 1] = salt_buf0[ 1];
block0[14] = salt_buf0[14];
block0[15] = salt_buf0[15];
- u32 block1[16];
+ u32x block1[16];
block1[ 0] = salt_buf1[ 0];
block1[ 1] = salt_buf1[ 1];
block1[14] = salt_buf1[14];
block1[15] = salt_buf1[15];
- memcat32 (block0, block1, salt_len, w0, w1, w2, w3, pw_len);
+ u32x w0_t[4];
+ u32x w1_t[4];
+ u32x w2_t[4];
+ u32x w3_t[4];
+
+ w0_t[0] = w0lr;
+ w0_t[1] = w0[1];
+ w0_t[2] = w0[2];
+ w0_t[3] = w0[3];
+
+ w1_t[0] = w1[0];
+ w1_t[1] = w1[1];
+ w1_t[2] = w1[2];
+ w1_t[3] = w1[3];
+
+ w2_t[0] = w2[0];
+ w2_t[1] = w2[1];
+ w2_t[2] = w2[2];
+ w2_t[3] = w2[3];
+
+ w3_t[0] = w3[0];
+ w3_t[1] = w3[1];
+ w3_t[2] = w3[2];
+ w3_t[3] = w3[3];
- u32 w0_t[4];
+ memcat32 (block0, block1, salt_len, w0_t, w1_t, w2_t, w3_t, pw_len);
w0_t[0] = block0[ 0];
w0_t[1] = block0[ 1];
w0_t[2] = block0[ 2];
w0_t[3] = block0[ 3];
- u32 w1_t[4];
-
w1_t[0] = block0[ 4];
w1_t[1] = block0[ 5];
w1_t[2] = block0[ 6];
w1_t[3] = block0[ 7];
- u32 w2_t[4];
-
w2_t[0] = block0[ 8];
w2_t[1] = block0[ 9];
w2_t[2] = block0[10];
w2_t[3] = block0[11];
- u32 w3_t[4];
-
w3_t[0] = block0[12];
w3_t[1] = block0[13];
w3_t[2] = block0[14];
// md5
- u32 tmp2;
+ u32x tmp2;
- u32 a = MD5M_A;
- u32 b = MD5M_B;
- u32 c = MD5M_C;
- u32 d = MD5M_D;
+ u32x a = MD5M_A;
+ u32x b = MD5M_B;
+ u32x c = MD5M_C;
+ u32x d = MD5M_D;
MD5_STEP (MD5_Fo, a, b, c, d, w0_t[0], MD5C00, MD5S00);
MD5_STEP (MD5_Fo, d, a, b, c, w0_t[1], MD5C01, MD5S01);
c += MD5M_C;
d += MD5M_D;
- u32 r_a = a;
- u32 r_b = b;
- u32 r_c = c;
- u32 r_d = d;
+ u32x r_a = a;
+ u32x r_b = b;
+ u32x r_c = c;
+ u32x r_d = d;
w0_t[0] = block1[ 0];
w0_t[1] = block1[ 1];
c += r_c;
d += r_d;
- const u32 r0 = a;
- const u32 r1 = d;
- const u32 r2 = c;
- const u32 r3 = b;
-
- #include COMPARE_S
+ COMPARE_S_SIMD (a, d, c, b);
}
}
u32 w0l = w0[0];
- for (u32 il_pos = 0; il_pos < bfs_cnt; il_pos++)
+ for (u32 il_pos = 0; il_pos < bfs_cnt; il_pos += VECT_SIZE)
{
- const u32 w0r = bfs_buf[il_pos].i;
+ const u32x w0r = w0r_create_bft (bfs_buf, il_pos);
- w0[0] = w0l | w0r;
+ const u32x w0lr = w0l | w0r;
/*
* HA1 = md5 ($salt . $pass)
// append the pass to the salt
- u32 block0[16];
+ u32x block0[16];
block0[ 0] = salt_buf0[ 0];
block0[ 1] = salt_buf0[ 1];
block0[14] = salt_buf0[14];
block0[15] = salt_buf0[15];
- u32 block1[16];
+ u32x block1[16];
block1[ 0] = salt_buf1[ 0];
block1[ 1] = salt_buf1[ 1];
block1[14] = salt_buf1[14];
block1[15] = salt_buf1[15];
- memcat32 (block0, block1, salt_len, w0, w1, w2, w3, pw_len);
+ u32x w0_t[4];
+ u32x w1_t[4];
+ u32x w2_t[4];
+ u32x w3_t[4];
+
+ w0_t[0] = w0lr;
+ w0_t[1] = w0[1];
+ w0_t[2] = w0[2];
+ w0_t[3] = w0[3];
+
+ w1_t[0] = w1[0];
+ w1_t[1] = w1[1];
+ w1_t[2] = w1[2];
+ w1_t[3] = w1[3];
+
+ w2_t[0] = w2[0];
+ w2_t[1] = w2[1];
+ w2_t[2] = w2[2];
+ w2_t[3] = w2[3];
+
+ w3_t[0] = w3[0];
+ w3_t[1] = w3[1];
+ w3_t[2] = w3[2];
+ w3_t[3] = w3[3];
- u32 w0_t[4];
+ memcat32 (block0, block1, salt_len, w0_t, w1_t, w2_t, w3_t, pw_len);
w0_t[0] = block0[ 0];
w0_t[1] = block0[ 1];
w0_t[2] = block0[ 2];
w0_t[3] = block0[ 3];
- u32 w1_t[4];
-
w1_t[0] = block0[ 4];
w1_t[1] = block0[ 5];
w1_t[2] = block0[ 6];
w1_t[3] = block0[ 7];
- u32 w2_t[4];
-
w2_t[0] = block0[ 8];
w2_t[1] = block0[ 9];
w2_t[2] = block0[10];
w2_t[3] = block0[11];
- u32 w3_t[4];
-
w3_t[0] = block0[12];
w3_t[1] = block0[13];
w3_t[2] = block0[14];
// md5
- u32 tmp2;
+ u32x tmp2;
- u32 a = MD5M_A;
- u32 b = MD5M_B;
- u32 c = MD5M_C;
- u32 d = MD5M_D;
+ u32x a = MD5M_A;
+ u32x b = MD5M_B;
+ u32x c = MD5M_C;
+ u32x d = MD5M_D;
MD5_STEP (MD5_Fo, a, b, c, d, w0_t[0], MD5C00, MD5S00);
MD5_STEP (MD5_Fo, d, a, b, c, w0_t[1], MD5C01, MD5S01);
c += MD5M_C;
d += MD5M_D;
- u32 r_a = a;
- u32 r_b = b;
- u32 r_c = c;
- u32 r_d = d;
+ u32x r_a = a;
+ u32x r_b = b;
+ u32x r_c = c;
+ u32x r_d = d;
w0_t[0] = block1[ 0];
w0_t[1] = block1[ 1];
c += r_c;
d += r_d;
- const u32 r0 = a;
- const u32 r1 = d;
- const u32 r2 = c;
- const u32 r3 = b;
-
- #include COMPARE_S
+ COMPARE_S_SIMD (a, d, c, b);
}
}
if (combs_mode == COMBINATOR_MODE_BASE_RIGHT)
{
- switch_buffer_by_offset (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len);
+ switch_buffer_by_offset_le (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len);
}
/**
if (combs_mode == COMBINATOR_MODE_BASE_LEFT)
{
- switch_buffer_by_offset (wordr0, wordr1, wordr2, wordr3, pw_l_len);
+ switch_buffer_by_offset_le (wordr0, wordr1, wordr2, wordr3, pw_l_len);
}
u32 w_t[16];
if (combs_mode == COMBINATOR_MODE_BASE_RIGHT)
{
- switch_buffer_by_offset (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len);
+ switch_buffer_by_offset_le (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len);
}
/**
if (combs_mode == COMBINATOR_MODE_BASE_LEFT)
{
- switch_buffer_by_offset (wordr0, wordr1, wordr2, wordr3, pw_l_len);
+ switch_buffer_by_offset_le (wordr0, wordr1, wordr2, wordr3, pw_l_len);
}
u32 w_t[16];
#define _CRC32_
+#define NEW_SIMD_CODE
+
#include "include/constants.h"
#include "include/kernel_vendor.h"
#include "include/kernel_functions.c"
#include "OpenCL/types_ocl.c"
#include "OpenCL/common.c"
-
-#define COMPARE_S "OpenCL/check_single_comp4.c"
-#define COMPARE_M "OpenCL/check_multi_comp4.c"
+#include "OpenCL/simd.c"
__constant u32 crc32tab[0x100] =
{
0xb40bbe37, 0xc30c8ea1, 0x5a05df1b, 0x2d02ef8d
};
-static u32 round_crc32 (u32 a, const u32 v)
+static u32x round_crc32 (u32x a, const u32x v)
{
- const u32 k = (a ^ v) & 0xff;
+ const u32x k = (a ^ v) & 0xff;
- const u32 s = a >> 8;
+ const u32x s = a >> 8;
- a = crc32tab[k];
+ #if VECT_SIZE == 1
+ a = (u32x) crc32tab[k];
+ #elif VECT_SIZE == 2
+ a = (u32x) (crc32tab[k.s0], crc32tab[k.s1]);
+ #elif VECT_SIZE == 4
+ a = (u32x) (crc32tab[k.s0], crc32tab[k.s1], crc32tab[k.s2], crc32tab[k.s3]);
+ #elif VECT_SIZE == 8
+ a = (u32x) (crc32tab[k.s0], crc32tab[k.s1], crc32tab[k.s2], crc32tab[k.s3], crc32tab[k.s4], crc32tab[k.s5], crc32tab[k.s6], crc32tab[k.s7]);
+ #endif
a ^= s;
return a;
}
-static u32 crc32 (const u32 w[16], const u32 pw_len, const u32 iv)
+static u32x crc32 (const u32x w[16], const u32 pw_len, const u32 iv)
{
- u32 a = iv ^ ~0;
+ u32x a = iv ^ ~0;
if (pw_len >= 1) a = round_crc32 (a, w[0] >> 0);
if (pw_len >= 2) a = round_crc32 (a, w[0] >> 8);
return ~a;
}
-static void m11500m (u32 w[16], const u32 pw_len, __global pw_t *pws, __global kernel_rule_t *rules_buf, __global comb_t *combs_buf, __constant u32 * words_buf_r, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 bfs_cnt, const u32 digests_cnt, const u32 digests_offset)
+static void m11500m (u32 w[16], const u32 pw_len, __global pw_t *pws, __global kernel_rule_t *rules_buf, __global comb_t *combs_buf, __constant u32x * words_buf_r, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 bfs_cnt, const u32 digests_cnt, const u32 digests_offset)
{
/**
* modifier
u32 w0l = w[0];
- for (u32 il_pos = 0; il_pos < bfs_cnt; il_pos++)
+ for (u32 il_pos = 0; il_pos < bfs_cnt; il_pos += VECT_SIZE)
{
- const u32 w0r = words_buf_r[il_pos];
+ const u32x w0r = words_buf_r[il_pos / VECT_SIZE];
- const u32 w0 = w0l | w0r;
+ const u32x w0 = w0l | w0r;
- u32 w_t[16];
+ u32x w_t[16];
w_t[ 0] = w0;
w_t[ 1] = w[ 1];
w_t[14] = w[14];
w_t[15] = w[15];
- u32 a = crc32 (w_t, pw_len, iv);
- u32 b = 0;
+ u32x a = crc32 (w_t, pw_len, iv);
+ u32x b = 0;
+ u32x c = 0;
+ u32x d = 0;
- const u32 r0 = a;
- const u32 r1 = b;
- const u32 r2 = 0;
- const u32 r3 = 0;
-
- #include COMPARE_M
+ COMPARE_M_SIMD (a, b, c, d);
}
}
-static void m11500s (u32 w[16], const u32 pw_len, __global pw_t *pws, __global kernel_rule_t *rules_buf, __global comb_t *combs_buf, __constant u32 * words_buf_r, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 bfs_cnt, const u32 digests_cnt, const u32 digests_offset)
+static void m11500s (u32 w[16], const u32 pw_len, __global pw_t *pws, __global kernel_rule_t *rules_buf, __global comb_t *combs_buf, __constant u32x * words_buf_r, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 bfs_cnt, const u32 digests_cnt, const u32 digests_offset)
{
/**
* modifier
u32 w0l = w[0];
- for (u32 il_pos = 0; il_pos < bfs_cnt; il_pos++)
+ for (u32 il_pos = 0; il_pos < bfs_cnt; il_pos += VECT_SIZE)
{
- const u32 w0r = words_buf_r[il_pos];
+ const u32x w0r = words_buf_r[il_pos / VECT_SIZE];
- const u32 w0 = w0l | w0r;
+ const u32x w0 = w0l | w0r;
- u32 w_t[16];
+ u32x w_t[16];
w_t[ 0] = w0;
w_t[ 1] = w[ 1];
w_t[14] = w[14];
w_t[15] = w[15];
- u32 a = crc32 (w_t, pw_len, iv);
- u32 b = 0;
-
- const u32 r0 = a;
- const u32 r1 = b;
- const u32 r2 = 0;
- const u32 r3 = 0;
+ u32x a = crc32 (w_t, pw_len, iv);
+ u32x b = 0;
+ u32x c = 0;
+ u32x d = 0;
- #include COMPARE_S
+ COMPARE_S_SIMD (a, b, c, d);
}
}
-__kernel void m11500_m04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, __global comb_t *combs_buf, __constant u32 * words_buf_r, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 bfs_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max)
+__kernel void m11500_m04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, __global comb_t *combs_buf, __constant u32x * words_buf_r, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 bfs_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max)
{
/**
* base
m11500m (w, pw_len, pws, rules_buf, combs_buf, words_buf_r, tmps, hooks, bitmaps_buf_s1_a, bitmaps_buf_s1_b, bitmaps_buf_s1_c, bitmaps_buf_s1_d, bitmaps_buf_s2_a, bitmaps_buf_s2_b, bitmaps_buf_s2_c, bitmaps_buf_s2_d, plains_buf, digests_buf, hashes_shown, salt_bufs, esalt_bufs, d_return_buf, d_scryptV_buf, bitmap_mask, bitmap_shift1, bitmap_shift2, salt_pos, loop_pos, loop_cnt, bfs_cnt, digests_cnt, digests_offset);
}
-__kernel void m11500_m08 (__global pw_t *pws, __global kernel_rule_t *rules_buf, __global comb_t *combs_buf, __constant u32 * words_buf_r, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 bfs_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max)
+__kernel void m11500_m08 (__global pw_t *pws, __global kernel_rule_t *rules_buf, __global comb_t *combs_buf, __constant u32x * words_buf_r, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 bfs_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max)
{
/**
* base
m11500m (w, pw_len, pws, rules_buf, combs_buf, words_buf_r, tmps, hooks, bitmaps_buf_s1_a, bitmaps_buf_s1_b, bitmaps_buf_s1_c, bitmaps_buf_s1_d, bitmaps_buf_s2_a, bitmaps_buf_s2_b, bitmaps_buf_s2_c, bitmaps_buf_s2_d, plains_buf, digests_buf, hashes_shown, salt_bufs, esalt_bufs, d_return_buf, d_scryptV_buf, bitmap_mask, bitmap_shift1, bitmap_shift2, salt_pos, loop_pos, loop_cnt, bfs_cnt, digests_cnt, digests_offset);
}
-__kernel void m11500_m16 (__global pw_t *pws, __global kernel_rule_t *rules_buf, __global comb_t *combs_buf, __constant u32 * words_buf_r, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 bfs_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max)
+__kernel void m11500_m16 (__global pw_t *pws, __global kernel_rule_t *rules_buf, __global comb_t *combs_buf, __constant u32x * words_buf_r, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 bfs_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max)
{
/**
* base
m11500m (w, pw_len, pws, rules_buf, combs_buf, words_buf_r, tmps, hooks, bitmaps_buf_s1_a, bitmaps_buf_s1_b, bitmaps_buf_s1_c, bitmaps_buf_s1_d, bitmaps_buf_s2_a, bitmaps_buf_s2_b, bitmaps_buf_s2_c, bitmaps_buf_s2_d, plains_buf, digests_buf, hashes_shown, salt_bufs, esalt_bufs, d_return_buf, d_scryptV_buf, bitmap_mask, bitmap_shift1, bitmap_shift2, salt_pos, loop_pos, loop_cnt, bfs_cnt, digests_cnt, digests_offset);
}
-__kernel void m11500_s04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, __global comb_t *combs_buf, __constant u32 * words_buf_r, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 bfs_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max)
+__kernel void m11500_s04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, __global comb_t *combs_buf, __constant u32x * words_buf_r, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 bfs_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max)
{
/**
* base
m11500s (w, pw_len, pws, rules_buf, combs_buf, words_buf_r, tmps, hooks, bitmaps_buf_s1_a, bitmaps_buf_s1_b, bitmaps_buf_s1_c, bitmaps_buf_s1_d, bitmaps_buf_s2_a, bitmaps_buf_s2_b, bitmaps_buf_s2_c, bitmaps_buf_s2_d, plains_buf, digests_buf, hashes_shown, salt_bufs, esalt_bufs, d_return_buf, d_scryptV_buf, bitmap_mask, bitmap_shift1, bitmap_shift2, salt_pos, loop_pos, loop_cnt, bfs_cnt, digests_cnt, digests_offset);
}
-__kernel void m11500_s08 (__global pw_t *pws, __global kernel_rule_t *rules_buf, __global comb_t *combs_buf, __constant u32 * words_buf_r, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 bfs_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max)
+__kernel void m11500_s08 (__global pw_t *pws, __global kernel_rule_t *rules_buf, __global comb_t *combs_buf, __constant u32x * words_buf_r, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 bfs_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max)
{
/**
* base
m11500s (w, pw_len, pws, rules_buf, combs_buf, words_buf_r, tmps, hooks, bitmaps_buf_s1_a, bitmaps_buf_s1_b, bitmaps_buf_s1_c, bitmaps_buf_s1_d, bitmaps_buf_s2_a, bitmaps_buf_s2_b, bitmaps_buf_s2_c, bitmaps_buf_s2_d, plains_buf, digests_buf, hashes_shown, salt_bufs, esalt_bufs, d_return_buf, d_scryptV_buf, bitmap_mask, bitmap_shift1, bitmap_shift2, salt_pos, loop_pos, loop_cnt, bfs_cnt, digests_cnt, digests_offset);
}
-__kernel void m11500_s16 (__global pw_t *pws, __global kernel_rule_t *rules_buf, __global comb_t *combs_buf, __constant u32 * words_buf_r, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 bfs_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max)
+__kernel void m11500_s16 (__global pw_t *pws, __global kernel_rule_t *rules_buf, __global comb_t *combs_buf, __constant u32x * words_buf_r, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 bfs_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max)
{
/**
* base
{
append_0x80_2x4 (wordl0, wordl1, pw_l_len);
- switch_buffer_by_offset (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len);
+ switch_buffer_by_offset_le (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len);
}
/**
if (combs_mode == COMBINATOR_MODE_BASE_LEFT)
{
- switch_buffer_by_offset (wordr0, wordr1, wordr2, wordr3, pw_l_len);
+ switch_buffer_by_offset_le (wordr0, wordr1, wordr2, wordr3, pw_l_len);
}
u32 w[16];
{
append_0x80_2x4 (wordl0, wordl1, pw_l_len);
- switch_buffer_by_offset (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len);
+ switch_buffer_by_offset_le (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len);
}
/**
if (combs_mode == COMBINATOR_MODE_BASE_LEFT)
{
- switch_buffer_by_offset (wordr0, wordr1, wordr2, wordr3, pw_l_len);
+ switch_buffer_by_offset_le (wordr0, wordr1, wordr2, wordr3, pw_l_len);
}
u32 w[16];
#define _GOST2012_256_
+#define NEW_SIMD_CODE
+
#include "include/constants.h"
#include "include/kernel_vendor.h"
#include "include/kernel_functions.c"
#include "OpenCL/types_ocl.c"
#include "OpenCL/common.c"
-
-#define COMPARE_S "OpenCL/check_single_comp4.c"
-#define COMPARE_M "OpenCL/check_multi_comp4.c"
+#include "OpenCL/simd.c"
#define INITVAL 0x0101010101010101
-#define SBOG_LPSti64 \
- s_sbob_sl64[0][(t[0] >> (i * 8)) & 0xff] ^ \
- s_sbob_sl64[1][(t[1] >> (i * 8)) & 0xff] ^ \
- s_sbob_sl64[2][(t[2] >> (i * 8)) & 0xff] ^ \
- s_sbob_sl64[3][(t[3] >> (i * 8)) & 0xff] ^ \
- s_sbob_sl64[4][(t[4] >> (i * 8)) & 0xff] ^ \
- s_sbob_sl64[5][(t[5] >> (i * 8)) & 0xff] ^ \
- s_sbob_sl64[6][(t[6] >> (i * 8)) & 0xff] ^ \
- s_sbob_sl64[7][(t[7] >> (i * 8)) & 0xff]
+#if VECT_SIZE == 1
+#define BOX(S,n,i) (S)[(n)][(i)]
+#elif VECT_SIZE == 2
+#define BOX(S,n,i) (u64x) ((S)[(n)][(i).s0], (S)[(n)][(i).s1])
+#elif VECT_SIZE == 4
+#define BOX(S,n,i) (u64x) ((S)[(n)][(i).s0], (S)[(n)][(i).s1], (S)[(n)][(i).s2], (S)[(n)][(i).s3])
+#elif VECT_SIZE == 8
+#define BOX(S,n,i) (u64x) ((S)[(n)][(i).s0], (S)[(n)][(i).s1], (S)[(n)][(i).s2], (S)[(n)][(i).s3], (S)[(n)][(i).s4], (S)[(n)][(i).s5], (S)[(n)][(i).s6], (S)[(n)][(i).s7])
+#endif
+
+#define SBOG_LPSti64 \
+ BOX (s_sbob_sl64, 0, ((t[0] >> (i * 8)) & 0xff)) ^ \
+ BOX (s_sbob_sl64, 1, ((t[1] >> (i * 8)) & 0xff)) ^ \
+ BOX (s_sbob_sl64, 2, ((t[2] >> (i * 8)) & 0xff)) ^ \
+ BOX (s_sbob_sl64, 3, ((t[3] >> (i * 8)) & 0xff)) ^ \
+ BOX (s_sbob_sl64, 4, ((t[4] >> (i * 8)) & 0xff)) ^ \
+ BOX (s_sbob_sl64, 5, ((t[5] >> (i * 8)) & 0xff)) ^ \
+ BOX (s_sbob_sl64, 6, ((t[6] >> (i * 8)) & 0xff)) ^ \
+ BOX (s_sbob_sl64, 7, ((t[7] >> (i * 8)) & 0xff))
// constants
},
};
-static void streebog_g (u64 h[8], const u64 m[8], __local u64 s_sbob_sl64[8][256])
+static void streebog_g (u64x h[8], const u64x m[8], __local u64 s_sbob_sl64[8][256])
{
- u64 k[8];
- u64 s[8];
- u64 t[8];
+ u64x k[8];
+ u64x s[8];
+ u64x t[8];
#pragma unroll
for (int i = 0; i < 8; i++)
u32 w0l = w[0];
- for (u32 il_pos = 0; il_pos < bfs_cnt; il_pos++)
+ for (u32 il_pos = 0; il_pos < bfs_cnt; il_pos += VECT_SIZE)
{
- const u32 w0r = bfs_buf[il_pos].i;
+ const u32x w0r = w0r_create_bft (bfs_buf, il_pos);
- w[0] = w0l | w0r;
+ const u32x w0lr = w0l | w0r;
/**
* reverse message block
*/
- u64 m[8];
+ u64x m[8];
m[0] = hl32_to_64 (w[15], w[14]);
m[1] = hl32_to_64 (w[13], w[12]);
m[4] = hl32_to_64 (w[ 7], w[ 6]);
m[5] = hl32_to_64 (w[ 5], w[ 4]);
m[6] = hl32_to_64 (w[ 3], w[ 2]);
- m[7] = hl32_to_64 (w[ 1], w[ 0]);
+ m[7] = hl32_to_64 (w[ 1], w0lr );
m[0] = swap64 (m[0]);
m[1] = swap64 (m[1]);
// state buffer (hash)
- u64 h[8];
+ u64x h[8];
h[0] = INITVAL;
h[1] = INITVAL;
streebog_g (h, m, s_sbob_sl64);
- u64 z[8];
+ u64x z[8];
z[0] = 0;
z[1] = 0;
streebog_g (h, z, s_sbob_sl64);
streebog_g (h, m, s_sbob_sl64);
- const u32 r0 = l32_from_64 (h[0]);
- const u32 r1 = h32_from_64 (h[0]);
- const u32 r2 = l32_from_64 (h[1]);
- const u32 r3 = h32_from_64 (h[1]);
+ const u32x r0 = l32_from_64 (h[0]);
+ const u32x r1 = h32_from_64 (h[0]);
+ const u32x r2 = l32_from_64 (h[1]);
+ const u32x r3 = h32_from_64 (h[1]);
- #include COMPARE_M
+ COMPARE_M_SIMD (r0, r1, r2, r3);
}
}
u32 w0l = w[0];
- for (u32 il_pos = 0; il_pos < bfs_cnt; il_pos++)
+ for (u32 il_pos = 0; il_pos < bfs_cnt; il_pos += VECT_SIZE)
{
- const u32 w0r = bfs_buf[il_pos].i;
+ const u32x w0r = w0r_create_bft (bfs_buf, il_pos);
- w[0] = w0l | w0r;
+ const u32x w0lr = w0l | w0r;
/**
* reverse message block
*/
- u64 m[8];
+ u64x m[8];
m[0] = hl32_to_64 (w[15], w[14]);
m[1] = hl32_to_64 (w[13], w[12]);
m[4] = hl32_to_64 (w[ 7], w[ 6]);
m[5] = hl32_to_64 (w[ 5], w[ 4]);
m[6] = hl32_to_64 (w[ 3], w[ 2]);
- m[7] = hl32_to_64 (w[ 1], w[ 0]);
+ m[7] = hl32_to_64 (w[ 1], w0lr );
m[0] = swap64 (m[0]);
m[1] = swap64 (m[1]);
// state buffer (hash)
- u64 h[8];
+ u64x h[8];
h[0] = INITVAL;
h[1] = INITVAL;
streebog_g (h, m, s_sbob_sl64);
- u64 z[8];
+ u64x z[8];
z[0] = 0;
z[1] = 0;
streebog_g (h, z, s_sbob_sl64);
streebog_g (h, m, s_sbob_sl64);
- const u32 r0 = l32_from_64 (h[0]);
- const u32 r1 = h32_from_64 (h[0]);
- const u32 r2 = l32_from_64 (h[1]);
- const u32 r3 = h32_from_64 (h[1]);
+ const u32x r0 = l32_from_64 (h[0]);
+ const u32x r1 = h32_from_64 (h[0]);
+ const u32x r2 = l32_from_64 (h[1]);
+ const u32x r3 = h32_from_64 (h[1]);
- #include COMPARE_S
+ COMPARE_S_SIMD (r0, r1, r2, r3);
}
}
{
append_0x80_2x4 (wordl0, wordl1, pw_l_len);
- switch_buffer_by_offset (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len);
+ switch_buffer_by_offset_le (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len);
}
/**
if (combs_mode == COMBINATOR_MODE_BASE_LEFT)
{
- switch_buffer_by_offset (wordr0, wordr1, wordr2, wordr3, pw_l_len);
+ switch_buffer_by_offset_le (wordr0, wordr1, wordr2, wordr3, pw_l_len);
}
u32 w[16];
{
append_0x80_2x4 (wordl0, wordl1, pw_l_len);
- switch_buffer_by_offset (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len);
+ switch_buffer_by_offset_le (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len);
}
/**
if (combs_mode == COMBINATOR_MODE_BASE_LEFT)
{
- switch_buffer_by_offset (wordr0, wordr1, wordr2, wordr3, pw_l_len);
+ switch_buffer_by_offset_le (wordr0, wordr1, wordr2, wordr3, pw_l_len);
}
u32 w[16];
#define _GOST2012_512_
+#define NEW_SIMD_CODE
+
#include "include/constants.h"
#include "include/kernel_vendor.h"
#include "include/kernel_functions.c"
#include "OpenCL/types_ocl.c"
#include "OpenCL/common.c"
-
-#define COMPARE_S "OpenCL/check_single_comp4.c"
-#define COMPARE_M "OpenCL/check_multi_comp4.c"
+#include "OpenCL/simd.c"
#define INITVAL 0
-#define SBOG_LPSti64 \
- s_sbob_sl64[0][(t[0] >> (i * 8)) & 0xff] ^ \
- s_sbob_sl64[1][(t[1] >> (i * 8)) & 0xff] ^ \
- s_sbob_sl64[2][(t[2] >> (i * 8)) & 0xff] ^ \
- s_sbob_sl64[3][(t[3] >> (i * 8)) & 0xff] ^ \
- s_sbob_sl64[4][(t[4] >> (i * 8)) & 0xff] ^ \
- s_sbob_sl64[5][(t[5] >> (i * 8)) & 0xff] ^ \
- s_sbob_sl64[6][(t[6] >> (i * 8)) & 0xff] ^ \
- s_sbob_sl64[7][(t[7] >> (i * 8)) & 0xff]
+#if VECT_SIZE == 1
+#define BOX(S,n,i) (S)[(n)][(i)]
+#elif VECT_SIZE == 2
+#define BOX(S,n,i) (u64x) ((S)[(n)][(i).s0], (S)[(n)][(i).s1])
+#elif VECT_SIZE == 4
+#define BOX(S,n,i) (u64x) ((S)[(n)][(i).s0], (S)[(n)][(i).s1], (S)[(n)][(i).s2], (S)[(n)][(i).s3])
+#elif VECT_SIZE == 8
+#define BOX(S,n,i) (u64x) ((S)[(n)][(i).s0], (S)[(n)][(i).s1], (S)[(n)][(i).s2], (S)[(n)][(i).s3], (S)[(n)][(i).s4], (S)[(n)][(i).s5], (S)[(n)][(i).s6], (S)[(n)][(i).s7])
+#endif
+
+#define SBOG_LPSti64 \
+ BOX (s_sbob_sl64, 0, ((t[0] >> (i * 8)) & 0xff)) ^ \
+ BOX (s_sbob_sl64, 1, ((t[1] >> (i * 8)) & 0xff)) ^ \
+ BOX (s_sbob_sl64, 2, ((t[2] >> (i * 8)) & 0xff)) ^ \
+ BOX (s_sbob_sl64, 3, ((t[3] >> (i * 8)) & 0xff)) ^ \
+ BOX (s_sbob_sl64, 4, ((t[4] >> (i * 8)) & 0xff)) ^ \
+ BOX (s_sbob_sl64, 5, ((t[5] >> (i * 8)) & 0xff)) ^ \
+ BOX (s_sbob_sl64, 6, ((t[6] >> (i * 8)) & 0xff)) ^ \
+ BOX (s_sbob_sl64, 7, ((t[7] >> (i * 8)) & 0xff))
// constants
},
};
-static void streebog_g (u64 h[8], const u64 m[8], __local u64 s_sbob_sl64[8][256])
+static void streebog_g (u64x h[8], const u64x m[8], __local u64 s_sbob_sl64[8][256])
{
- u64 k[8];
- u64 s[8];
- u64 t[8];
+ u64x k[8];
+ u64x s[8];
+ u64x t[8];
#pragma unroll
for (int i = 0; i < 8; i++)
u32 w0l = w[0];
- for (u32 il_pos = 0; il_pos < bfs_cnt; il_pos++)
+ for (u32 il_pos = 0; il_pos < bfs_cnt; il_pos += VECT_SIZE)
{
- const u32 w0r = bfs_buf[il_pos].i;
+ const u32x w0r = w0r_create_bft (bfs_buf, il_pos);
- w[0] = w0l | w0r;
+ const u32x w0lr = w0l | w0r;
/**
* reverse message block
*/
- u64 m[8];
+ u64x m[8];
m[0] = hl32_to_64 (w[15], w[14]);
m[1] = hl32_to_64 (w[13], w[12]);
m[4] = hl32_to_64 (w[ 7], w[ 6]);
m[5] = hl32_to_64 (w[ 5], w[ 4]);
m[6] = hl32_to_64 (w[ 3], w[ 2]);
- m[7] = hl32_to_64 (w[ 1], w[ 0]);
+ m[7] = hl32_to_64 (w[ 1], w0lr );
m[0] = swap64 (m[0]);
m[1] = swap64 (m[1]);
// state buffer (hash)
- u64 h[8];
+ u64x h[8];
h[0] = INITVAL;
h[1] = INITVAL;
streebog_g (h, m, s_sbob_sl64);
- u64 z[8];
+ u64x z[8];
z[0] = 0;
z[1] = 0;
streebog_g (h, z, s_sbob_sl64);
streebog_g (h, m, s_sbob_sl64);
- const u32 r0 = l32_from_64 (h[0]);
- const u32 r1 = h32_from_64 (h[0]);
- const u32 r2 = l32_from_64 (h[1]);
- const u32 r3 = h32_from_64 (h[1]);
+ const u32x r0 = l32_from_64 (h[0]);
+ const u32x r1 = h32_from_64 (h[0]);
+ const u32x r2 = l32_from_64 (h[1]);
+ const u32x r3 = h32_from_64 (h[1]);
- #include COMPARE_M
+ COMPARE_M_SIMD (r0, r1, r2, r3);
}
}
u32 w0l = w[0];
- for (u32 il_pos = 0; il_pos < bfs_cnt; il_pos++)
+ for (u32 il_pos = 0; il_pos < bfs_cnt; il_pos += VECT_SIZE)
{
- const u32 w0r = bfs_buf[il_pos].i;
+ const u32x w0r = w0r_create_bft (bfs_buf, il_pos);
- w[0] = w0l | w0r;
+ const u32x w0lr = w0l | w0r;
/**
* reverse message block
*/
- u64 m[8];
+ u64x m[8];
m[0] = hl32_to_64 (w[15], w[14]);
m[1] = hl32_to_64 (w[13], w[12]);
m[4] = hl32_to_64 (w[ 7], w[ 6]);
m[5] = hl32_to_64 (w[ 5], w[ 4]);
m[6] = hl32_to_64 (w[ 3], w[ 2]);
- m[7] = hl32_to_64 (w[ 1], w[ 0]);
+ m[7] = hl32_to_64 (w[ 1], w0lr );
m[0] = swap64 (m[0]);
m[1] = swap64 (m[1]);
// state buffer (hash)
- u64 h[8];
+ u64x h[8];
h[0] = INITVAL;
h[1] = INITVAL;
streebog_g (h, m, s_sbob_sl64);
- u64 z[8];
+ u64x z[8];
z[0] = 0;
z[1] = 0;
streebog_g (h, z, s_sbob_sl64);
streebog_g (h, m, s_sbob_sl64);
- const u32 r0 = l32_from_64 (h[0]);
- const u32 r1 = h32_from_64 (h[0]);
- const u32 r2 = l32_from_64 (h[1]);
- const u32 r3 = h32_from_64 (h[1]);
+ const u32x r0 = l32_from_64 (h[0]);
+ const u32x r1 = h32_from_64 (h[0]);
+ const u32x r2 = l32_from_64 (h[1]);
+ const u32x r3 = h32_from_64 (h[1]);
- #include COMPARE_S
+ COMPARE_S_SIMD (r0, r1, r2, r3);
}
}
{
append_0x80_2x4 (wordl0, wordl1, pw_l_len);
- switch_buffer_by_offset (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len);
+ switch_buffer_by_offset_le (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len);
}
/**
{
append_0x80_2x4 (wordr0, wordr1, pw_r_len);
- switch_buffer_by_offset (wordr0, wordr1, wordr2, wordr3, pw_l_len);
+ switch_buffer_by_offset_le (wordr0, wordr1, wordr2, wordr3, pw_l_len);
}
u32 w0[4];
{
append_0x80_2x4 (wordl0, wordl1, pw_l_len);
- switch_buffer_by_offset (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len);
+ switch_buffer_by_offset_le (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len);
}
/**
{
append_0x80_2x4 (wordr0, wordr1, pw_r_len);
- switch_buffer_by_offset (wordr0, wordr1, wordr2, wordr3, pw_l_len);
+ switch_buffer_by_offset_le (wordr0, wordr1, wordr2, wordr3, pw_l_len);
}
u32 w0[4];
#define _SHA256_SHA1_
+#define NEW_SIMD_CODE
+
#include "include/constants.h"
#include "include/kernel_vendor.h"
#include "include/kernel_functions.c"
#include "OpenCL/types_ocl.c"
#include "OpenCL/common.c"
-
-#define COMPARE_S "OpenCL/check_single_comp4.c"
-#define COMPARE_M "OpenCL/check_multi_comp4.c"
-
-#define uint_to_hex_upper8(i) l_bin2asc[(i)]
+#include "OpenCL/simd.c"
+
+#if VECT_SIZE == 1
+#define uint_to_hex_upper8(i) (u32x) (l_bin2asc[(i)])
+#elif VECT_SIZE == 2
+#define uint_to_hex_upper8(i) (u32x) (l_bin2asc[(i).s0], l_bin2asc[(i).s1])
+#elif VECT_SIZE == 4
+#define uint_to_hex_upper8(i) (u32x) (l_bin2asc[(i).s0], l_bin2asc[(i).s1], l_bin2asc[(i).s2], l_bin2asc[(i).s3])
+#elif VECT_SIZE == 8
+#define uint_to_hex_upper8(i) (u32x) (l_bin2asc[(i).s0], l_bin2asc[(i).s1], l_bin2asc[(i).s2], l_bin2asc[(i).s3], l_bin2asc[(i).s4], l_bin2asc[(i).s5], l_bin2asc[(i).s6], l_bin2asc[(i).s7])
+#endif
static void m12600m (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], const u32 pw_len, __global pw_t *pws, __global kernel_rule_t *rules_buf, __global comb_t *combs_buf, __global bf_t *bfs_buf, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 bfs_cnt, const u32 digests_cnt, const u32 digests_offset, __local u32 l_bin2asc[256])
{
u32 w0l = w0[0];
- for (u32 il_pos = 0; il_pos < bfs_cnt; il_pos++)
+ for (u32 il_pos = 0; il_pos < bfs_cnt; il_pos += VECT_SIZE)
{
- const u32 w0r = bfs_buf[il_pos].i;
+ const u32x w0r = w0r_create_bft (bfs_buf, il_pos);
- w0[0] = w0l | w0r;
+ const u32x w0lr = w0l | w0r;
/**
* sha1
*/
- u32 w0_t = w0[0];
- u32 w1_t = w0[1];
- u32 w2_t = w0[2];
- u32 w3_t = w0[3];
- u32 w4_t = w1[0];
- u32 w5_t = w1[1];
- u32 w6_t = w1[2];
- u32 w7_t = w1[3];
- u32 w8_t = w2[0];
- u32 w9_t = w2[1];
- u32 wa_t = w2[2];
- u32 wb_t = w2[3];
- u32 wc_t = w3[0];
- u32 wd_t = w3[1];
- u32 we_t = 0;
- u32 wf_t = pw_len * 8;
-
- u32 a = SHA1M_A;
- u32 b = SHA1M_B;
- u32 c = SHA1M_C;
- u32 d = SHA1M_D;
- u32 e = SHA1M_E;
- u32 f = 0;
- u32 g = 0;
- u32 h = 0;
+ u32x w0_t = w0lr;
+ u32x w1_t = w0[1];
+ u32x w2_t = w0[2];
+ u32x w3_t = w0[3];
+ u32x w4_t = w1[0];
+ u32x w5_t = w1[1];
+ u32x w6_t = w1[2];
+ u32x w7_t = w1[3];
+ u32x w8_t = w2[0];
+ u32x w9_t = w2[1];
+ u32x wa_t = w2[2];
+ u32x wb_t = w2[3];
+ u32x wc_t = w3[0];
+ u32x wd_t = w3[1];
+ u32x we_t = 0;
+ u32x wf_t = pw_len * 8;
+
+ u32x a = SHA1M_A;
+ u32x b = SHA1M_B;
+ u32x c = SHA1M_C;
+ u32x d = SHA1M_D;
+ u32x e = SHA1M_E;
+ u32x f = 0;
+ u32x g = 0;
+ u32x h = 0;
#undef K
#define K SHA1C00
we_t = SHA256_EXPAND (wc_t, w7_t, wf_t, we_t); SHA256_STEP (SHA256_F0o, SHA256_F1o, c, d, e, f, g, h, a, b, we_t, SHA256C3e);
wf_t = SHA256_EXPAND (wd_t, w8_t, w0_t, wf_t); SHA256_STEP (SHA256_F0o, SHA256_F1o, b, c, d, e, f, g, h, a, wf_t, SHA256C3f);
- const u32 r0 = d;
- const u32 r1 = h;
- const u32 r2 = c;
- const u32 r3 = g;
-
- #include COMPARE_M
+ COMPARE_M_SIMD (d, h, c, g);
}
}
u32 w0l = w0[0];
- for (u32 il_pos = 0; il_pos < bfs_cnt; il_pos++)
+ for (u32 il_pos = 0; il_pos < bfs_cnt; il_pos += VECT_SIZE)
{
- const u32 w0r = bfs_buf[il_pos].i;
+ const u32x w0r = w0r_create_bft (bfs_buf, il_pos);
- w0[0] = w0l | w0r;
+ const u32x w0lr = w0l | w0r;
/**
* sha1
*/
- u32 w0_t = w0[0];
- u32 w1_t = w0[1];
- u32 w2_t = w0[2];
- u32 w3_t = w0[3];
- u32 w4_t = w1[0];
- u32 w5_t = w1[1];
- u32 w6_t = w1[2];
- u32 w7_t = w1[3];
- u32 w8_t = w2[0];
- u32 w9_t = w2[1];
- u32 wa_t = w2[2];
- u32 wb_t = w2[3];
- u32 wc_t = w3[0];
- u32 wd_t = w3[1];
- u32 we_t = 0;
- u32 wf_t = pw_len * 8;
-
- u32 a = SHA1M_A;
- u32 b = SHA1M_B;
- u32 c = SHA1M_C;
- u32 d = SHA1M_D;
- u32 e = SHA1M_E;
- u32 f = 0;
- u32 g = 0;
- u32 h = 0;
+ u32x w0_t = w0lr;
+ u32x w1_t = w0[1];
+ u32x w2_t = w0[2];
+ u32x w3_t = w0[3];
+ u32x w4_t = w1[0];
+ u32x w5_t = w1[1];
+ u32x w6_t = w1[2];
+ u32x w7_t = w1[3];
+ u32x w8_t = w2[0];
+ u32x w9_t = w2[1];
+ u32x wa_t = w2[2];
+ u32x wb_t = w2[3];
+ u32x wc_t = w3[0];
+ u32x wd_t = w3[1];
+ u32x we_t = 0;
+ u32x wf_t = pw_len * 8;
+
+ u32x a = SHA1M_A;
+ u32x b = SHA1M_B;
+ u32x c = SHA1M_C;
+ u32x d = SHA1M_D;
+ u32x e = SHA1M_E;
+ u32x f = 0;
+ u32x g = 0;
+ u32x h = 0;
#undef K
#define K SHA1C00
we_t = SHA256_EXPAND (wc_t, w7_t, wf_t, we_t); SHA256_STEP (SHA256_F0o, SHA256_F1o, c, d, e, f, g, h, a, b, we_t, SHA256C3e);
wf_t = SHA256_EXPAND (wd_t, w8_t, w0_t, wf_t); SHA256_STEP (SHA256_F0o, SHA256_F1o, b, c, d, e, f, g, h, a, wf_t, SHA256C3f);
- const u32 r0 = d;
- const u32 r1 = h;
- const u32 r2 = c;
- const u32 r3 = g;
-
- #include COMPARE_S
+ COMPARE_S_SIMD (d, h, c, g);
}
}
#define MATCHES_NONE_VV(a,b) !(MATCHES_ONE_VV ((a), (b)))
#define MATCHES_NONE_VS(a,b) !(MATCHES_ONE_VS ((a), (b)))
+
+// attack-mode 0
+
+static inline u32x w0r_create_bft (__global bf_t *bfs_buf, const u32 il_pos)
+{
+ #if VECT_SIZE == 1
+ const u32x w0r = (u32x) (bfs_buf[il_pos + 0].i);
+ #elif VECT_SIZE == 2
+ const u32x w0r = (u32x) (bfs_buf[il_pos + 0].i, bfs_buf[il_pos + 1].i);
+ #elif VECT_SIZE == 4
+ const u32x w0r = (u32x) (bfs_buf[il_pos + 0].i, bfs_buf[il_pos + 1].i, bfs_buf[il_pos + 2].i, bfs_buf[il_pos + 3].i);
+ #elif VECT_SIZE == 8
+ const u32x w0r = (u32x) (bfs_buf[il_pos + 0].i, bfs_buf[il_pos + 1].i, bfs_buf[il_pos + 2].i, bfs_buf[il_pos + 3].i, bfs_buf[il_pos + 4].i, bfs_buf[il_pos + 5].i, bfs_buf[il_pos + 6].i, bfs_buf[il_pos + 7].i);
+ #endif
+
+ return w0r;
+}
* License.....: MIT
*/
+#define DEVICE_TYPE_CPU 2
+#define DEVICE_TYPE_GPU 4
+
typedef uchar u8;
typedef ushort u16;
typedef uint u32;
#endif
#if VECT_SIZE == 1
-typedef uint u32x;
-typedef ulong u64x;
+typedef uchar u8x;
+typedef ushort u16x;
+typedef uint u32x;
+typedef ulong u64x;
#endif
#if VECT_SIZE == 2
-typedef uint2 u32x;
-typedef ulong2 u64x;
+typedef uchar2 u8x;
+typedef ushort2 u16x;
+typedef uint2 u32x;
+typedef ulong2 u64x;
#endif
#if VECT_SIZE == 4
-typedef uint4 u32x;
-typedef ulong4 u64x;
+typedef uchar4 u8x;
+typedef ushort4 u16x;
+typedef uint4 u32x;
+typedef ulong4 u64x;
#endif
#if VECT_SIZE == 8
-typedef uint8 u32x;
-typedef ulong8 u64x;
+typedef uchar8 u8x;
+typedef ushort8 u16x;
+typedef uint8 u32x;
+typedef ulong8 u64x;
#endif
// this one needs to die
#define allx(r) r
-static inline u32 l32_from_64 (u64 a)
+static inline u32 l32_from_64_S (u64 a)
{
- const u32 r = (uint) (a);
+ const u32 r = (u32) (a);
return r;
}
-static inline u32 h32_from_64 (u64 a)
+static inline u32 h32_from_64_S (u64 a)
{
a >>= 32;
- const u32 r = (uint) (a);
+ const u32 r = (u32) (a);
return r;
}
-static inline u64 hl32_to_64 (const u32 a, const u32 b)
+static inline u64 hl32_to_64_S (const u32 a, const u32 b)
{
return as_ulong ((uint2) (b, a));
}
-#ifdef IS_AMD
-static inline u32 swap32 (const u32 v)
+static inline u32x l32_from_64 (u64x a)
{
- return (as_uint (as_uchar4 (v).s3210));
-}
+ u32x r;
-static inline u64 swap64 (const u64 v)
-{
- return (as_ulong (as_uchar8 (v).s76543210));
+ #if VECT_SIZE == 1
+ r = (u32) a;
+ #endif
+
+ #if VECT_SIZE >= 2
+ r.s0 = (u32) a.s0;
+ r.s1 = (u32) a.s1;
+ #endif
+
+ #if VECT_SIZE >= 4
+ r.s2 = (u32) a.s2;
+ r.s3 = (u32) a.s3;
+ #endif
+
+ #if VECT_SIZE >= 8
+ r.s4 = (u32) a.s4;
+ r.s5 = (u32) a.s5;
+ r.s6 = (u32) a.s6;
+ r.s7 = (u32) a.s7;
+ #endif
+
+ return r;
}
-#endif
-#ifdef IS_NV
-static inline u32 swap32 (const u32 v)
+static inline u32x h32_from_64 (u64x a)
{
- u32 r;
+ a >>= 32;
- asm ("prmt.b32 %0, %1, 0, 0x0123;" : "=r"(r) : "r"(v));
+ u32x r;
+
+ #if VECT_SIZE == 1
+ r = (u32) a;
+ #endif
+
+ #if VECT_SIZE >= 2
+ r.s0 = (u32) a.s0;
+ r.s1 = (u32) a.s1;
+ #endif
+
+ #if VECT_SIZE >= 4
+ r.s2 = (u32) a.s2;
+ r.s3 = (u32) a.s3;
+ #endif
+
+ #if VECT_SIZE >= 8
+ r.s4 = (u32) a.s4;
+ r.s5 = (u32) a.s5;
+ r.s6 = (u32) a.s6;
+ r.s7 = (u32) a.s7;
+ #endif
return r;
}
-static inline u64 swap64 (const u64 v)
+static inline u64x hl32_to_64 (const u32x a, const u32x b)
{
- u32 il;
- u32 ir;
-
- asm ("mov.b64 {%0, %1}, %2;" : "=r"(il), "=r"(ir) : "l"(v));
+ u64x r;
- u32 tl;
- u32 tr;
+ #if VECT_SIZE == 1
+ r = as_ulong ((uint2) (b, a));
+ #endif
- asm ("prmt.b32 %0, %1, 0, 0x0123;" : "=r"(tl) : "r"(il));
- asm ("prmt.b32 %0, %1, 0, 0x0123;" : "=r"(tr) : "r"(ir));
+ #if VECT_SIZE >= 2
+ r.s0 = as_ulong ((uint2) (b.s0, a.s0));
+ r.s1 = as_ulong ((uint2) (b.s1, a.s1));
+ #endif
- u64 r;
+ #if VECT_SIZE >= 4
+ r.s2 = as_ulong ((uint2) (b.s2, a.s2));
+ r.s3 = as_ulong ((uint2) (b.s3, a.s3));
+ #endif
- asm ("mov.b64 %0, {%1, %2};" : "=l"(r) : "r"(tr), "r"(tl));
+ #if VECT_SIZE >= 8
+ r.s4 = as_ulong ((uint2) (b.s4, a.s4));
+ r.s5 = as_ulong ((uint2) (b.s5, a.s5));
+ r.s6 = as_ulong ((uint2) (b.s6, a.s6));
+ r.s7 = as_ulong ((uint2) (b.s7, a.s7));
+ #endif
return r;
}
-#endif
-#ifdef IS_GENERIC
-static inline u32 swap32 (const u32 v)
+#ifdef IS_AMD
+static inline u32 swap32_S (const u32 v)
{
return (as_uint (as_uchar4 (v).s3210));
}
-static inline u64 swap64 (const u64 v)
+static inline u64 swap64_S (const u64 v)
{
return (as_ulong (as_uchar8 (v).s76543210));
}
-#endif
-#ifdef IS_AMD
-static inline u32 __bfe (const u32 a, const u32 b, const u32 c)
+static inline u32 rotr32_S (const u32 a, const u32 n)
{
- return amd_bfe (a, b, c);
+ return rotate (a, 32 - n);
}
-static inline u32 amd_bytealign_S (const u32 a, const u32 b, const u32 c)
+static inline u32 rotl32_S (const u32 a, const u32 n)
{
- return amd_bytealign (a, b, c);
+ return rotate (a, n);
}
-#endif
-
-#ifdef IS_NV
-static inline u32 __byte_perm_S (const u32 a, const u32 b, const u32 c)
+static inline u64 rotr64_S (const u64 a, const u32 n)
{
- u32 r;
+ u64 r;
- asm ("prmt.b32 %0, %1, %2, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(c));
+ #if DEVICE_TYPE == DEVICE_TYPE_CPU
- return r;
-}
+ r = rotate (a, (u64) 64 - n);
-static inline u32x __byte_perm (const u32x a, const u32x b, const u32x c)
-{
- u32x r;
+ #else
- #if VECT_SIZE == 1
- asm ("prmt.b32 %0, %1, %2, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(c) );
- #endif
+ uint2 a2 = as_uint2 (a);
- #if VECT_SIZE == 2
- asm ("prmt.b32 %0, %1, %2, %3;" : "=r"(r.s0) : "r"(a.s0), "r"(b.s0), "r"(c.s0));
- asm ("prmt.b32 %0, %1, %2, %3;" : "=r"(r.s1) : "r"(a.s1), "r"(b.s1), "r"(c.s1));
- #endif
+ uint2 t;
- #if VECT_SIZE == 4
- asm ("prmt.b32 %0, %1, %2, %3;" : "=r"(r.s0) : "r"(a.s0), "r"(b.s0), "r"(c.s0));
- asm ("prmt.b32 %0, %1, %2, %3;" : "=r"(r.s1) : "r"(a.s1), "r"(b.s1), "r"(c.s1));
- asm ("prmt.b32 %0, %1, %2, %3;" : "=r"(r.s2) : "r"(a.s2), "r"(b.s2), "r"(c.s2));
- asm ("prmt.b32 %0, %1, %2, %3;" : "=r"(r.s3) : "r"(a.s3), "r"(b.s3), "r"(c.s3));
- #endif
+ t.s0 = (n >= 32) ? amd_bitalign (a2.s0, a2.s1, n - 32)
+ : amd_bitalign (a2.s1, a2.s0, n);
+ t.s1 = (n >= 32) ? amd_bitalign (a2.s1, a2.s0, n - 32)
+ : amd_bitalign (a2.s0, a2.s1, n);
+
+ r = as_ulong (t);
- #if VECT_SIZE == 8
- asm ("prmt.b32 %0, %1, %2, %3;" : "=r"(r.s0) : "r"(a.s0), "r"(b.s0), "r"(c.s0));
- asm ("prmt.b32 %0, %1, %2, %3;" : "=r"(r.s1) : "r"(a.s1), "r"(b.s1), "r"(c.s1));
- asm ("prmt.b32 %0, %1, %2, %3;" : "=r"(r.s2) : "r"(a.s2), "r"(b.s2), "r"(c.s2));
- asm ("prmt.b32 %0, %1, %2, %3;" : "=r"(r.s3) : "r"(a.s3), "r"(b.s3), "r"(c.s3));
- asm ("prmt.b32 %0, %1, %2, %3;" : "=r"(r.s4) : "r"(a.s4), "r"(b.s4), "r"(c.s4));
- asm ("prmt.b32 %0, %1, %2, %3;" : "=r"(r.s5) : "r"(a.s5), "r"(b.s5), "r"(c.s5));
- asm ("prmt.b32 %0, %1, %2, %3;" : "=r"(r.s6) : "r"(a.s6), "r"(b.s6), "r"(c.s6));
- asm ("prmt.b32 %0, %1, %2, %3;" : "=r"(r.s7) : "r"(a.s7), "r"(b.s7), "r"(c.s7));
#endif
return r;
}
-static inline u32 __bfe (const u32 a, const u32 b, const u32 c)
+static inline u64 rotl64_S (const u64 a, const u32 n)
{
- u32 r;
-
- asm ("bfe.u32 %0, %1, %2, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(c));
-
- return r;
+ return rotr64_S (a, 64 - n);
}
-#if CUDA_ARCH >= 350
-static inline u32 amd_bytealign (const u32 a, const u32 b, const u32 c)
+static inline u32x swap32 (const u32x v)
{
- u32 r;
-
- asm ("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(r) : "r"(b), "r"(a), "r"((c & 3) * 8));
-
- return r;
+ return ((v >> 24) & 0x000000ff)
+ | ((v >> 8) & 0x0000ff00)
+ | ((v << 8) & 0x00ff0000)
+ | ((v << 24) & 0xff000000);
}
-#else
-static inline u32 amd_bytealign (const u32 a, const u32 b, const u32 c)
+
+static inline u64x swap64 (const u64x v)
{
- return __byte_perm_S (b, a, (0x76543210 >> ((c & 3) * 4)) & 0xffff);
+ return ((v >> 56) & 0x00000000000000ff)
+ | ((v >> 40) & 0x000000000000ff00)
+ | ((v >> 24) & 0x0000000000ff0000)
+ | ((v >> 8) & 0x00000000ff000000)
+ | ((v << 8) & 0x000000ff00000000)
+ | ((v << 24) & 0x0000ff0000000000)
+ | ((v << 40) & 0x00ff000000000000)
+ | ((v << 56) & 0xff00000000000000);
}
-#endif
-#endif
-#ifdef IS_GENERIC
-static inline u32 __bfe (const u32 a, const u32 b, const u32 c)
+static inline u32x rotr32 (const u32x a, const u32 n)
{
- #define BIT(x) (1 << (x))
- #define BIT_MASK(x) (BIT (x) - 1)
- #define BFE(x,y,z) (((x) >> (y)) & BIT_MASK (z))
-
- return BFE (a, b, c);
+ return rotate (a, 32 - n);
}
-static inline u32 amd_bytealign_S (const u32 a, const u32 b, const u32 c)
+static inline u32x rotl32 (const u32x a, const u32 n)
{
- const u64 tmp = ((((u64) a) << 32) | ((u64) b)) >> ((c & 3) * 8);
-
- return (u32) (tmp);
+ return rotate (a, n);
}
-static inline u32x amd_bytealign (const u32x a, const u32x b, const u32 c)
+static inline u64x rotr64 (const u64x a, const u32 n)
{
- #if VECT_SIZE == 1
- const u64x tmp = ((((u64x) (a)) << 32) | ((u64x) (b))) >> ((c & 3) * 8);
+ u64x r;
- return (u32x) (tmp);
- #endif
+ #if DEVICE_TYPE == DEVICE_TYPE_CPU
- #if VECT_SIZE == 2
- const u64x tmp = ((((u64x) (a.s0, a.s1)) << 32) | ((u64x) (b.s0, b.s1))) >> ((c & 3) * 8);
+ r = rotate (a, (u64) 64 - n);
- return (u32x) (tmp.s0, tmp.s1);
- #endif
+ #else
- #if VECT_SIZE == 4
- const u64x tmp = ((((u64x) (a.s0, a.s1, a.s2, a.s3)) << 32) | ((u64x) (b.s0, b.s1, b.s2, b.s3))) >> ((c & 3) * 8);
+ uint2 a2;
+ uint2 t;
- return (u32x) (tmp.s0, tmp.s1, tmp.s2, tmp.s3);
- #endif
+ #if VECT_SIZE == 1
- #if VECT_SIZE == 8
- const u64x tmp = ((((u64x) (a.s0, a.s1, a.s2, a.s3, a.s4, a.s5, a.s6, a.s7)) << 32) | ((u64x) (b.s0, b.s1, b.s2, b.s3, b.s4, b.s5, b.s6, b.s7))) >> ((c & 3) * 8);
+ a2 = as_uint2 (a);
+
+ t.s0 = (n >= 32) ? amd_bitalign (a2.s0, a2.s1, n - 32) : amd_bitalign (a2.s1, a2.s0, n);
+ t.s1 = (n >= 32) ? amd_bitalign (a2.s1, a2.s0, n - 32) : amd_bitalign (a2.s0, a2.s1, n);
+
+ r = as_ulong (t);
+
+ #elif VECT_SIZE == 2
+
+ {
+ a2 = as_uint2 (a.s0);
+
+ t.s0 = (n >= 32) ? amd_bitalign (a2.s0, a2.s1, n - 32) : amd_bitalign (a2.s1, a2.s0, n);
+ t.s1 = (n >= 32) ? amd_bitalign (a2.s1, a2.s0, n - 32) : amd_bitalign (a2.s0, a2.s1, n);
+
+ r.s0 = as_ulong (t);
+ }
+
+ {
+ a2 = as_uint2 (a.s1);
+
+ t.s0 = (n >= 32) ? amd_bitalign (a2.s0, a2.s1, n - 32) : amd_bitalign (a2.s1, a2.s0, n);
+ t.s1 = (n >= 32) ? amd_bitalign (a2.s1, a2.s0, n - 32) : amd_bitalign (a2.s0, a2.s1, n);
+
+ r.s1 = as_ulong (t);
+ }
+
+ #elif VECT_SIZE == 4
+
+ {
+ a2 = as_uint2 (a.s0);
+
+ t.s0 = (n >= 32) ? amd_bitalign (a2.s0, a2.s1, n - 32) : amd_bitalign (a2.s1, a2.s0, n);
+ t.s1 = (n >= 32) ? amd_bitalign (a2.s1, a2.s0, n - 32) : amd_bitalign (a2.s0, a2.s1, n);
+
+ r.s0 = as_ulong (t);
+ }
+
+ {
+ a2 = as_uint2 (a.s1);
+
+ t.s0 = (n >= 32) ? amd_bitalign (a2.s0, a2.s1, n - 32) : amd_bitalign (a2.s1, a2.s0, n);
+ t.s1 = (n >= 32) ? amd_bitalign (a2.s1, a2.s0, n - 32) : amd_bitalign (a2.s0, a2.s1, n);
+
+ r.s1 = as_ulong (t);
+ }
+
+ {
+ a2 = as_uint2 (a.s2);
+
+ t.s0 = (n >= 32) ? amd_bitalign (a2.s0, a2.s1, n - 32) : amd_bitalign (a2.s1, a2.s0, n);
+ t.s1 = (n >= 32) ? amd_bitalign (a2.s1, a2.s0, n - 32) : amd_bitalign (a2.s0, a2.s1, n);
+
+ r.s2 = as_ulong (t);
+ }
+
+ {
+ a2 = as_uint2 (a.s3);
+
+ t.s0 = (n >= 32) ? amd_bitalign (a2.s0, a2.s1, n - 32) : amd_bitalign (a2.s1, a2.s0, n);
+ t.s1 = (n >= 32) ? amd_bitalign (a2.s1, a2.s0, n - 32) : amd_bitalign (a2.s0, a2.s1, n);
+
+ r.s3 = as_ulong (t);
+ }
+
+ #elif VECT_SIZE == 8
+
+ {
+ a2 = as_uint2 (a.s0);
+
+ t.s0 = (n >= 32) ? amd_bitalign (a2.s0, a2.s1, n - 32) : amd_bitalign (a2.s1, a2.s0, n);
+ t.s1 = (n >= 32) ? amd_bitalign (a2.s1, a2.s0, n - 32) : amd_bitalign (a2.s0, a2.s1, n);
+
+ r.s0 = as_ulong (t);
+ }
+
+ {
+ a2 = as_uint2 (a.s1);
+
+ t.s0 = (n >= 32) ? amd_bitalign (a2.s0, a2.s1, n - 32) : amd_bitalign (a2.s1, a2.s0, n);
+ t.s1 = (n >= 32) ? amd_bitalign (a2.s1, a2.s0, n - 32) : amd_bitalign (a2.s0, a2.s1, n);
+
+ r.s1 = as_ulong (t);
+ }
+
+ {
+ a2 = as_uint2 (a.s2);
+
+ t.s0 = (n >= 32) ? amd_bitalign (a2.s0, a2.s1, n - 32) : amd_bitalign (a2.s1, a2.s0, n);
+ t.s1 = (n >= 32) ? amd_bitalign (a2.s1, a2.s0, n - 32) : amd_bitalign (a2.s0, a2.s1, n);
+
+ r.s2 = as_ulong (t);
+ }
+
+ {
+ a2 = as_uint2 (a.s3);
+
+ t.s0 = (n >= 32) ? amd_bitalign (a2.s0, a2.s1, n - 32) : amd_bitalign (a2.s1, a2.s0, n);
+ t.s1 = (n >= 32) ? amd_bitalign (a2.s1, a2.s0, n - 32) : amd_bitalign (a2.s0, a2.s1, n);
+
+ r.s3 = as_ulong (t);
+ }
+
+ {
+ a2 = as_uint2 (a.s4);
+
+ t.s0 = (n >= 32) ? amd_bitalign (a2.s0, a2.s1, n - 32) : amd_bitalign (a2.s1, a2.s0, n);
+ t.s1 = (n >= 32) ? amd_bitalign (a2.s1, a2.s0, n - 32) : amd_bitalign (a2.s0, a2.s1, n);
+
+ r.s4 = as_ulong (t);
+ }
+
+ {
+ a2 = as_uint2 (a.s5);
+
+ t.s0 = (n >= 32) ? amd_bitalign (a2.s0, a2.s1, n - 32) : amd_bitalign (a2.s1, a2.s0, n);
+ t.s1 = (n >= 32) ? amd_bitalign (a2.s1, a2.s0, n - 32) : amd_bitalign (a2.s0, a2.s1, n);
+
+ r.s5 = as_ulong (t);
+ }
+
+ {
+ a2 = as_uint2 (a.s6);
+
+ t.s0 = (n >= 32) ? amd_bitalign (a2.s0, a2.s1, n - 32) : amd_bitalign (a2.s1, a2.s0, n);
+ t.s1 = (n >= 32) ? amd_bitalign (a2.s1, a2.s0, n - 32) : amd_bitalign (a2.s0, a2.s1, n);
+
+ r.s6 = as_ulong (t);
+ }
+
+ {
+ a2 = as_uint2 (a.s7);
+
+ t.s0 = (n >= 32) ? amd_bitalign (a2.s0, a2.s1, n - 32) : amd_bitalign (a2.s1, a2.s0, n);
+ t.s1 = (n >= 32) ? amd_bitalign (a2.s1, a2.s0, n - 32) : amd_bitalign (a2.s0, a2.s1, n);
+
+ r.s7 = as_ulong (t);
+ }
- return (u32x) (tmp.s0, tmp.s1, tmp.s2, tmp.s3, tmp.s4, tmp.s5, tmp.s6, tmp.s7);
#endif
+ #endif
+
+ return r;
}
-#endif
-#ifdef IS_AMD
-static inline u32x rotr32 (const u32x a, const u32 n)
+static inline u64x rotl64 (const u64x a, const u32 n)
{
- return rotate (a, 32 - n);
+ return rotr64 (a, 64 - n);
}
-static inline u32x rotl32 (const u32x a, const u32 n)
+static inline u32 __bfe (const u32 a, const u32 b, const u32 c)
{
- return rotate (a, n);
+ return amd_bfe (a, b, c);
}
-static inline u64 rotr64 (const u64 a, const u32 n)
+static inline u32 amd_bytealign_S (const u32 a, const u32 b, const u32 c)
{
- uint2 a2 = as_uint2 (a);
+ return amd_bytealign (a, b, c);
+}
+#endif
- uint2 t;
+#ifdef IS_NV
+static inline u32 swap32_S (const u32 v)
+{
+ u32 r;
- t.s0 = (n >= 32) ? amd_bitalign (a2.s0, a2.s1, n - 32)
- : amd_bitalign (a2.s1, a2.s0, n);
- t.s1 = (n >= 32) ? amd_bitalign (a2.s1, a2.s0, n - 32)
- : amd_bitalign (a2.s0, a2.s1, n);
+ asm ("prmt.b32 %0, %1, 0, 0x0123;" : "=r"(r) : "r"(v));
- return as_ulong (t);
+ return r;
}
-static inline u64 rotl64 (const u64 a, const u32 n)
+static inline u64 swap64_S (const u64 v)
{
- return rotr64 (a, 64 - n);
+ u32 il;
+ u32 ir;
+
+ asm ("mov.b64 {%0, %1}, %2;" : "=r"(il), "=r"(ir) : "l"(v));
+
+ u32 tl;
+ u32 tr;
+
+ asm ("prmt.b32 %0, %1, 0, 0x0123;" : "=r"(tl) : "r"(il));
+ asm ("prmt.b32 %0, %1, 0, 0x0123;" : "=r"(tr) : "r"(ir));
+
+ u64 r;
+
+ asm ("mov.b64 %0, {%1, %2};" : "=l"(r) : "r"(tr), "r"(tl));
+
+ return r;
}
-#endif
-#ifdef IS_NV
-static inline u32x rotr32 (const u32x a, const u32 n)
+static inline u32 rotr32_S (const u32 a, const u32 n)
{
return rotate (a, 32 - n);
}
-static inline u32x rotl32 (const u32x a, const u32 n)
+static inline u32 rotl32_S (const u32 a, const u32 n)
{
return rotate (a, n);
}
#if CUDA_ARCH >= 350
-static inline u64 rotr64 (const u64 a, const u32 n)
+static inline u64 rotr64_S (const u64 a, const u32 n)
{
u32 il;
u32 ir;
return r;
}
#else
-static inline u64 rotr64 (const u64 a, const u32 n)
+static inline u64 rotr64_S (const u64 a, const u32 n)
{
return rotate (a, (u64) 64 - n);
}
#endif
-static inline u64 rotl64 (const u64 a, const u32 n)
+static inline u64 rotl64_S (const u64 a, const u32 n)
{
- return rotr64 (a, 64 - n);
+ return rotr64_S (a, 64 - n);
+}
+
+#if CUDA_ARCH >= 500
+static inline u32 lut3_2d_S (const u32 a, const u32 b, const u32 c)
+{
+ u32 r;
+
+ asm ("lop3.b32 %0, %1, %2, %3, 0x2d;" : "=r" (r) : "r" (a), "r" (b), "r" (c));
+
+ return r;
+}
+
+static inline u32 lut3_39_S (const u32 a, const u32 b, const u32 c)
+{
+ u32 r;
+
+ asm ("lop3.b32 %0, %1, %2, %3, 0x39;" : "=r" (r) : "r" (a), "r" (b), "r" (c));
+
+ return r;
+}
+
+static inline u32 lut3_59_S (const u32 a, const u32 b, const u32 c)
+{
+ u32 r;
+
+ asm ("lop3.b32 %0, %1, %2, %3, 0x59;" : "=r" (r) : "r" (a), "r" (b), "r" (c));
+
+ return r;
+}
+
+static inline u32 lut3_96_S (const u32 a, const u32 b, const u32 c)
+{
+ u32 r;
+
+ asm ("lop3.b32 %0, %1, %2, %3, 0x96;" : "=r" (r) : "r" (a), "r" (b), "r" (c));
+
+ return r;
+}
+
+static inline u32 lut3_e4_S (const u32 a, const u32 b, const u32 c)
+{
+ u32 r;
+
+ asm ("lop3.b32 %0, %1, %2, %3, 0xe4;" : "=r" (r) : "r" (a), "r" (b), "r" (c));
+
+ return r;
+}
+
+static inline u32 lut3_e8_S (const u32 a, const u32 b, const u32 c)
+{
+ u32 r;
+
+ asm ("lop3.b32 %0, %1, %2, %3, 0xe8;" : "=r" (r) : "r" (a), "r" (b), "r" (c));
+
+ return r;
+}
+
+static inline u32 lut3_ca_S (const u32 a, const u32 b, const u32 c)
+{
+ u32 r;
+
+ asm ("lop3.b32 %0, %1, %2, %3, 0xca;" : "=r" (r) : "r" (a), "r" (b), "r" (c));
+
+ return r;
}
#endif
-#ifdef IS_GENERIC
+static inline u32 __byte_perm_S (const u32 a, const u32 b, const u32 c)
+{
+ u32 r;
+
+ asm ("prmt.b32 %0, %1, %2, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(c));
+
+ return r;
+}
+
+static inline u32x swap32 (const u32x v)
+{
+ return ((v >> 24) & 0x000000ff)
+ | ((v >> 8) & 0x0000ff00)
+ | ((v << 8) & 0x00ff0000)
+ | ((v << 24) & 0xff000000);
+}
+
+static inline u64x swap64 (const u64x v)
+{
+ return ((v >> 56) & 0x00000000000000ff)
+ | ((v >> 40) & 0x000000000000ff00)
+ | ((v >> 24) & 0x0000000000ff0000)
+ | ((v >> 8) & 0x00000000ff000000)
+ | ((v << 8) & 0x000000ff00000000)
+ | ((v << 24) & 0x0000ff0000000000)
+ | ((v << 40) & 0x00ff000000000000)
+ | ((v << 56) & 0xff00000000000000);
+}
-static inline u32x rotr32 (const u32x a, const u32x n)
+static inline u32x rotr32 (const u32x a, const u32 n)
{
return rotate (a, 32 - n);
}
-static inline u32x rotl32 (const u32x a, const u32x n)
+static inline u32x rotl32 (const u32x a, const u32 n)
{
return rotate (a, n);
}
-static inline u64 rotr64 (const u64 a, const u32 n)
+static inline u64x rotr64 (const u64x a, const u32 n)
{
return rotate (a, (u64) 64 - n);
}
-static inline u64 rotl64 (const u64 a, const u32 n)
+static inline u64x rotl64 (const u64x a, const u32 n)
{
return rotate (a, (u64) n);
}
+
+static inline u32x __byte_perm (const u32x a, const u32x b, const u32x c)
+{
+ u32x r;
+
+ #if VECT_SIZE == 1
+ asm ("prmt.b32 %0, %1, %2, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(c) );
+ #endif
+
+ #if VECT_SIZE >= 2
+ asm ("prmt.b32 %0, %1, %2, %3;" : "=r"(r.s0) : "r"(a.s0), "r"(b.s0), "r"(c.s0));
+ asm ("prmt.b32 %0, %1, %2, %3;" : "=r"(r.s1) : "r"(a.s1), "r"(b.s1), "r"(c.s1));
+ #endif
+
+ #if VECT_SIZE >= 4
+ asm ("prmt.b32 %0, %1, %2, %3;" : "=r"(r.s2) : "r"(a.s2), "r"(b.s2), "r"(c.s2));
+ asm ("prmt.b32 %0, %1, %2, %3;" : "=r"(r.s3) : "r"(a.s3), "r"(b.s3), "r"(c.s3));
+ #endif
+
+ #if VECT_SIZE >= 8
+ asm ("prmt.b32 %0, %1, %2, %3;" : "=r"(r.s4) : "r"(a.s4), "r"(b.s4), "r"(c.s4));
+ asm ("prmt.b32 %0, %1, %2, %3;" : "=r"(r.s5) : "r"(a.s5), "r"(b.s5), "r"(c.s5));
+ asm ("prmt.b32 %0, %1, %2, %3;" : "=r"(r.s6) : "r"(a.s6), "r"(b.s6), "r"(c.s6));
+ asm ("prmt.b32 %0, %1, %2, %3;" : "=r"(r.s7) : "r"(a.s7), "r"(b.s7), "r"(c.s7));
+ #endif
+
+ return r;
+}
+
+static inline u32 __bfe (const u32 a, const u32 b, const u32 c)
+{
+ u32 r;
+
+ asm ("bfe.u32 %0, %1, %2, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(c));
+
+ return r;
+}
+
+#if CUDA_ARCH >= 350
+static inline u32 amd_bytealign (const u32 a, const u32 b, const u32 c)
+{
+ u32 r;
+
+ asm ("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(r) : "r"(b), "r"(a), "r"((c & 3) * 8));
+
+ return r;
+}
+#else
+static inline u32 amd_bytealign (const u32 a, const u32 b, const u32 c)
+{
+ return __byte_perm_S (b, a, (0x76543210 >> ((c & 3) * 4)) & 0xffff);
+}
#endif
-#ifdef IS_NV
#if CUDA_ARCH >= 500
static inline u32x lut3_2d (const u32x a, const u32x b, const u32x c)
{
asm ("lop3.b32 %0, %1, %2, %3, 0x2d;" : "=r" (r) : "r" (a), "r" (b), "r" (c));
#endif
- #if VECT_SIZE == 2
+ #if VECT_SIZE >= 2
asm ("lop3.b32 %0, %1, %2, %3, 0x2d;" : "=r" (r.s0) : "r" (a.s0), "r" (b.s0), "r" (c.s0));
asm ("lop3.b32 %0, %1, %2, %3, 0x2d;" : "=r" (r.s1) : "r" (a.s1), "r" (b.s1), "r" (c.s1));
#endif
- #if VECT_SIZE == 4
- asm ("lop3.b32 %0, %1, %2, %3, 0x2d;" : "=r" (r.s0) : "r" (a.s0), "r" (b.s0), "r" (c.s0));
- asm ("lop3.b32 %0, %1, %2, %3, 0x2d;" : "=r" (r.s1) : "r" (a.s1), "r" (b.s1), "r" (c.s1));
+ #if VECT_SIZE >= 4
asm ("lop3.b32 %0, %1, %2, %3, 0x2d;" : "=r" (r.s2) : "r" (a.s2), "r" (b.s2), "r" (c.s2));
asm ("lop3.b32 %0, %1, %2, %3, 0x2d;" : "=r" (r.s3) : "r" (a.s3), "r" (b.s3), "r" (c.s3));
#endif
- #if VECT_SIZE == 8
- asm ("lop3.b32 %0, %1, %2, %3, 0x2d;" : "=r" (r.s0) : "r" (a.s0), "r" (b.s0), "r" (c.s0));
- asm ("lop3.b32 %0, %1, %2, %3, 0x2d;" : "=r" (r.s1) : "r" (a.s1), "r" (b.s1), "r" (c.s1));
- asm ("lop3.b32 %0, %1, %2, %3, 0x2d;" : "=r" (r.s2) : "r" (a.s2), "r" (b.s2), "r" (c.s2));
- asm ("lop3.b32 %0, %1, %2, %3, 0x2d;" : "=r" (r.s3) : "r" (a.s3), "r" (b.s3), "r" (c.s3));
+ #if VECT_SIZE >= 8
asm ("lop3.b32 %0, %1, %2, %3, 0x2d;" : "=r" (r.s4) : "r" (a.s4), "r" (b.s4), "r" (c.s4));
asm ("lop3.b32 %0, %1, %2, %3, 0x2d;" : "=r" (r.s5) : "r" (a.s5), "r" (b.s5), "r" (c.s5));
asm ("lop3.b32 %0, %1, %2, %3, 0x2d;" : "=r" (r.s6) : "r" (a.s6), "r" (b.s6), "r" (c.s6));
#endif
#endif
+#ifdef IS_GENERIC
+static inline u32 swap32_S (const u32 v)
+{
+ return (as_uint (as_uchar4 (v).s3210));
+}
+
+static inline u64 swap64_S (const u64 v)
+{
+ return (as_ulong (as_uchar8 (v).s76543210));
+}
+
+static inline u32 rotr32_S (const u32 a, const u32 n)
+{
+ return rotate (a, 32 - n);
+}
+
+static inline u32 rotl32_S (const u32 a, const u32 n)
+{
+ return rotate (a, n);
+}
+
+static inline u64 rotr64_S (const u64 a, const u32 n)
+{
+ return rotate (a, (u64) 64 - n);
+}
+
+static inline u64 rotl64_S (const u64 a, const u32 n)
+{
+ return rotate (a, (u64) n);
+}
+
+static inline u32 amd_bytealign_S (const u32 a, const u32 b, const u32 c)
+{
+ const u64 tmp = ((((u64) a) << 32) | ((u64) b)) >> ((c & 3) * 8);
+
+ return (u32) (tmp);
+}
+
+static inline u32x swap32 (const u32x v)
+{
+ return ((v >> 24) & 0x000000ff)
+ | ((v >> 8) & 0x0000ff00)
+ | ((v << 8) & 0x00ff0000)
+ | ((v << 24) & 0xff000000);
+}
+
+static inline u64x swap64 (const u64x v)
+{
+ return ((v >> 56) & 0x00000000000000ff)
+ | ((v >> 40) & 0x000000000000ff00)
+ | ((v >> 24) & 0x0000000000ff0000)
+ | ((v >> 8) & 0x00000000ff000000)
+ | ((v << 8) & 0x000000ff00000000)
+ | ((v << 24) & 0x0000ff0000000000)
+ | ((v << 40) & 0x00ff000000000000)
+ | ((v << 56) & 0xff00000000000000);
+}
+
+static inline u32x rotr32 (const u32x a, const u32 n)
+{
+ return rotate (a, 32 - n);
+}
+
+static inline u32x rotl32 (const u32x a, const u32 n)
+{
+ return rotate (a, n);
+}
+
+static inline u64x rotr64 (const u64x a, const u32 n)
+{
+ return rotate (a, (u64) 64 - n);
+}
+
+static inline u64x rotl64 (const u64x a, const u32 n)
+{
+ return rotate (a, (u64) n);
+}
+
+static inline u32 __bfe (const u32 a, const u32 b, const u32 c)
+{
+ #define BIT(x) (1 << (x))
+ #define BIT_MASK(x) (BIT (x) - 1)
+ #define BFE(x,y,z) (((x) >> (y)) & BIT_MASK (z))
+
+ return BFE (a, b, c);
+}
+
+static inline u32x amd_bytealign (const u32x a, const u32x b, const u32 c)
+{
+ #if VECT_SIZE == 1
+ const u64x tmp = ((((u64x) (a)) << 32) | ((u64x) (b))) >> ((c & 3) * 8);
+
+ return (u32x) (tmp);
+ #endif
+
+ #if VECT_SIZE == 2
+ const u64x tmp = ((((u64x) (a.s0, a.s1)) << 32) | ((u64x) (b.s0, b.s1))) >> ((c & 3) * 8);
+
+ return (u32x) (tmp.s0, tmp.s1);
+ #endif
+
+ #if VECT_SIZE == 4
+ const u64x tmp = ((((u64x) (a.s0, a.s1, a.s2, a.s3)) << 32) | ((u64x) (b.s0, b.s1, b.s2, b.s3))) >> ((c & 3) * 8);
+
+ return (u32x) (tmp.s0, tmp.s1, tmp.s2, tmp.s3);
+ #endif
+
+ #if VECT_SIZE == 8
+ const u64x tmp = ((((u64x) (a.s0, a.s1, a.s2, a.s3, a.s4, a.s5, a.s6, a.s7)) << 32) | ((u64x) (b.s0, b.s1, b.s2, b.s3, b.s4, b.s5, b.s6, b.s7))) >> ((c & 3) * 8);
+
+ return (u32x) (tmp.s0, tmp.s1, tmp.s2, tmp.s3, tmp.s4, tmp.s5, tmp.s6, tmp.s7);
+ #endif
+}
+#endif
+
typedef struct
{
#if defined _DES_
#if defined _MD4_ || defined _DCC2_ || defined _NETNTLMV2_ || defined _KRB5PA_ || defined _MS_DRSR_
+#define MD4_F_S(x,y,z) (((x) & (y)) | ((~(x)) & (z)))
+#define MD4_G_S(x,y,z) (((x) & (y)) | ((x) & (z)) | ((y) & (z)))
+#define MD4_H_S(x,y,z) ((x) ^ (y) ^ (z))
+
#ifdef IS_NV
#if CUDA_ARCH >= 500
#define MD4_F(x,y,z) lut3_ca ((x), (y), (z))
#if defined _MD5_ || defined _MD5H_ || defined _SAPB_ || defined _OLDOFFICE01_ || defined _WPA_ || defined _MD5_SHA1_ || defined _SHA1_MD5_ || defined _NETNTLMV2_ || defined _KRB5PA_ || defined _PBKDF2_MD5_
+#define MD5_F_S(x,y,z) ((z) ^ ((x) & ((y) ^ (z))))
+#define MD5_G_S(x,y,z) ((y) ^ ((z) & ((x) ^ (y))))
+#define MD5_H_S(x,y,z) ((x) ^ (y) ^ (z))
+#define MD5_I_S(x,y,z) ((y) ^ ((x) | ~(z)))
+
#ifdef IS_NV
#if CUDA_ARCH >= 500
#define MD5_F(x,y,z) lut3_ca ((x), (y), (z))
#define SHA384_STEP(F0,F1,a,b,c,d,e,f,g,h,x,K) \
{ \
- u64 temp0; \
+ u64x temp0; \
temp0 = K; \
temp0 += x; \
temp0 += h; \
#define SHA512_STEP(F0,F1,a,b,c,d,e,f,g,h,x,K) \
{ \
- u64 temp0; \
+ u64x temp0; \
temp0 = K; \
temp0 += x; \
temp0 += h; \
#ifdef IS_NV
#endif
+
+/**
+ * Generic
+ */
+
+#ifdef IS_GENERIC
+#endif
\ No newline at end of file
#define OPTI_TYPE_SINGLE_HASH (1 << 11)
#define OPTI_TYPE_SINGLE_SALT (1 << 12)
#define OPTI_TYPE_BRUTE_FORCE (1 << 13)
-#define OPTI_TYPE_RAW_HASH (1 << 15)
+#define OPTI_TYPE_RAW_HASH (1 << 14)
+#define OPTI_TYPE_USES_BITS_8 (1 << 15)
+#define OPTI_TYPE_USES_BITS_16 (1 << 16)
+#define OPTI_TYPE_USES_BITS_32 (1 << 17)
+#define OPTI_TYPE_USES_BITS_64 (1 << 18)
#define OPTI_STR_ZERO_BYTE "Zero-Byte"
#define OPTI_STR_PRECOMPUTE_INIT "Precompute-Init"
| OPTI_TYPE_EARLY_SKIP
| OPTI_TYPE_NOT_ITERATED
| OPTI_TYPE_NOT_SALTED
+ | OPTI_TYPE_USES_BITS_64
| OPTI_TYPE_RAW_HASH;
dgst_pos0 = 14;
dgst_pos1 = 15;
| OPTI_TYPE_EARLY_SKIP
| OPTI_TYPE_NOT_ITERATED
| OPTI_TYPE_APPENDED_SALT
+ | OPTI_TYPE_USES_BITS_64
| OPTI_TYPE_RAW_HASH;
dgst_pos0 = 14;
dgst_pos1 = 15;
| OPTI_TYPE_EARLY_SKIP
| OPTI_TYPE_NOT_ITERATED
| OPTI_TYPE_APPENDED_SALT
+ | OPTI_TYPE_USES_BITS_64
| OPTI_TYPE_RAW_HASH;
dgst_pos0 = 14;
dgst_pos1 = 15;
| OPTI_TYPE_EARLY_SKIP
| OPTI_TYPE_NOT_ITERATED
| OPTI_TYPE_PREPENDED_SALT
+ | OPTI_TYPE_USES_BITS_64
| OPTI_TYPE_RAW_HASH;
dgst_pos0 = 14;
dgst_pos1 = 15;
| OPTI_TYPE_EARLY_SKIP
| OPTI_TYPE_NOT_ITERATED
| OPTI_TYPE_PREPENDED_SALT
+ | OPTI_TYPE_USES_BITS_64
| OPTI_TYPE_RAW_HASH;
dgst_pos0 = 14;
dgst_pos1 = 15;
| OPTI_TYPE_EARLY_SKIP
| OPTI_TYPE_NOT_ITERATED
| OPTI_TYPE_APPENDED_SALT
+ | OPTI_TYPE_USES_BITS_64
| OPTI_TYPE_RAW_HASH;
dgst_pos0 = 14;
dgst_pos1 = 15;
| OPTI_TYPE_EARLY_SKIP
| OPTI_TYPE_NOT_ITERATED
| OPTI_TYPE_APPENDED_SALT
+ | OPTI_TYPE_USES_BITS_64
| OPTI_TYPE_RAW_HASH;
dgst_pos0 = 14;
dgst_pos1 = 15;
| OPTI_TYPE_EARLY_SKIP
| OPTI_TYPE_NOT_ITERATED
| OPTI_TYPE_PREPENDED_SALT
+ | OPTI_TYPE_USES_BITS_64
| OPTI_TYPE_RAW_HASH;
dgst_pos0 = 14;
dgst_pos1 = 15;
parse_func = hmacsha512_parse_hash;
sort_by_digest = sort_by_digest_8_8;
opti_type = OPTI_TYPE_ZERO_BYTE
+ | OPTI_TYPE_USES_BITS_64
| OPTI_TYPE_NOT_ITERATED;
dgst_pos0 = 14;
dgst_pos1 = 15;
parse_func = hmacsha512_parse_hash;
sort_by_digest = sort_by_digest_8_8;
opti_type = OPTI_TYPE_ZERO_BYTE
+ | OPTI_TYPE_USES_BITS_64
| OPTI_TYPE_NOT_ITERATED;
dgst_pos0 = 14;
dgst_pos1 = 15;
dgst_size = DGST_SIZE_8_8;
parse_func = sha512crypt_parse_hash;
sort_by_digest = sort_by_digest_8_8;
- opti_type = OPTI_TYPE_ZERO_BYTE;
+ opti_type = OPTI_TYPE_ZERO_BYTE
+ | OPTI_TYPE_USES_BITS_64;
dgst_pos0 = 0;
dgst_pos1 = 1;
dgst_pos2 = 2;
parse_func = keccak_parse_hash;
sort_by_digest = sort_by_digest_8_25;
opti_type = OPTI_TYPE_ZERO_BYTE
+ | OPTI_TYPE_USES_BITS_64
| OPTI_TYPE_RAW_HASH;
dgst_pos0 = 2;
dgst_pos1 = 3;
dgst_size = DGST_SIZE_8_8;
parse_func = truecrypt_parse_hash_1k;
sort_by_digest = sort_by_digest_8_8;
- opti_type = OPTI_TYPE_ZERO_BYTE;
+ opti_type = OPTI_TYPE_ZERO_BYTE
+ | OPTI_TYPE_USES_BITS_64;
dgst_pos0 = 0;
dgst_pos1 = 1;
dgst_pos2 = 2;
dgst_size = DGST_SIZE_8_8;
parse_func = truecrypt_parse_hash_1k;
sort_by_digest = sort_by_digest_8_8;
- opti_type = OPTI_TYPE_ZERO_BYTE;
+ opti_type = OPTI_TYPE_ZERO_BYTE
+ | OPTI_TYPE_USES_BITS_64;
dgst_pos0 = 0;
dgst_pos1 = 1;
dgst_pos2 = 2;
dgst_size = DGST_SIZE_8_8;
parse_func = truecrypt_parse_hash_1k;
sort_by_digest = sort_by_digest_8_8;
- opti_type = OPTI_TYPE_ZERO_BYTE;
+ opti_type = OPTI_TYPE_ZERO_BYTE
+ | OPTI_TYPE_USES_BITS_64;
dgst_pos0 = 0;
dgst_pos1 = 1;
dgst_pos2 = 2;
dgst_size = DGST_SIZE_8_8;
parse_func = sha512aix_parse_hash;
sort_by_digest = sort_by_digest_8_8;
- opti_type = OPTI_TYPE_ZERO_BYTE;
+ opti_type = OPTI_TYPE_ZERO_BYTE
+ | OPTI_TYPE_USES_BITS_64;
dgst_pos0 = 0;
dgst_pos1 = 1;
dgst_pos2 = 2;
dgst_size = DGST_SIZE_8_16;
parse_func = sha512osx_parse_hash;
sort_by_digest = sort_by_digest_8_16;
- opti_type = OPTI_TYPE_ZERO_BYTE;
+ opti_type = OPTI_TYPE_ZERO_BYTE
+ | OPTI_TYPE_USES_BITS_64;
dgst_pos0 = 0;
dgst_pos1 = 1;
dgst_pos2 = 2;
dgst_size = DGST_SIZE_8_16;
parse_func = sha512grub_parse_hash;
sort_by_digest = sort_by_digest_8_16;
- opti_type = OPTI_TYPE_ZERO_BYTE;
+ opti_type = OPTI_TYPE_ZERO_BYTE
+ | OPTI_TYPE_USES_BITS_64;
dgst_pos0 = 0;
dgst_pos1 = 1;
dgst_pos2 = 2;
dgst_size = DGST_SIZE_8_8;
parse_func = drupal7_parse_hash;
sort_by_digest = sort_by_digest_8_8;
- opti_type = OPTI_TYPE_ZERO_BYTE;
+ opti_type = OPTI_TYPE_ZERO_BYTE
+ | OPTI_TYPE_USES_BITS_64;
dgst_pos0 = 0;
dgst_pos1 = 1;
dgst_pos2 = 2;
| OPTI_TYPE_EARLY_SKIP
| OPTI_TYPE_NOT_ITERATED
| OPTI_TYPE_NOT_SALTED
+ | OPTI_TYPE_USES_BITS_64
| OPTI_TYPE_RAW_HASH;
dgst_pos0 = 6;
dgst_pos1 = 7;
dgst_size = DGST_SIZE_8_16;
parse_func = pbkdf2_sha512_parse_hash;
sort_by_digest = sort_by_digest_8_16;
- opti_type = OPTI_TYPE_ZERO_BYTE;
+ opti_type = OPTI_TYPE_ZERO_BYTE
+ | OPTI_TYPE_USES_BITS_64;
dgst_pos0 = 0;
dgst_pos1 = 1;
dgst_pos2 = 2;
dgst_size = DGST_SIZE_8_8;
parse_func = ecryptfs_parse_hash;
sort_by_digest = sort_by_digest_8_8;
- opti_type = OPTI_TYPE_ZERO_BYTE;
+ opti_type = OPTI_TYPE_ZERO_BYTE
+ | OPTI_TYPE_USES_BITS_64;
dgst_pos0 = 0;
dgst_pos1 = 1;
dgst_pos2 = 2;
dgst_size = DGST_SIZE_8_16;
parse_func = oraclet_parse_hash;
sort_by_digest = sort_by_digest_8_16;
- opti_type = OPTI_TYPE_ZERO_BYTE;
+ opti_type = OPTI_TYPE_ZERO_BYTE
+ | OPTI_TYPE_USES_BITS_64;
dgst_pos0 = 0;
dgst_pos1 = 1;
dgst_pos2 = 2;
device_param->platform_devices_id = platform_devices_id;
- // vendor_id
-
- cl_uint vendor_id = 0;
-
- hc_clGetDeviceInfo (device_param->device, CL_DEVICE_VENDOR_ID, sizeof (vendor_id), &vendor_id, NULL);
-
- device_param->vendor_id = vendor_id;
-
// device_type
cl_device_type device_type;
device_param->device_type = device_type;
+ // vendor_id
+
+ cl_uint vendor_id = 0;
+
+ hc_clGetDeviceInfo (device_param->device, CL_DEVICE_VENDOR_ID, sizeof (vendor_id), &vendor_id, NULL);
+
+ device_param->vendor_id = vendor_id;
+
// device_name
char *device_name = (char *) mymalloc (INFOSZ);
// pocl returns the real vendor_id in CL_DEVICE_VENDOR_ID which causes many problems because of hms and missing amd_bfe () etc
// we need to overwrite vendor_id to avoid this. maybe open pocl issue?
- cl_uint vendor_id = 0xffff;
+ cl_uint vendor_id = VENDOR_ID_GENERIC;
device_param->vendor_id = vendor_id;
}
cl_uint vector_width;
- if (1) // can be removed as soon as all kernel are migrated; if (attack_mode == ATTACK_MODE_BF)
+ if (opencl_vector_width == OPENCL_VECTOR_WIDTH)
{
- if (opencl_vector_width == OPENCL_VECTOR_WIDTH)
+ hc_clGetDeviceInfo (device_param->device, CL_DEVICE_NATIVE_VECTOR_WIDTH_INT, sizeof (vector_width), &vector_width, NULL);
+
+ if ((vendor_id == VENDOR_ID_NV) && (strstr (device_name, " Ti") || strstr (device_name, " TI")))
{
- hc_clGetDeviceInfo (device_param->device, CL_DEVICE_NATIVE_VECTOR_WIDTH_INT, sizeof (vector_width), &vector_width, NULL);
+ // Yeah that's a super bad hack, but there's no other attribute we could use
+
+ if (vector_width < 2) vector_width *= 2;
}
- else
+
+ if (opti_type & OPTI_TYPE_USES_BITS_64)
{
- vector_width = opencl_vector_width;
+ if (vector_width > 1) vector_width /= 2;
}
}
else
{
- vector_width = 1;
+ vector_width = opencl_vector_width;
}
if (vector_width > 8) vector_width = 8;
* create command-queue
*/
- // not support with NV
+ // not supported with NV
// device_param->command_queue = hc_clCreateCommandQueueWithProperties (device_param->context, device_param->device, NULL);
device_param->command_queue = hc_clCreateCommandQueue (device_param->context, device_param->device, 0);
if (device_type & CL_DEVICE_TYPE_CPU)
{
- // CPU still need lots of workitems, don't know why...
- // for testing phase, lets start with this
-
-// kernel_accel = 1;
+ if (benchmark_mode == 0)
+ {
+ if (kernel_accel > 16)
+ {
+ kernel_accel = 16;
+ }
+ }
+ else
+ {
+ if (kernel_accel > 64)
+ {
+ kernel_accel = 64;
+ }
+ }
}
uint kernel_power = device_processors * kernel_threads * kernel_accel;
// we don't have sm_* on vendors not NV but it doesn't matter
- sprintf (build_opts, "-I%s/ -DVENDOR_ID=%d -DCUDA_ARCH=%d -DVECT_SIZE=%u", shared_dir, device_param->vendor_id, (device_param->sm_major * 100) + device_param->sm_minor, device_param->vector_width);
+ sprintf (build_opts, "-I%s/ -DVENDOR_ID=%d -DCUDA_ARCH=%d -DVECT_SIZE=%u -DDEVICE_TYPE=%u", shared_dir, device_param->vendor_id, (device_param->sm_major * 100) + device_param->sm_minor, device_param->vector_width, (u32) device_param->device_type);
/**
* main kernel
SLOW_ALGOS="400 500 501 1600 1800 2100 2500 3200 5200 5800 6211 6221 6231 6241 6251 6261 6271 6281 6300 6400 6500 6600 6700 6800 7100 7200 7400 7900 8200 8800 8900 9000 9100 9200 9300 9400 9500 9600 10000 10300 10500 10700 10900 11300 11600 11900 12000 12100 12200 12300 12400 12500 12800 12900 13000"
-OPTS="--quiet --force --potfile-disable --runtime 200 --gpu-temp-disable --weak-hash-threshold=0 --opencl-device-types 2"
+OPTS="--quiet --force --potfile-disable --runtime 200 --gpu-temp-disable --weak-hash-threshold=0 --opencl-device-types 2 --opencl-vector-width 2"
OUTD="test_$(date +%s)"