#define _MD5_
+#define NEW_SIMD_CODE
+
#include "include/constants.h"
#include "include/kernel_vendor.h"
#include "include/kernel_functions.c"
#include "OpenCL/types_ocl.c"
#include "OpenCL/common.c"
+#include "OpenCL/simd.c"
#define COMPARE_S "OpenCL/check_single_comp4.c"
#define COMPARE_M "OpenCL/check_multi_comp4.c"
-static void md5_transform (const u32 w0[4], const u32 w1[4], const u32 w2[4], const u32 w3[4], u32 digest[4])
+static void md5_transform_S (const u32 w0[4], const u32 w1[4], const u32 w2[4], const u32 w3[4], u32 digest[4])
{
u32 a = digest[0];
u32 b = digest[1];
u32 we_t = w3[2];
u32 wf_t = 0;
+ MD5_STEP_S (MD5_Fo, a, b, c, d, w0_t, MD5C00, MD5S00);
+ MD5_STEP_S (MD5_Fo, d, a, b, c, w1_t, MD5C01, MD5S01);
+ MD5_STEP_S (MD5_Fo, c, d, a, b, w2_t, MD5C02, MD5S02);
+ MD5_STEP_S (MD5_Fo, b, c, d, a, w3_t, MD5C03, MD5S03);
+ MD5_STEP_S (MD5_Fo, a, b, c, d, w4_t, MD5C04, MD5S00);
+ MD5_STEP_S (MD5_Fo, d, a, b, c, w5_t, MD5C05, MD5S01);
+ MD5_STEP_S (MD5_Fo, c, d, a, b, w6_t, MD5C06, MD5S02);
+ MD5_STEP_S (MD5_Fo, b, c, d, a, w7_t, MD5C07, MD5S03);
+ MD5_STEP_S (MD5_Fo, a, b, c, d, w8_t, MD5C08, MD5S00);
+ MD5_STEP_S (MD5_Fo, d, a, b, c, w9_t, MD5C09, MD5S01);
+ MD5_STEP_S (MD5_Fo, c, d, a, b, wa_t, MD5C0a, MD5S02);
+ MD5_STEP_S (MD5_Fo, b, c, d, a, wb_t, MD5C0b, MD5S03);
+ MD5_STEP_S (MD5_Fo, a, b, c, d, wc_t, MD5C0c, MD5S00);
+ MD5_STEP_S (MD5_Fo, d, a, b, c, wd_t, MD5C0d, MD5S01);
+ MD5_STEP_S (MD5_Fo, c, d, a, b, we_t, MD5C0e, MD5S02);
+ MD5_STEP_S (MD5_Fo, b, c, d, a, wf_t, MD5C0f, MD5S03);
+
+ MD5_STEP_S (MD5_Go, a, b, c, d, w1_t, MD5C10, MD5S10);
+ MD5_STEP_S (MD5_Go, d, a, b, c, w6_t, MD5C11, MD5S11);
+ MD5_STEP_S (MD5_Go, c, d, a, b, wb_t, MD5C12, MD5S12);
+ MD5_STEP_S (MD5_Go, b, c, d, a, w0_t, MD5C13, MD5S13);
+ MD5_STEP_S (MD5_Go, a, b, c, d, w5_t, MD5C14, MD5S10);
+ MD5_STEP_S (MD5_Go, d, a, b, c, wa_t, MD5C15, MD5S11);
+ MD5_STEP_S (MD5_Go, c, d, a, b, wf_t, MD5C16, MD5S12);
+ MD5_STEP_S (MD5_Go, b, c, d, a, w4_t, MD5C17, MD5S13);
+ MD5_STEP_S (MD5_Go, a, b, c, d, w9_t, MD5C18, MD5S10);
+ MD5_STEP_S (MD5_Go, d, a, b, c, we_t, MD5C19, MD5S11);
+ MD5_STEP_S (MD5_Go, c, d, a, b, w3_t, MD5C1a, MD5S12);
+ MD5_STEP_S (MD5_Go, b, c, d, a, w8_t, MD5C1b, MD5S13);
+ MD5_STEP_S (MD5_Go, a, b, c, d, wd_t, MD5C1c, MD5S10);
+ MD5_STEP_S (MD5_Go, d, a, b, c, w2_t, MD5C1d, MD5S11);
+ MD5_STEP_S (MD5_Go, c, d, a, b, w7_t, MD5C1e, MD5S12);
+ MD5_STEP_S (MD5_Go, b, c, d, a, wc_t, MD5C1f, MD5S13);
+
+ MD5_STEP_S (MD5_H , a, b, c, d, w5_t, MD5C20, MD5S20);
+ MD5_STEP_S (MD5_H , d, a, b, c, w8_t, MD5C21, MD5S21);
+ MD5_STEP_S (MD5_H , c, d, a, b, wb_t, MD5C22, MD5S22);
+ MD5_STEP_S (MD5_H , b, c, d, a, we_t, MD5C23, MD5S23);
+ MD5_STEP_S (MD5_H , a, b, c, d, w1_t, MD5C24, MD5S20);
+ MD5_STEP_S (MD5_H , d, a, b, c, w4_t, MD5C25, MD5S21);
+ MD5_STEP_S (MD5_H , c, d, a, b, w7_t, MD5C26, MD5S22);
+ MD5_STEP_S (MD5_H , b, c, d, a, wa_t, MD5C27, MD5S23);
+ MD5_STEP_S (MD5_H , a, b, c, d, wd_t, MD5C28, MD5S20);
+ MD5_STEP_S (MD5_H , d, a, b, c, w0_t, MD5C29, MD5S21);
+ MD5_STEP_S (MD5_H , c, d, a, b, w3_t, MD5C2a, MD5S22);
+ MD5_STEP_S (MD5_H , b, c, d, a, w6_t, MD5C2b, MD5S23);
+ MD5_STEP_S (MD5_H , a, b, c, d, w9_t, MD5C2c, MD5S20);
+ MD5_STEP_S (MD5_H , d, a, b, c, wc_t, MD5C2d, MD5S21);
+ MD5_STEP_S (MD5_H , c, d, a, b, wf_t, MD5C2e, MD5S22);
+ MD5_STEP_S (MD5_H , b, c, d, a, w2_t, MD5C2f, MD5S23);
+
+ MD5_STEP_S (MD5_I , a, b, c, d, w0_t, MD5C30, MD5S30);
+ MD5_STEP_S (MD5_I , d, a, b, c, w7_t, MD5C31, MD5S31);
+ MD5_STEP_S (MD5_I , c, d, a, b, we_t, MD5C32, MD5S32);
+ MD5_STEP_S (MD5_I , b, c, d, a, w5_t, MD5C33, MD5S33);
+ MD5_STEP_S (MD5_I , a, b, c, d, wc_t, MD5C34, MD5S30);
+ MD5_STEP_S (MD5_I , d, a, b, c, w3_t, MD5C35, MD5S31);
+ MD5_STEP_S (MD5_I , c, d, a, b, wa_t, MD5C36, MD5S32);
+ MD5_STEP_S (MD5_I , b, c, d, a, w1_t, MD5C37, MD5S33);
+ MD5_STEP_S (MD5_I , a, b, c, d, w8_t, MD5C38, MD5S30);
+ MD5_STEP_S (MD5_I , d, a, b, c, wf_t, MD5C39, MD5S31);
+ MD5_STEP_S (MD5_I , c, d, a, b, w6_t, MD5C3a, MD5S32);
+ MD5_STEP_S (MD5_I , b, c, d, a, wd_t, MD5C3b, MD5S33);
+ MD5_STEP_S (MD5_I , a, b, c, d, w4_t, MD5C3c, MD5S30);
+ MD5_STEP_S (MD5_I , d, a, b, c, wb_t, MD5C3d, MD5S31);
+ MD5_STEP_S (MD5_I , c, d, a, b, w2_t, MD5C3e, MD5S32);
+ MD5_STEP_S (MD5_I , b, c, d, a, w9_t, MD5C3f, MD5S33);
+
+ digest[0] += a;
+ digest[1] += b;
+ digest[2] += c;
+ digest[3] += d;
+}
+
+static void md5_transform (const u32x w0[4], const u32x w1[4], const u32x w2[4], const u32x w3[4], u32x digest[4])
+{
+ u32x a = digest[0];
+ u32x b = digest[1];
+ u32x c = digest[2];
+ u32x d = digest[3];
+
+ u32x w0_t = w0[0];
+ u32x w1_t = w0[1];
+ u32x w2_t = w0[2];
+ u32x w3_t = w0[3];
+ u32x w4_t = w1[0];
+ u32x w5_t = w1[1];
+ u32x w6_t = w1[2];
+ u32x w7_t = w1[3];
+ u32x w8_t = w2[0];
+ u32x w9_t = w2[1];
+ u32x wa_t = w2[2];
+ u32x wb_t = w2[3];
+ u32x wc_t = w3[0];
+ u32x wd_t = w3[1];
+ u32x we_t = w3[2];
+ u32x wf_t = 0;
+
MD5_STEP (MD5_Fo, a, b, c, d, w0_t, MD5C00, MD5S00);
MD5_STEP (MD5_Fo, d, a, b, c, w1_t, MD5C01, MD5S01);
MD5_STEP (MD5_Fo, c, d, a, b, w2_t, MD5C02, MD5S02);
block3[2] = block_len * 8;
block3[3] = 0;
- append_0x80_4x4 (block0, block1, block2, block3, block_len);
+ append_0x80_4x4_S (block0, block1, block2, block3, block_len);
/**
* init
digest[2] = MD5M_C;
digest[3] = MD5M_D;
- md5_transform (block0, block1, block2, block3, digest);
+ md5_transform_S (block0, block1, block2, block3, digest);
tmps[gid].digest_buf[0] = digest[0];
tmps[gid].digest_buf[1] = digest[1];
if (gid >= gid_max) return;
- u32 w0[4];
+ u32x w0[4] = { 0 };
+ u32x w1[4] = { 0 };
+ u32x w2[4] = { 0 };
- w0[0] = pws[gid].i[ 0];
- w0[1] = pws[gid].i[ 1];
- w0[2] = pws[gid].i[ 2];
- w0[3] = pws[gid].i[ 3];
+ u32x pw_len = 0;
- u32 w1[4];
+ u32x digest[4] = { 0 };
- w1[0] = pws[gid].i[ 4];
- w1[1] = pws[gid].i[ 5];
- w1[2] = pws[gid].i[ 6];
- w1[3] = pws[gid].i[ 7];
+ #if VECT_SIZE == 1
- u32 w2[4];
+ w0[0] = pws[gid].i[0];
+ w0[1] = pws[gid].i[1];
+ w0[2] = pws[gid].i[2];
+ w0[3] = pws[gid].i[3];
+ w1[0] = pws[gid].i[4];
+ w1[1] = pws[gid].i[5];
+ w1[2] = pws[gid].i[6];
+ w1[3] = pws[gid].i[7];
+ w2[0] = pws[gid].i[8];
+ w2[1] = pws[gid].i[9];
- w2[0] = pws[gid].i[ 8];
- w2[1] = pws[gid].i[ 9];
- w2[2] = 0;
- w2[3] = 0;
-
- const u32 pw_len = pws[gid].pw_len;
-
- /**
- * digest
- */
-
- u32 digest[4];
+ pw_len = pws[gid].pw_len;
digest[0] = tmps[gid].digest_buf[0];
digest[1] = tmps[gid].digest_buf[1];
digest[2] = tmps[gid].digest_buf[2];
digest[3] = tmps[gid].digest_buf[3];
+ #else
+
+ const u32 gidx = gid * VECT_SIZE;
+
+ #if VECT_SIZE >= 2
+
+ if ((gidx + 0) < gid_max)
+ {
+ w0[0].s0 = pws[gidx + 0].i[0];
+ w0[1].s0 = pws[gidx + 0].i[1];
+ w0[2].s0 = pws[gidx + 0].i[2];
+ w0[3].s0 = pws[gidx + 0].i[3];
+ w1[0].s0 = pws[gidx + 0].i[4];
+ w1[1].s0 = pws[gidx + 0].i[5];
+ w1[2].s0 = pws[gidx + 0].i[6];
+ w1[3].s0 = pws[gidx + 0].i[7];
+ w2[0].s0 = pws[gidx + 0].i[8];
+ w2[1].s0 = pws[gidx + 0].i[9];
+
+ pw_len.s0 = pws[gidx + 0].pw_len;
+
+ digest[0].s0 = tmps[gidx + 0].digest_buf[0];
+ digest[1].s0 = tmps[gidx + 0].digest_buf[1];
+ digest[2].s0 = tmps[gidx + 0].digest_buf[2];
+ digest[3].s0 = tmps[gidx + 0].digest_buf[3];
+ }
+
+ if ((gidx + 1) < gid_max)
+ {
+ w0[0].s1 = pws[gidx + 1].i[0];
+ w0[1].s1 = pws[gidx + 1].i[1];
+ w0[2].s1 = pws[gidx + 1].i[2];
+ w0[3].s1 = pws[gidx + 1].i[3];
+ w1[0].s1 = pws[gidx + 1].i[4];
+ w1[1].s1 = pws[gidx + 1].i[5];
+ w1[2].s1 = pws[gidx + 1].i[6];
+ w1[3].s1 = pws[gidx + 1].i[7];
+ w2[0].s1 = pws[gidx + 1].i[8];
+ w2[1].s1 = pws[gidx + 1].i[9];
+
+ pw_len.s1 = pws[gidx + 1].pw_len;
+
+ digest[0].s1 = tmps[gidx + 1].digest_buf[0];
+ digest[1].s1 = tmps[gidx + 1].digest_buf[1];
+ digest[2].s1 = tmps[gidx + 1].digest_buf[2];
+ digest[3].s1 = tmps[gidx + 1].digest_buf[3];
+ }
+
+ #endif
+
+ #if VECT_SIZE >= 4
+
+ if ((gidx + 2) < gid_max)
+ {
+ w0[0].s2 = pws[gidx + 2].i[0];
+ w0[1].s2 = pws[gidx + 2].i[1];
+ w0[2].s2 = pws[gidx + 2].i[2];
+ w0[3].s2 = pws[gidx + 2].i[3];
+ w1[0].s2 = pws[gidx + 2].i[4];
+ w1[1].s2 = pws[gidx + 2].i[5];
+ w1[2].s2 = pws[gidx + 2].i[6];
+ w1[3].s2 = pws[gidx + 2].i[7];
+ w2[0].s2 = pws[gidx + 2].i[8];
+ w2[1].s2 = pws[gidx + 2].i[9];
+
+ pw_len.s2 = pws[gidx + 2].pw_len;
+
+ digest[0].s2 = tmps[gidx + 2].digest_buf[0];
+ digest[1].s2 = tmps[gidx + 2].digest_buf[1];
+ digest[2].s2 = tmps[gidx + 2].digest_buf[2];
+ digest[3].s2 = tmps[gidx + 2].digest_buf[3];
+ }
+
+ if ((gidx + 3) < gid_max)
+ {
+ w0[0].s3 = pws[gidx + 3].i[0];
+ w0[1].s3 = pws[gidx + 3].i[1];
+ w0[2].s3 = pws[gidx + 3].i[2];
+ w0[3].s3 = pws[gidx + 3].i[3];
+ w1[0].s3 = pws[gidx + 3].i[4];
+ w1[1].s3 = pws[gidx + 3].i[5];
+ w1[2].s3 = pws[gidx + 3].i[6];
+ w1[3].s3 = pws[gidx + 3].i[7];
+ w2[0].s3 = pws[gidx + 3].i[8];
+ w2[1].s3 = pws[gidx + 3].i[9];
+
+ pw_len.s3 = pws[gidx + 3].pw_len;
+
+ digest[0].s3 = tmps[gidx + 3].digest_buf[0];
+ digest[1].s3 = tmps[gidx + 3].digest_buf[1];
+ digest[2].s3 = tmps[gidx + 3].digest_buf[2];
+ digest[3].s3 = tmps[gidx + 3].digest_buf[3];
+ }
+
+ #endif
+ #endif
+
/**
* loop
*/
- u32 block_len = (16 + pw_len);
+ u32x block_len = (16 + pw_len);
- u32 block0[4];
+ u32x block0[4];
+ u32x block1[4];
+ u32x block2[4];
+ u32x block3[4];
block0[0] = 0;
block0[1] = 0;
block0[2] = 0;
block0[3] = 0;
-
- u32 block1[4];
-
block1[0] = w0[0];
block1[1] = w0[1];
block1[2] = w0[2];
block1[3] = w0[3];
-
- u32 block2[4];
-
block2[0] = w1[0];
block2[1] = w1[1];
block2[2] = w1[2];
block2[3] = w1[3];
-
- u32 block3[4];
-
block3[0] = w2[0];
block3[1] = w2[1];
block3[2] = block_len * 8;
block3[3] = 0;
- append_0x80_4x4 (block0, block1, block2, block3, block_len);
+ append_0x80_4x4_VV (block0, block1, block2, block3, block_len);
/**
* init
md5_transform (block0, block1, block2, block3, digest);
}
+ #if VECT_SIZE == 1
+
tmps[gid].digest_buf[0] = digest[0];
tmps[gid].digest_buf[1] = digest[1];
tmps[gid].digest_buf[2] = digest[2];
tmps[gid].digest_buf[3] = digest[3];
+
+ #else
+
+ #if VECT_SIZE >= 2
+
+ if ((gidx + 0) < gid_max)
+ {
+ tmps[gidx + 0].digest_buf[0] = digest[0].s0;
+ tmps[gidx + 0].digest_buf[1] = digest[1].s0;
+ tmps[gidx + 0].digest_buf[2] = digest[2].s0;
+ tmps[gidx + 0].digest_buf[3] = digest[3].s0;
+ }
+
+ if ((gidx + 1) < gid_max)
+ {
+ tmps[gidx + 1].digest_buf[0] = digest[0].s1;
+ tmps[gidx + 1].digest_buf[1] = digest[1].s1;
+ tmps[gidx + 1].digest_buf[2] = digest[2].s1;
+ tmps[gidx + 1].digest_buf[3] = digest[3].s1;
+ }
+
+ #endif
+
+ #if VECT_SIZE >= 4
+
+ if ((gidx + 2) < gid_max)
+ {
+ tmps[gidx + 2].digest_buf[0] = digest[0].s2;
+ tmps[gidx + 2].digest_buf[1] = digest[1].s2;
+ tmps[gidx + 2].digest_buf[2] = digest[2].s2;
+ tmps[gidx + 2].digest_buf[3] = digest[3].s2;
+ }
+
+ if ((gidx + 3) < gid_max)
+ {
+ tmps[gidx + 3].digest_buf[0] = digest[0].s3;
+ tmps[gidx + 3].digest_buf[1] = digest[1].s3;
+ tmps[gidx + 3].digest_buf[2] = digest[2].s3;
+ tmps[gidx + 3].digest_buf[3] = digest[3].s3;
+ }
+
+ #endif
+ #endif
}
__kernel void m00400_comp (__global pw_t *pws, __global kernel_rule_t *rules_buf, __global comb_t *combs_buf, __global bf_t *bfs_buf, __global phpass_tmp_t *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 il_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max)
*/
const u32 gid = get_global_id (0);
+ const u32 lid = get_local_id (0);
if (gid >= gid_max) return;
- const u32 lid = get_local_id (0);
-
/**
* digest
*/
return rotr64_S (a, 64 - n);
}
-#if CUDA_ARCH >= 500
-static inline u32 lut3_2d_S (const u32 a, const u32 b, const u32 c)
-{
- u32 r;
-
- asm ("lop3.b32 %0, %1, %2, %3, 0x2d;" : "=r" (r) : "r" (a), "r" (b), "r" (c));
-
- return r;
-}
-
-static inline u32 lut3_39_S (const u32 a, const u32 b, const u32 c)
-{
- u32 r;
-
- asm ("lop3.b32 %0, %1, %2, %3, 0x39;" : "=r" (r) : "r" (a), "r" (b), "r" (c));
-
- return r;
-}
-
-static inline u32 lut3_59_S (const u32 a, const u32 b, const u32 c)
-{
- u32 r;
-
- asm ("lop3.b32 %0, %1, %2, %3, 0x59;" : "=r" (r) : "r" (a), "r" (b), "r" (c));
-
- return r;
-}
-
-static inline u32 lut3_96_S (const u32 a, const u32 b, const u32 c)
-{
- u32 r;
-
- asm ("lop3.b32 %0, %1, %2, %3, 0x96;" : "=r" (r) : "r" (a), "r" (b), "r" (c));
-
- return r;
-}
-
-static inline u32 lut3_e4_S (const u32 a, const u32 b, const u32 c)
-{
- u32 r;
-
- asm ("lop3.b32 %0, %1, %2, %3, 0xe4;" : "=r" (r) : "r" (a), "r" (b), "r" (c));
-
- return r;
-}
-
-static inline u32 lut3_e8_S (const u32 a, const u32 b, const u32 c)
-{
- u32 r;
-
- asm ("lop3.b32 %0, %1, %2, %3, 0xe8;" : "=r" (r) : "r" (a), "r" (b), "r" (c));
-
- return r;
-}
-
-static inline u32 lut3_ca_S (const u32 a, const u32 b, const u32 c)
-{
- u32 r;
-
- asm ("lop3.b32 %0, %1, %2, %3, 0xca;" : "=r" (r) : "r" (a), "r" (b), "r" (c));
-
- return r;
-}
-#endif
-
static inline u32 __byte_perm_S (const u32 a, const u32 b, const u32 c)
{
u32 r;
}
#endif
-#if CUDA_ARCH >= 500
-static inline u32x lut3_2d (const u32x a, const u32x b, const u32x c)
-{
- u32x r;
-
- #if VECT_SIZE == 1
- asm ("lop3.b32 %0, %1, %2, %3, 0x2d;" : "=r" (r) : "r" (a), "r" (b), "r" (c));
- #endif
-
- #if VECT_SIZE >= 2
- asm ("lop3.b32 %0, %1, %2, %3, 0x2d;" : "=r" (r.s0) : "r" (a.s0), "r" (b.s0), "r" (c.s0));
- asm ("lop3.b32 %0, %1, %2, %3, 0x2d;" : "=r" (r.s1) : "r" (a.s1), "r" (b.s1), "r" (c.s1));
- #endif
-
- #if VECT_SIZE >= 4
- asm ("lop3.b32 %0, %1, %2, %3, 0x2d;" : "=r" (r.s2) : "r" (a.s2), "r" (b.s2), "r" (c.s2));
- asm ("lop3.b32 %0, %1, %2, %3, 0x2d;" : "=r" (r.s3) : "r" (a.s3), "r" (b.s3), "r" (c.s3));
- #endif
-
- #if VECT_SIZE >= 8
- asm ("lop3.b32 %0, %1, %2, %3, 0x2d;" : "=r" (r.s4) : "r" (a.s4), "r" (b.s4), "r" (c.s4));
- asm ("lop3.b32 %0, %1, %2, %3, 0x2d;" : "=r" (r.s5) : "r" (a.s5), "r" (b.s5), "r" (c.s5));
- asm ("lop3.b32 %0, %1, %2, %3, 0x2d;" : "=r" (r.s6) : "r" (a.s6), "r" (b.s6), "r" (c.s6));
- asm ("lop3.b32 %0, %1, %2, %3, 0x2d;" : "=r" (r.s7) : "r" (a.s7), "r" (b.s7), "r" (c.s7));
- #endif
-
- #if VECT_SIZE >= 16
- asm ("lop3.b32 %0, %1, %2, %3, 0x2d;" : "=r" (r.s8) : "r" (a.s8), "r" (b.s8), "r" (c.s8));
- asm ("lop3.b32 %0, %1, %2, %3, 0x2d;" : "=r" (r.s9) : "r" (a.s9), "r" (b.s9), "r" (c.s9));
- asm ("lop3.b32 %0, %1, %2, %3, 0x2d;" : "=r" (r.sa) : "r" (a.sa), "r" (b.sa), "r" (c.sa));
- asm ("lop3.b32 %0, %1, %2, %3, 0x2d;" : "=r" (r.sb) : "r" (a.sb), "r" (b.sb), "r" (c.sb));
- asm ("lop3.b32 %0, %1, %2, %3, 0x2d;" : "=r" (r.sc) : "r" (a.sc), "r" (b.sc), "r" (c.sc));
- asm ("lop3.b32 %0, %1, %2, %3, 0x2d;" : "=r" (r.sd) : "r" (a.sd), "r" (b.sd), "r" (c.sd));
- asm ("lop3.b32 %0, %1, %2, %3, 0x2d;" : "=r" (r.se) : "r" (a.se), "r" (b.se), "r" (c.se));
- asm ("lop3.b32 %0, %1, %2, %3, 0x2d;" : "=r" (r.sf) : "r" (a.sf), "r" (b.sf), "r" (c.sf));
- #endif
-
- return r;
-}
-
-static inline u32x lut3_39 (const u32x a, const u32x b, const u32x c)
-{
- u32x r;
-
- #if VECT_SIZE == 1
- asm ("lop3.b32 %0, %1, %2, %3, 0x39;" : "=r" (r) : "r" (a), "r" (b), "r" (c));
- #endif
-
- #if VECT_SIZE >= 2
- asm ("lop3.b32 %0, %1, %2, %3, 0x39;" : "=r" (r.s0) : "r" (a.s0), "r" (b.s0), "r" (c.s0));
- asm ("lop3.b32 %0, %1, %2, %3, 0x39;" : "=r" (r.s1) : "r" (a.s1), "r" (b.s1), "r" (c.s1));
- #endif
-
- #if VECT_SIZE >= 4
- asm ("lop3.b32 %0, %1, %2, %3, 0x39;" : "=r" (r.s2) : "r" (a.s2), "r" (b.s2), "r" (c.s2));
- asm ("lop3.b32 %0, %1, %2, %3, 0x39;" : "=r" (r.s3) : "r" (a.s3), "r" (b.s3), "r" (c.s3));
- #endif
-
- #if VECT_SIZE >= 8
- asm ("lop3.b32 %0, %1, %2, %3, 0x39;" : "=r" (r.s4) : "r" (a.s4), "r" (b.s4), "r" (c.s4));
- asm ("lop3.b32 %0, %1, %2, %3, 0x39;" : "=r" (r.s5) : "r" (a.s5), "r" (b.s5), "r" (c.s5));
- asm ("lop3.b32 %0, %1, %2, %3, 0x39;" : "=r" (r.s6) : "r" (a.s6), "r" (b.s6), "r" (c.s6));
- asm ("lop3.b32 %0, %1, %2, %3, 0x39;" : "=r" (r.s7) : "r" (a.s7), "r" (b.s7), "r" (c.s7));
- #endif
-
- #if VECT_SIZE >= 16
- asm ("lop3.b32 %0, %1, %2, %3, 0x39;" : "=r" (r.s8) : "r" (a.s8), "r" (b.s8), "r" (c.s8));
- asm ("lop3.b32 %0, %1, %2, %3, 0x39;" : "=r" (r.s9) : "r" (a.s9), "r" (b.s9), "r" (c.s9));
- asm ("lop3.b32 %0, %1, %2, %3, 0x39;" : "=r" (r.sa) : "r" (a.sa), "r" (b.sa), "r" (c.sa));
- asm ("lop3.b32 %0, %1, %2, %3, 0x39;" : "=r" (r.sb) : "r" (a.sb), "r" (b.sb), "r" (c.sb));
- asm ("lop3.b32 %0, %1, %2, %3, 0x39;" : "=r" (r.sc) : "r" (a.sc), "r" (b.sc), "r" (c.sc));
- asm ("lop3.b32 %0, %1, %2, %3, 0x39;" : "=r" (r.sd) : "r" (a.sd), "r" (b.sd), "r" (c.sd));
- asm ("lop3.b32 %0, %1, %2, %3, 0x39;" : "=r" (r.se) : "r" (a.se), "r" (b.se), "r" (c.se));
- asm ("lop3.b32 %0, %1, %2, %3, 0x39;" : "=r" (r.sf) : "r" (a.sf), "r" (b.sf), "r" (c.sf));
- #endif
-
- return r;
-}
-
-static inline u32x lut3_59 (const u32x a, const u32x b, const u32x c)
-{
- u32x r;
-
- #if VECT_SIZE == 1
- asm ("lop3.b32 %0, %1, %2, %3, 0x59;" : "=r" (r) : "r" (a), "r" (b), "r" (c));
- #endif
-
- #if VECT_SIZE >= 2
- asm ("lop3.b32 %0, %1, %2, %3, 0x59;" : "=r" (r.s0) : "r" (a.s0), "r" (b.s0), "r" (c.s0));
- asm ("lop3.b32 %0, %1, %2, %3, 0x59;" : "=r" (r.s1) : "r" (a.s1), "r" (b.s1), "r" (c.s1));
- #endif
-
- #if VECT_SIZE >= 4
- asm ("lop3.b32 %0, %1, %2, %3, 0x59;" : "=r" (r.s2) : "r" (a.s2), "r" (b.s2), "r" (c.s2));
- asm ("lop3.b32 %0, %1, %2, %3, 0x59;" : "=r" (r.s3) : "r" (a.s3), "r" (b.s3), "r" (c.s3));
- #endif
-
- #if VECT_SIZE >= 8
- asm ("lop3.b32 %0, %1, %2, %3, 0x59;" : "=r" (r.s4) : "r" (a.s4), "r" (b.s4), "r" (c.s4));
- asm ("lop3.b32 %0, %1, %2, %3, 0x59;" : "=r" (r.s5) : "r" (a.s5), "r" (b.s5), "r" (c.s5));
- asm ("lop3.b32 %0, %1, %2, %3, 0x59;" : "=r" (r.s6) : "r" (a.s6), "r" (b.s6), "r" (c.s6));
- asm ("lop3.b32 %0, %1, %2, %3, 0x59;" : "=r" (r.s7) : "r" (a.s7), "r" (b.s7), "r" (c.s7));
- #endif
-
- #if VECT_SIZE >= 16
- asm ("lop3.b32 %0, %1, %2, %3, 0x59;" : "=r" (r.s8) : "r" (a.s8), "r" (b.s8), "r" (c.s8));
- asm ("lop3.b32 %0, %1, %2, %3, 0x59;" : "=r" (r.s9) : "r" (a.s9), "r" (b.s9), "r" (c.s9));
- asm ("lop3.b32 %0, %1, %2, %3, 0x59;" : "=r" (r.sa) : "r" (a.sa), "r" (b.sa), "r" (c.sa));
- asm ("lop3.b32 %0, %1, %2, %3, 0x59;" : "=r" (r.sb) : "r" (a.sb), "r" (b.sb), "r" (c.sb));
- asm ("lop3.b32 %0, %1, %2, %3, 0x59;" : "=r" (r.sc) : "r" (a.sc), "r" (b.sc), "r" (c.sc));
- asm ("lop3.b32 %0, %1, %2, %3, 0x59;" : "=r" (r.sd) : "r" (a.sd), "r" (b.sd), "r" (c.sd));
- asm ("lop3.b32 %0, %1, %2, %3, 0x59;" : "=r" (r.se) : "r" (a.se), "r" (b.se), "r" (c.se));
- asm ("lop3.b32 %0, %1, %2, %3, 0x59;" : "=r" (r.sf) : "r" (a.sf), "r" (b.sf), "r" (c.sf));
- #endif
-
- return r;
-}
-
-static inline u32x lut3_96 (const u32x a, const u32x b, const u32x c)
-{
- u32x r;
-
- #if VECT_SIZE == 1
- asm ("lop3.b32 %0, %1, %2, %3, 0x96;" : "=r" (r) : "r" (a), "r" (b), "r" (c));
- #endif
-
- #if VECT_SIZE >= 2
- asm ("lop3.b32 %0, %1, %2, %3, 0x96;" : "=r" (r.s0) : "r" (a.s0), "r" (b.s0), "r" (c.s0));
- asm ("lop3.b32 %0, %1, %2, %3, 0x96;" : "=r" (r.s1) : "r" (a.s1), "r" (b.s1), "r" (c.s1));
- #endif
-
- #if VECT_SIZE >= 4
- asm ("lop3.b32 %0, %1, %2, %3, 0x96;" : "=r" (r.s2) : "r" (a.s2), "r" (b.s2), "r" (c.s2));
- asm ("lop3.b32 %0, %1, %2, %3, 0x96;" : "=r" (r.s3) : "r" (a.s3), "r" (b.s3), "r" (c.s3));
- #endif
-
- #if VECT_SIZE >= 8
- asm ("lop3.b32 %0, %1, %2, %3, 0x96;" : "=r" (r.s4) : "r" (a.s4), "r" (b.s4), "r" (c.s4));
- asm ("lop3.b32 %0, %1, %2, %3, 0x96;" : "=r" (r.s5) : "r" (a.s5), "r" (b.s5), "r" (c.s5));
- asm ("lop3.b32 %0, %1, %2, %3, 0x96;" : "=r" (r.s6) : "r" (a.s6), "r" (b.s6), "r" (c.s6));
- asm ("lop3.b32 %0, %1, %2, %3, 0x96;" : "=r" (r.s7) : "r" (a.s7), "r" (b.s7), "r" (c.s7));
- #endif
-
- #if VECT_SIZE >= 16
- asm ("lop3.b32 %0, %1, %2, %3, 0x96;" : "=r" (r.s8) : "r" (a.s8), "r" (b.s8), "r" (c.s8));
- asm ("lop3.b32 %0, %1, %2, %3, 0x96;" : "=r" (r.s9) : "r" (a.s9), "r" (b.s9), "r" (c.s9));
- asm ("lop3.b32 %0, %1, %2, %3, 0x96;" : "=r" (r.sa) : "r" (a.sa), "r" (b.sa), "r" (c.sa));
- asm ("lop3.b32 %0, %1, %2, %3, 0x96;" : "=r" (r.sb) : "r" (a.sb), "r" (b.sb), "r" (c.sb));
- asm ("lop3.b32 %0, %1, %2, %3, 0x96;" : "=r" (r.sc) : "r" (a.sc), "r" (b.sc), "r" (c.sc));
- asm ("lop3.b32 %0, %1, %2, %3, 0x96;" : "=r" (r.sd) : "r" (a.sd), "r" (b.sd), "r" (c.sd));
- asm ("lop3.b32 %0, %1, %2, %3, 0x96;" : "=r" (r.se) : "r" (a.se), "r" (b.se), "r" (c.se));
- asm ("lop3.b32 %0, %1, %2, %3, 0x96;" : "=r" (r.sf) : "r" (a.sf), "r" (b.sf), "r" (c.sf));
- #endif
-
- return r;
-}
-
-static inline u32x lut3_e4 (const u32x a, const u32x b, const u32x c)
-{
- u32x r;
-
- #if VECT_SIZE == 1
- asm ("lop3.b32 %0, %1, %2, %3, 0xe4;" : "=r" (r) : "r" (a), "r" (b), "r" (c));
- #endif
-
- #if VECT_SIZE >= 2
- asm ("lop3.b32 %0, %1, %2, %3, 0xe4;" : "=r" (r.s0) : "r" (a.s0), "r" (b.s0), "r" (c.s0));
- asm ("lop3.b32 %0, %1, %2, %3, 0xe4;" : "=r" (r.s1) : "r" (a.s1), "r" (b.s1), "r" (c.s1));
- #endif
-
- #if VECT_SIZE >= 4
- asm ("lop3.b32 %0, %1, %2, %3, 0xe4;" : "=r" (r.s2) : "r" (a.s2), "r" (b.s2), "r" (c.s2));
- asm ("lop3.b32 %0, %1, %2, %3, 0xe4;" : "=r" (r.s3) : "r" (a.s3), "r" (b.s3), "r" (c.s3));
- #endif
-
- #if VECT_SIZE >= 8
- asm ("lop3.b32 %0, %1, %2, %3, 0xe4;" : "=r" (r.s4) : "r" (a.s4), "r" (b.s4), "r" (c.s4));
- asm ("lop3.b32 %0, %1, %2, %3, 0xe4;" : "=r" (r.s5) : "r" (a.s5), "r" (b.s5), "r" (c.s5));
- asm ("lop3.b32 %0, %1, %2, %3, 0xe4;" : "=r" (r.s6) : "r" (a.s6), "r" (b.s6), "r" (c.s6));
- asm ("lop3.b32 %0, %1, %2, %3, 0xe4;" : "=r" (r.s7) : "r" (a.s7), "r" (b.s7), "r" (c.s7));
- #endif
-
- #if VECT_SIZE >= 16
- asm ("lop3.b32 %0, %1, %2, %3, 0xe4;" : "=r" (r.s8) : "r" (a.s8), "r" (b.s8), "r" (c.s8));
- asm ("lop3.b32 %0, %1, %2, %3, 0xe4;" : "=r" (r.s9) : "r" (a.s9), "r" (b.s9), "r" (c.s9));
- asm ("lop3.b32 %0, %1, %2, %3, 0xe4;" : "=r" (r.sa) : "r" (a.sa), "r" (b.sa), "r" (c.sa));
- asm ("lop3.b32 %0, %1, %2, %3, 0xe4;" : "=r" (r.sb) : "r" (a.sb), "r" (b.sb), "r" (c.sb));
- asm ("lop3.b32 %0, %1, %2, %3, 0xe4;" : "=r" (r.sc) : "r" (a.sc), "r" (b.sc), "r" (c.sc));
- asm ("lop3.b32 %0, %1, %2, %3, 0xe4;" : "=r" (r.sd) : "r" (a.sd), "r" (b.sd), "r" (c.sd));
- asm ("lop3.b32 %0, %1, %2, %3, 0xe4;" : "=r" (r.se) : "r" (a.se), "r" (b.se), "r" (c.se));
- asm ("lop3.b32 %0, %1, %2, %3, 0xe4;" : "=r" (r.sf) : "r" (a.sf), "r" (b.sf), "r" (c.sf));
- #endif
-
- return r;
-}
-
-static inline u32x lut3_e8 (const u32x a, const u32x b, const u32x c)
-{
- u32x r;
-
- #if VECT_SIZE == 1
- asm ("lop3.b32 %0, %1, %2, %3, 0xe8;" : "=r" (r) : "r" (a), "r" (b), "r" (c));
- #endif
-
- #if VECT_SIZE >= 2
- asm ("lop3.b32 %0, %1, %2, %3, 0xe8;" : "=r" (r.s0) : "r" (a.s0), "r" (b.s0), "r" (c.s0));
- asm ("lop3.b32 %0, %1, %2, %3, 0xe8;" : "=r" (r.s1) : "r" (a.s1), "r" (b.s1), "r" (c.s1));
- #endif
-
- #if VECT_SIZE >= 4
- asm ("lop3.b32 %0, %1, %2, %3, 0xe8;" : "=r" (r.s2) : "r" (a.s2), "r" (b.s2), "r" (c.s2));
- asm ("lop3.b32 %0, %1, %2, %3, 0xe8;" : "=r" (r.s3) : "r" (a.s3), "r" (b.s3), "r" (c.s3));
- #endif
-
- #if VECT_SIZE >= 8
- asm ("lop3.b32 %0, %1, %2, %3, 0xe8;" : "=r" (r.s4) : "r" (a.s4), "r" (b.s4), "r" (c.s4));
- asm ("lop3.b32 %0, %1, %2, %3, 0xe8;" : "=r" (r.s5) : "r" (a.s5), "r" (b.s5), "r" (c.s5));
- asm ("lop3.b32 %0, %1, %2, %3, 0xe8;" : "=r" (r.s6) : "r" (a.s6), "r" (b.s6), "r" (c.s6));
- asm ("lop3.b32 %0, %1, %2, %3, 0xe8;" : "=r" (r.s7) : "r" (a.s7), "r" (b.s7), "r" (c.s7));
- #endif
-
- #if VECT_SIZE >= 16
- asm ("lop3.b32 %0, %1, %2, %3, 0xe8;" : "=r" (r.s8) : "r" (a.s8), "r" (b.s8), "r" (c.s8));
- asm ("lop3.b32 %0, %1, %2, %3, 0xe8;" : "=r" (r.s9) : "r" (a.s9), "r" (b.s9), "r" (c.s9));
- asm ("lop3.b32 %0, %1, %2, %3, 0xe8;" : "=r" (r.sa) : "r" (a.sa), "r" (b.sa), "r" (c.sa));
- asm ("lop3.b32 %0, %1, %2, %3, 0xe8;" : "=r" (r.sb) : "r" (a.sb), "r" (b.sb), "r" (c.sb));
- asm ("lop3.b32 %0, %1, %2, %3, 0xe8;" : "=r" (r.sc) : "r" (a.sc), "r" (b.sc), "r" (c.sc));
- asm ("lop3.b32 %0, %1, %2, %3, 0xe8;" : "=r" (r.sd) : "r" (a.sd), "r" (b.sd), "r" (c.sd));
- asm ("lop3.b32 %0, %1, %2, %3, 0xe8;" : "=r" (r.se) : "r" (a.se), "r" (b.se), "r" (c.se));
- asm ("lop3.b32 %0, %1, %2, %3, 0xe8;" : "=r" (r.sf) : "r" (a.sf), "r" (b.sf), "r" (c.sf));
- #endif
-
- return r;
-}
-
-static inline u32x lut3_ca (const u32x a, const u32x b, const u32x c)
-{
- u32x r;
-
- #if VECT_SIZE == 1
- asm ("lop3.b32 %0, %1, %2, %3, 0xca;" : "=r" (r) : "r" (a), "r" (b), "r" (c));
- #endif
-
- #if VECT_SIZE >= 2
- asm ("lop3.b32 %0, %1, %2, %3, 0xca;" : "=r" (r.s0) : "r" (a.s0), "r" (b.s0), "r" (c.s0));
- asm ("lop3.b32 %0, %1, %2, %3, 0xca;" : "=r" (r.s1) : "r" (a.s1), "r" (b.s1), "r" (c.s1));
- #endif
-
- #if VECT_SIZE >= 4
- asm ("lop3.b32 %0, %1, %2, %3, 0xca;" : "=r" (r.s2) : "r" (a.s2), "r" (b.s2), "r" (c.s2));
- asm ("lop3.b32 %0, %1, %2, %3, 0xca;" : "=r" (r.s3) : "r" (a.s3), "r" (b.s3), "r" (c.s3));
- #endif
-
- #if VECT_SIZE >= 8
- asm ("lop3.b32 %0, %1, %2, %3, 0xca;" : "=r" (r.s4) : "r" (a.s4), "r" (b.s4), "r" (c.s4));
- asm ("lop3.b32 %0, %1, %2, %3, 0xca;" : "=r" (r.s5) : "r" (a.s5), "r" (b.s5), "r" (c.s5));
- asm ("lop3.b32 %0, %1, %2, %3, 0xca;" : "=r" (r.s6) : "r" (a.s6), "r" (b.s6), "r" (c.s6));
- asm ("lop3.b32 %0, %1, %2, %3, 0xca;" : "=r" (r.s7) : "r" (a.s7), "r" (b.s7), "r" (c.s7));
- #endif
-
- #if VECT_SIZE >= 16
- asm ("lop3.b32 %0, %1, %2, %3, 0xca;" : "=r" (r.s8) : "r" (a.s8), "r" (b.s8), "r" (c.s8));
- asm ("lop3.b32 %0, %1, %2, %3, 0xca;" : "=r" (r.s9) : "r" (a.s9), "r" (b.s9), "r" (c.s9));
- asm ("lop3.b32 %0, %1, %2, %3, 0xca;" : "=r" (r.sa) : "r" (a.sa), "r" (b.sa), "r" (c.sa));
- asm ("lop3.b32 %0, %1, %2, %3, 0xca;" : "=r" (r.sb) : "r" (a.sb), "r" (b.sb), "r" (c.sb));
- asm ("lop3.b32 %0, %1, %2, %3, 0xca;" : "=r" (r.sc) : "r" (a.sc), "r" (b.sc), "r" (c.sc));
- asm ("lop3.b32 %0, %1, %2, %3, 0xca;" : "=r" (r.sd) : "r" (a.sd), "r" (b.sd), "r" (c.sd));
- asm ("lop3.b32 %0, %1, %2, %3, 0xca;" : "=r" (r.se) : "r" (a.se), "r" (b.se), "r" (c.se));
- asm ("lop3.b32 %0, %1, %2, %3, 0xca;" : "=r" (r.sf) : "r" (a.sf), "r" (b.sf), "r" (c.sf));
- #endif
-
- return r;
-}
-
-#endif
#endif
#ifdef IS_GENERIC
ALIAS_nv_budget * 22 2 A A
ALIAS_nv_budget * 23 2 A A
ALIAS_nv_budget * 200 2 A A
+ALIAS_nv_budget * 400 2 A A
ALIAS_nv_budget * 900 2 A A
ALIAS_nv_budget * 1000 2 A A
ALIAS_nv_budget * 1100 2 A A
#define MIN(a,b) (((a) < (b)) ? (a) : (b))
#define MAX(a,b) (((a) > (b)) ? (a) : (b))
+#define CEIL(a) ((a - (int) (a)) > 0 ? a + 1 : a)
+
#endif // COMMON_H
#define MD4_H_S(x,y,z) ((x) ^ (y) ^ (z))
#ifdef IS_NV
-#if CUDA_ARCH >= 500
-#define MD4_F(x,y,z) lut3_ca ((x), (y), (z))
-#define MD4_G(x,y,z) lut3_e8 ((x), (y), (z))
-#define MD4_H(x,y,z) lut3_96 ((x), (y), (z))
-#else
#define MD4_F(x,y,z) (((x) & (y)) | ((~(x)) & (z)))
#define MD4_G(x,y,z) (((x) & (y)) | ((x) & (z)) | ((y) & (z)))
#define MD4_H(x,y,z) ((x) ^ (y) ^ (z))
-#endif
#define MD4_Fo(x,y,z) (MD4_F((x), (y), (z)))
#define MD4_Go(x,y,z) (MD4_G((x), (y), (z)))
#endif
#define MD5_I_S(x,y,z) ((y) ^ ((x) | ~(z)))
#ifdef IS_NV
-#if CUDA_ARCH >= 500
-#define MD5_F(x,y,z) lut3_ca ((x), (y), (z))
-#define MD5_G(x,y,z) lut3_e4 ((x), (y), (z))
-#define MD5_H(x,y,z) lut3_96 ((x), (y), (z))
-#define MD5_I(x,y,z) lut3_39 ((x), (y), (z))
-#else
#define MD5_F(x,y,z) ((z) ^ ((x) & ((y) ^ (z))))
#define MD5_G(x,y,z) ((y) ^ ((z) & ((x) ^ (y))))
#define MD5_H(x,y,z) ((x) ^ (y) ^ (z))
#define MD5_I(x,y,z) ((y) ^ ((x) | ~(z)))
-#endif
#define MD5_Fo(x,y,z) (MD5_F((x), (y), (z)))
#define MD5_Go(x,y,z) (MD5_G((x), (y), (z)))
#endif
#define MD5_Go(x,y,z) (MD5_G((x), (y), (z)))
#endif
+#define MD5_STEP_S(f,a,b,c,d,x,K,s) \
+{ \
+ a += K; \
+ a += x; \
+ a += f (b, c, d); \
+ a = rotl32_S (a, s); \
+ a += b; \
+}
+
#define MD5_STEP(f,a,b,c,d,x,K,s) \
{ \
a += K; \
#if defined _SHA1_ || defined _SAPG_ || defined _OFFICE2007_ || defined _OFFICE2010_ || defined _OLDOFFICE34_ || defined _ANDROIDFDE_ || defined _DCC2_ || defined _WPA_ || defined _MD5_SHA1_ || defined _SHA1_MD5_ || defined _PSAFE2_ || defined _LOTUS8_ || defined _PBKDF2_SHA1_ || defined _RAR3_ || defined _SHA256_SHA1_
#ifdef IS_NV
-#if CUDA_ARCH >= 500
-#define SHA1_F0(x,y,z) lut3_ca ((x), (y), (z))
-#define SHA1_F1(x,y,z) lut3_96 ((x), (y), (z))
-#define SHA1_F2(x,y,z) lut3_e8 ((x), (y), (z))
-#else
#define SHA1_F0(x,y,z) ((z) ^ ((x) & ((y) ^ (z))))
#define SHA1_F1(x,y,z) ((x) ^ (y) ^ (z))
#define SHA1_F2(x,y,z) (((x) & (y)) | ((z) & ((x) ^ (y))))
-#endif
#define SHA1_F0o(x,y,z) (SHA1_F0 ((x), (y), (z)))
#define SHA1_F2o(x,y,z) (SHA1_F2 ((x), (y), (z)))
#endif
#define SHA256_S3(x) (rotl32 ((x), 26u) ^ rotl32 ((x), 21u) ^ rotl32 ((x), 7u))
#ifdef IS_NV
-#if CUDA_ARCH >= 500
-#define SHA256_F0(x,y,z) lut3_e8 ((x), (y), (z))
-#define SHA256_F1(x,y,z) lut3_ca ((x), (y), (z))
-#else
#define SHA256_F0(x,y,z) (((x) & (y)) | ((z) & ((x) ^ (y))))
#define SHA256_F1(x,y,z) ((z) ^ ((x) & ((y) ^ (z))))
-#endif
#define SHA256_F0o(x,y,z) (SHA256_F0 ((x), (y), (z)))
#define SHA256_F1o(x,y,z) (SHA256_F1 ((x), (y), (z)))
#endif
#ifdef _RIPEMD160_
#ifdef IS_NV
-#if CUDA_ARCH >= 500
-#define RIPEMD160_F(x,y,z) lut3_96 ((x), (y), (z))
-#define RIPEMD160_G(x,y,z) lut3_ca ((x), (y), (z))
-#define RIPEMD160_H(x,y,z) lut3_59 ((x), (y), (z))
-#define RIPEMD160_I(x,y,z) lut3_e4 ((x), (y), (z))
-#define RIPEMD160_J(x,y,z) lut3_2d ((x), (y), (z))
-#else
#define RIPEMD160_F(x,y,z) ((x) ^ (y) ^ (z))
#define RIPEMD160_G(x,y,z) ((z) ^ ((x) & ((y) ^ (z)))) /* x ? y : z */
#define RIPEMD160_H(x,y,z) (((x) | ~(y)) ^ (z))
#define RIPEMD160_I(x,y,z) ((y) ^ ((z) & ((x) ^ (y)))) /* z ? x : y */
#define RIPEMD160_J(x,y,z) ((x) ^ ((y) | ~(z)))
-#endif
#define RIPEMD160_Go(x,y,z) (RIPEMD160_G ((x), (y), (z)))
#define RIPEMD160_Io(x,y,z) (RIPEMD160_I ((x), (y), (z)))
#endif
#define OPTI_TYPE_SINGLE_SALT (1 << 12)
#define OPTI_TYPE_BRUTE_FORCE (1 << 13)
#define OPTI_TYPE_RAW_HASH (1 << 14)
-#define OPTI_TYPE_USES_BITS_8 (1 << 15)
-#define OPTI_TYPE_USES_BITS_16 (1 << 16)
-#define OPTI_TYPE_USES_BITS_32 (1 << 17)
-#define OPTI_TYPE_USES_BITS_64 (1 << 18)
+#define OPTI_TYPE_SLOW_HASH_SIMD (1 << 15)
+#define OPTI_TYPE_USES_BITS_8 (1 << 16)
+#define OPTI_TYPE_USES_BITS_16 (1 << 17)
+#define OPTI_TYPE_USES_BITS_32 (1 << 18)
+#define OPTI_TYPE_USES_BITS_64 (1 << 19)
#define OPTI_STR_ZERO_BYTE "Zero-Byte"
#define OPTI_STR_PRECOMPUTE_INIT "Precompute-Init"
#define OPTI_STR_SINGLE_SALT "Single-Salt"
#define OPTI_STR_BRUTE_FORCE "Brute-Force"
#define OPTI_STR_RAW_HASH "Raw-Hash"
+#define OPTI_STR_SLOW_HASH_SIMD "Slow-Hash-SIMD"
#define OPTI_STR_USES_BITS_8 "Uses-8-Bit"
#define OPTI_STR_USES_BITS_16 "Uses-16-Bit"
#define OPTI_STR_USES_BITS_32 "Uses-32-Bit"
hc_clGetKernelWorkGroupInfo (data.ocl, kernel, device_param->device, CL_KERNEL_WORK_GROUP_SIZE, sizeof (size_t), &workgroup_size, NULL);
+ if (kern_run == KERN_RUN_2)
+ {
+ if (data.opti_type & OPTI_TYPE_SLOW_HASH_SIMD)
+ {
+ num_elements = CEIL ((float) num_elements / device_param->vector_width);
+ }
+ }
+
if (kernel_threads > workgroup_size) kernel_threads = workgroup_size;
+ while (num_elements % kernel_threads) num_elements++;
+
const size_t global_work_size[3] = { num_elements, 1, 1 };
const size_t local_work_size[3] = { kernel_threads, 1, 1 };
dgst_size = DGST_SIZE_4_4;
parse_func = phpass_parse_hash;
sort_by_digest = sort_by_digest_4_4;
- opti_type = OPTI_TYPE_ZERO_BYTE;
+ opti_type = OPTI_TYPE_ZERO_BYTE
+ | OPTI_TYPE_SLOW_HASH_SIMD;
dgst_pos0 = 0;
dgst_pos1 = 1;
dgst_pos2 = 2;
case OPTI_TYPE_SINGLE_SALT: return ((char *) OPTI_STR_SINGLE_SALT); break;
case OPTI_TYPE_BRUTE_FORCE: return ((char *) OPTI_STR_BRUTE_FORCE); break;
case OPTI_TYPE_RAW_HASH: return ((char *) OPTI_STR_RAW_HASH); break;
+ case OPTI_TYPE_SLOW_HASH_SIMD: return ((char *) OPTI_STR_SLOW_HASH_SIMD); break;
case OPTI_TYPE_USES_BITS_8: return ((char *) OPTI_STR_USES_BITS_8); break;
case OPTI_TYPE_USES_BITS_16: return ((char *) OPTI_STR_USES_BITS_16); break;
case OPTI_TYPE_USES_BITS_32: return ((char *) OPTI_STR_USES_BITS_32); break;