From e6e5005a6bac731c887d30e337bd417f3cf2a43a Mon Sep 17 00:00:00 2001 From: Jens Steube Date: Mon, 22 Feb 2016 21:32:38 +0100 Subject: [PATCH] Revert "Zero pws_buf before reuse" This reverts commit b409e5e9e162927b25af88e672326b246f17ec5f. --- OpenCL/m00010_a0.cl | 130 ++++++++--------- OpenCL/m00010_a1.cl | 80 +++-------- OpenCL/m00010_a3.cl | 29 ++-- OpenCL/m00020_a0.cl | 78 +++-------- OpenCL/m00020_a1.cl | 80 ++++------- OpenCL/m00020_a3.cl | 270 ++++++++++++++++++----------------- OpenCL/m00030_a0.cl | 76 +++++----- OpenCL/m00030_a1.cl | 56 +++----- OpenCL/m00030_a3.cl | 29 ++-- OpenCL/m00040_a0.cl | 78 +++-------- OpenCL/m00040_a1.cl | 80 ++++------- OpenCL/m00040_a3.cl | 56 +++++--- OpenCL/m00050_a0.cl | 24 +++- OpenCL/m00050_a3.cl | 24 +++- OpenCL/m00060_a0.cl | 88 ++++++------ OpenCL/m00060_a3.cl | 72 +++++----- OpenCL/m00110_a1.cl | 316 ++++++++++++++++++++++++++--------------- OpenCL/m00120_a1.cl | 260 +++++++++++++++++++++------------- OpenCL/m00130_a1.cl | 256 ++++++++++++++++++++++------------ OpenCL/m00140_a1.cl | 256 ++++++++++++++++++++++------------ OpenCL/m00150_a1.cl | 222 +++++++++++++++++++---------- OpenCL/m00160_a1.cl | 206 ++++++++++++++++++--------- OpenCL/m00190_a1.cl | 324 +++++++++++++++++++++++++++--------------- OpenCL/m00200_a1.cl | 206 ++++++++++++++++++--------- OpenCL/m00300_a1.cl | 308 +++++++++++++++++++++++++--------------- OpenCL/m00900_a1.cl | 229 ++++++++++++++++++++---------- OpenCL/m01000_a1.cl | 244 +++++++++++++++++++++----------- OpenCL/m01100_a1.cl | 248 ++++++++++++++++++++++----------- OpenCL/m01400_a1.cl | 316 ++++++++++++++++++++++++++--------------- OpenCL/m01410_a1.cl | 324 ++++++++++++++++++++++++++---------------- OpenCL/m01420_a1.cl | 320 ++++++++++++++++++++++++++---------------- OpenCL/m01430_a1.cl | 332 +++++++++++++++++++++++++++----------------- OpenCL/m01440_a1.cl | 332 +++++++++++++++++++++++++++----------------- OpenCL/m01450_a1.cl | 222 +++++++++++++++++++---------- OpenCL/m01460_a1.cl | 206 ++++++++++++++++++--------- OpenCL/m01500_a1.cl | 118 +++++++++++----- OpenCL/m01700_a1.cl | 216 +++++++++++++++++++--------- OpenCL/m01710_a1.cl | 224 ++++++++++++++++++++---------- OpenCL/m01720_a1.cl | 220 ++++++++++++++++++----------- OpenCL/m01730_a1.cl | 216 ++++++++++++++++++---------- OpenCL/m01740_a1.cl | 216 ++++++++++++++++++---------- OpenCL/m01750_a1.cl | 208 ++++++++++++++++++--------- OpenCL/m01760_a1.cl | 192 +++++++++++++++++-------- OpenCL/m02400_a1.cl | 224 ++++++++++++++++++++---------- OpenCL/m02410_a1.cl | 232 +++++++++++++++++++++---------- OpenCL/m02610_a1.cl | 227 ++++++++++++++++++++---------- OpenCL/m02710_a1.cl | 228 ++++++++++++++++++++---------- OpenCL/m02810_a1.cl | 228 ++++++++++++++++++++---------- OpenCL/m03000_a1.cl | 124 +++++++++++------ OpenCL/m03100_a1.cl | 193 ++++++++++++++++--------- OpenCL/m03710_a1.cl | 244 +++++++++++++++++++++----------- OpenCL/m03800_a1.cl | 240 +++++++++++++++++++++----------- OpenCL/m04310_a1.cl | 228 ++++++++++++++++++++---------- OpenCL/m04400_a1.cl | 300 +++++++++++++++++++++++++-------------- OpenCL/m04500_a1.cl | 304 +++++++++++++++++++++++++--------------- OpenCL/m04700_a1.cl | 296 +++++++++++++++++++++++++-------------- OpenCL/m04800_a1.cl | 232 +++++++++++++++++++++---------- OpenCL/m04900_a1.cl | 296 +++++++++++++++++++++++++-------------- OpenCL/m05000_a1.cl | 196 ++++++++++++++++++-------- OpenCL/m05100_a1.cl | 173 +++++++++++++++-------- OpenCL/m05300_a1.cl | 222 +++++++++++++++++++---------- OpenCL/m05400_a1.cl | 222 +++++++++++++++++++---------- OpenCL/m05500_a1.cl | 228 ++++++++++++++++++++---------- OpenCL/m05600_a1.cl | 226 ++++++++++++++++++++---------- OpenCL/m06000_a1.cl | 200 +++++++++++++++++--------- OpenCL/m06100_a1.cl | 200 +++++++++++++++++--------- OpenCL/m06900_a1.cl | 192 +++++++++++++++++-------- OpenCL/m07300_a1.cl | 222 +++++++++++++++++++---------- OpenCL/m07500_a1.cl | 188 +++++++++++++++++-------- OpenCL/m07600_a1.cl | 314 ++++++++++++++++++++++++++--------------- OpenCL/m08000_a1.cl | 210 ++++++++++++++++++---------- OpenCL/m08100_a1.cl | 252 +++++++++++++++++++++------------ OpenCL/m08300_a1.cl | 254 ++++++++++++++++++++------------- OpenCL/m08400_a1.cl | 226 ++++++++++++++++++++---------- OpenCL/m08500_a1.cl | 118 +++++++++++----- OpenCL/m08600_a1.cl | 182 ++++++++++++++++-------- OpenCL/m08700_a1.cl | 200 +++++++++++++++++--------- OpenCL/m09720_a1.cl | 216 ++++++++++++++++++---------- OpenCL/m09820_a1.cl | 216 ++++++++++++++++++---------- OpenCL/m09900_a1.cl | 224 ++++++++++++++++++++---------- OpenCL/m10100_a1.cl | 180 ++++++++++++++++-------- OpenCL/m10420_a1.cl | 220 +++++++++++++++++++---------- OpenCL/m10800_a1.cl | 212 +++++++++++++++++++--------- OpenCL/m11000_a1.cl | 244 ++++++++++++++++++++------------ OpenCL/m11100_a1.cl | 228 ++++++++++++++++++++---------- OpenCL/m11200_a1.cl | 300 +++++++++++++++++++++++++-------------- OpenCL/m11400_a1.cl | 244 +++++++++++++++++++++----------- OpenCL/m11500_a1.cl | 188 +++++++++++++++++-------- OpenCL/m11700_a1.cl | 184 ++++++++++++++++-------- OpenCL/m11800_a1.cl | 184 ++++++++++++++++-------- OpenCL/m12600_a1.cl | 312 ++++++++++++++++++++++++++--------------- src/oclHashcat.c | 4 - 92 files changed, 12190 insertions(+), 6574 deletions(-) diff --git a/OpenCL/m00010_a0.cl b/OpenCL/m00010_a0.cl index 85260a9..2c8cebf 100644 --- a/OpenCL/m00010_a0.cl +++ b/OpenCL/m00010_a0.cl @@ -59,26 +59,18 @@ __kernel void m00010_m04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, */ u32 salt_buf0[4]; + + salt_buf0[0] = salt_bufs[salt_pos].salt_buf[0]; + salt_buf0[1] = salt_bufs[salt_pos].salt_buf[1]; + salt_buf0[2] = salt_bufs[salt_pos].salt_buf[2]; + salt_buf0[3] = salt_bufs[salt_pos].salt_buf[3]; + u32 salt_buf1[4]; - u32 salt_buf2[4]; - u32 salt_buf3[4]; - - salt_buf0[0] = salt_bufs[salt_pos].salt_buf[ 0]; - salt_buf0[1] = salt_bufs[salt_pos].salt_buf[ 1]; - salt_buf0[2] = salt_bufs[salt_pos].salt_buf[ 2]; - salt_buf0[3] = salt_bufs[salt_pos].salt_buf[ 3]; - salt_buf1[0] = salt_bufs[salt_pos].salt_buf[ 4]; - salt_buf1[1] = salt_bufs[salt_pos].salt_buf[ 5]; - salt_buf1[2] = salt_bufs[salt_pos].salt_buf[ 6]; - salt_buf1[3] = salt_bufs[salt_pos].salt_buf[ 7]; - salt_buf2[0] = salt_bufs[salt_pos].salt_buf[ 8]; - salt_buf2[1] = salt_bufs[salt_pos].salt_buf[ 9]; - salt_buf2[2] = salt_bufs[salt_pos].salt_buf[10]; - salt_buf2[3] = salt_bufs[salt_pos].salt_buf[11]; - salt_buf3[0] = salt_bufs[salt_pos].salt_buf[12]; - salt_buf3[1] = salt_bufs[salt_pos].salt_buf[13]; - salt_buf3[2] = salt_bufs[salt_pos].salt_buf[14]; - salt_buf3[3] = salt_bufs[salt_pos].salt_buf[15]; + + salt_buf1[0] = salt_bufs[salt_pos].salt_buf[4]; + salt_buf1[1] = salt_bufs[salt_pos].salt_buf[5]; + salt_buf1[2] = salt_bufs[salt_pos].salt_buf[6]; + salt_buf1[3] = salt_bufs[salt_pos].salt_buf[7]; const u32 salt_len = salt_bufs[salt_pos].salt_len; @@ -99,27 +91,33 @@ __kernel void m00010_m04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, * append salt */ - u32x s0[4] = { 0 }; - u32x s1[4] = { 0 }; - u32x s2[4] = { 0 }; - u32x s3[4] = { 0 }; + u32x s0[4]; s0[0] = salt_buf0[0]; s0[1] = salt_buf0[1]; s0[2] = salt_buf0[2]; s0[3] = salt_buf0[3]; + + u32x s1[4]; + s1[0] = salt_buf1[0]; s1[1] = salt_buf1[1]; s1[2] = salt_buf1[2]; s1[3] = salt_buf1[3]; - s2[0] = salt_buf2[0]; - s2[1] = salt_buf2[1]; - s2[2] = salt_buf2[2]; - s2[3] = salt_buf2[3]; - s3[0] = salt_buf3[0]; - s3[1] = salt_buf3[1]; - s3[2] = salt_buf3[2]; - s3[3] = salt_buf3[3]; + + u32x s2[4]; + + s2[0] = 0; + s2[1] = 0; + s2[2] = 0; + s2[3] = 0; + + u32x s3[4]; + + s3[0] = 0; + s3[1] = 0; + s3[2] = 0; + s3[3] = 0; switch_buffer_by_offset_le (s0, s1, s2, s3, out_len); @@ -129,19 +127,24 @@ __kernel void m00010_m04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, w0[1] |= s0[1]; w0[2] |= s0[2]; w0[3] |= s0[3]; + w1[0] |= s1[0]; w1[1] |= s1[1]; w1[2] |= s1[2]; w1[3] |= s1[3]; + w2[0] |= s2[0]; w2[1] |= s2[1]; w2[2] |= s2[2]; w2[3] |= s2[3]; + w3[0] |= s3[0]; w3[1] |= s3[1]; w3[2] = pw_salt_len * 8; w3[3] = 0; + append_0x80_4x4 (w0, w1, w2, w3, pw_salt_len); + /** * md5 */ @@ -268,26 +271,18 @@ __kernel void m00010_s04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, */ u32 salt_buf0[4]; + + salt_buf0[0] = salt_bufs[salt_pos].salt_buf[0]; + salt_buf0[1] = salt_bufs[salt_pos].salt_buf[1]; + salt_buf0[2] = salt_bufs[salt_pos].salt_buf[2]; + salt_buf0[3] = salt_bufs[salt_pos].salt_buf[3]; + u32 salt_buf1[4]; - u32 salt_buf2[4]; - u32 salt_buf3[4]; - - salt_buf0[0] = salt_bufs[salt_pos].salt_buf[ 0]; - salt_buf0[1] = salt_bufs[salt_pos].salt_buf[ 1]; - salt_buf0[2] = salt_bufs[salt_pos].salt_buf[ 2]; - salt_buf0[3] = salt_bufs[salt_pos].salt_buf[ 3]; - salt_buf1[0] = salt_bufs[salt_pos].salt_buf[ 4]; - salt_buf1[1] = salt_bufs[salt_pos].salt_buf[ 5]; - salt_buf1[2] = salt_bufs[salt_pos].salt_buf[ 6]; - salt_buf1[3] = salt_bufs[salt_pos].salt_buf[ 7]; - salt_buf2[0] = salt_bufs[salt_pos].salt_buf[ 8]; - salt_buf2[1] = salt_bufs[salt_pos].salt_buf[ 9]; - salt_buf2[2] = salt_bufs[salt_pos].salt_buf[10]; - salt_buf2[3] = salt_bufs[salt_pos].salt_buf[11]; - salt_buf3[0] = salt_bufs[salt_pos].salt_buf[12]; - salt_buf3[1] = salt_bufs[salt_pos].salt_buf[13]; - salt_buf3[2] = salt_bufs[salt_pos].salt_buf[14]; - salt_buf3[3] = salt_bufs[salt_pos].salt_buf[15]; + + salt_buf1[0] = salt_bufs[salt_pos].salt_buf[4]; + salt_buf1[1] = salt_bufs[salt_pos].salt_buf[5]; + salt_buf1[2] = salt_bufs[salt_pos].salt_buf[6]; + salt_buf1[3] = salt_bufs[salt_pos].salt_buf[7]; const u32 salt_len = salt_bufs[salt_pos].salt_len; @@ -320,27 +315,33 @@ __kernel void m00010_s04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, * append salt */ - u32x s0[4] = { 0 }; - u32x s1[4] = { 0 }; - u32x s2[4] = { 0 }; - u32x s3[4] = { 0 }; + u32x s0[4]; s0[0] = salt_buf0[0]; s0[1] = salt_buf0[1]; s0[2] = salt_buf0[2]; s0[3] = salt_buf0[3]; + + u32x s1[4]; + s1[0] = salt_buf1[0]; s1[1] = salt_buf1[1]; s1[2] = salt_buf1[2]; s1[3] = salt_buf1[3]; - s2[0] = salt_buf2[0]; - s2[1] = salt_buf2[1]; - s2[2] = salt_buf2[2]; - s2[3] = salt_buf2[3]; - s3[0] = salt_buf3[0]; - s3[1] = salt_buf3[1]; - s3[2] = salt_buf3[2]; - s3[3] = salt_buf3[3]; + + u32x s2[4]; + + s2[0] = 0; + s2[1] = 0; + s2[2] = 0; + s2[3] = 0; + + u32x s3[4]; + + s3[0] = 0; + s3[1] = 0; + s3[2] = 0; + s3[3] = 0; switch_buffer_by_offset_le (s0, s1, s2, s3, out_len); @@ -350,19 +351,24 @@ __kernel void m00010_s04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, w0[1] |= s0[1]; w0[2] |= s0[2]; w0[3] |= s0[3]; + w1[0] |= s1[0]; w1[1] |= s1[1]; w1[2] |= s1[2]; w1[3] |= s1[3]; + w2[0] |= s2[0]; w2[1] |= s2[1]; w2[2] |= s2[2]; w2[3] |= s2[3]; + w3[0] |= s3[0]; w3[1] |= s3[1]; w3[2] = pw_salt_len * 8; w3[3] = 0; + append_0x80_4x4 (w0, w1, w2, w3, pw_salt_len); + /** * md5 */ diff --git a/OpenCL/m00010_a1.cl b/OpenCL/m00010_a1.cl index 3ce73f1..adcf7d5 100644 --- a/OpenCL/m00010_a1.cl +++ b/OpenCL/m00010_a1.cl @@ -56,25 +56,15 @@ __kernel void m00010_m04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, u32 salt_buf0[4]; u32 salt_buf1[4]; - u32 salt_buf2[4]; - u32 salt_buf3[4]; - - salt_buf0[0] = salt_bufs[salt_pos].salt_buf[ 0]; - salt_buf0[1] = salt_bufs[salt_pos].salt_buf[ 1]; - salt_buf0[2] = salt_bufs[salt_pos].salt_buf[ 2]; - salt_buf0[3] = salt_bufs[salt_pos].salt_buf[ 3]; - salt_buf1[0] = salt_bufs[salt_pos].salt_buf[ 4]; - salt_buf1[1] = salt_bufs[salt_pos].salt_buf[ 5]; - salt_buf1[2] = salt_bufs[salt_pos].salt_buf[ 6]; - salt_buf1[3] = salt_bufs[salt_pos].salt_buf[ 7]; - salt_buf2[0] = salt_bufs[salt_pos].salt_buf[ 8]; - salt_buf2[1] = salt_bufs[salt_pos].salt_buf[ 9]; - salt_buf2[2] = salt_bufs[salt_pos].salt_buf[10]; - salt_buf2[3] = salt_bufs[salt_pos].salt_buf[11]; - salt_buf3[0] = salt_bufs[salt_pos].salt_buf[12]; - salt_buf3[1] = salt_bufs[salt_pos].salt_buf[13]; - salt_buf3[2] = salt_bufs[salt_pos].salt_buf[14]; - salt_buf3[3] = salt_bufs[salt_pos].salt_buf[15]; + + salt_buf0[0] = salt_bufs[salt_pos].salt_buf[0]; + salt_buf0[1] = salt_bufs[salt_pos].salt_buf[1]; + salt_buf0[2] = salt_bufs[salt_pos].salt_buf[2]; + salt_buf0[3] = salt_bufs[salt_pos].salt_buf[3]; + salt_buf1[0] = salt_bufs[salt_pos].salt_buf[4]; + salt_buf1[1] = salt_bufs[salt_pos].salt_buf[5]; + salt_buf1[2] = salt_bufs[salt_pos].salt_buf[6]; + salt_buf1[3] = salt_bufs[salt_pos].salt_buf[7]; const u32 salt_len = salt_bufs[salt_pos].salt_len; @@ -148,8 +138,8 @@ __kernel void m00010_m04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, w2[3] = wordl2[3] | wordr2[3]; w3[0] = wordl3[0] | wordr3[0]; w3[1] = wordl3[1] | wordr3[1]; - w3[2] = wordl3[2] | wordr3[2]; - w3[3] = wordl3[3] | wordr3[3]; + w3[2] = 0; + w3[3] = 0; /** * append salt @@ -168,14 +158,6 @@ __kernel void m00010_m04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, s1[1] = salt_buf1[1]; s1[2] = salt_buf1[2]; s1[3] = salt_buf1[3]; - s2[0] = salt_buf2[0]; - s2[1] = salt_buf2[1]; - s2[2] = salt_buf2[2]; - s2[3] = salt_buf2[3]; - s3[0] = salt_buf3[0]; - s3[1] = salt_buf3[1]; - s3[2] = salt_buf3[2]; - s3[3] = salt_buf3[3]; switch_buffer_by_offset_le_VV (s0, s1, s2, s3, pw_len); @@ -323,25 +305,15 @@ __kernel void m00010_s04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, u32 salt_buf0[4]; u32 salt_buf1[4]; - u32 salt_buf2[4]; - u32 salt_buf3[4]; - - salt_buf0[0] = salt_bufs[salt_pos].salt_buf[ 0]; - salt_buf0[1] = salt_bufs[salt_pos].salt_buf[ 1]; - salt_buf0[2] = salt_bufs[salt_pos].salt_buf[ 2]; - salt_buf0[3] = salt_bufs[salt_pos].salt_buf[ 3]; - salt_buf1[0] = salt_bufs[salt_pos].salt_buf[ 4]; - salt_buf1[1] = salt_bufs[salt_pos].salt_buf[ 5]; - salt_buf1[2] = salt_bufs[salt_pos].salt_buf[ 6]; - salt_buf1[3] = salt_bufs[salt_pos].salt_buf[ 7]; - salt_buf2[0] = salt_bufs[salt_pos].salt_buf[ 8]; - salt_buf2[1] = salt_bufs[salt_pos].salt_buf[ 9]; - salt_buf2[2] = salt_bufs[salt_pos].salt_buf[10]; - salt_buf2[3] = salt_bufs[salt_pos].salt_buf[11]; - salt_buf3[0] = salt_bufs[salt_pos].salt_buf[12]; - salt_buf3[1] = salt_bufs[salt_pos].salt_buf[13]; - salt_buf3[2] = salt_bufs[salt_pos].salt_buf[14]; - salt_buf3[3] = salt_bufs[salt_pos].salt_buf[15]; + + salt_buf0[0] = salt_bufs[salt_pos].salt_buf[0]; + salt_buf0[1] = salt_bufs[salt_pos].salt_buf[1]; + salt_buf0[2] = salt_bufs[salt_pos].salt_buf[2]; + salt_buf0[3] = salt_bufs[salt_pos].salt_buf[3]; + salt_buf1[0] = salt_bufs[salt_pos].salt_buf[4]; + salt_buf1[1] = salt_bufs[salt_pos].salt_buf[5]; + salt_buf1[2] = salt_bufs[salt_pos].salt_buf[6]; + salt_buf1[3] = salt_bufs[salt_pos].salt_buf[7]; const u32 salt_len = salt_bufs[salt_pos].salt_len; @@ -427,8 +399,8 @@ __kernel void m00010_s04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, w2[3] = wordl2[3] | wordr2[3]; w3[0] = wordl3[0] | wordr3[0]; w3[1] = wordl3[1] | wordr3[1]; - w3[2] = wordl3[2] | wordr3[2]; - w3[3] = wordl3[3] | wordr3[3]; + w3[2] = 0; + w3[3] = 0; /** * append salt @@ -447,14 +419,6 @@ __kernel void m00010_s04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, s1[1] = salt_buf1[1]; s1[2] = salt_buf1[2]; s1[3] = salt_buf1[3]; - s2[0] = salt_buf2[0]; - s2[1] = salt_buf2[1]; - s2[2] = salt_buf2[2]; - s2[3] = salt_buf2[3]; - s3[0] = salt_buf3[0]; - s3[1] = salt_buf3[1]; - s3[2] = salt_buf3[2]; - s3[3] = salt_buf3[3]; switch_buffer_by_offset_le_VV (s0, s1, s2, s3, pw_len); diff --git a/OpenCL/m00010_a3.cl b/OpenCL/m00010_a3.cl index f2cc12a..ae1a83f 100644 --- a/OpenCL/m00010_a3.cl +++ b/OpenCL/m00010_a3.cl @@ -51,28 +51,32 @@ static void m00010m (u32 w[16], const u32 pw_len, __global pw_t *pws, __global k */ u32 salt_buf0[4]; - u32 salt_buf1[4]; - u32 salt_buf2[4]; - u32 salt_buf3[4]; salt_buf0[0] = salt_bufs[salt_pos].salt_buf[ 0]; salt_buf0[1] = salt_bufs[salt_pos].salt_buf[ 1]; salt_buf0[2] = salt_bufs[salt_pos].salt_buf[ 2]; salt_buf0[3] = salt_bufs[salt_pos].salt_buf[ 3]; + + u32 salt_buf1[4]; + salt_buf1[0] = salt_bufs[salt_pos].salt_buf[ 4]; salt_buf1[1] = salt_bufs[salt_pos].salt_buf[ 5]; salt_buf1[2] = salt_bufs[salt_pos].salt_buf[ 6]; salt_buf1[3] = salt_bufs[salt_pos].salt_buf[ 7]; + + u32 salt_buf2[4]; + salt_buf2[0] = salt_bufs[salt_pos].salt_buf[ 8]; - salt_buf2[1] = salt_bufs[salt_pos].salt_buf[ 9]; - salt_buf2[2] = salt_bufs[salt_pos].salt_buf[10]; - salt_buf2[3] = salt_bufs[salt_pos].salt_buf[11]; - salt_buf3[0] = salt_bufs[salt_pos].salt_buf[12]; - salt_buf3[1] = salt_bufs[salt_pos].salt_buf[13]; - salt_buf3[2] = salt_bufs[salt_pos].salt_buf[14]; - salt_buf3[3] = salt_bufs[salt_pos].salt_buf[15]; + salt_buf2[1] = 0; + salt_buf2[2] = 0; + salt_buf2[3] = 0; - const u32 salt_len = salt_bufs[salt_pos].salt_len; + u32 salt_buf3[4]; + + salt_buf3[0] = 0; + salt_buf3[1] = 0; + salt_buf3[2] = 0; + salt_buf3[3] = 0; switch_buffer_by_offset_le_S (salt_buf0, salt_buf1, salt_buf2, salt_buf3, pw_len); @@ -93,10 +97,11 @@ static void m00010m (u32 w[16], const u32 pw_len, __global pw_t *pws, __global k w[14] |= salt_buf3[2]; w[15] |= salt_buf3[3]; + const u32 salt_len = salt_bufs[salt_pos].salt_len; + const u32 pw_salt_len = pw_len + salt_len; w[14] = pw_salt_len * 8; - w[15] = 0; /** * base diff --git a/OpenCL/m00020_a0.cl b/OpenCL/m00020_a0.cl index c10513c..2518b03 100644 --- a/OpenCL/m00020_a0.cl +++ b/OpenCL/m00020_a0.cl @@ -59,26 +59,18 @@ __kernel void m00020_m04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, */ u32 salt_buf0[4]; + + salt_buf0[0] = salt_bufs[salt_pos].salt_buf[0]; + salt_buf0[1] = salt_bufs[salt_pos].salt_buf[1]; + salt_buf0[2] = salt_bufs[salt_pos].salt_buf[2]; + salt_buf0[3] = salt_bufs[salt_pos].salt_buf[3]; + u32 salt_buf1[4]; - u32 salt_buf2[4]; - u32 salt_buf3[4]; - - salt_buf0[0] = salt_bufs[salt_pos].salt_buf[ 0]; - salt_buf0[1] = salt_bufs[salt_pos].salt_buf[ 1]; - salt_buf0[2] = salt_bufs[salt_pos].salt_buf[ 2]; - salt_buf0[3] = salt_bufs[salt_pos].salt_buf[ 3]; - salt_buf1[0] = salt_bufs[salt_pos].salt_buf[ 4]; - salt_buf1[1] = salt_bufs[salt_pos].salt_buf[ 5]; - salt_buf1[2] = salt_bufs[salt_pos].salt_buf[ 6]; - salt_buf1[3] = salt_bufs[salt_pos].salt_buf[ 7]; - salt_buf2[0] = salt_bufs[salt_pos].salt_buf[ 8]; - salt_buf2[1] = salt_bufs[salt_pos].salt_buf[ 9]; - salt_buf2[2] = salt_bufs[salt_pos].salt_buf[10]; - salt_buf2[3] = salt_bufs[salt_pos].salt_buf[11]; - salt_buf3[0] = salt_bufs[salt_pos].salt_buf[12]; - salt_buf3[1] = salt_bufs[salt_pos].salt_buf[13]; - salt_buf3[2] = salt_bufs[salt_pos].salt_buf[14]; - salt_buf3[3] = salt_bufs[salt_pos].salt_buf[15]; + + salt_buf1[0] = salt_bufs[salt_pos].salt_buf[4]; + salt_buf1[1] = salt_bufs[salt_pos].salt_buf[5]; + salt_buf1[2] = salt_bufs[salt_pos].salt_buf[6]; + salt_buf1[3] = salt_bufs[salt_pos].salt_buf[7]; const u32 salt_len = salt_bufs[salt_pos].salt_len; @@ -133,19 +125,10 @@ __kernel void m00020_m04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, w1_t[1] |= salt_buf1[1]; w1_t[2] |= salt_buf1[2]; w1_t[3] |= salt_buf1[3]; - w2_t[0] |= salt_buf2[0]; - w2_t[1] |= salt_buf2[1]; - w2_t[2] |= salt_buf2[2]; - w2_t[3] |= salt_buf2[3]; - w3_t[0] |= salt_buf3[0]; - w3_t[1] |= salt_buf3[1]; - w3_t[2] |= salt_buf3[2]; - w3_t[3] |= salt_buf3[3]; append_0x80_4x4 (w0_t, w1_t, w2_t, w3_t, out_salt_len); w3_t[2] = out_salt_len * 8; - w3_t[3] = 0; /** * md5 @@ -273,26 +256,18 @@ __kernel void m00020_s04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, */ u32 salt_buf0[4]; + + salt_buf0[0] = salt_bufs[salt_pos].salt_buf[0]; + salt_buf0[1] = salt_bufs[salt_pos].salt_buf[1]; + salt_buf0[2] = salt_bufs[salt_pos].salt_buf[2]; + salt_buf0[3] = salt_bufs[salt_pos].salt_buf[3]; + u32 salt_buf1[4]; - u32 salt_buf2[4]; - u32 salt_buf3[4]; - - salt_buf0[0] = salt_bufs[salt_pos].salt_buf[ 0]; - salt_buf0[1] = salt_bufs[salt_pos].salt_buf[ 1]; - salt_buf0[2] = salt_bufs[salt_pos].salt_buf[ 2]; - salt_buf0[3] = salt_bufs[salt_pos].salt_buf[ 3]; - salt_buf1[0] = salt_bufs[salt_pos].salt_buf[ 4]; - salt_buf1[1] = salt_bufs[salt_pos].salt_buf[ 5]; - salt_buf1[2] = salt_bufs[salt_pos].salt_buf[ 6]; - salt_buf1[3] = salt_bufs[salt_pos].salt_buf[ 7]; - salt_buf2[0] = salt_bufs[salt_pos].salt_buf[ 8]; - salt_buf2[1] = salt_bufs[salt_pos].salt_buf[ 9]; - salt_buf2[2] = salt_bufs[salt_pos].salt_buf[10]; - salt_buf2[3] = salt_bufs[salt_pos].salt_buf[11]; - salt_buf3[0] = salt_bufs[salt_pos].salt_buf[12]; - salt_buf3[1] = salt_bufs[salt_pos].salt_buf[13]; - salt_buf3[2] = salt_bufs[salt_pos].salt_buf[14]; - salt_buf3[3] = salt_bufs[salt_pos].salt_buf[15]; + + salt_buf1[0] = salt_bufs[salt_pos].salt_buf[4]; + salt_buf1[1] = salt_bufs[salt_pos].salt_buf[5]; + salt_buf1[2] = salt_bufs[salt_pos].salt_buf[6]; + salt_buf1[3] = salt_bufs[salt_pos].salt_buf[7]; const u32 salt_len = salt_bufs[salt_pos].salt_len; @@ -359,19 +334,10 @@ __kernel void m00020_s04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, w1_t[1] |= salt_buf1[1]; w1_t[2] |= salt_buf1[2]; w1_t[3] |= salt_buf1[3]; - w2_t[0] |= salt_buf2[0]; - w2_t[1] |= salt_buf2[1]; - w2_t[2] |= salt_buf2[2]; - w2_t[3] |= salt_buf2[3]; - w3_t[0] |= salt_buf3[0]; - w3_t[1] |= salt_buf3[1]; - w3_t[2] |= salt_buf3[2]; - w3_t[3] |= salt_buf3[3]; append_0x80_4x4 (w0_t, w1_t, w2_t, w3_t, out_salt_len); w3_t[2] = out_salt_len * 8; - w3_t[3] = 0; /** * md5 diff --git a/OpenCL/m00020_a1.cl b/OpenCL/m00020_a1.cl index c66332e..6afd120 100644 --- a/OpenCL/m00020_a1.cl +++ b/OpenCL/m00020_a1.cl @@ -56,25 +56,15 @@ __kernel void m00020_m04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, u32 salt_buf0[4]; u32 salt_buf1[4]; - u32 salt_buf2[4]; - u32 salt_buf3[4]; - - salt_buf0[0] = salt_bufs[salt_pos].salt_buf[ 0]; - salt_buf0[1] = salt_bufs[salt_pos].salt_buf[ 1]; - salt_buf0[2] = salt_bufs[salt_pos].salt_buf[ 2]; - salt_buf0[3] = salt_bufs[salt_pos].salt_buf[ 3]; - salt_buf1[0] = salt_bufs[salt_pos].salt_buf[ 4]; - salt_buf1[1] = salt_bufs[salt_pos].salt_buf[ 5]; - salt_buf1[2] = salt_bufs[salt_pos].salt_buf[ 6]; - salt_buf1[3] = salt_bufs[salt_pos].salt_buf[ 7]; - salt_buf2[0] = salt_bufs[salt_pos].salt_buf[ 8]; - salt_buf2[1] = salt_bufs[salt_pos].salt_buf[ 9]; - salt_buf2[2] = salt_bufs[salt_pos].salt_buf[10]; - salt_buf2[3] = salt_bufs[salt_pos].salt_buf[11]; - salt_buf3[0] = salt_bufs[salt_pos].salt_buf[12]; - salt_buf3[1] = salt_bufs[salt_pos].salt_buf[13]; - salt_buf3[2] = salt_bufs[salt_pos].salt_buf[14]; - salt_buf3[3] = salt_bufs[salt_pos].salt_buf[15]; + + salt_buf0[0] = salt_bufs[salt_pos].salt_buf[0]; + salt_buf0[1] = salt_bufs[salt_pos].salt_buf[1]; + salt_buf0[2] = salt_bufs[salt_pos].salt_buf[2]; + salt_buf0[3] = salt_bufs[salt_pos].salt_buf[3]; + salt_buf1[0] = salt_bufs[salt_pos].salt_buf[4]; + salt_buf1[1] = salt_bufs[salt_pos].salt_buf[5]; + salt_buf1[2] = salt_bufs[salt_pos].salt_buf[6]; + salt_buf1[3] = salt_bufs[salt_pos].salt_buf[7]; const u32 salt_len = salt_bufs[salt_pos].salt_len; @@ -167,12 +157,12 @@ __kernel void m00020_m04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, w1[1] |= salt_buf1[1]; w1[2] |= salt_buf1[2]; w1[3] |= salt_buf1[3]; - w2[0] |= salt_buf2[0]; - w2[1] |= salt_buf2[1]; - w2[2] |= salt_buf2[2]; - w2[3] |= salt_buf2[3]; - w3[0] |= salt_buf3[0]; - w3[1] |= salt_buf3[1]; + w2[0] |= 0; + w2[1] |= 0; + w2[2] |= 0; + w2[3] |= 0; + w3[0] |= 0; + w3[1] |= 0; w3[2] = pw_salt_len * 8; w3[3] = 0; @@ -300,25 +290,15 @@ __kernel void m00020_s04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, u32 salt_buf0[4]; u32 salt_buf1[4]; - u32 salt_buf2[4]; - u32 salt_buf3[4]; - - salt_buf0[0] = salt_bufs[salt_pos].salt_buf[ 0]; - salt_buf0[1] = salt_bufs[salt_pos].salt_buf[ 1]; - salt_buf0[2] = salt_bufs[salt_pos].salt_buf[ 2]; - salt_buf0[3] = salt_bufs[salt_pos].salt_buf[ 3]; - salt_buf1[0] = salt_bufs[salt_pos].salt_buf[ 4]; - salt_buf1[1] = salt_bufs[salt_pos].salt_buf[ 5]; - salt_buf1[2] = salt_bufs[salt_pos].salt_buf[ 6]; - salt_buf1[3] = salt_bufs[salt_pos].salt_buf[ 7]; - salt_buf2[0] = salt_bufs[salt_pos].salt_buf[ 8]; - salt_buf2[1] = salt_bufs[salt_pos].salt_buf[ 9]; - salt_buf2[2] = salt_bufs[salt_pos].salt_buf[10]; - salt_buf2[3] = salt_bufs[salt_pos].salt_buf[11]; - salt_buf3[0] = salt_bufs[salt_pos].salt_buf[12]; - salt_buf3[1] = salt_bufs[salt_pos].salt_buf[13]; - salt_buf3[2] = salt_bufs[salt_pos].salt_buf[14]; - salt_buf3[3] = salt_bufs[salt_pos].salt_buf[15]; + + salt_buf0[0] = salt_bufs[salt_pos].salt_buf[0]; + salt_buf0[1] = salt_bufs[salt_pos].salt_buf[1]; + salt_buf0[2] = salt_bufs[salt_pos].salt_buf[2]; + salt_buf0[3] = salt_bufs[salt_pos].salt_buf[3]; + salt_buf1[0] = salt_bufs[salt_pos].salt_buf[4]; + salt_buf1[1] = salt_bufs[salt_pos].salt_buf[5]; + salt_buf1[2] = salt_bufs[salt_pos].salt_buf[6]; + salt_buf1[3] = salt_bufs[salt_pos].salt_buf[7]; const u32 salt_len = salt_bufs[salt_pos].salt_len; @@ -423,12 +403,12 @@ __kernel void m00020_s04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, w1[1] |= salt_buf1[1]; w1[2] |= salt_buf1[2]; w1[3] |= salt_buf1[3]; - w2[0] |= salt_buf2[0]; - w2[1] |= salt_buf2[1]; - w2[2] |= salt_buf2[2]; - w2[3] |= salt_buf2[3]; - w3[0] |= salt_buf3[0]; - w3[1] |= salt_buf3[1]; + w2[0] |= 0; + w2[1] |= 0; + w2[2] |= 0; + w2[3] |= 0; + w3[0] |= 0; + w3[1] |= 0; w3[2] = pw_salt_len * 8; w3[3] = 0; diff --git a/OpenCL/m00020_a3.cl b/OpenCL/m00020_a3.cl index 8714165..d74fbd7 100644 --- a/OpenCL/m00020_a3.cl +++ b/OpenCL/m00020_a3.cl @@ -20,7 +20,7 @@ #include "OpenCL/common.c" #include "OpenCL/simd.c" -static void m00020m (u32 t0[4], u32 t1[4], u32 t2[4], u32 t3[4], const u32 pw_len, __global pw_t *pws, __global kernel_rule_t *rules_buf, __global comb_t *combs_buf, __global bf_t *bfs_buf, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 bfs_cnt, const u32 digests_cnt, const u32 digests_offset) +static void m00020m (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], const u32 pw_len, __global pw_t *pws, __global kernel_rule_t *rules_buf, __global comb_t *combs_buf, __global bf_t *bfs_buf, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 bfs_cnt, const u32 digests_cnt, const u32 digests_offset) { /** * modifier @@ -34,26 +34,32 @@ static void m00020m (u32 t0[4], u32 t1[4], u32 t2[4], u32 t3[4], const u32 pw_le */ u32 salt_buf0[4]; - u32 salt_buf1[4]; - u32 salt_buf2[4]; - u32 salt_buf3[4]; salt_buf0[0] = salt_bufs[salt_pos].salt_buf[ 0]; salt_buf0[1] = salt_bufs[salt_pos].salt_buf[ 1]; salt_buf0[2] = salt_bufs[salt_pos].salt_buf[ 2]; salt_buf0[3] = salt_bufs[salt_pos].salt_buf[ 3]; + + u32 salt_buf1[4]; + salt_buf1[0] = salt_bufs[salt_pos].salt_buf[ 4]; salt_buf1[1] = salt_bufs[salt_pos].salt_buf[ 5]; salt_buf1[2] = salt_bufs[salt_pos].salt_buf[ 6]; salt_buf1[3] = salt_bufs[salt_pos].salt_buf[ 7]; - salt_buf2[0] = salt_bufs[salt_pos].salt_buf[ 8]; - salt_buf2[1] = salt_bufs[salt_pos].salt_buf[ 9]; - salt_buf2[2] = salt_bufs[salt_pos].salt_buf[10]; - salt_buf2[3] = salt_bufs[salt_pos].salt_buf[11]; - salt_buf3[0] = salt_bufs[salt_pos].salt_buf[12]; - salt_buf3[1] = salt_bufs[salt_pos].salt_buf[13]; - salt_buf3[2] = salt_bufs[salt_pos].salt_buf[14]; - salt_buf3[3] = salt_bufs[salt_pos].salt_buf[15]; + + u32 salt_buf2[4]; + + salt_buf2[0] = 0; + salt_buf2[1] = 0; + salt_buf2[2] = 0; + salt_buf2[3] = 0; + + u32 salt_buf3[4]; + + salt_buf3[0] = 0; + salt_buf3[1] = 0; + salt_buf3[2] = 0; + salt_buf3[3] = 0; const u32 salt_len = salt_bufs[salt_pos].salt_len; @@ -68,22 +74,22 @@ static void m00020m (u32 t0[4], u32 t1[4], u32 t2[4], u32 t3[4], const u32 pw_le u32 w2_t[4]; u32 w3_t[4]; - w0_t[0] = t0[0]; - w0_t[1] = t0[1]; - w0_t[2] = t0[2]; - w0_t[3] = t0[3]; - w1_t[0] = t1[0]; - w1_t[1] = t1[1]; - w1_t[2] = t1[2]; - w1_t[3] = t1[3]; - w2_t[0] = t2[0]; - w2_t[1] = t2[1]; - w2_t[2] = t2[2]; - w2_t[3] = t2[3]; - w3_t[0] = t3[0]; - w3_t[1] = t3[1]; - w3_t[2] = t3[2]; - w3_t[3] = t3[3]; + w0_t[0] = w0[0]; + w0_t[1] = w0[1]; + w0_t[2] = w0[2]; + w0_t[3] = w0[3]; + w1_t[0] = w1[0]; + w1_t[1] = w1[1]; + w1_t[2] = w1[2]; + w1_t[3] = w1[3]; + w2_t[0] = w2[0]; + w2_t[1] = w2[1]; + w2_t[2] = w2[2]; + w2_t[3] = w2[3]; + w3_t[0] = w3[0]; + w3_t[1] = w3[1]; + w3_t[2] = w3[2]; + w3_t[3] = w3[3]; switch_buffer_by_offset_le_S (w0_t, w1_t, w2_t, w3_t, salt_len); @@ -108,7 +114,7 @@ static void m00020m (u32 t0[4], u32 t1[4], u32 t2[4], u32 t3[4], const u32 pw_le * loop */ - u32 w0l = t0[0]; + u32 w0l = w0[0]; for (u32 il_pos = 0; il_pos < bfs_cnt; il_pos += VECT_SIZE) { @@ -137,29 +143,27 @@ static void m00020m (u32 t0[4], u32 t1[4], u32 t2[4], u32 t3[4], const u32 pw_le overwrite_at_le (wx, w0lr, salt_len); - u32x w0[4]; - u32x w1[4]; - u32x w2[4]; - u32x w3[4]; - - w0[0] = wx[ 0]; - w0[1] = wx[ 1]; - w0[2] = wx[ 2]; - w0[3] = wx[ 3]; - w1[0] = wx[ 4]; - w1[1] = wx[ 5]; - w1[2] = wx[ 6]; - w1[3] = wx[ 7]; - w2[0] = wx[ 8]; - w2[1] = wx[ 9]; - w2[2] = wx[10]; - w2[3] = wx[11]; - w3[0] = wx[12]; - w3[1] = wx[13]; - w3[2] = pw_salt_len * 8; - w3[3] = 0; - - append_0x80_4x4 (w0, w1, w2, w3, pw_salt_len); + u32x w0_t[4]; + u32x w1_t[4]; + u32x w2_t[4]; + u32x w3_t[4]; + + w0_t[0] = wx[ 0]; + w0_t[1] = wx[ 1]; + w0_t[2] = wx[ 2]; + w0_t[3] = wx[ 3]; + w1_t[0] = wx[ 4]; + w1_t[1] = wx[ 5]; + w1_t[2] = wx[ 6]; + w1_t[3] = wx[ 7]; + w2_t[0] = wx[ 8]; + w2_t[1] = wx[ 9]; + w2_t[2] = wx[10]; + w2_t[3] = wx[11]; + w3_t[0] = wx[12]; + w3_t[1] = wx[13]; + w3_t[2] = pw_salt_len * 8; + w3_t[3] = 0; /** * md5 @@ -170,73 +174,73 @@ static void m00020m (u32 t0[4], u32 t1[4], u32 t2[4], u32 t3[4], const u32 pw_le u32x c = MD5M_C; u32x d = MD5M_D; - MD5_STEP (MD5_Fo, a, b, c, d, w0[0], MD5C00, MD5S00); - MD5_STEP (MD5_Fo, d, a, b, c, w0[1], MD5C01, MD5S01); - MD5_STEP (MD5_Fo, c, d, a, b, w0[2], MD5C02, MD5S02); - MD5_STEP (MD5_Fo, b, c, d, a, w0[3], MD5C03, MD5S03); - MD5_STEP (MD5_Fo, a, b, c, d, w1[0], MD5C04, MD5S00); - MD5_STEP (MD5_Fo, d, a, b, c, w1[1], MD5C05, MD5S01); - MD5_STEP (MD5_Fo, c, d, a, b, w1[2], MD5C06, MD5S02); - MD5_STEP (MD5_Fo, b, c, d, a, w1[3], MD5C07, MD5S03); - MD5_STEP (MD5_Fo, a, b, c, d, w2[0], MD5C08, MD5S00); - MD5_STEP (MD5_Fo, d, a, b, c, w2[1], MD5C09, MD5S01); - MD5_STEP (MD5_Fo, c, d, a, b, w2[2], MD5C0a, MD5S02); - MD5_STEP (MD5_Fo, b, c, d, a, w2[3], MD5C0b, MD5S03); - MD5_STEP (MD5_Fo, a, b, c, d, w3[0], MD5C0c, MD5S00); - MD5_STEP (MD5_Fo, d, a, b, c, w3[1], MD5C0d, MD5S01); - MD5_STEP (MD5_Fo, c, d, a, b, w3[2], MD5C0e, MD5S02); - MD5_STEP (MD5_Fo, b, c, d, a, w3[3], MD5C0f, MD5S03); - - MD5_STEP (MD5_Go, a, b, c, d, w0[1], MD5C10, MD5S10); - MD5_STEP (MD5_Go, d, a, b, c, w1[2], MD5C11, MD5S11); - MD5_STEP (MD5_Go, c, d, a, b, w2[3], MD5C12, MD5S12); - MD5_STEP (MD5_Go, b, c, d, a, w0[0], MD5C13, MD5S13); - MD5_STEP (MD5_Go, a, b, c, d, w1[1], MD5C14, MD5S10); - MD5_STEP (MD5_Go, d, a, b, c, w2[2], MD5C15, MD5S11); - MD5_STEP (MD5_Go, c, d, a, b, w3[3], MD5C16, MD5S12); - MD5_STEP (MD5_Go, b, c, d, a, w1[0], MD5C17, MD5S13); - MD5_STEP (MD5_Go, a, b, c, d, w2[1], MD5C18, MD5S10); - MD5_STEP (MD5_Go, d, a, b, c, w3[2], MD5C19, MD5S11); - MD5_STEP (MD5_Go, c, d, a, b, w0[3], MD5C1a, MD5S12); - MD5_STEP (MD5_Go, b, c, d, a, w2[0], MD5C1b, MD5S13); - MD5_STEP (MD5_Go, a, b, c, d, w3[1], MD5C1c, MD5S10); - MD5_STEP (MD5_Go, d, a, b, c, w0[2], MD5C1d, MD5S11); - MD5_STEP (MD5_Go, c, d, a, b, w1[3], MD5C1e, MD5S12); - MD5_STEP (MD5_Go, b, c, d, a, w3[0], MD5C1f, MD5S13); - - MD5_STEP (MD5_H , a, b, c, d, w1[1], MD5C20, MD5S20); - MD5_STEP (MD5_H , d, a, b, c, w2[0], MD5C21, MD5S21); - MD5_STEP (MD5_H , c, d, a, b, w2[3], MD5C22, MD5S22); - MD5_STEP (MD5_H , b, c, d, a, w3[2], MD5C23, MD5S23); - MD5_STEP (MD5_H , a, b, c, d, w0[1], MD5C24, MD5S20); - MD5_STEP (MD5_H , d, a, b, c, w1[0], MD5C25, MD5S21); - MD5_STEP (MD5_H , c, d, a, b, w1[3], MD5C26, MD5S22); - MD5_STEP (MD5_H , b, c, d, a, w2[2], MD5C27, MD5S23); - MD5_STEP (MD5_H , a, b, c, d, w3[1], MD5C28, MD5S20); - MD5_STEP (MD5_H , d, a, b, c, w0[0], MD5C29, MD5S21); - MD5_STEP (MD5_H , c, d, a, b, w0[3], MD5C2a, MD5S22); - MD5_STEP (MD5_H , b, c, d, a, w1[2], MD5C2b, MD5S23); - MD5_STEP (MD5_H , a, b, c, d, w2[1], MD5C2c, MD5S20); - MD5_STEP (MD5_H , d, a, b, c, w3[0], MD5C2d, MD5S21); - MD5_STEP (MD5_H , c, d, a, b, w3[3], MD5C2e, MD5S22); - MD5_STEP (MD5_H , b, c, d, a, w0[2], MD5C2f, MD5S23); - - MD5_STEP (MD5_I , a, b, c, d, w0[0], MD5C30, MD5S30); - MD5_STEP (MD5_I , d, a, b, c, w1[3], MD5C31, MD5S31); - MD5_STEP (MD5_I , c, d, a, b, w3[2], MD5C32, MD5S32); - MD5_STEP (MD5_I , b, c, d, a, w1[1], MD5C33, MD5S33); - MD5_STEP (MD5_I , a, b, c, d, w3[0], MD5C34, MD5S30); - MD5_STEP (MD5_I , d, a, b, c, w0[3], MD5C35, MD5S31); - MD5_STEP (MD5_I , c, d, a, b, w2[2], MD5C36, MD5S32); - MD5_STEP (MD5_I , b, c, d, a, w0[1], MD5C37, MD5S33); - MD5_STEP (MD5_I , a, b, c, d, w2[0], MD5C38, MD5S30); - MD5_STEP (MD5_I , d, a, b, c, w3[3], MD5C39, MD5S31); - MD5_STEP (MD5_I , c, d, a, b, w1[2], MD5C3a, MD5S32); - MD5_STEP (MD5_I , b, c, d, a, w3[1], MD5C3b, MD5S33); - MD5_STEP (MD5_I , a, b, c, d, w1[0], MD5C3c, MD5S30); - MD5_STEP (MD5_I , d, a, b, c, w2[3], MD5C3d, MD5S31); - MD5_STEP (MD5_I , c, d, a, b, w0[2], MD5C3e, MD5S32); - MD5_STEP (MD5_I , b, c, d, a, w2[1], MD5C3f, MD5S33); + MD5_STEP (MD5_Fo, a, b, c, d, w0_t[0], MD5C00, MD5S00); + MD5_STEP (MD5_Fo, d, a, b, c, w0_t[1], MD5C01, MD5S01); + MD5_STEP (MD5_Fo, c, d, a, b, w0_t[2], MD5C02, MD5S02); + MD5_STEP (MD5_Fo, b, c, d, a, w0_t[3], MD5C03, MD5S03); + MD5_STEP (MD5_Fo, a, b, c, d, w1_t[0], MD5C04, MD5S00); + MD5_STEP (MD5_Fo, d, a, b, c, w1_t[1], MD5C05, MD5S01); + MD5_STEP (MD5_Fo, c, d, a, b, w1_t[2], MD5C06, MD5S02); + MD5_STEP (MD5_Fo, b, c, d, a, w1_t[3], MD5C07, MD5S03); + MD5_STEP (MD5_Fo, a, b, c, d, w2_t[0], MD5C08, MD5S00); + MD5_STEP (MD5_Fo, d, a, b, c, w2_t[1], MD5C09, MD5S01); + MD5_STEP (MD5_Fo, c, d, a, b, w2_t[2], MD5C0a, MD5S02); + MD5_STEP (MD5_Fo, b, c, d, a, w2_t[3], MD5C0b, MD5S03); + MD5_STEP (MD5_Fo, a, b, c, d, w3_t[0], MD5C0c, MD5S00); + MD5_STEP (MD5_Fo, d, a, b, c, w3_t[1], MD5C0d, MD5S01); + MD5_STEP (MD5_Fo, c, d, a, b, w3_t[2], MD5C0e, MD5S02); + MD5_STEP (MD5_Fo, b, c, d, a, w3_t[3], MD5C0f, MD5S03); + + MD5_STEP (MD5_Go, a, b, c, d, w0_t[1], MD5C10, MD5S10); + MD5_STEP (MD5_Go, d, a, b, c, w1_t[2], MD5C11, MD5S11); + MD5_STEP (MD5_Go, c, d, a, b, w2_t[3], MD5C12, MD5S12); + MD5_STEP (MD5_Go, b, c, d, a, w0_t[0], MD5C13, MD5S13); + MD5_STEP (MD5_Go, a, b, c, d, w1_t[1], MD5C14, MD5S10); + MD5_STEP (MD5_Go, d, a, b, c, w2_t[2], MD5C15, MD5S11); + MD5_STEP (MD5_Go, c, d, a, b, w3_t[3], MD5C16, MD5S12); + MD5_STEP (MD5_Go, b, c, d, a, w1_t[0], MD5C17, MD5S13); + MD5_STEP (MD5_Go, a, b, c, d, w2_t[1], MD5C18, MD5S10); + MD5_STEP (MD5_Go, d, a, b, c, w3_t[2], MD5C19, MD5S11); + MD5_STEP (MD5_Go, c, d, a, b, w0_t[3], MD5C1a, MD5S12); + MD5_STEP (MD5_Go, b, c, d, a, w2_t[0], MD5C1b, MD5S13); + MD5_STEP (MD5_Go, a, b, c, d, w3_t[1], MD5C1c, MD5S10); + MD5_STEP (MD5_Go, d, a, b, c, w0_t[2], MD5C1d, MD5S11); + MD5_STEP (MD5_Go, c, d, a, b, w1_t[3], MD5C1e, MD5S12); + MD5_STEP (MD5_Go, b, c, d, a, w3_t[0], MD5C1f, MD5S13); + + MD5_STEP (MD5_H , a, b, c, d, w1_t[1], MD5C20, MD5S20); + MD5_STEP (MD5_H , d, a, b, c, w2_t[0], MD5C21, MD5S21); + MD5_STEP (MD5_H , c, d, a, b, w2_t[3], MD5C22, MD5S22); + MD5_STEP (MD5_H , b, c, d, a, w3_t[2], MD5C23, MD5S23); + MD5_STEP (MD5_H , a, b, c, d, w0_t[1], MD5C24, MD5S20); + MD5_STEP (MD5_H , d, a, b, c, w1_t[0], MD5C25, MD5S21); + MD5_STEP (MD5_H , c, d, a, b, w1_t[3], MD5C26, MD5S22); + MD5_STEP (MD5_H , b, c, d, a, w2_t[2], MD5C27, MD5S23); + MD5_STEP (MD5_H , a, b, c, d, w3_t[1], MD5C28, MD5S20); + MD5_STEP (MD5_H , d, a, b, c, w0_t[0], MD5C29, MD5S21); + MD5_STEP (MD5_H , c, d, a, b, w0_t[3], MD5C2a, MD5S22); + MD5_STEP (MD5_H , b, c, d, a, w1_t[2], MD5C2b, MD5S23); + MD5_STEP (MD5_H , a, b, c, d, w2_t[1], MD5C2c, MD5S20); + MD5_STEP (MD5_H , d, a, b, c, w3_t[0], MD5C2d, MD5S21); + MD5_STEP (MD5_H , c, d, a, b, w3_t[3], MD5C2e, MD5S22); + MD5_STEP (MD5_H , b, c, d, a, w0_t[2], MD5C2f, MD5S23); + + MD5_STEP (MD5_I , a, b, c, d, w0_t[0], MD5C30, MD5S30); + MD5_STEP (MD5_I , d, a, b, c, w1_t[3], MD5C31, MD5S31); + MD5_STEP (MD5_I , c, d, a, b, w3_t[2], MD5C32, MD5S32); + MD5_STEP (MD5_I , b, c, d, a, w1_t[1], MD5C33, MD5S33); + MD5_STEP (MD5_I , a, b, c, d, w3_t[0], MD5C34, MD5S30); + MD5_STEP (MD5_I , d, a, b, c, w0_t[3], MD5C35, MD5S31); + MD5_STEP (MD5_I , c, d, a, b, w2_t[2], MD5C36, MD5S32); + MD5_STEP (MD5_I , b, c, d, a, w0_t[1], MD5C37, MD5S33); + MD5_STEP (MD5_I , a, b, c, d, w2_t[0], MD5C38, MD5S30); + MD5_STEP (MD5_I , d, a, b, c, w3_t[3], MD5C39, MD5S31); + MD5_STEP (MD5_I , c, d, a, b, w1_t[2], MD5C3a, MD5S32); + MD5_STEP (MD5_I , b, c, d, a, w3_t[1], MD5C3b, MD5S33); + MD5_STEP (MD5_I , a, b, c, d, w1_t[0], MD5C3c, MD5S30); + MD5_STEP (MD5_I , d, a, b, c, w2_t[3], MD5C3d, MD5S31); + MD5_STEP (MD5_I , c, d, a, b, w0_t[2], MD5C3e, MD5S32); + MD5_STEP (MD5_I , b, c, d, a, w2_t[1], MD5C3f, MD5S33); COMPARE_M_SIMD (a, d, c, b); } @@ -268,26 +272,32 @@ static void m00020s (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], const u32 pw_le */ u32 salt_buf0[4]; - u32 salt_buf1[4]; - u32 salt_buf2[4]; - u32 salt_buf3[4]; salt_buf0[0] = salt_bufs[salt_pos].salt_buf[ 0]; salt_buf0[1] = salt_bufs[salt_pos].salt_buf[ 1]; salt_buf0[2] = salt_bufs[salt_pos].salt_buf[ 2]; salt_buf0[3] = salt_bufs[salt_pos].salt_buf[ 3]; + + u32 salt_buf1[4]; + salt_buf1[0] = salt_bufs[salt_pos].salt_buf[ 4]; salt_buf1[1] = salt_bufs[salt_pos].salt_buf[ 5]; salt_buf1[2] = salt_bufs[salt_pos].salt_buf[ 6]; salt_buf1[3] = salt_bufs[salt_pos].salt_buf[ 7]; - salt_buf2[0] = salt_bufs[salt_pos].salt_buf[ 8]; - salt_buf2[1] = salt_bufs[salt_pos].salt_buf[ 9]; - salt_buf2[2] = salt_bufs[salt_pos].salt_buf[10]; - salt_buf2[3] = salt_bufs[salt_pos].salt_buf[11]; - salt_buf3[0] = salt_bufs[salt_pos].salt_buf[12]; - salt_buf3[1] = salt_bufs[salt_pos].salt_buf[13]; - salt_buf3[2] = salt_bufs[salt_pos].salt_buf[14]; - salt_buf3[3] = salt_bufs[salt_pos].salt_buf[15]; + + u32 salt_buf2[4]; + + salt_buf2[0] = 0; + salt_buf2[1] = 0; + salt_buf2[2] = 0; + salt_buf2[3] = 0; + + u32 salt_buf3[4]; + + salt_buf3[0] = 0; + salt_buf3[1] = 0; + salt_buf3[2] = 0; + salt_buf3[3] = 0; const u32 salt_len = salt_bufs[salt_pos].salt_len; diff --git a/OpenCL/m00030_a0.cl b/OpenCL/m00030_a0.cl index 7e043ea..953f933 100644 --- a/OpenCL/m00030_a0.cl +++ b/OpenCL/m00030_a0.cl @@ -61,26 +61,18 @@ __kernel void m00030_m04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, */ u32 salt_buf0[4]; + + salt_buf0[0] = salt_bufs[salt_pos].salt_buf[0]; + salt_buf0[1] = salt_bufs[salt_pos].salt_buf[1]; + salt_buf0[2] = salt_bufs[salt_pos].salt_buf[2]; + salt_buf0[3] = salt_bufs[salt_pos].salt_buf[3]; + u32 salt_buf1[4]; - u32 salt_buf2[4]; - u32 salt_buf3[4]; - - salt_buf0[0] = salt_bufs[salt_pos].salt_buf[ 0]; - salt_buf0[1] = salt_bufs[salt_pos].salt_buf[ 1]; - salt_buf0[2] = salt_bufs[salt_pos].salt_buf[ 2]; - salt_buf0[3] = salt_bufs[salt_pos].salt_buf[ 3]; - salt_buf1[0] = salt_bufs[salt_pos].salt_buf[ 4]; - salt_buf1[1] = salt_bufs[salt_pos].salt_buf[ 5]; - salt_buf1[2] = salt_bufs[salt_pos].salt_buf[ 6]; - salt_buf1[3] = salt_bufs[salt_pos].salt_buf[ 7]; - salt_buf2[0] = salt_bufs[salt_pos].salt_buf[ 8]; - salt_buf2[1] = salt_bufs[salt_pos].salt_buf[ 9]; - salt_buf2[2] = salt_bufs[salt_pos].salt_buf[10]; - salt_buf2[3] = salt_bufs[salt_pos].salt_buf[11]; - salt_buf3[0] = salt_bufs[salt_pos].salt_buf[12]; - salt_buf3[1] = salt_bufs[salt_pos].salt_buf[13]; - salt_buf3[2] = salt_bufs[salt_pos].salt_buf[14]; - salt_buf3[3] = salt_bufs[salt_pos].salt_buf[15]; + + salt_buf1[0] = salt_bufs[salt_pos].salt_buf[4]; + salt_buf1[1] = salt_bufs[salt_pos].salt_buf[5]; + salt_buf1[2] = salt_bufs[salt_pos].salt_buf[6]; + salt_buf1[3] = salt_bufs[salt_pos].salt_buf[7]; const u32 salt_len = salt_bufs[salt_pos].salt_len; @@ -155,8 +147,12 @@ __kernel void m00030_m04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, w2_t[3] |= s2[3]; w3_t[0] |= s3[0]; w3_t[1] |= s3[1]; - w3_t[2] = out_salt_len * 8; - w3_t[3] = 0; + w3_t[2] |= s3[2]; + w3_t[3] |= s3[3]; + + append_0x80_4x4 (w0_t, w1_t, w2_t, w3_t, out_salt_len); + + w3_t[2] = out_salt_len * 8; /** * md5 @@ -284,26 +280,18 @@ __kernel void m00030_s04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, */ u32 salt_buf0[4]; + + salt_buf0[0] = salt_bufs[salt_pos].salt_buf[0]; + salt_buf0[1] = salt_bufs[salt_pos].salt_buf[1]; + salt_buf0[2] = salt_bufs[salt_pos].salt_buf[2]; + salt_buf0[3] = salt_bufs[salt_pos].salt_buf[3]; + u32 salt_buf1[4]; - u32 salt_buf2[4]; - u32 salt_buf3[4]; - - salt_buf0[0] = salt_bufs[salt_pos].salt_buf[ 0]; - salt_buf0[1] = salt_bufs[salt_pos].salt_buf[ 1]; - salt_buf0[2] = salt_bufs[salt_pos].salt_buf[ 2]; - salt_buf0[3] = salt_bufs[salt_pos].salt_buf[ 3]; - salt_buf1[0] = salt_bufs[salt_pos].salt_buf[ 4]; - salt_buf1[1] = salt_bufs[salt_pos].salt_buf[ 5]; - salt_buf1[2] = salt_bufs[salt_pos].salt_buf[ 6]; - salt_buf1[3] = salt_bufs[salt_pos].salt_buf[ 7]; - salt_buf2[0] = salt_bufs[salt_pos].salt_buf[ 8]; - salt_buf2[1] = salt_bufs[salt_pos].salt_buf[ 9]; - salt_buf2[2] = salt_bufs[salt_pos].salt_buf[10]; - salt_buf2[3] = salt_bufs[salt_pos].salt_buf[11]; - salt_buf3[0] = salt_bufs[salt_pos].salt_buf[12]; - salt_buf3[1] = salt_bufs[salt_pos].salt_buf[13]; - salt_buf3[2] = salt_bufs[salt_pos].salt_buf[14]; - salt_buf3[3] = salt_bufs[salt_pos].salt_buf[15]; + + salt_buf1[0] = salt_bufs[salt_pos].salt_buf[4]; + salt_buf1[1] = salt_bufs[salt_pos].salt_buf[5]; + salt_buf1[2] = salt_bufs[salt_pos].salt_buf[6]; + salt_buf1[3] = salt_bufs[salt_pos].salt_buf[7]; const u32 salt_len = salt_bufs[salt_pos].salt_len; @@ -390,8 +378,12 @@ __kernel void m00030_s04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, w2_t[3] |= s2[3]; w3_t[0] |= s3[0]; w3_t[1] |= s3[1]; - w3_t[2] = out_salt_len * 8; - w3_t[3] = 0; + w3_t[2] |= s3[2]; + w3_t[3] |= s3[3]; + + append_0x80_4x4 (w0_t, w1_t, w2_t, w3_t, out_salt_len); + + w3_t[2] = out_salt_len * 8; /** * md5 diff --git a/OpenCL/m00030_a1.cl b/OpenCL/m00030_a1.cl index 2113be1..0016776 100644 --- a/OpenCL/m00030_a1.cl +++ b/OpenCL/m00030_a1.cl @@ -58,25 +58,15 @@ __kernel void m00030_m04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, u32 salt_buf0[4]; u32 salt_buf1[4]; - u32 salt_buf2[4]; - u32 salt_buf3[4]; - - salt_buf0[0] = salt_bufs[salt_pos].salt_buf[ 0]; - salt_buf0[1] = salt_bufs[salt_pos].salt_buf[ 1]; - salt_buf0[2] = salt_bufs[salt_pos].salt_buf[ 2]; - salt_buf0[3] = salt_bufs[salt_pos].salt_buf[ 3]; - salt_buf1[0] = salt_bufs[salt_pos].salt_buf[ 4]; - salt_buf1[1] = salt_bufs[salt_pos].salt_buf[ 5]; - salt_buf1[2] = salt_bufs[salt_pos].salt_buf[ 6]; - salt_buf1[3] = salt_bufs[salt_pos].salt_buf[ 7]; - salt_buf2[0] = salt_bufs[salt_pos].salt_buf[ 8]; - salt_buf2[1] = salt_bufs[salt_pos].salt_buf[ 9]; - salt_buf2[2] = salt_bufs[salt_pos].salt_buf[10]; - salt_buf2[3] = salt_bufs[salt_pos].salt_buf[11]; - salt_buf3[0] = salt_bufs[salt_pos].salt_buf[12]; - salt_buf3[1] = salt_bufs[salt_pos].salt_buf[13]; - salt_buf3[2] = salt_bufs[salt_pos].salt_buf[14]; - salt_buf3[3] = salt_bufs[salt_pos].salt_buf[15]; + + salt_buf0[0] = salt_bufs[salt_pos].salt_buf[0]; + salt_buf0[1] = salt_bufs[salt_pos].salt_buf[1]; + salt_buf0[2] = salt_bufs[salt_pos].salt_buf[2]; + salt_buf0[3] = salt_bufs[salt_pos].salt_buf[3]; + salt_buf1[0] = salt_bufs[salt_pos].salt_buf[4]; + salt_buf1[1] = salt_bufs[salt_pos].salt_buf[5]; + salt_buf1[2] = salt_bufs[salt_pos].salt_buf[6]; + salt_buf1[3] = salt_bufs[salt_pos].salt_buf[7]; const u32 salt_len = salt_bufs[salt_pos].salt_len; @@ -322,25 +312,15 @@ __kernel void m00030_s04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, u32 salt_buf0[4]; u32 salt_buf1[4]; - u32 salt_buf2[4]; - u32 salt_buf3[4]; - - salt_buf0[0] = salt_bufs[salt_pos].salt_buf[ 0]; - salt_buf0[1] = salt_bufs[salt_pos].salt_buf[ 1]; - salt_buf0[2] = salt_bufs[salt_pos].salt_buf[ 2]; - salt_buf0[3] = salt_bufs[salt_pos].salt_buf[ 3]; - salt_buf1[0] = salt_bufs[salt_pos].salt_buf[ 4]; - salt_buf1[1] = salt_bufs[salt_pos].salt_buf[ 5]; - salt_buf1[2] = salt_bufs[salt_pos].salt_buf[ 6]; - salt_buf1[3] = salt_bufs[salt_pos].salt_buf[ 7]; - salt_buf2[0] = salt_bufs[salt_pos].salt_buf[ 8]; - salt_buf2[1] = salt_bufs[salt_pos].salt_buf[ 9]; - salt_buf2[2] = salt_bufs[salt_pos].salt_buf[10]; - salt_buf2[3] = salt_bufs[salt_pos].salt_buf[11]; - salt_buf3[0] = salt_bufs[salt_pos].salt_buf[12]; - salt_buf3[1] = salt_bufs[salt_pos].salt_buf[13]; - salt_buf3[2] = salt_bufs[salt_pos].salt_buf[14]; - salt_buf3[3] = salt_bufs[salt_pos].salt_buf[15]; + + salt_buf0[0] = salt_bufs[salt_pos].salt_buf[0]; + salt_buf0[1] = salt_bufs[salt_pos].salt_buf[1]; + salt_buf0[2] = salt_bufs[salt_pos].salt_buf[2]; + salt_buf0[3] = salt_bufs[salt_pos].salt_buf[3]; + salt_buf1[0] = salt_bufs[salt_pos].salt_buf[4]; + salt_buf1[1] = salt_bufs[salt_pos].salt_buf[5]; + salt_buf1[2] = salt_bufs[salt_pos].salt_buf[6]; + salt_buf1[3] = salt_bufs[salt_pos].salt_buf[7]; const u32 salt_len = salt_bufs[salt_pos].salt_len; diff --git a/OpenCL/m00030_a3.cl b/OpenCL/m00030_a3.cl index a79bfc3..946c887 100644 --- a/OpenCL/m00030_a3.cl +++ b/OpenCL/m00030_a3.cl @@ -51,28 +51,32 @@ static void m00030m (u32 w[16], const u32 pw_len, __global pw_t *pws, __global k */ u32 salt_buf0[4]; - u32 salt_buf1[4]; - u32 salt_buf2[4]; - u32 salt_buf3[4]; salt_buf0[0] = salt_bufs[salt_pos].salt_buf[ 0]; salt_buf0[1] = salt_bufs[salt_pos].salt_buf[ 1]; salt_buf0[2] = salt_bufs[salt_pos].salt_buf[ 2]; salt_buf0[3] = salt_bufs[salt_pos].salt_buf[ 3]; + + u32 salt_buf1[4]; + salt_buf1[0] = salt_bufs[salt_pos].salt_buf[ 4]; salt_buf1[1] = salt_bufs[salt_pos].salt_buf[ 5]; salt_buf1[2] = salt_bufs[salt_pos].salt_buf[ 6]; salt_buf1[3] = salt_bufs[salt_pos].salt_buf[ 7]; + + u32 salt_buf2[4]; + salt_buf2[0] = salt_bufs[salt_pos].salt_buf[ 8]; - salt_buf2[1] = salt_bufs[salt_pos].salt_buf[ 9]; - salt_buf2[2] = salt_bufs[salt_pos].salt_buf[10]; - salt_buf2[3] = salt_bufs[salt_pos].salt_buf[11]; - salt_buf3[0] = salt_bufs[salt_pos].salt_buf[12]; - salt_buf3[1] = salt_bufs[salt_pos].salt_buf[13]; - salt_buf3[2] = salt_bufs[salt_pos].salt_buf[14]; - salt_buf3[3] = salt_bufs[salt_pos].salt_buf[15]; + salt_buf2[1] = 0; + salt_buf2[2] = 0; + salt_buf2[3] = 0; - const u32 salt_len = salt_bufs[salt_pos].salt_len; + u32 salt_buf3[4]; + + salt_buf3[0] = 0; + salt_buf3[1] = 0; + salt_buf3[2] = 0; + salt_buf3[3] = 0; switch_buffer_by_offset_le_S (salt_buf0, salt_buf1, salt_buf2, salt_buf3, pw_len); @@ -93,10 +97,11 @@ static void m00030m (u32 w[16], const u32 pw_len, __global pw_t *pws, __global k w[14] |= salt_buf3[2]; w[15] |= salt_buf3[3]; + const u32 salt_len = salt_bufs[salt_pos].salt_len; + const u32 pw_salt_len = pw_len + salt_len; w[14] = pw_salt_len * 8; - w[15] = 0; /** * base diff --git a/OpenCL/m00040_a0.cl b/OpenCL/m00040_a0.cl index 6c41a85..2f6a0aa 100644 --- a/OpenCL/m00040_a0.cl +++ b/OpenCL/m00040_a0.cl @@ -59,26 +59,18 @@ __kernel void m00040_m04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, */ u32 salt_buf0[4]; + + salt_buf0[0] = salt_bufs[salt_pos].salt_buf[0]; + salt_buf0[1] = salt_bufs[salt_pos].salt_buf[1]; + salt_buf0[2] = salt_bufs[salt_pos].salt_buf[2]; + salt_buf0[3] = salt_bufs[salt_pos].salt_buf[3]; + u32 salt_buf1[4]; - u32 salt_buf2[4]; - u32 salt_buf3[4]; - - salt_buf0[0] = salt_bufs[salt_pos].salt_buf[ 0]; - salt_buf0[1] = salt_bufs[salt_pos].salt_buf[ 1]; - salt_buf0[2] = salt_bufs[salt_pos].salt_buf[ 2]; - salt_buf0[3] = salt_bufs[salt_pos].salt_buf[ 3]; - salt_buf1[0] = salt_bufs[salt_pos].salt_buf[ 4]; - salt_buf1[1] = salt_bufs[salt_pos].salt_buf[ 5]; - salt_buf1[2] = salt_bufs[salt_pos].salt_buf[ 6]; - salt_buf1[3] = salt_bufs[salt_pos].salt_buf[ 7]; - salt_buf2[0] = salt_bufs[salt_pos].salt_buf[ 8]; - salt_buf2[1] = salt_bufs[salt_pos].salt_buf[ 9]; - salt_buf2[2] = salt_bufs[salt_pos].salt_buf[10]; - salt_buf2[3] = salt_bufs[salt_pos].salt_buf[11]; - salt_buf3[0] = salt_bufs[salt_pos].salt_buf[12]; - salt_buf3[1] = salt_bufs[salt_pos].salt_buf[13]; - salt_buf3[2] = salt_bufs[salt_pos].salt_buf[14]; - salt_buf3[3] = salt_bufs[salt_pos].salt_buf[15]; + + salt_buf1[0] = salt_bufs[salt_pos].salt_buf[4]; + salt_buf1[1] = salt_bufs[salt_pos].salt_buf[5]; + salt_buf1[2] = salt_bufs[salt_pos].salt_buf[6]; + salt_buf1[3] = salt_bufs[salt_pos].salt_buf[7]; const u32 salt_len = salt_bufs[salt_pos].salt_len; @@ -119,19 +111,10 @@ __kernel void m00040_m04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, w1_t[1] |= salt_buf1[1]; w1_t[2] |= salt_buf1[2]; w1_t[3] |= salt_buf1[3]; - w2_t[0] |= salt_buf2[0]; - w2_t[1] |= salt_buf2[1]; - w2_t[2] |= salt_buf2[2]; - w2_t[3] |= salt_buf2[3]; - w3_t[0] |= salt_buf3[0]; - w3_t[1] |= salt_buf3[1]; - w3_t[2] |= salt_buf3[2]; - w3_t[3] |= salt_buf3[3]; append_0x80_4x4 (w0_t, w1_t, w2_t, w3_t, out_salt_len); w3_t[2] = out_salt_len * 8; - w3_t[3] = 0; /** * md5 @@ -259,26 +242,18 @@ __kernel void m00040_s04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, */ u32 salt_buf0[4]; + + salt_buf0[0] = salt_bufs[salt_pos].salt_buf[0]; + salt_buf0[1] = salt_bufs[salt_pos].salt_buf[1]; + salt_buf0[2] = salt_bufs[salt_pos].salt_buf[2]; + salt_buf0[3] = salt_bufs[salt_pos].salt_buf[3]; + u32 salt_buf1[4]; - u32 salt_buf2[4]; - u32 salt_buf3[4]; - - salt_buf0[0] = salt_bufs[salt_pos].salt_buf[ 0]; - salt_buf0[1] = salt_bufs[salt_pos].salt_buf[ 1]; - salt_buf0[2] = salt_bufs[salt_pos].salt_buf[ 2]; - salt_buf0[3] = salt_bufs[salt_pos].salt_buf[ 3]; - salt_buf1[0] = salt_bufs[salt_pos].salt_buf[ 4]; - salt_buf1[1] = salt_bufs[salt_pos].salt_buf[ 5]; - salt_buf1[2] = salt_bufs[salt_pos].salt_buf[ 6]; - salt_buf1[3] = salt_bufs[salt_pos].salt_buf[ 7]; - salt_buf2[0] = salt_bufs[salt_pos].salt_buf[ 8]; - salt_buf2[1] = salt_bufs[salt_pos].salt_buf[ 9]; - salt_buf2[2] = salt_bufs[salt_pos].salt_buf[10]; - salt_buf2[3] = salt_bufs[salt_pos].salt_buf[11]; - salt_buf3[0] = salt_bufs[salt_pos].salt_buf[12]; - salt_buf3[1] = salt_bufs[salt_pos].salt_buf[13]; - salt_buf3[2] = salt_bufs[salt_pos].salt_buf[14]; - salt_buf3[3] = salt_bufs[salt_pos].salt_buf[15]; + + salt_buf1[0] = salt_bufs[salt_pos].salt_buf[4]; + salt_buf1[1] = salt_bufs[salt_pos].salt_buf[5]; + salt_buf1[2] = salt_bufs[salt_pos].salt_buf[6]; + salt_buf1[3] = salt_bufs[salt_pos].salt_buf[7]; const u32 salt_len = salt_bufs[salt_pos].salt_len; @@ -331,19 +306,10 @@ __kernel void m00040_s04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, w1_t[1] |= salt_buf1[1]; w1_t[2] |= salt_buf1[2]; w1_t[3] |= salt_buf1[3]; - w2_t[0] |= salt_buf2[0]; - w2_t[1] |= salt_buf2[1]; - w2_t[2] |= salt_buf2[2]; - w2_t[3] |= salt_buf2[3]; - w3_t[0] |= salt_buf3[0]; - w3_t[1] |= salt_buf3[1]; - w3_t[2] |= salt_buf3[2]; - w3_t[3] |= salt_buf3[3]; append_0x80_4x4 (w0_t, w1_t, w2_t, w3_t, out_salt_len); w3_t[2] = out_salt_len * 8; - w3_t[3] = 0; /** * md5 diff --git a/OpenCL/m00040_a1.cl b/OpenCL/m00040_a1.cl index 9fa9651..f704c55 100644 --- a/OpenCL/m00040_a1.cl +++ b/OpenCL/m00040_a1.cl @@ -58,25 +58,15 @@ __kernel void m00040_m04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, u32 salt_buf0[4]; u32 salt_buf1[4]; - u32 salt_buf2[4]; - u32 salt_buf3[4]; - - salt_buf0[0] = salt_bufs[salt_pos].salt_buf[ 0]; - salt_buf0[1] = salt_bufs[salt_pos].salt_buf[ 1]; - salt_buf0[2] = salt_bufs[salt_pos].salt_buf[ 2]; - salt_buf0[3] = salt_bufs[salt_pos].salt_buf[ 3]; - salt_buf1[0] = salt_bufs[salt_pos].salt_buf[ 4]; - salt_buf1[1] = salt_bufs[salt_pos].salt_buf[ 5]; - salt_buf1[2] = salt_bufs[salt_pos].salt_buf[ 6]; - salt_buf1[3] = salt_bufs[salt_pos].salt_buf[ 7]; - salt_buf2[0] = salt_bufs[salt_pos].salt_buf[ 8]; - salt_buf2[1] = salt_bufs[salt_pos].salt_buf[ 9]; - salt_buf2[2] = salt_bufs[salt_pos].salt_buf[10]; - salt_buf2[3] = salt_bufs[salt_pos].salt_buf[11]; - salt_buf3[0] = salt_bufs[salt_pos].salt_buf[12]; - salt_buf3[1] = salt_bufs[salt_pos].salt_buf[13]; - salt_buf3[2] = salt_bufs[salt_pos].salt_buf[14]; - salt_buf3[3] = salt_bufs[salt_pos].salt_buf[15]; + + salt_buf0[0] = salt_bufs[salt_pos].salt_buf[0]; + salt_buf0[1] = salt_bufs[salt_pos].salt_buf[1]; + salt_buf0[2] = salt_bufs[salt_pos].salt_buf[2]; + salt_buf0[3] = salt_bufs[salt_pos].salt_buf[3]; + salt_buf1[0] = salt_bufs[salt_pos].salt_buf[4]; + salt_buf1[1] = salt_bufs[salt_pos].salt_buf[5]; + salt_buf1[2] = salt_bufs[salt_pos].salt_buf[6]; + salt_buf1[3] = salt_bufs[salt_pos].salt_buf[7]; const u32 salt_len = salt_bufs[salt_pos].salt_len; @@ -174,12 +164,12 @@ __kernel void m00040_m04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, w1[1] |= salt_buf1[1]; w1[2] |= salt_buf1[2]; w1[3] |= salt_buf1[3]; - w2[0] |= salt_buf2[0]; - w2[1] |= salt_buf2[1]; - w2[2] |= salt_buf2[2]; - w2[3] |= salt_buf2[3]; - w3[0] |= salt_buf3[0]; - w3[1] |= salt_buf3[1]; + w2[0] |= 0; + w2[1] |= 0; + w2[2] |= 0; + w2[3] |= 0; + w3[0] |= 0; + w3[1] |= 0; w3[2] = pw_salt_len * 8; w3[3] = 0; @@ -308,25 +298,15 @@ __kernel void m00040_s04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, u32 salt_buf0[4]; u32 salt_buf1[4]; - u32 salt_buf2[4]; - u32 salt_buf3[4]; - - salt_buf0[0] = salt_bufs[salt_pos].salt_buf[ 0]; - salt_buf0[1] = salt_bufs[salt_pos].salt_buf[ 1]; - salt_buf0[2] = salt_bufs[salt_pos].salt_buf[ 2]; - salt_buf0[3] = salt_bufs[salt_pos].salt_buf[ 3]; - salt_buf1[0] = salt_bufs[salt_pos].salt_buf[ 4]; - salt_buf1[1] = salt_bufs[salt_pos].salt_buf[ 5]; - salt_buf1[2] = salt_bufs[salt_pos].salt_buf[ 6]; - salt_buf1[3] = salt_bufs[salt_pos].salt_buf[ 7]; - salt_buf2[0] = salt_bufs[salt_pos].salt_buf[ 8]; - salt_buf2[1] = salt_bufs[salt_pos].salt_buf[ 9]; - salt_buf2[2] = salt_bufs[salt_pos].salt_buf[10]; - salt_buf2[3] = salt_bufs[salt_pos].salt_buf[11]; - salt_buf3[0] = salt_bufs[salt_pos].salt_buf[12]; - salt_buf3[1] = salt_bufs[salt_pos].salt_buf[13]; - salt_buf3[2] = salt_bufs[salt_pos].salt_buf[14]; - salt_buf3[3] = salt_bufs[salt_pos].salt_buf[15]; + + salt_buf0[0] = salt_bufs[salt_pos].salt_buf[0]; + salt_buf0[1] = salt_bufs[salt_pos].salt_buf[1]; + salt_buf0[2] = salt_bufs[salt_pos].salt_buf[2]; + salt_buf0[3] = salt_bufs[salt_pos].salt_buf[3]; + salt_buf1[0] = salt_bufs[salt_pos].salt_buf[4]; + salt_buf1[1] = salt_bufs[salt_pos].salt_buf[5]; + salt_buf1[2] = salt_bufs[salt_pos].salt_buf[6]; + salt_buf1[3] = salt_bufs[salt_pos].salt_buf[7]; const u32 salt_len = salt_bufs[salt_pos].salt_len; @@ -436,12 +416,12 @@ __kernel void m00040_s04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, w1[1] |= salt_buf1[1]; w1[2] |= salt_buf1[2]; w1[3] |= salt_buf1[3]; - w2[0] |= salt_buf2[0]; - w2[1] |= salt_buf2[1]; - w2[2] |= salt_buf2[2]; - w2[3] |= salt_buf2[3]; - w3[0] |= salt_buf3[0]; - w3[1] |= salt_buf3[1]; + w2[0] |= 0; + w2[1] |= 0; + w2[2] |= 0; + w2[3] |= 0; + w3[0] |= 0; + w3[1] |= 0; w3[2] = pw_salt_len * 8; w3[3] = 0; diff --git a/OpenCL/m00040_a3.cl b/OpenCL/m00040_a3.cl index fabe52b..a60c6d4 100644 --- a/OpenCL/m00040_a3.cl +++ b/OpenCL/m00040_a3.cl @@ -34,26 +34,32 @@ static void m00040m (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], const u32 pw_le */ u32 salt_buf0[4]; - u32 salt_buf1[4]; - u32 salt_buf2[4]; - u32 salt_buf3[4]; salt_buf0[0] = salt_bufs[salt_pos].salt_buf[ 0]; salt_buf0[1] = salt_bufs[salt_pos].salt_buf[ 1]; salt_buf0[2] = salt_bufs[salt_pos].salt_buf[ 2]; salt_buf0[3] = salt_bufs[salt_pos].salt_buf[ 3]; + + u32 salt_buf1[4]; + salt_buf1[0] = salt_bufs[salt_pos].salt_buf[ 4]; salt_buf1[1] = salt_bufs[salt_pos].salt_buf[ 5]; salt_buf1[2] = salt_bufs[salt_pos].salt_buf[ 6]; salt_buf1[3] = salt_bufs[salt_pos].salt_buf[ 7]; - salt_buf2[0] = salt_bufs[salt_pos].salt_buf[ 8]; - salt_buf2[1] = salt_bufs[salt_pos].salt_buf[ 9]; - salt_buf2[2] = salt_bufs[salt_pos].salt_buf[10]; - salt_buf2[3] = salt_bufs[salt_pos].salt_buf[11]; - salt_buf3[0] = salt_bufs[salt_pos].salt_buf[12]; - salt_buf3[1] = salt_bufs[salt_pos].salt_buf[13]; - salt_buf3[2] = salt_bufs[salt_pos].salt_buf[14]; - salt_buf3[3] = salt_bufs[salt_pos].salt_buf[15]; + + u32 salt_buf2[4]; + + salt_buf2[0] = 0; + salt_buf2[1] = 0; + salt_buf2[2] = 0; + salt_buf2[3] = 0; + + u32 salt_buf3[4]; + + salt_buf3[0] = 0; + salt_buf3[1] = 0; + salt_buf3[2] = 0; + salt_buf3[3] = 0; const u32 salt_len = salt_bufs[salt_pos].salt_len; @@ -266,26 +272,32 @@ static void m00040s (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], const u32 pw_le */ u32 salt_buf0[4]; - u32 salt_buf1[4]; - u32 salt_buf2[4]; - u32 salt_buf3[4]; salt_buf0[0] = salt_bufs[salt_pos].salt_buf[ 0]; salt_buf0[1] = salt_bufs[salt_pos].salt_buf[ 1]; salt_buf0[2] = salt_bufs[salt_pos].salt_buf[ 2]; salt_buf0[3] = salt_bufs[salt_pos].salt_buf[ 3]; + + u32 salt_buf1[4]; + salt_buf1[0] = salt_bufs[salt_pos].salt_buf[ 4]; salt_buf1[1] = salt_bufs[salt_pos].salt_buf[ 5]; salt_buf1[2] = salt_bufs[salt_pos].salt_buf[ 6]; salt_buf1[3] = salt_bufs[salt_pos].salt_buf[ 7]; - salt_buf2[0] = salt_bufs[salt_pos].salt_buf[ 8]; - salt_buf2[1] = salt_bufs[salt_pos].salt_buf[ 9]; - salt_buf2[2] = salt_bufs[salt_pos].salt_buf[10]; - salt_buf2[3] = salt_bufs[salt_pos].salt_buf[11]; - salt_buf3[0] = salt_bufs[salt_pos].salt_buf[12]; - salt_buf3[1] = salt_bufs[salt_pos].salt_buf[13]; - salt_buf3[2] = salt_bufs[salt_pos].salt_buf[14]; - salt_buf3[3] = salt_bufs[salt_pos].salt_buf[15]; + + u32 salt_buf2[4]; + + salt_buf2[0] = 0; + salt_buf2[1] = 0; + salt_buf2[2] = 0; + salt_buf2[3] = 0; + + u32 salt_buf3[4]; + + salt_buf3[0] = 0; + salt_buf3[1] = 0; + salt_buf3[2] = 0; + salt_buf3[3] = 0; const u32 salt_len = salt_bufs[salt_pos].salt_len; diff --git a/OpenCL/m00050_a0.cl b/OpenCL/m00050_a0.cl index 09fda93..575b1b4 100644 --- a/OpenCL/m00050_a0.cl +++ b/OpenCL/m00050_a0.cl @@ -242,22 +242,28 @@ __kernel void m00050_m04 (__global pw_t *pws, __global kernel_rule_t * rules_bu */ u32 salt_buf0[4]; - u32 salt_buf1[4]; - u32 salt_buf2[4]; - u32 salt_buf3[4]; salt_buf0[0] = salt_bufs[salt_pos].salt_buf[ 0]; salt_buf0[1] = salt_bufs[salt_pos].salt_buf[ 1]; salt_buf0[2] = salt_bufs[salt_pos].salt_buf[ 2]; salt_buf0[3] = salt_bufs[salt_pos].salt_buf[ 3]; + + u32 salt_buf1[4]; + salt_buf1[0] = salt_bufs[salt_pos].salt_buf[ 4]; salt_buf1[1] = salt_bufs[salt_pos].salt_buf[ 5]; salt_buf1[2] = salt_bufs[salt_pos].salt_buf[ 6]; salt_buf1[3] = salt_bufs[salt_pos].salt_buf[ 7]; + + u32 salt_buf2[4]; + salt_buf2[0] = salt_bufs[salt_pos].salt_buf[ 8]; salt_buf2[1] = salt_bufs[salt_pos].salt_buf[ 9]; salt_buf2[2] = salt_bufs[salt_pos].salt_buf[10]; salt_buf2[3] = salt_bufs[salt_pos].salt_buf[11]; + + u32 salt_buf3[4]; + salt_buf3[0] = salt_bufs[salt_pos].salt_buf[12]; salt_buf3[1] = salt_bufs[salt_pos].salt_buf[13]; salt_buf3[2] = salt_bufs[salt_pos].salt_buf[14]; @@ -385,22 +391,28 @@ __kernel void m00050_s04 (__global pw_t *pws, __global kernel_rule_t * rules_bu */ u32 salt_buf0[4]; - u32 salt_buf1[4]; - u32 salt_buf2[4]; - u32 salt_buf3[4]; salt_buf0[0] = salt_bufs[salt_pos].salt_buf[ 0]; salt_buf0[1] = salt_bufs[salt_pos].salt_buf[ 1]; salt_buf0[2] = salt_bufs[salt_pos].salt_buf[ 2]; salt_buf0[3] = salt_bufs[salt_pos].salt_buf[ 3]; + + u32 salt_buf1[4]; + salt_buf1[0] = salt_bufs[salt_pos].salt_buf[ 4]; salt_buf1[1] = salt_bufs[salt_pos].salt_buf[ 5]; salt_buf1[2] = salt_bufs[salt_pos].salt_buf[ 6]; salt_buf1[3] = salt_bufs[salt_pos].salt_buf[ 7]; + + u32 salt_buf2[4]; + salt_buf2[0] = salt_bufs[salt_pos].salt_buf[ 8]; salt_buf2[1] = salt_bufs[salt_pos].salt_buf[ 9]; salt_buf2[2] = salt_bufs[salt_pos].salt_buf[10]; salt_buf2[3] = salt_bufs[salt_pos].salt_buf[11]; + + u32 salt_buf3[4]; + salt_buf3[0] = salt_bufs[salt_pos].salt_buf[12]; salt_buf3[1] = salt_bufs[salt_pos].salt_buf[13]; salt_buf3[2] = salt_bufs[salt_pos].salt_buf[14]; diff --git a/OpenCL/m00050_a3.cl b/OpenCL/m00050_a3.cl index 54e5e67..a954bf4 100644 --- a/OpenCL/m00050_a3.cl +++ b/OpenCL/m00050_a3.cl @@ -217,22 +217,28 @@ static void m00050m (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], const u32 pw_le */ u32 salt_buf0[4]; - u32 salt_buf1[4]; - u32 salt_buf2[4]; - u32 salt_buf3[4]; salt_buf0[0] = salt_bufs[salt_pos].salt_buf[ 0]; salt_buf0[1] = salt_bufs[salt_pos].salt_buf[ 1]; salt_buf0[2] = salt_bufs[salt_pos].salt_buf[ 2]; salt_buf0[3] = salt_bufs[salt_pos].salt_buf[ 3]; + + u32 salt_buf1[4]; + salt_buf1[0] = salt_bufs[salt_pos].salt_buf[ 4]; salt_buf1[1] = salt_bufs[salt_pos].salt_buf[ 5]; salt_buf1[2] = salt_bufs[salt_pos].salt_buf[ 6]; salt_buf1[3] = salt_bufs[salt_pos].salt_buf[ 7]; + + u32 salt_buf2[4]; + salt_buf2[0] = salt_bufs[salt_pos].salt_buf[ 8]; salt_buf2[1] = salt_bufs[salt_pos].salt_buf[ 9]; salt_buf2[2] = salt_bufs[salt_pos].salt_buf[10]; salt_buf2[3] = salt_bufs[salt_pos].salt_buf[11]; + + u32 salt_buf3[4]; + salt_buf3[0] = salt_bufs[salt_pos].salt_buf[12]; salt_buf3[1] = salt_bufs[salt_pos].salt_buf[13]; salt_buf3[2] = salt_bufs[salt_pos].salt_buf[14]; @@ -328,22 +334,28 @@ static void m00050s (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], const u32 pw_le */ u32 salt_buf0[4]; - u32 salt_buf1[4]; - u32 salt_buf2[4]; - u32 salt_buf3[4]; salt_buf0[0] = salt_bufs[salt_pos].salt_buf[ 0]; salt_buf0[1] = salt_bufs[salt_pos].salt_buf[ 1]; salt_buf0[2] = salt_bufs[salt_pos].salt_buf[ 2]; salt_buf0[3] = salt_bufs[salt_pos].salt_buf[ 3]; + + u32 salt_buf1[4]; + salt_buf1[0] = salt_bufs[salt_pos].salt_buf[ 4]; salt_buf1[1] = salt_bufs[salt_pos].salt_buf[ 5]; salt_buf1[2] = salt_bufs[salt_pos].salt_buf[ 6]; salt_buf1[3] = salt_bufs[salt_pos].salt_buf[ 7]; + + u32 salt_buf2[4]; + salt_buf2[0] = salt_bufs[salt_pos].salt_buf[ 8]; salt_buf2[1] = salt_bufs[salt_pos].salt_buf[ 9]; salt_buf2[2] = salt_bufs[salt_pos].salt_buf[10]; salt_buf2[3] = salt_bufs[salt_pos].salt_buf[11]; + + u32 salt_buf3[4]; + salt_buf3[0] = salt_bufs[salt_pos].salt_buf[12]; salt_buf3[1] = salt_bufs[salt_pos].salt_buf[13]; salt_buf3[2] = salt_bufs[salt_pos].salt_buf[14]; diff --git a/OpenCL/m00060_a0.cl b/OpenCL/m00060_a0.cl index 558416c..9515c79 100644 --- a/OpenCL/m00060_a0.cl +++ b/OpenCL/m00060_a0.cl @@ -242,54 +242,50 @@ __kernel void m00060_m04 (__global pw_t *pws, __global kernel_rule_t * rules_bu */ u32 salt_buf0[4]; - u32 salt_buf1[4]; - u32 salt_buf2[4]; - u32 salt_buf3[4]; salt_buf0[0] = salt_bufs[salt_pos].salt_buf[ 0]; salt_buf0[1] = salt_bufs[salt_pos].salt_buf[ 1]; salt_buf0[2] = salt_bufs[salt_pos].salt_buf[ 2]; salt_buf0[3] = salt_bufs[salt_pos].salt_buf[ 3]; + + u32 salt_buf1[4]; + salt_buf1[0] = salt_bufs[salt_pos].salt_buf[ 4]; salt_buf1[1] = salt_bufs[salt_pos].salt_buf[ 5]; salt_buf1[2] = salt_bufs[salt_pos].salt_buf[ 6]; salt_buf1[3] = salt_bufs[salt_pos].salt_buf[ 7]; - salt_buf2[0] = salt_bufs[salt_pos].salt_buf[ 8]; - salt_buf2[1] = salt_bufs[salt_pos].salt_buf[ 9]; - salt_buf2[2] = salt_bufs[salt_pos].salt_buf[10]; - salt_buf2[3] = salt_bufs[salt_pos].salt_buf[11]; - salt_buf3[0] = salt_bufs[salt_pos].salt_buf[12]; - salt_buf3[1] = salt_bufs[salt_pos].salt_buf[13]; - salt_buf3[2] = salt_bufs[salt_pos].salt_buf[14]; - salt_buf3[3] = salt_bufs[salt_pos].salt_buf[15]; - - const u32 salt_len = salt_bufs[salt_pos].salt_len; /** * pads */ u32x w0_t[4]; - u32x w1_t[4]; - u32x w2_t[4]; - u32x w3_t[4]; w0_t[0] = salt_buf0[0]; w0_t[1] = salt_buf0[1]; w0_t[2] = salt_buf0[2]; w0_t[3] = salt_buf0[3]; + + u32x w1_t[4]; + w1_t[0] = salt_buf1[0]; w1_t[1] = salt_buf1[1]; w1_t[2] = salt_buf1[2]; w1_t[3] = salt_buf1[3]; - w2_t[0] = salt_buf2[0]; - w2_t[1] = salt_buf2[1]; - w2_t[2] = salt_buf2[2]; - w2_t[3] = salt_buf2[3]; - w3_t[0] = salt_buf3[0]; - w3_t[1] = salt_buf3[1]; - w3_t[2] = salt_buf3[2]; - w3_t[3] = salt_buf3[3]; + + u32x w2_t[4]; + + w2_t[0] = 0; + w2_t[1] = 0; + w2_t[2] = 0; + w2_t[3] = 0; + + u32x w3_t[4]; + + w3_t[0] = 0; + w3_t[1] = 0; + w3_t[2] = 0; + w3_t[3] = 0; u32x ipad[4]; u32x opad[4]; @@ -381,54 +377,50 @@ __kernel void m00060_s04 (__global pw_t *pws, __global kernel_rule_t * rules_bu */ u32 salt_buf0[4]; - u32 salt_buf1[4]; - u32 salt_buf2[4]; - u32 salt_buf3[4]; salt_buf0[0] = salt_bufs[salt_pos].salt_buf[ 0]; salt_buf0[1] = salt_bufs[salt_pos].salt_buf[ 1]; salt_buf0[2] = salt_bufs[salt_pos].salt_buf[ 2]; salt_buf0[3] = salt_bufs[salt_pos].salt_buf[ 3]; + + u32 salt_buf1[4]; + salt_buf1[0] = salt_bufs[salt_pos].salt_buf[ 4]; salt_buf1[1] = salt_bufs[salt_pos].salt_buf[ 5]; salt_buf1[2] = salt_bufs[salt_pos].salt_buf[ 6]; salt_buf1[3] = salt_bufs[salt_pos].salt_buf[ 7]; - salt_buf2[0] = salt_bufs[salt_pos].salt_buf[ 8]; - salt_buf2[1] = salt_bufs[salt_pos].salt_buf[ 9]; - salt_buf2[2] = salt_bufs[salt_pos].salt_buf[10]; - salt_buf2[3] = salt_bufs[salt_pos].salt_buf[11]; - salt_buf3[0] = salt_bufs[salt_pos].salt_buf[12]; - salt_buf3[1] = salt_bufs[salt_pos].salt_buf[13]; - salt_buf3[2] = salt_bufs[salt_pos].salt_buf[14]; - salt_buf3[3] = salt_bufs[salt_pos].salt_buf[15]; - - const u32 salt_len = salt_bufs[salt_pos].salt_len; /** * pads */ u32x w0_t[4]; - u32x w1_t[4]; - u32x w2_t[4]; - u32x w3_t[4]; w0_t[0] = salt_buf0[0]; w0_t[1] = salt_buf0[1]; w0_t[2] = salt_buf0[2]; w0_t[3] = salt_buf0[3]; + + u32x w1_t[4]; + w1_t[0] = salt_buf1[0]; w1_t[1] = salt_buf1[1]; w1_t[2] = salt_buf1[2]; w1_t[3] = salt_buf1[3]; - w2_t[0] = salt_buf2[0]; - w2_t[1] = salt_buf2[1]; - w2_t[2] = salt_buf2[2]; - w2_t[3] = salt_buf2[3]; - w3_t[0] = salt_buf3[0]; - w3_t[1] = salt_buf3[1]; - w3_t[2] = salt_buf3[2]; - w3_t[3] = salt_buf3[3]; + + u32x w2_t[4]; + + w2_t[0] = 0; + w2_t[1] = 0; + w2_t[2] = 0; + w2_t[3] = 0; + + u32x w3_t[4]; + + w3_t[0] = 0; + w3_t[1] = 0; + w3_t[2] = 0; + w3_t[3] = 0; u32x ipad[4]; u32x opad[4]; diff --git a/OpenCL/m00060_a3.cl b/OpenCL/m00060_a3.cl index b2dd85e..3d5419c 100644 --- a/OpenCL/m00060_a3.cl +++ b/OpenCL/m00060_a3.cl @@ -235,26 +235,32 @@ static void m00060m (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], const u32 pw_le */ u32x w0_t[4]; - u32x w1_t[4]; - u32x w2_t[4]; - u32x w3_t[4]; w0_t[0] = salt_buf0[0]; w0_t[1] = salt_buf0[1]; w0_t[2] = salt_buf0[2]; w0_t[3] = salt_buf0[3]; + + u32x w1_t[4]; + w1_t[0] = salt_buf1[0]; w1_t[1] = salt_buf1[1]; w1_t[2] = salt_buf1[2]; w1_t[3] = salt_buf1[3]; - w2_t[0] = salt_buf2[0]; - w2_t[1] = salt_buf2[1]; - w2_t[2] = salt_buf2[2]; - w2_t[3] = salt_buf2[3]; - w3_t[0] = salt_buf3[0]; - w3_t[1] = salt_buf3[1]; - w3_t[2] = salt_buf3[2]; - w3_t[3] = salt_buf3[3]; + + u32x w2_t[4]; + + w2_t[0] = 0; + w2_t[1] = 0; + w2_t[2] = 0; + w2_t[3] = 0; + + u32x w3_t[4]; + + w3_t[0] = 0; + w3_t[1] = 0; + w3_t[2] = 0; + w3_t[3] = 0; u32x ipad[4]; u32x opad[4]; @@ -314,54 +320,50 @@ static void m00060s (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], const u32 pw_le */ u32 salt_buf0[4]; - u32 salt_buf1[4]; - u32 salt_buf2[4]; - u32 salt_buf3[4]; salt_buf0[0] = salt_bufs[salt_pos].salt_buf[ 0]; salt_buf0[1] = salt_bufs[salt_pos].salt_buf[ 1]; salt_buf0[2] = salt_bufs[salt_pos].salt_buf[ 2]; salt_buf0[3] = salt_bufs[salt_pos].salt_buf[ 3]; + + u32 salt_buf1[4]; + salt_buf1[0] = salt_bufs[salt_pos].salt_buf[ 4]; salt_buf1[1] = salt_bufs[salt_pos].salt_buf[ 5]; salt_buf1[2] = salt_bufs[salt_pos].salt_buf[ 6]; salt_buf1[3] = salt_bufs[salt_pos].salt_buf[ 7]; - salt_buf2[0] = salt_bufs[salt_pos].salt_buf[ 8]; - salt_buf2[1] = salt_bufs[salt_pos].salt_buf[ 9]; - salt_buf2[2] = salt_bufs[salt_pos].salt_buf[10]; - salt_buf2[3] = salt_bufs[salt_pos].salt_buf[11]; - salt_buf3[0] = salt_bufs[salt_pos].salt_buf[12]; - salt_buf3[1] = salt_bufs[salt_pos].salt_buf[13]; - salt_buf3[2] = salt_bufs[salt_pos].salt_buf[14]; - salt_buf3[3] = salt_bufs[salt_pos].salt_buf[15]; - - const u32 salt_len = salt_bufs[salt_pos].salt_len; /** * pads */ u32x w0_t[4]; - u32x w1_t[4]; - u32x w2_t[4]; - u32x w3_t[4]; w0_t[0] = salt_buf0[0]; w0_t[1] = salt_buf0[1]; w0_t[2] = salt_buf0[2]; w0_t[3] = salt_buf0[3]; + + u32x w1_t[4]; + w1_t[0] = salt_buf1[0]; w1_t[1] = salt_buf1[1]; w1_t[2] = salt_buf1[2]; w1_t[3] = salt_buf1[3]; - w2_t[0] = salt_buf2[0]; - w2_t[1] = salt_buf2[1]; - w2_t[2] = salt_buf2[2]; - w2_t[3] = salt_buf2[3]; - w3_t[0] = salt_buf3[0]; - w3_t[1] = salt_buf3[1]; - w3_t[2] = salt_buf3[2]; - w3_t[3] = salt_buf3[3]; + + u32x w2_t[4]; + + w2_t[0] = 0; + w2_t[1] = 0; + w2_t[2] = 0; + w2_t[3] = 0; + + u32x w3_t[4]; + + w3_t[0] = 0; + w3_t[1] = 0; + w3_t[2] = 0; + w3_t[3] = 0; u32x ipad[4]; u32x opad[4]; diff --git a/OpenCL/m00110_a1.cl b/OpenCL/m00110_a1.cl index 851023e..2642ae4 100644 --- a/OpenCL/m00110_a1.cl +++ b/OpenCL/m00110_a1.cl @@ -5,8 +5,6 @@ #define _SHA1_ -#define NEW_SIMD_CODE - #include "include/constants.h" #include "include/kernel_vendor.h" @@ -18,7 +16,9 @@ #include "include/kernel_functions.c" #include "OpenCL/types_ocl.c" #include "OpenCL/common.c" -#include "OpenCL/simd.c" + +#define COMPARE_S "OpenCL/check_single_comp4.c" +#define COMPARE_M "OpenCL/check_multi_comp4.c" __kernel void m00110_m04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, __global comb_t *combs_buf, __global bf_t *bfs_buf, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 combs_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max) { @@ -36,31 +36,54 @@ __kernel void m00110_m04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, if (gid >= gid_max) return; - u32 pws0[4] = { 0 }; - u32 pws1[4] = { 0 }; + u32 wordl0[4]; + + wordl0[0] = pws[gid].i[ 0]; + wordl0[1] = pws[gid].i[ 1]; + wordl0[2] = pws[gid].i[ 2]; + wordl0[3] = pws[gid].i[ 3]; + + u32 wordl1[4]; + + wordl1[0] = pws[gid].i[ 4]; + wordl1[1] = pws[gid].i[ 5]; + wordl1[2] = pws[gid].i[ 6]; + wordl1[3] = pws[gid].i[ 7]; + + u32 wordl2[4]; - pws0[0] = pws[gid].i[0]; - pws0[1] = pws[gid].i[1]; - pws0[2] = pws[gid].i[2]; - pws0[3] = pws[gid].i[3]; - pws1[0] = pws[gid].i[4]; - pws1[1] = pws[gid].i[5]; - pws1[2] = pws[gid].i[6]; - pws1[3] = pws[gid].i[7]; + wordl2[0] = 0; + wordl2[1] = 0; + wordl2[2] = 0; + wordl2[3] = 0; + + u32 wordl3[4]; + + wordl3[0] = 0; + wordl3[1] = 0; + wordl3[2] = 0; + wordl3[3] = 0; const u32 pw_l_len = pws[gid].pw_len; + if (combs_mode == COMBINATOR_MODE_BASE_RIGHT) + { + switch_buffer_by_offset_le (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len); + } + /** * salt */ u32 salt_buf0[4]; - u32 salt_buf1[4]; salt_buf0[0] = salt_bufs[salt_pos].salt_buf[0]; salt_buf0[1] = salt_bufs[salt_pos].salt_buf[1]; salt_buf0[2] = salt_bufs[salt_pos].salt_buf[2]; salt_buf0[3] = salt_bufs[salt_pos].salt_buf[3]; + + u32 salt_buf1[4]; + salt_buf1[0] = salt_bufs[salt_pos].salt_buf[4]; salt_buf1[1] = salt_bufs[salt_pos].salt_buf[5]; salt_buf1[2] = salt_bufs[salt_pos].salt_buf[6]; @@ -72,25 +95,39 @@ __kernel void m00110_m04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, * loop */ - for (u32 il_pos = 0; il_pos < combs_cnt; il_pos += VECT_SIZE) + for (u32 il_pos = 0; il_pos < combs_cnt; il_pos++) { - const u32x pw_r_len = pwlenx_create_combt (combs_buf, il_pos); + const u32 pw_r_len = combs_buf[il_pos].pw_len; + + const u32 pw_len = pw_l_len + pw_r_len; + + u32 wordr0[4]; - const u32x pw_len = pw_l_len + pw_r_len; + wordr0[0] = combs_buf[il_pos].i[0]; + wordr0[1] = combs_buf[il_pos].i[1]; + wordr0[2] = combs_buf[il_pos].i[2]; + wordr0[3] = combs_buf[il_pos].i[3]; - u32x wordr0[4] = { 0 }; - u32x wordr1[4] = { 0 }; - u32x wordr2[4] = { 0 }; - u32x wordr3[4] = { 0 }; + u32 wordr1[4]; - wordr0[0] = ix_create_combt (combs_buf, il_pos, 0); - wordr0[1] = ix_create_combt (combs_buf, il_pos, 1); - wordr0[2] = ix_create_combt (combs_buf, il_pos, 2); - wordr0[3] = ix_create_combt (combs_buf, il_pos, 3); - wordr1[0] = ix_create_combt (combs_buf, il_pos, 4); - wordr1[1] = ix_create_combt (combs_buf, il_pos, 5); - wordr1[2] = ix_create_combt (combs_buf, il_pos, 6); - wordr1[3] = ix_create_combt (combs_buf, il_pos, 7); + wordr1[0] = combs_buf[il_pos].i[4]; + wordr1[1] = combs_buf[il_pos].i[5]; + wordr1[2] = combs_buf[il_pos].i[6]; + wordr1[3] = combs_buf[il_pos].i[7]; + + u32 wordr2[4]; + + wordr2[0] = 0; + wordr2[1] = 0; + wordr2[2] = 0; + wordr2[3] = 0; + + u32 wordr3[4]; + + wordr3[0] = 0; + wordr3[1] = 0; + wordr3[2] = 0; + wordr3[3] = 0; if (combs_mode == COMBINATOR_MODE_BASE_LEFT) { @@ -129,32 +166,32 @@ __kernel void m00110_m04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, s3[2] = 0; s3[3] = 0; - switch_buffer_by_offset_le_S (s0, s1, s2, s3, pw_len); + switch_buffer_by_offset_le (s0, s1, s2, s3, pw_len); - const u32x pw_salt_len = pw_len + salt_len; + const u32 pw_salt_len = pw_len + salt_len; - u32x w0[4]; + u32 w0[4]; w0[0] = wordl0[0] | wordr0[0] | s0[0]; w0[1] = wordl0[1] | wordr0[1] | s0[1]; w0[2] = wordl0[2] | wordr0[2] | s0[2]; w0[3] = wordl0[3] | wordr0[3] | s0[3]; - u32x w1[4]; + u32 w1[4]; w1[0] = wordl1[0] | wordr1[0] | s1[0]; w1[1] = wordl1[1] | wordr1[1] | s1[1]; w1[2] = wordl1[2] | wordr1[2] | s1[2]; w1[3] = wordl1[3] | wordr1[3] | s1[3]; - u32x w2[4]; + u32 w2[4]; w2[0] = wordl2[0] | wordr2[0] | s2[0]; w2[1] = wordl2[1] | wordr2[1] | s2[1]; w2[2] = wordl2[2] | wordr2[2] | s2[2]; w2[3] = wordl2[3] | wordr2[3] | s2[3]; - u32x w3[4]; + u32 w3[4]; w3[0] = wordl3[0] | wordr3[0] | s3[0]; w3[1] = wordl3[1] | wordr3[1] | s3[1]; @@ -167,28 +204,28 @@ __kernel void m00110_m04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, * sha1 */ - u32x w0_t = swap32 (w0[0]); - u32x w1_t = swap32 (w0[1]); - u32x w2_t = swap32 (w0[2]); - u32x w3_t = swap32 (w0[3]); - u32x w4_t = swap32 (w1[0]); - u32x w5_t = swap32 (w1[1]); - u32x w6_t = swap32 (w1[2]); - u32x w7_t = swap32 (w1[3]); - u32x w8_t = swap32 (w2[0]); - u32x w9_t = swap32 (w2[1]); - u32x wa_t = swap32 (w2[2]); - u32x wb_t = swap32 (w2[3]); - u32x wc_t = swap32 (w3[0]); - u32x wd_t = swap32 (w3[1]); - u32x we_t = 0; - u32x wf_t = pw_salt_len * 8; - - u32x a = SHA1M_A; - u32x b = SHA1M_B; - u32x c = SHA1M_C; - u32x d = SHA1M_D; - u32x e = SHA1M_E; + u32 w0_t = swap32 (w0[0]); + u32 w1_t = swap32 (w0[1]); + u32 w2_t = swap32 (w0[2]); + u32 w3_t = swap32 (w0[3]); + u32 w4_t = swap32 (w1[0]); + u32 w5_t = swap32 (w1[1]); + u32 w6_t = swap32 (w1[2]); + u32 w7_t = swap32 (w1[3]); + u32 w8_t = swap32 (w2[0]); + u32 w9_t = swap32 (w2[1]); + u32 wa_t = swap32 (w2[2]); + u32 wb_t = swap32 (w2[3]); + u32 wc_t = swap32 (w3[0]); + u32 wd_t = swap32 (w3[1]); + u32 we_t = 0; + u32 wf_t = pw_salt_len * 8; + + u32 a = SHA1M_A; + u32 b = SHA1M_B; + u32 c = SHA1M_C; + u32 d = SHA1M_D; + u32 e = SHA1M_E; #undef K #define K SHA1C00 @@ -286,7 +323,13 @@ __kernel void m00110_m04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, we_t = rotl32 ((wb_t ^ w6_t ^ w0_t ^ we_t), 1u); SHA1_STEP (SHA1_F1, c, d, e, a, b, we_t); wf_t = rotl32 ((wc_t ^ w7_t ^ w1_t ^ wf_t), 1u); SHA1_STEP (SHA1_F1, b, c, d, e, a, wf_t); - COMPARE_M_SIMD (d, e, c, b); + + const u32 r0 = d; + const u32 r1 = e; + const u32 r2 = c; + const u32 r3 = b; + + #include COMPARE_M } } @@ -314,31 +357,54 @@ __kernel void m00110_s04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, if (gid >= gid_max) return; - u32 pws0[4] = { 0 }; - u32 pws1[4] = { 0 }; + u32 wordl0[4]; + + wordl0[0] = pws[gid].i[ 0]; + wordl0[1] = pws[gid].i[ 1]; + wordl0[2] = pws[gid].i[ 2]; + wordl0[3] = pws[gid].i[ 3]; + + u32 wordl1[4]; + + wordl1[0] = pws[gid].i[ 4]; + wordl1[1] = pws[gid].i[ 5]; + wordl1[2] = pws[gid].i[ 6]; + wordl1[3] = pws[gid].i[ 7]; + + u32 wordl2[4]; + + wordl2[0] = 0; + wordl2[1] = 0; + wordl2[2] = 0; + wordl2[3] = 0; - pws0[0] = pws[gid].i[0]; - pws0[1] = pws[gid].i[1]; - pws0[2] = pws[gid].i[2]; - pws0[3] = pws[gid].i[3]; - pws1[0] = pws[gid].i[4]; - pws1[1] = pws[gid].i[5]; - pws1[2] = pws[gid].i[6]; - pws1[3] = pws[gid].i[7]; + u32 wordl3[4]; + + wordl3[0] = 0; + wordl3[1] = 0; + wordl3[2] = 0; + wordl3[3] = 0; const u32 pw_l_len = pws[gid].pw_len; + if (combs_mode == COMBINATOR_MODE_BASE_RIGHT) + { + switch_buffer_by_offset_le (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len); + } + /** * salt */ u32 salt_buf0[4]; - u32 salt_buf1[4]; salt_buf0[0] = salt_bufs[salt_pos].salt_buf[0]; salt_buf0[1] = salt_bufs[salt_pos].salt_buf[1]; salt_buf0[2] = salt_bufs[salt_pos].salt_buf[2]; salt_buf0[3] = salt_bufs[salt_pos].salt_buf[3]; + + u32 salt_buf1[4]; + salt_buf1[0] = salt_bufs[salt_pos].salt_buf[4]; salt_buf1[1] = salt_bufs[salt_pos].salt_buf[5]; salt_buf1[2] = salt_bufs[salt_pos].salt_buf[6]; @@ -362,31 +428,45 @@ __kernel void m00110_s04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, * reverse */ - const u32 e_rev = rotl32_S (search[1], 2u); + const u32 e_rev = rotl32 (search[1], 2u); /** * loop */ - for (u32 il_pos = 0; il_pos < combs_cnt; il_pos += VECT_SIZE) + for (u32 il_pos = 0; il_pos < combs_cnt; il_pos++) { - const u32x pw_r_len = pwlenx_create_combt (combs_buf, il_pos); + const u32 pw_r_len = combs_buf[il_pos].pw_len; + + const u32 pw_len = pw_l_len + pw_r_len; + + u32 wordr0[4]; + + wordr0[0] = combs_buf[il_pos].i[0]; + wordr0[1] = combs_buf[il_pos].i[1]; + wordr0[2] = combs_buf[il_pos].i[2]; + wordr0[3] = combs_buf[il_pos].i[3]; - const u32x pw_len = pw_l_len + pw_r_len; + u32 wordr1[4]; - u32x wordr0[4] = { 0 }; - u32x wordr1[4] = { 0 }; - u32x wordr2[4] = { 0 }; - u32x wordr3[4] = { 0 }; + wordr1[0] = combs_buf[il_pos].i[4]; + wordr1[1] = combs_buf[il_pos].i[5]; + wordr1[2] = combs_buf[il_pos].i[6]; + wordr1[3] = combs_buf[il_pos].i[7]; - wordr0[0] = ix_create_combt (combs_buf, il_pos, 0); - wordr0[1] = ix_create_combt (combs_buf, il_pos, 1); - wordr0[2] = ix_create_combt (combs_buf, il_pos, 2); - wordr0[3] = ix_create_combt (combs_buf, il_pos, 3); - wordr1[0] = ix_create_combt (combs_buf, il_pos, 4); - wordr1[1] = ix_create_combt (combs_buf, il_pos, 5); - wordr1[2] = ix_create_combt (combs_buf, il_pos, 6); - wordr1[3] = ix_create_combt (combs_buf, il_pos, 7); + u32 wordr2[4]; + + wordr2[0] = 0; + wordr2[1] = 0; + wordr2[2] = 0; + wordr2[3] = 0; + + u32 wordr3[4]; + + wordr3[0] = 0; + wordr3[1] = 0; + wordr3[2] = 0; + wordr3[3] = 0; if (combs_mode == COMBINATOR_MODE_BASE_LEFT) { @@ -425,32 +505,32 @@ __kernel void m00110_s04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, s3[2] = 0; s3[3] = 0; - switch_buffer_by_offset_le_S (s0, s1, s2, s3, pw_len); + switch_buffer_by_offset_le (s0, s1, s2, s3, pw_len); - const u32x pw_salt_len = pw_len + salt_len; + const u32 pw_salt_len = pw_len + salt_len; - u32x w0[4]; + u32 w0[4]; w0[0] = wordl0[0] | wordr0[0] | s0[0]; w0[1] = wordl0[1] | wordr0[1] | s0[1]; w0[2] = wordl0[2] | wordr0[2] | s0[2]; w0[3] = wordl0[3] | wordr0[3] | s0[3]; - u32x w1[4]; + u32 w1[4]; w1[0] = wordl1[0] | wordr1[0] | s1[0]; w1[1] = wordl1[1] | wordr1[1] | s1[1]; w1[2] = wordl1[2] | wordr1[2] | s1[2]; w1[3] = wordl1[3] | wordr1[3] | s1[3]; - u32x w2[4]; + u32 w2[4]; w2[0] = wordl2[0] | wordr2[0] | s2[0]; w2[1] = wordl2[1] | wordr2[1] | s2[1]; w2[2] = wordl2[2] | wordr2[2] | s2[2]; w2[3] = wordl2[3] | wordr2[3] | s2[3]; - u32x w3[4]; + u32 w3[4]; w3[0] = wordl3[0] | wordr3[0] | s3[0]; w3[1] = wordl3[1] | wordr3[1] | s3[1]; @@ -463,28 +543,28 @@ __kernel void m00110_s04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, * sha1 */ - u32x w0_t = swap32 (w0[0]); - u32x w1_t = swap32 (w0[1]); - u32x w2_t = swap32 (w0[2]); - u32x w3_t = swap32 (w0[3]); - u32x w4_t = swap32 (w1[0]); - u32x w5_t = swap32 (w1[1]); - u32x w6_t = swap32 (w1[2]); - u32x w7_t = swap32 (w1[3]); - u32x w8_t = swap32 (w2[0]); - u32x w9_t = swap32 (w2[1]); - u32x wa_t = swap32 (w2[2]); - u32x wb_t = swap32 (w2[3]); - u32x wc_t = swap32 (w3[0]); - u32x wd_t = swap32 (w3[1]); - u32x we_t = 0; - u32x wf_t = pw_salt_len * 8; - - u32x a = SHA1M_A; - u32x b = SHA1M_B; - u32x c = SHA1M_C; - u32x d = SHA1M_D; - u32x e = SHA1M_E; + u32 w0_t = swap32 (w0[0]); + u32 w1_t = swap32 (w0[1]); + u32 w2_t = swap32 (w0[2]); + u32 w3_t = swap32 (w0[3]); + u32 w4_t = swap32 (w1[0]); + u32 w5_t = swap32 (w1[1]); + u32 w6_t = swap32 (w1[2]); + u32 w7_t = swap32 (w1[3]); + u32 w8_t = swap32 (w2[0]); + u32 w9_t = swap32 (w2[1]); + u32 wa_t = swap32 (w2[2]); + u32 wb_t = swap32 (w2[3]); + u32 wc_t = swap32 (w3[0]); + u32 wd_t = swap32 (w3[1]); + u32 we_t = 0; + u32 wf_t = pw_salt_len * 8; + + u32 a = SHA1M_A; + u32 b = SHA1M_B; + u32 c = SHA1M_C; + u32 d = SHA1M_D; + u32 e = SHA1M_E; #undef K #define K SHA1C00 @@ -578,14 +658,20 @@ __kernel void m00110_s04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, wa_t = rotl32 ((w7_t ^ w2_t ^ wc_t ^ wa_t), 1u); SHA1_STEP (SHA1_F1, b, c, d, e, a, wa_t); wb_t = rotl32 ((w8_t ^ w3_t ^ wd_t ^ wb_t), 1u); SHA1_STEP (SHA1_F1, a, b, c, d, e, wb_t); - if (MATCHES_NONE_VS (e, e_rev)) continue; + if (allx (e != e_rev)) continue; wc_t = rotl32 ((w9_t ^ w4_t ^ we_t ^ wc_t), 1u); SHA1_STEP (SHA1_F1, e, a, b, c, d, wc_t); wd_t = rotl32 ((wa_t ^ w5_t ^ wf_t ^ wd_t), 1u); SHA1_STEP (SHA1_F1, d, e, a, b, c, wd_t); we_t = rotl32 ((wb_t ^ w6_t ^ w0_t ^ we_t), 1u); SHA1_STEP (SHA1_F1, c, d, e, a, b, we_t); wf_t = rotl32 ((wc_t ^ w7_t ^ w1_t ^ wf_t), 1u); SHA1_STEP (SHA1_F1, b, c, d, e, a, wf_t); - COMPARE_S_SIMD (d, e, c, b); + + const u32 r0 = d; + const u32 r1 = e; + const u32 r2 = c; + const u32 r3 = b; + + #include COMPARE_S } } diff --git a/OpenCL/m00120_a1.cl b/OpenCL/m00120_a1.cl index 77f4207..8adc5c4 100644 --- a/OpenCL/m00120_a1.cl +++ b/OpenCL/m00120_a1.cl @@ -5,8 +5,6 @@ #define _SHA1_ -#define NEW_SIMD_CODE - #include "include/constants.h" #include "include/kernel_vendor.h" @@ -18,7 +16,9 @@ #include "include/kernel_functions.c" #include "OpenCL/types_ocl.c" #include "OpenCL/common.c" -#include "OpenCL/simd.c" + +#define COMPARE_S "OpenCL/check_single_comp4.c" +#define COMPARE_M "OpenCL/check_multi_comp4.c" __kernel void m00120_m04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, __global comb_t *combs_buf, __global bf_t *bfs_buf, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 combs_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max) { @@ -36,31 +36,54 @@ __kernel void m00120_m04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, if (gid >= gid_max) return; - u32 pws0[4] = { 0 }; - u32 pws1[4] = { 0 }; + u32 wordl0[4]; + + wordl0[0] = pws[gid].i[ 0]; + wordl0[1] = pws[gid].i[ 1]; + wordl0[2] = pws[gid].i[ 2]; + wordl0[3] = pws[gid].i[ 3]; + + u32 wordl1[4]; + + wordl1[0] = pws[gid].i[ 4]; + wordl1[1] = pws[gid].i[ 5]; + wordl1[2] = pws[gid].i[ 6]; + wordl1[3] = pws[gid].i[ 7]; - pws0[0] = pws[gid].i[0]; - pws0[1] = pws[gid].i[1]; - pws0[2] = pws[gid].i[2]; - pws0[3] = pws[gid].i[3]; - pws1[0] = pws[gid].i[4]; - pws1[1] = pws[gid].i[5]; - pws1[2] = pws[gid].i[6]; - pws1[3] = pws[gid].i[7]; + u32 wordl2[4]; + + wordl2[0] = 0; + wordl2[1] = 0; + wordl2[2] = 0; + wordl2[3] = 0; + + u32 wordl3[4]; + + wordl3[0] = 0; + wordl3[1] = 0; + wordl3[2] = 0; + wordl3[3] = 0; const u32 pw_l_len = pws[gid].pw_len; + if (combs_mode == COMBINATOR_MODE_BASE_RIGHT) + { + switch_buffer_by_offset_le (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len); + } + /** * salt */ u32 salt_buf0[4]; - u32 salt_buf1[4]; salt_buf0[0] = salt_bufs[salt_pos].salt_buf[0]; salt_buf0[1] = salt_bufs[salt_pos].salt_buf[1]; salt_buf0[2] = salt_bufs[salt_pos].salt_buf[2]; salt_buf0[3] = salt_bufs[salt_pos].salt_buf[3]; + + u32 salt_buf1[4]; + salt_buf1[0] = salt_bufs[salt_pos].salt_buf[4]; salt_buf1[1] = salt_bufs[salt_pos].salt_buf[5]; salt_buf1[2] = salt_bufs[salt_pos].salt_buf[6]; @@ -72,35 +95,43 @@ __kernel void m00120_m04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, * loop */ - for (u32 il_pos = 0; il_pos < combs_cnt; il_pos += VECT_SIZE) + for (u32 il_pos = 0; il_pos < combs_cnt; il_pos++) { - const u32x pw_r_len = pwlenx_create_combt (combs_buf, il_pos); - - const u32x pw_len = pw_l_len + pw_r_len; - - u32x wordr0[4] = { 0 }; - u32x wordr1[4] = { 0 }; - u32x wordr2[4] = { 0 }; - u32x wordr3[4] = { 0 }; - - wordr0[0] = ix_create_combt (combs_buf, il_pos, 0); - wordr0[1] = ix_create_combt (combs_buf, il_pos, 1); - wordr0[2] = ix_create_combt (combs_buf, il_pos, 2); - wordr0[3] = ix_create_combt (combs_buf, il_pos, 3); - wordr1[0] = ix_create_combt (combs_buf, il_pos, 4); - wordr1[1] = ix_create_combt (combs_buf, il_pos, 5); - wordr1[2] = ix_create_combt (combs_buf, il_pos, 6); - wordr1[3] = ix_create_combt (combs_buf, il_pos, 7); + const u32 pw_r_len = combs_buf[il_pos].pw_len; + + const u32 pw_len = pw_l_len + pw_r_len; + + u32 wordr0[4]; + u32 wordr1[4]; + u32 wordr2[4]; + u32 wordr3[4]; + + wordr0[0] = combs_buf[il_pos].i[0]; + wordr0[1] = combs_buf[il_pos].i[1]; + wordr0[2] = combs_buf[il_pos].i[2]; + wordr0[3] = combs_buf[il_pos].i[3]; + wordr1[0] = combs_buf[il_pos].i[4]; + wordr1[1] = combs_buf[il_pos].i[5]; + wordr1[2] = combs_buf[il_pos].i[6]; + wordr1[3] = combs_buf[il_pos].i[7]; + wordr2[0] = 0; + wordr2[1] = 0; + wordr2[2] = 0; + wordr2[3] = 0; + wordr3[0] = 0; + wordr3[1] = 0; + wordr3[2] = 0; + wordr3[3] = 0; if (combs_mode == COMBINATOR_MODE_BASE_LEFT) { switch_buffer_by_offset_le (wordr0, wordr1, wordr2, wordr3, pw_l_len); } - u32x w0[4]; - u32x w1[4]; - u32x w2[4]; - u32x w3[4]; + u32 w0[4]; + u32 w1[4]; + u32 w2[4]; + u32 w3[4]; w0[0] = wordl0[0] | wordr0[0]; w0[1] = wordl0[1] | wordr0[1]; @@ -123,12 +154,12 @@ __kernel void m00120_m04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, * prepend salt */ - const u32x pw_salt_len = pw_len + salt_len; + const u32 pw_salt_len = pw_len + salt_len; - u32x w0_t[4]; - u32x w1_t[4]; - u32x w2_t[4]; - u32x w3_t[4]; + u32 w0_t[4]; + u32 w1_t[4]; + u32 w2_t[4]; + u32 w3_t[4]; w0_t[0] = w0[0]; w0_t[1] = w0[1]; @@ -183,11 +214,11 @@ __kernel void m00120_m04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, //w3_t[2] = swap32 (w3_t[2]); //w3_t[3] = swap32 (w3_t[3]); - u32x a = SHA1M_A; - u32x b = SHA1M_B; - u32x c = SHA1M_C; - u32x d = SHA1M_D; - u32x e = SHA1M_E; + u32 a = SHA1M_A; + u32 b = SHA1M_B; + u32 c = SHA1M_C; + u32 d = SHA1M_D; + u32 e = SHA1M_E; #undef K #define K SHA1C00 @@ -285,7 +316,13 @@ __kernel void m00120_m04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, w3_t[2] = rotl32 ((w2_t[3] ^ w1_t[2] ^ w0_t[0] ^ w3_t[2]), 1u); SHA1_STEP (SHA1_F1, c, d, e, a, b, w3_t[2]); w3_t[3] = rotl32 ((w3_t[0] ^ w1_t[3] ^ w0_t[1] ^ w3_t[3]), 1u); SHA1_STEP (SHA1_F1, b, c, d, e, a, w3_t[3]); - COMPARE_M_SIMD (d, e, c, b); + + const u32 r0 = d; + const u32 r1 = e; + const u32 r2 = c; + const u32 r3 = b; + + #include COMPARE_M } } @@ -313,31 +350,54 @@ __kernel void m00120_s04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, if (gid >= gid_max) return; - u32 pws0[4] = { 0 }; - u32 pws1[4] = { 0 }; + u32 wordl0[4]; + + wordl0[0] = pws[gid].i[ 0]; + wordl0[1] = pws[gid].i[ 1]; + wordl0[2] = pws[gid].i[ 2]; + wordl0[3] = pws[gid].i[ 3]; + + u32 wordl1[4]; + + wordl1[0] = pws[gid].i[ 4]; + wordl1[1] = pws[gid].i[ 5]; + wordl1[2] = pws[gid].i[ 6]; + wordl1[3] = pws[gid].i[ 7]; + + u32 wordl2[4]; - pws0[0] = pws[gid].i[0]; - pws0[1] = pws[gid].i[1]; - pws0[2] = pws[gid].i[2]; - pws0[3] = pws[gid].i[3]; - pws1[0] = pws[gid].i[4]; - pws1[1] = pws[gid].i[5]; - pws1[2] = pws[gid].i[6]; - pws1[3] = pws[gid].i[7]; + wordl2[0] = 0; + wordl2[1] = 0; + wordl2[2] = 0; + wordl2[3] = 0; + + u32 wordl3[4]; + + wordl3[0] = 0; + wordl3[1] = 0; + wordl3[2] = 0; + wordl3[3] = 0; const u32 pw_l_len = pws[gid].pw_len; + if (combs_mode == COMBINATOR_MODE_BASE_RIGHT) + { + switch_buffer_by_offset_le (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len); + } + /** * salt */ u32 salt_buf0[4]; - u32 salt_buf1[4]; salt_buf0[0] = salt_bufs[salt_pos].salt_buf[0]; salt_buf0[1] = salt_bufs[salt_pos].salt_buf[1]; salt_buf0[2] = salt_bufs[salt_pos].salt_buf[2]; salt_buf0[3] = salt_bufs[salt_pos].salt_buf[3]; + + u32 salt_buf1[4]; + salt_buf1[0] = salt_bufs[salt_pos].salt_buf[4]; salt_buf1[1] = salt_bufs[salt_pos].salt_buf[5]; salt_buf1[2] = salt_bufs[salt_pos].salt_buf[6]; @@ -361,41 +421,49 @@ __kernel void m00120_s04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, * reverse */ - const u32 e_rev = rotl32_S (search[1], 2u); + const u32 e_rev = rotl32 (search[1], 2u); /** * loop */ - for (u32 il_pos = 0; il_pos < combs_cnt; il_pos += VECT_SIZE) + for (u32 il_pos = 0; il_pos < combs_cnt; il_pos++) { - const u32x pw_r_len = pwlenx_create_combt (combs_buf, il_pos); - - const u32x pw_len = pw_l_len + pw_r_len; - - u32x wordr0[4] = { 0 }; - u32x wordr1[4] = { 0 }; - u32x wordr2[4] = { 0 }; - u32x wordr3[4] = { 0 }; - - wordr0[0] = ix_create_combt (combs_buf, il_pos, 0); - wordr0[1] = ix_create_combt (combs_buf, il_pos, 1); - wordr0[2] = ix_create_combt (combs_buf, il_pos, 2); - wordr0[3] = ix_create_combt (combs_buf, il_pos, 3); - wordr1[0] = ix_create_combt (combs_buf, il_pos, 4); - wordr1[1] = ix_create_combt (combs_buf, il_pos, 5); - wordr1[2] = ix_create_combt (combs_buf, il_pos, 6); - wordr1[3] = ix_create_combt (combs_buf, il_pos, 7); + const u32 pw_r_len = combs_buf[il_pos].pw_len; + + const u32 pw_len = pw_l_len + pw_r_len; + + u32 wordr0[4]; + u32 wordr1[4]; + u32 wordr2[4]; + u32 wordr3[4]; + + wordr0[0] = combs_buf[il_pos].i[0]; + wordr0[1] = combs_buf[il_pos].i[1]; + wordr0[2] = combs_buf[il_pos].i[2]; + wordr0[3] = combs_buf[il_pos].i[3]; + wordr1[0] = combs_buf[il_pos].i[4]; + wordr1[1] = combs_buf[il_pos].i[5]; + wordr1[2] = combs_buf[il_pos].i[6]; + wordr1[3] = combs_buf[il_pos].i[7]; + wordr2[0] = 0; + wordr2[1] = 0; + wordr2[2] = 0; + wordr2[3] = 0; + wordr3[0] = 0; + wordr3[1] = 0; + wordr3[2] = 0; + wordr3[3] = 0; if (combs_mode == COMBINATOR_MODE_BASE_LEFT) { switch_buffer_by_offset_le (wordr0, wordr1, wordr2, wordr3, pw_l_len); } - u32x w0[4]; - u32x w1[4]; - u32x w2[4]; - u32x w3[4]; + u32 w0[4]; + u32 w1[4]; + u32 w2[4]; + u32 w3[4]; w0[0] = wordl0[0] | wordr0[0]; w0[1] = wordl0[1] | wordr0[1]; @@ -418,12 +486,12 @@ __kernel void m00120_s04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, * prepend salt */ - const u32x pw_salt_len = pw_len + salt_len; + const u32 pw_salt_len = pw_len + salt_len; - u32x w0_t[4]; - u32x w1_t[4]; - u32x w2_t[4]; - u32x w3_t[4]; + u32 w0_t[4]; + u32 w1_t[4]; + u32 w2_t[4]; + u32 w3_t[4]; w0_t[0] = w0[0]; w0_t[1] = w0[1]; @@ -478,11 +546,11 @@ __kernel void m00120_s04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, //w3_t[2] = swap32 (w3_t[2]); //w3_t[3] = swap32 (w3_t[3]); - u32x a = SHA1M_A; - u32x b = SHA1M_B; - u32x c = SHA1M_C; - u32x d = SHA1M_D; - u32x e = SHA1M_E; + u32 a = SHA1M_A; + u32 b = SHA1M_B; + u32 c = SHA1M_C; + u32 d = SHA1M_D; + u32 e = SHA1M_E; #undef K #define K SHA1C00 @@ -577,13 +645,19 @@ __kernel void m00120_s04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, w2_t[3] = rotl32 ((w2_t[0] ^ w0_t[3] ^ w3_t[1] ^ w2_t[3]), 1u); SHA1_STEP (SHA1_F1, a, b, c, d, e, w2_t[3]); w3_t[0] = rotl32 ((w2_t[1] ^ w1_t[0] ^ w3_t[2] ^ w3_t[0]), 1u); SHA1_STEP (SHA1_F1, e, a, b, c, d, w3_t[0]); - if (MATCHES_NONE_VS (e, e_rev)) continue; + if (allx (e != e_rev)) continue; w3_t[1] = rotl32 ((w2_t[2] ^ w1_t[1] ^ w3_t[3] ^ w3_t[1]), 1u); SHA1_STEP (SHA1_F1, d, e, a, b, c, w3_t[1]); w3_t[2] = rotl32 ((w2_t[3] ^ w1_t[2] ^ w0_t[0] ^ w3_t[2]), 1u); SHA1_STEP (SHA1_F1, c, d, e, a, b, w3_t[2]); w3_t[3] = rotl32 ((w3_t[0] ^ w1_t[3] ^ w0_t[1] ^ w3_t[3]), 1u); SHA1_STEP (SHA1_F1, b, c, d, e, a, w3_t[3]); - COMPARE_S_SIMD (d, e, c, b); + + const u32 r0 = d; + const u32 r1 = e; + const u32 r2 = c; + const u32 r3 = b; + + #include COMPARE_S } } diff --git a/OpenCL/m00130_a1.cl b/OpenCL/m00130_a1.cl index a13bbbc..58eae95 100644 --- a/OpenCL/m00130_a1.cl +++ b/OpenCL/m00130_a1.cl @@ -5,8 +5,6 @@ #define _SHA1_ -#define NEW_SIMD_CODE - #include "include/constants.h" #include "include/kernel_vendor.h" @@ -18,7 +16,9 @@ #include "include/kernel_functions.c" #include "OpenCL/types_ocl.c" #include "OpenCL/common.c" -#include "OpenCL/simd.c" + +#define COMPARE_S "OpenCL/check_single_comp4.c" +#define COMPARE_M "OpenCL/check_multi_comp4.c" // no unicode yet @@ -38,31 +38,54 @@ __kernel void m00130_m04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, if (gid >= gid_max) return; - u32 pws0[4] = { 0 }; - u32 pws1[4] = { 0 }; + u32 wordl0[4]; + + wordl0[0] = pws[gid].i[ 0]; + wordl0[1] = pws[gid].i[ 1]; + wordl0[2] = pws[gid].i[ 2]; + wordl0[3] = pws[gid].i[ 3]; + + u32 wordl1[4]; + + wordl1[0] = pws[gid].i[ 4]; + wordl1[1] = pws[gid].i[ 5]; + wordl1[2] = pws[gid].i[ 6]; + wordl1[3] = pws[gid].i[ 7]; + + u32 wordl2[4]; - pws0[0] = pws[gid].i[0]; - pws0[1] = pws[gid].i[1]; - pws0[2] = pws[gid].i[2]; - pws0[3] = pws[gid].i[3]; - pws1[0] = pws[gid].i[4]; - pws1[1] = pws[gid].i[5]; - pws1[2] = pws[gid].i[6]; - pws1[3] = pws[gid].i[7]; + wordl2[0] = 0; + wordl2[1] = 0; + wordl2[2] = 0; + wordl2[3] = 0; + + u32 wordl3[4]; + + wordl3[0] = 0; + wordl3[1] = 0; + wordl3[2] = 0; + wordl3[3] = 0; const u32 pw_l_len = pws[gid].pw_len; + if (combs_mode == COMBINATOR_MODE_BASE_RIGHT) + { + switch_buffer_by_offset_le (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len); + } + /** * salt */ u32 salt_buf0[4]; - u32 salt_buf1[4]; salt_buf0[0] = salt_bufs[salt_pos].salt_buf[0]; salt_buf0[1] = salt_bufs[salt_pos].salt_buf[1]; salt_buf0[2] = salt_bufs[salt_pos].salt_buf[2]; salt_buf0[3] = salt_bufs[salt_pos].salt_buf[3]; + + u32 salt_buf1[4]; + salt_buf1[0] = salt_bufs[salt_pos].salt_buf[4]; salt_buf1[1] = salt_bufs[salt_pos].salt_buf[5]; salt_buf1[2] = salt_bufs[salt_pos].salt_buf[6]; @@ -74,25 +97,39 @@ __kernel void m00130_m04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, * loop */ - for (u32 il_pos = 0; il_pos < combs_cnt; il_pos += VECT_SIZE) + for (u32 il_pos = 0; il_pos < combs_cnt; il_pos++) { - const u32x pw_r_len = pwlenx_create_combt (combs_buf, il_pos); + const u32 pw_r_len = combs_buf[il_pos].pw_len; + + const u32 pw_len = pw_l_len + pw_r_len; + + u32 wordr0[4]; - const u32x pw_len = pw_l_len + pw_r_len; + wordr0[0] = combs_buf[il_pos].i[0]; + wordr0[1] = combs_buf[il_pos].i[1]; + wordr0[2] = combs_buf[il_pos].i[2]; + wordr0[3] = combs_buf[il_pos].i[3]; - u32x wordr0[4] = { 0 }; - u32x wordr1[4] = { 0 }; - u32x wordr2[4] = { 0 }; - u32x wordr3[4] = { 0 }; + u32 wordr1[4]; - wordr0[0] = ix_create_combt (combs_buf, il_pos, 0); - wordr0[1] = ix_create_combt (combs_buf, il_pos, 1); - wordr0[2] = ix_create_combt (combs_buf, il_pos, 2); - wordr0[3] = ix_create_combt (combs_buf, il_pos, 3); - wordr1[0] = ix_create_combt (combs_buf, il_pos, 4); - wordr1[1] = ix_create_combt (combs_buf, il_pos, 5); - wordr1[2] = ix_create_combt (combs_buf, il_pos, 6); - wordr1[3] = ix_create_combt (combs_buf, il_pos, 7); + wordr1[0] = combs_buf[il_pos].i[4]; + wordr1[1] = combs_buf[il_pos].i[5]; + wordr1[2] = combs_buf[il_pos].i[6]; + wordr1[3] = combs_buf[il_pos].i[7]; + + u32 wordr2[4]; + + wordr2[0] = 0; + wordr2[1] = 0; + wordr2[2] = 0; + wordr2[3] = 0; + + u32 wordr3[4]; + + wordr3[0] = 0; + wordr3[1] = 0; + wordr3[2] = 0; + wordr3[3] = 0; if (combs_mode == COMBINATOR_MODE_BASE_LEFT) { @@ -135,10 +172,10 @@ __kernel void m00130_m04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, const u32 pw_salt_len = (pw_len * 2) + salt_len; - u32x w0[4]; - u32x w1[4]; - u32x w2[4]; - u32x w3[4]; + u32 w0[4]; + u32 w1[4]; + u32 w2[4]; + u32 w3[4]; w0[0] = wordl0[0] | wordr0[0]; w0[1] = wordl0[1] | wordr0[1]; @@ -157,10 +194,10 @@ __kernel void m00130_m04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, w3[2] = 0; w3[3] = 0; - u32x w0_t[4]; - u32x w1_t[4]; - u32x w2_t[4]; - u32x w3_t[4]; + u32 w0_t[4]; + u32 w1_t[4]; + u32 w2_t[4]; + u32 w3_t[4]; make_unicode (w0, w0_t, w1_t); make_unicode (w1, w2_t, w3_t); @@ -207,11 +244,11 @@ __kernel void m00130_m04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, //w3_t[2] = swap32 (w3_t[2]); //w3_t[3] = swap32 (w3_t[3]); - u32x a = SHA1M_A; - u32x b = SHA1M_B; - u32x c = SHA1M_C; - u32x d = SHA1M_D; - u32x e = SHA1M_E; + u32 a = SHA1M_A; + u32 b = SHA1M_B; + u32 c = SHA1M_C; + u32 d = SHA1M_D; + u32 e = SHA1M_E; #undef K #define K SHA1C00 @@ -309,7 +346,13 @@ __kernel void m00130_m04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, w3_t[2] = rotl32 ((w2_t[3] ^ w1_t[2] ^ w0_t[0] ^ w3_t[2]), 1u); SHA1_STEP (SHA1_F1, c, d, e, a, b, w3_t[2]); w3_t[3] = rotl32 ((w3_t[0] ^ w1_t[3] ^ w0_t[1] ^ w3_t[3]), 1u); SHA1_STEP (SHA1_F1, b, c, d, e, a, w3_t[3]); - COMPARE_M_SIMD (d, e, c, b); + + const u32 r0 = d; + const u32 r1 = e; + const u32 r2 = c; + const u32 r3 = b; + + #include COMPARE_M } } @@ -337,31 +380,54 @@ __kernel void m00130_s04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, if (gid >= gid_max) return; - u32 pws0[4] = { 0 }; - u32 pws1[4] = { 0 }; + u32 wordl0[4]; + + wordl0[0] = pws[gid].i[ 0]; + wordl0[1] = pws[gid].i[ 1]; + wordl0[2] = pws[gid].i[ 2]; + wordl0[3] = pws[gid].i[ 3]; + + u32 wordl1[4]; + + wordl1[0] = pws[gid].i[ 4]; + wordl1[1] = pws[gid].i[ 5]; + wordl1[2] = pws[gid].i[ 6]; + wordl1[3] = pws[gid].i[ 7]; + + u32 wordl2[4]; + + wordl2[0] = 0; + wordl2[1] = 0; + wordl2[2] = 0; + wordl2[3] = 0; - pws0[0] = pws[gid].i[0]; - pws0[1] = pws[gid].i[1]; - pws0[2] = pws[gid].i[2]; - pws0[3] = pws[gid].i[3]; - pws1[0] = pws[gid].i[4]; - pws1[1] = pws[gid].i[5]; - pws1[2] = pws[gid].i[6]; - pws1[3] = pws[gid].i[7]; + u32 wordl3[4]; + + wordl3[0] = 0; + wordl3[1] = 0; + wordl3[2] = 0; + wordl3[3] = 0; const u32 pw_l_len = pws[gid].pw_len; + if (combs_mode == COMBINATOR_MODE_BASE_RIGHT) + { + switch_buffer_by_offset_le (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len); + } + /** * salt */ u32 salt_buf0[4]; - u32 salt_buf1[4]; salt_buf0[0] = salt_bufs[salt_pos].salt_buf[0]; salt_buf0[1] = salt_bufs[salt_pos].salt_buf[1]; salt_buf0[2] = salt_bufs[salt_pos].salt_buf[2]; salt_buf0[3] = salt_bufs[salt_pos].salt_buf[3]; + + u32 salt_buf1[4]; + salt_buf1[0] = salt_bufs[salt_pos].salt_buf[4]; salt_buf1[1] = salt_bufs[salt_pos].salt_buf[5]; salt_buf1[2] = salt_bufs[salt_pos].salt_buf[6]; @@ -385,31 +451,45 @@ __kernel void m00130_s04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, * reverse */ - const u32 e_rev = rotl32_S (search[1], 2u); + const u32 e_rev = rotl32 (search[1], 2u); /** * loop */ - for (u32 il_pos = 0; il_pos < combs_cnt; il_pos += VECT_SIZE) + for (u32 il_pos = 0; il_pos < combs_cnt; il_pos++) { - const u32x pw_r_len = pwlenx_create_combt (combs_buf, il_pos); + const u32 pw_r_len = combs_buf[il_pos].pw_len; + + const u32 pw_len = pw_l_len + pw_r_len; + + u32 wordr0[4]; + + wordr0[0] = combs_buf[il_pos].i[0]; + wordr0[1] = combs_buf[il_pos].i[1]; + wordr0[2] = combs_buf[il_pos].i[2]; + wordr0[3] = combs_buf[il_pos].i[3]; - const u32x pw_len = pw_l_len + pw_r_len; + u32 wordr1[4]; - u32x wordr0[4] = { 0 }; - u32x wordr1[4] = { 0 }; - u32x wordr2[4] = { 0 }; - u32x wordr3[4] = { 0 }; + wordr1[0] = combs_buf[il_pos].i[4]; + wordr1[1] = combs_buf[il_pos].i[5]; + wordr1[2] = combs_buf[il_pos].i[6]; + wordr1[3] = combs_buf[il_pos].i[7]; - wordr0[0] = ix_create_combt (combs_buf, il_pos, 0); - wordr0[1] = ix_create_combt (combs_buf, il_pos, 1); - wordr0[2] = ix_create_combt (combs_buf, il_pos, 2); - wordr0[3] = ix_create_combt (combs_buf, il_pos, 3); - wordr1[0] = ix_create_combt (combs_buf, il_pos, 4); - wordr1[1] = ix_create_combt (combs_buf, il_pos, 5); - wordr1[2] = ix_create_combt (combs_buf, il_pos, 6); - wordr1[3] = ix_create_combt (combs_buf, il_pos, 7); + u32 wordr2[4]; + + wordr2[0] = 0; + wordr2[1] = 0; + wordr2[2] = 0; + wordr2[3] = 0; + + u32 wordr3[4]; + + wordr3[0] = 0; + wordr3[1] = 0; + wordr3[2] = 0; + wordr3[3] = 0; if (combs_mode == COMBINATOR_MODE_BASE_LEFT) { @@ -452,10 +532,10 @@ __kernel void m00130_s04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, const u32 pw_salt_len = (pw_len * 2) + salt_len; - u32x w0[4]; - u32x w1[4]; - u32x w2[4]; - u32x w3[4]; + u32 w0[4]; + u32 w1[4]; + u32 w2[4]; + u32 w3[4]; w0[0] = wordl0[0] | wordr0[0]; w0[1] = wordl0[1] | wordr0[1]; @@ -474,10 +554,10 @@ __kernel void m00130_s04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, w3[2] = 0; w3[3] = 0; - u32x w0_t[4]; - u32x w1_t[4]; - u32x w2_t[4]; - u32x w3_t[4]; + u32 w0_t[4]; + u32 w1_t[4]; + u32 w2_t[4]; + u32 w3_t[4]; make_unicode (w0, w0_t, w1_t); make_unicode (w1, w2_t, w3_t); @@ -524,11 +604,11 @@ __kernel void m00130_s04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, //w3_t[2] = swap32 (w3_t[2]); //w3_t[3] = swap32 (w3_t[3]); - u32x a = SHA1M_A; - u32x b = SHA1M_B; - u32x c = SHA1M_C; - u32x d = SHA1M_D; - u32x e = SHA1M_E; + u32 a = SHA1M_A; + u32 b = SHA1M_B; + u32 c = SHA1M_C; + u32 d = SHA1M_D; + u32 e = SHA1M_E; #undef K #define K SHA1C00 @@ -623,13 +703,19 @@ __kernel void m00130_s04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, w2_t[3] = rotl32 ((w2_t[0] ^ w0_t[3] ^ w3_t[1] ^ w2_t[3]), 1u); SHA1_STEP (SHA1_F1, a, b, c, d, e, w2_t[3]); w3_t[0] = rotl32 ((w2_t[1] ^ w1_t[0] ^ w3_t[2] ^ w3_t[0]), 1u); SHA1_STEP (SHA1_F1, e, a, b, c, d, w3_t[0]); - if (MATCHES_NONE_VS (e, e_rev)) continue; + if (allx (e != e_rev)) continue; w3_t[1] = rotl32 ((w2_t[2] ^ w1_t[1] ^ w3_t[3] ^ w3_t[1]), 1u); SHA1_STEP (SHA1_F1, d, e, a, b, c, w3_t[1]); w3_t[2] = rotl32 ((w2_t[3] ^ w1_t[2] ^ w0_t[0] ^ w3_t[2]), 1u); SHA1_STEP (SHA1_F1, c, d, e, a, b, w3_t[2]); w3_t[3] = rotl32 ((w3_t[0] ^ w1_t[3] ^ w0_t[1] ^ w3_t[3]), 1u); SHA1_STEP (SHA1_F1, b, c, d, e, a, w3_t[3]); - COMPARE_S_SIMD (d, e, c, b); + + const u32 r0 = d; + const u32 r1 = e; + const u32 r2 = c; + const u32 r3 = b; + + #include COMPARE_S } } diff --git a/OpenCL/m00140_a1.cl b/OpenCL/m00140_a1.cl index a6aa271..e48b70f 100644 --- a/OpenCL/m00140_a1.cl +++ b/OpenCL/m00140_a1.cl @@ -5,8 +5,6 @@ #define _SHA1_ -#define NEW_SIMD_CODE - #include "include/constants.h" #include "include/kernel_vendor.h" @@ -18,7 +16,9 @@ #include "include/kernel_functions.c" #include "OpenCL/types_ocl.c" #include "OpenCL/common.c" -#include "OpenCL/simd.c" + +#define COMPARE_S "OpenCL/check_single_comp4.c" +#define COMPARE_M "OpenCL/check_multi_comp4.c" // no unicode yet @@ -38,31 +38,54 @@ __kernel void m00140_m04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, if (gid >= gid_max) return; - u32 pws0[4] = { 0 }; - u32 pws1[4] = { 0 }; + u32 wordl0[4]; + + wordl0[0] = pws[gid].i[ 0]; + wordl0[1] = pws[gid].i[ 1]; + wordl0[2] = pws[gid].i[ 2]; + wordl0[3] = pws[gid].i[ 3]; + + u32 wordl1[4]; + + wordl1[0] = pws[gid].i[ 4]; + wordl1[1] = pws[gid].i[ 5]; + wordl1[2] = pws[gid].i[ 6]; + wordl1[3] = pws[gid].i[ 7]; + + u32 wordl2[4]; - pws0[0] = pws[gid].i[0]; - pws0[1] = pws[gid].i[1]; - pws0[2] = pws[gid].i[2]; - pws0[3] = pws[gid].i[3]; - pws1[0] = pws[gid].i[4]; - pws1[1] = pws[gid].i[5]; - pws1[2] = pws[gid].i[6]; - pws1[3] = pws[gid].i[7]; + wordl2[0] = 0; + wordl2[1] = 0; + wordl2[2] = 0; + wordl2[3] = 0; + + u32 wordl3[4]; + + wordl3[0] = 0; + wordl3[1] = 0; + wordl3[2] = 0; + wordl3[3] = 0; const u32 pw_l_len = pws[gid].pw_len; + if (combs_mode == COMBINATOR_MODE_BASE_RIGHT) + { + switch_buffer_by_offset_le (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len); + } + /** * salt */ u32 salt_buf0[4]; - u32 salt_buf1[4]; salt_buf0[0] = salt_bufs[salt_pos].salt_buf[0]; salt_buf0[1] = salt_bufs[salt_pos].salt_buf[1]; salt_buf0[2] = salt_bufs[salt_pos].salt_buf[2]; salt_buf0[3] = salt_bufs[salt_pos].salt_buf[3]; + + u32 salt_buf1[4]; + salt_buf1[0] = salt_bufs[salt_pos].salt_buf[4]; salt_buf1[1] = salt_bufs[salt_pos].salt_buf[5]; salt_buf1[2] = salt_bufs[salt_pos].salt_buf[6]; @@ -74,25 +97,39 @@ __kernel void m00140_m04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, * loop */ - for (u32 il_pos = 0; il_pos < combs_cnt; il_pos += VECT_SIZE) + for (u32 il_pos = 0; il_pos < combs_cnt; il_pos++) { - const u32x pw_r_len = pwlenx_create_combt (combs_buf, il_pos); + const u32 pw_r_len = combs_buf[il_pos].pw_len; + + const u32 pw_len = pw_l_len + pw_r_len; + + u32 wordr0[4]; - const u32x pw_len = pw_l_len + pw_r_len; + wordr0[0] = combs_buf[il_pos].i[0]; + wordr0[1] = combs_buf[il_pos].i[1]; + wordr0[2] = combs_buf[il_pos].i[2]; + wordr0[3] = combs_buf[il_pos].i[3]; - u32x wordr0[4] = { 0 }; - u32x wordr1[4] = { 0 }; - u32x wordr2[4] = { 0 }; - u32x wordr3[4] = { 0 }; + u32 wordr1[4]; - wordr0[0] = ix_create_combt (combs_buf, il_pos, 0); - wordr0[1] = ix_create_combt (combs_buf, il_pos, 1); - wordr0[2] = ix_create_combt (combs_buf, il_pos, 2); - wordr0[3] = ix_create_combt (combs_buf, il_pos, 3); - wordr1[0] = ix_create_combt (combs_buf, il_pos, 4); - wordr1[1] = ix_create_combt (combs_buf, il_pos, 5); - wordr1[2] = ix_create_combt (combs_buf, il_pos, 6); - wordr1[3] = ix_create_combt (combs_buf, il_pos, 7); + wordr1[0] = combs_buf[il_pos].i[4]; + wordr1[1] = combs_buf[il_pos].i[5]; + wordr1[2] = combs_buf[il_pos].i[6]; + wordr1[3] = combs_buf[il_pos].i[7]; + + u32 wordr2[4]; + + wordr2[0] = 0; + wordr2[1] = 0; + wordr2[2] = 0; + wordr2[3] = 0; + + u32 wordr3[4]; + + wordr3[0] = 0; + wordr3[1] = 0; + wordr3[2] = 0; + wordr3[3] = 0; if (combs_mode == COMBINATOR_MODE_BASE_LEFT) { @@ -105,10 +142,10 @@ __kernel void m00140_m04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, const u32 pw_salt_len = (pw_len * 2) + salt_len; - u32x w0[4]; - u32x w1[4]; - u32x w2[4]; - u32x w3[4]; + u32 w0[4]; + u32 w1[4]; + u32 w2[4]; + u32 w3[4]; w0[0] = wordl0[0] | wordr0[0]; w0[1] = wordl0[1] | wordr0[1]; @@ -127,10 +164,10 @@ __kernel void m00140_m04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, w3[2] = 0; w3[3] = 0; - u32x w0_t[4]; - u32x w1_t[4]; - u32x w2_t[4]; - u32x w3_t[4]; + u32 w0_t[4]; + u32 w1_t[4]; + u32 w2_t[4]; + u32 w3_t[4]; make_unicode (w0, w0_t, w1_t); make_unicode (w1, w2_t, w3_t); @@ -171,11 +208,11 @@ __kernel void m00140_m04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, //w3_t[2] = swap32 (w3_t[2]); //w3_t[3] = swap32 (w3_t[3]); - u32x a = SHA1M_A; - u32x b = SHA1M_B; - u32x c = SHA1M_C; - u32x d = SHA1M_D; - u32x e = SHA1M_E; + u32 a = SHA1M_A; + u32 b = SHA1M_B; + u32 c = SHA1M_C; + u32 d = SHA1M_D; + u32 e = SHA1M_E; #undef K #define K SHA1C00 @@ -273,7 +310,13 @@ __kernel void m00140_m04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, w3_t[2] = rotl32 ((w2_t[3] ^ w1_t[2] ^ w0_t[0] ^ w3_t[2]), 1u); SHA1_STEP (SHA1_F1, c, d, e, a, b, w3_t[2]); w3_t[3] = rotl32 ((w3_t[0] ^ w1_t[3] ^ w0_t[1] ^ w3_t[3]), 1u); SHA1_STEP (SHA1_F1, b, c, d, e, a, w3_t[3]); - COMPARE_M_SIMD (d, e, c, b); + + const u32 r0 = d; + const u32 r1 = e; + const u32 r2 = c; + const u32 r3 = b; + + #include COMPARE_M } } @@ -301,31 +344,54 @@ __kernel void m00140_s04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, if (gid >= gid_max) return; - u32 pws0[4] = { 0 }; - u32 pws1[4] = { 0 }; + u32 wordl0[4]; + + wordl0[0] = pws[gid].i[ 0]; + wordl0[1] = pws[gid].i[ 1]; + wordl0[2] = pws[gid].i[ 2]; + wordl0[3] = pws[gid].i[ 3]; + + u32 wordl1[4]; + + wordl1[0] = pws[gid].i[ 4]; + wordl1[1] = pws[gid].i[ 5]; + wordl1[2] = pws[gid].i[ 6]; + wordl1[3] = pws[gid].i[ 7]; + + u32 wordl2[4]; + + wordl2[0] = 0; + wordl2[1] = 0; + wordl2[2] = 0; + wordl2[3] = 0; - pws0[0] = pws[gid].i[0]; - pws0[1] = pws[gid].i[1]; - pws0[2] = pws[gid].i[2]; - pws0[3] = pws[gid].i[3]; - pws1[0] = pws[gid].i[4]; - pws1[1] = pws[gid].i[5]; - pws1[2] = pws[gid].i[6]; - pws1[3] = pws[gid].i[7]; + u32 wordl3[4]; + + wordl3[0] = 0; + wordl3[1] = 0; + wordl3[2] = 0; + wordl3[3] = 0; const u32 pw_l_len = pws[gid].pw_len; + if (combs_mode == COMBINATOR_MODE_BASE_RIGHT) + { + switch_buffer_by_offset_le (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len); + } + /** * salt */ u32 salt_buf0[4]; - u32 salt_buf1[4]; salt_buf0[0] = salt_bufs[salt_pos].salt_buf[0]; salt_buf0[1] = salt_bufs[salt_pos].salt_buf[1]; salt_buf0[2] = salt_bufs[salt_pos].salt_buf[2]; salt_buf0[3] = salt_bufs[salt_pos].salt_buf[3]; + + u32 salt_buf1[4]; + salt_buf1[0] = salt_bufs[salt_pos].salt_buf[4]; salt_buf1[1] = salt_bufs[salt_pos].salt_buf[5]; salt_buf1[2] = salt_bufs[salt_pos].salt_buf[6]; @@ -349,31 +415,45 @@ __kernel void m00140_s04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, * reverse */ - const u32 e_rev = rotl32_S (search[1], 2u); + const u32 e_rev = rotl32 (search[1], 2u); /** * loop */ - for (u32 il_pos = 0; il_pos < combs_cnt; il_pos += VECT_SIZE) + for (u32 il_pos = 0; il_pos < combs_cnt; il_pos++) { - const u32x pw_r_len = pwlenx_create_combt (combs_buf, il_pos); + const u32 pw_r_len = combs_buf[il_pos].pw_len; + + const u32 pw_len = pw_l_len + pw_r_len; + + u32 wordr0[4]; + + wordr0[0] = combs_buf[il_pos].i[0]; + wordr0[1] = combs_buf[il_pos].i[1]; + wordr0[2] = combs_buf[il_pos].i[2]; + wordr0[3] = combs_buf[il_pos].i[3]; - const u32x pw_len = pw_l_len + pw_r_len; + u32 wordr1[4]; - u32x wordr0[4] = { 0 }; - u32x wordr1[4] = { 0 }; - u32x wordr2[4] = { 0 }; - u32x wordr3[4] = { 0 }; + wordr1[0] = combs_buf[il_pos].i[4]; + wordr1[1] = combs_buf[il_pos].i[5]; + wordr1[2] = combs_buf[il_pos].i[6]; + wordr1[3] = combs_buf[il_pos].i[7]; - wordr0[0] = ix_create_combt (combs_buf, il_pos, 0); - wordr0[1] = ix_create_combt (combs_buf, il_pos, 1); - wordr0[2] = ix_create_combt (combs_buf, il_pos, 2); - wordr0[3] = ix_create_combt (combs_buf, il_pos, 3); - wordr1[0] = ix_create_combt (combs_buf, il_pos, 4); - wordr1[1] = ix_create_combt (combs_buf, il_pos, 5); - wordr1[2] = ix_create_combt (combs_buf, il_pos, 6); - wordr1[3] = ix_create_combt (combs_buf, il_pos, 7); + u32 wordr2[4]; + + wordr2[0] = 0; + wordr2[1] = 0; + wordr2[2] = 0; + wordr2[3] = 0; + + u32 wordr3[4]; + + wordr3[0] = 0; + wordr3[1] = 0; + wordr3[2] = 0; + wordr3[3] = 0; if (combs_mode == COMBINATOR_MODE_BASE_LEFT) { @@ -386,10 +466,10 @@ __kernel void m00140_s04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, const u32 pw_salt_len = (pw_len * 2) + salt_len; - u32x w0[4]; - u32x w1[4]; - u32x w2[4]; - u32x w3[4]; + u32 w0[4]; + u32 w1[4]; + u32 w2[4]; + u32 w3[4]; w0[0] = wordl0[0] | wordr0[0]; w0[1] = wordl0[1] | wordr0[1]; @@ -408,10 +488,10 @@ __kernel void m00140_s04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, w3[2] = 0; w3[3] = 0; - u32x w0_t[4]; - u32x w1_t[4]; - u32x w2_t[4]; - u32x w3_t[4]; + u32 w0_t[4]; + u32 w1_t[4]; + u32 w2_t[4]; + u32 w3_t[4]; make_unicode (w0, w0_t, w1_t); make_unicode (w1, w2_t, w3_t); @@ -452,11 +532,11 @@ __kernel void m00140_s04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, //w3_t[2] = swap32 (w3_t[2]); //w3_t[3] = swap32 (w3_t[3]); - u32x a = SHA1M_A; - u32x b = SHA1M_B; - u32x c = SHA1M_C; - u32x d = SHA1M_D; - u32x e = SHA1M_E; + u32 a = SHA1M_A; + u32 b = SHA1M_B; + u32 c = SHA1M_C; + u32 d = SHA1M_D; + u32 e = SHA1M_E; #undef K #define K SHA1C00 @@ -551,13 +631,19 @@ __kernel void m00140_s04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, w2_t[3] = rotl32 ((w2_t[0] ^ w0_t[3] ^ w3_t[1] ^ w2_t[3]), 1u); SHA1_STEP (SHA1_F1, a, b, c, d, e, w2_t[3]); w3_t[0] = rotl32 ((w2_t[1] ^ w1_t[0] ^ w3_t[2] ^ w3_t[0]), 1u); SHA1_STEP (SHA1_F1, e, a, b, c, d, w3_t[0]); - if (MATCHES_NONE_VS (e, e_rev)) continue; + if (allx (e != e_rev)) continue; w3_t[1] = rotl32 ((w2_t[2] ^ w1_t[1] ^ w3_t[3] ^ w3_t[1]), 1u); SHA1_STEP (SHA1_F1, d, e, a, b, c, w3_t[1]); w3_t[2] = rotl32 ((w2_t[3] ^ w1_t[2] ^ w0_t[0] ^ w3_t[2]), 1u); SHA1_STEP (SHA1_F1, c, d, e, a, b, w3_t[2]); w3_t[3] = rotl32 ((w3_t[0] ^ w1_t[3] ^ w0_t[1] ^ w3_t[3]), 1u); SHA1_STEP (SHA1_F1, b, c, d, e, a, w3_t[3]); - COMPARE_S_SIMD (d, e, c, b); + + const u32 r0 = d; + const u32 r1 = e; + const u32 r2 = c; + const u32 r3 = b; + + #include COMPARE_S } } diff --git a/OpenCL/m00150_a1.cl b/OpenCL/m00150_a1.cl index 7fb723c..1526cf2 100644 --- a/OpenCL/m00150_a1.cl +++ b/OpenCL/m00150_a1.cl @@ -5,8 +5,6 @@ #define _SHA1_ -#define NEW_SIMD_CODE - #include "include/constants.h" #include "include/kernel_vendor.h" @@ -18,7 +16,9 @@ #include "include/kernel_functions.c" #include "OpenCL/types_ocl.c" #include "OpenCL/common.c" -#include "OpenCL/simd.c" + +#define COMPARE_S "OpenCL/check_single_comp4.c" +#define COMPARE_M "OpenCL/check_multi_comp4.c" static void sha1_transform (const u32 w0[4], const u32 w1[4], const u32 w2[4], const u32 w3[4], u32 digest[5]) { @@ -253,20 +253,41 @@ __kernel void m00150_m04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, if (gid >= gid_max) return; - u32 pws0[4] = { 0 }; - u32 pws1[4] = { 0 }; + u32 wordl0[4]; + + wordl0[0] = pws[gid].i[ 0]; + wordl0[1] = pws[gid].i[ 1]; + wordl0[2] = pws[gid].i[ 2]; + wordl0[3] = pws[gid].i[ 3]; + + u32 wordl1[4]; + + wordl1[0] = pws[gid].i[ 4]; + wordl1[1] = pws[gid].i[ 5]; + wordl1[2] = pws[gid].i[ 6]; + wordl1[3] = pws[gid].i[ 7]; - pws0[0] = pws[gid].i[0]; - pws0[1] = pws[gid].i[1]; - pws0[2] = pws[gid].i[2]; - pws0[3] = pws[gid].i[3]; - pws1[0] = pws[gid].i[4]; - pws1[1] = pws[gid].i[5]; - pws1[2] = pws[gid].i[6]; - pws1[3] = pws[gid].i[7]; + u32 wordl2[4]; + + wordl2[0] = 0; + wordl2[1] = 0; + wordl2[2] = 0; + wordl2[3] = 0; + + u32 wordl3[4]; + + wordl3[0] = 0; + wordl3[1] = 0; + wordl3[2] = 0; + wordl3[3] = 0; const u32 pw_l_len = pws[gid].pw_len; + if (combs_mode == COMBINATOR_MODE_BASE_RIGHT) + { + switch_buffer_by_offset_le (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len); + } + /** * salt */ @@ -291,53 +312,67 @@ __kernel void m00150_m04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, * loop */ - for (u32 il_pos = 0; il_pos < combs_cnt; il_pos += VECT_SIZE) + for (u32 il_pos = 0; il_pos < combs_cnt; il_pos++) { - const u32x pw_r_len = pwlenx_create_combt (combs_buf, il_pos); + const u32 pw_r_len = combs_buf[il_pos].pw_len; + + const u32 pw_len = pw_l_len + pw_r_len; + + u32 wordr0[4]; + + wordr0[0] = combs_buf[il_pos].i[0]; + wordr0[1] = combs_buf[il_pos].i[1]; + wordr0[2] = combs_buf[il_pos].i[2]; + wordr0[3] = combs_buf[il_pos].i[3]; + + u32 wordr1[4]; + + wordr1[0] = combs_buf[il_pos].i[4]; + wordr1[1] = combs_buf[il_pos].i[5]; + wordr1[2] = combs_buf[il_pos].i[6]; + wordr1[3] = combs_buf[il_pos].i[7]; + + u32 wordr2[4]; - const u32x pw_len = pw_l_len + pw_r_len; + wordr2[0] = 0; + wordr2[1] = 0; + wordr2[2] = 0; + wordr2[3] = 0; - u32x wordr0[4] = { 0 }; - u32x wordr1[4] = { 0 }; - u32x wordr2[4] = { 0 }; - u32x wordr3[4] = { 0 }; + u32 wordr3[4]; - wordr0[0] = ix_create_combt (combs_buf, il_pos, 0); - wordr0[1] = ix_create_combt (combs_buf, il_pos, 1); - wordr0[2] = ix_create_combt (combs_buf, il_pos, 2); - wordr0[3] = ix_create_combt (combs_buf, il_pos, 3); - wordr1[0] = ix_create_combt (combs_buf, il_pos, 4); - wordr1[1] = ix_create_combt (combs_buf, il_pos, 5); - wordr1[2] = ix_create_combt (combs_buf, il_pos, 6); - wordr1[3] = ix_create_combt (combs_buf, il_pos, 7); + wordr3[0] = 0; + wordr3[1] = 0; + wordr3[2] = 0; + wordr3[3] = 0; if (combs_mode == COMBINATOR_MODE_BASE_LEFT) { switch_buffer_by_offset_le (wordr0, wordr1, wordr2, wordr3, pw_l_len); } - u32x w0[4]; + u32 w0[4]; w0[0] = wordl0[0] | wordr0[0]; w0[1] = wordl0[1] | wordr0[1]; w0[2] = wordl0[2] | wordr0[2]; w0[3] = wordl0[3] | wordr0[3]; - u32x w1[4]; + u32 w1[4]; w1[0] = wordl1[0] | wordr1[0]; w1[1] = wordl1[1] | wordr1[1]; w1[2] = wordl1[2] | wordr1[2]; w1[3] = wordl1[3] | wordr1[3]; - u32x w2[4]; + u32 w2[4]; w2[0] = wordl2[0] | wordr2[0]; w2[1] = wordl2[1] | wordr2[1]; w2[2] = wordl2[2] | wordr2[2]; w2[3] = wordl2[3] | wordr2[3]; - u32x w3[4]; + u32 w3[4]; w3[0] = wordl3[0] | wordr3[0]; w3[1] = wordl3[1] | wordr3[1]; @@ -348,28 +383,28 @@ __kernel void m00150_m04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, * pads */ - u32x w0_t[4]; + u32 w0_t[4]; w0_t[0] = swap32 (w0[0]); w0_t[1] = swap32 (w0[1]); w0_t[2] = swap32 (w0[2]); w0_t[3] = swap32 (w0[3]); - u32x w1_t[4]; + u32 w1_t[4]; w1_t[0] = swap32 (w1[0]); w1_t[1] = swap32 (w1[1]); w1_t[2] = swap32 (w1[2]); w1_t[3] = swap32 (w1[3]); - u32x w2_t[4]; + u32 w2_t[4]; w2_t[0] = 0; w2_t[1] = 0; w2_t[2] = 0; w2_t[3] = 0; - u32x w3_t[4]; + u32 w3_t[4]; w3_t[0] = 0; w3_t[1] = 0; @@ -402,7 +437,12 @@ __kernel void m00150_m04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, hmac_sha1_run (w0_t, w1_t, w2_t, w3_t, ipad, opad, digest); - COMPARE_M_SIMD (digest[3], digest[4], digest[2], digest[1]); + const u32 r0 = digest[3]; + const u32 r1 = digest[4]; + const u32 r2 = digest[2]; + const u32 r3 = digest[1]; + + #include COMPARE_M } } @@ -430,20 +470,41 @@ __kernel void m00150_s04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, if (gid >= gid_max) return; - u32 pws0[4] = { 0 }; - u32 pws1[4] = { 0 }; + u32 wordl0[4]; + + wordl0[0] = pws[gid].i[ 0]; + wordl0[1] = pws[gid].i[ 1]; + wordl0[2] = pws[gid].i[ 2]; + wordl0[3] = pws[gid].i[ 3]; + + u32 wordl1[4]; + + wordl1[0] = pws[gid].i[ 4]; + wordl1[1] = pws[gid].i[ 5]; + wordl1[2] = pws[gid].i[ 6]; + wordl1[3] = pws[gid].i[ 7]; + + u32 wordl2[4]; - pws0[0] = pws[gid].i[0]; - pws0[1] = pws[gid].i[1]; - pws0[2] = pws[gid].i[2]; - pws0[3] = pws[gid].i[3]; - pws1[0] = pws[gid].i[4]; - pws1[1] = pws[gid].i[5]; - pws1[2] = pws[gid].i[6]; - pws1[3] = pws[gid].i[7]; + wordl2[0] = 0; + wordl2[1] = 0; + wordl2[2] = 0; + wordl2[3] = 0; + + u32 wordl3[4]; + + wordl3[0] = 0; + wordl3[1] = 0; + wordl3[2] = 0; + wordl3[3] = 0; const u32 pw_l_len = pws[gid].pw_len; + if (combs_mode == COMBINATOR_MODE_BASE_RIGHT) + { + switch_buffer_by_offset_le (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len); + } + /** * salt */ @@ -480,53 +541,67 @@ __kernel void m00150_s04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, * loop */ - for (u32 il_pos = 0; il_pos < combs_cnt; il_pos += VECT_SIZE) + for (u32 il_pos = 0; il_pos < combs_cnt; il_pos++) { - const u32x pw_r_len = pwlenx_create_combt (combs_buf, il_pos); + const u32 pw_r_len = combs_buf[il_pos].pw_len; + + const u32 pw_len = pw_l_len + pw_r_len; + + u32 wordr0[4]; + + wordr0[0] = combs_buf[il_pos].i[0]; + wordr0[1] = combs_buf[il_pos].i[1]; + wordr0[2] = combs_buf[il_pos].i[2]; + wordr0[3] = combs_buf[il_pos].i[3]; - const u32x pw_len = pw_l_len + pw_r_len; + u32 wordr1[4]; - u32x wordr0[4] = { 0 }; - u32x wordr1[4] = { 0 }; - u32x wordr2[4] = { 0 }; - u32x wordr3[4] = { 0 }; + wordr1[0] = combs_buf[il_pos].i[4]; + wordr1[1] = combs_buf[il_pos].i[5]; + wordr1[2] = combs_buf[il_pos].i[6]; + wordr1[3] = combs_buf[il_pos].i[7]; - wordr0[0] = ix_create_combt (combs_buf, il_pos, 0); - wordr0[1] = ix_create_combt (combs_buf, il_pos, 1); - wordr0[2] = ix_create_combt (combs_buf, il_pos, 2); - wordr0[3] = ix_create_combt (combs_buf, il_pos, 3); - wordr1[0] = ix_create_combt (combs_buf, il_pos, 4); - wordr1[1] = ix_create_combt (combs_buf, il_pos, 5); - wordr1[2] = ix_create_combt (combs_buf, il_pos, 6); - wordr1[3] = ix_create_combt (combs_buf, il_pos, 7); + u32 wordr2[4]; + + wordr2[0] = 0; + wordr2[1] = 0; + wordr2[2] = 0; + wordr2[3] = 0; + + u32 wordr3[4]; + + wordr3[0] = 0; + wordr3[1] = 0; + wordr3[2] = 0; + wordr3[3] = 0; if (combs_mode == COMBINATOR_MODE_BASE_LEFT) { switch_buffer_by_offset_le (wordr0, wordr1, wordr2, wordr3, pw_l_len); } - u32x w0[4]; + u32 w0[4]; w0[0] = wordl0[0] | wordr0[0]; w0[1] = wordl0[1] | wordr0[1]; w0[2] = wordl0[2] | wordr0[2]; w0[3] = wordl0[3] | wordr0[3]; - u32x w1[4]; + u32 w1[4]; w1[0] = wordl1[0] | wordr1[0]; w1[1] = wordl1[1] | wordr1[1]; w1[2] = wordl1[2] | wordr1[2]; w1[3] = wordl1[3] | wordr1[3]; - u32x w2[4]; + u32 w2[4]; w2[0] = wordl2[0] | wordr2[0]; w2[1] = wordl2[1] | wordr2[1]; w2[2] = wordl2[2] | wordr2[2]; w2[3] = wordl2[3] | wordr2[3]; - u32x w3[4]; + u32 w3[4]; w3[0] = wordl3[0] | wordr3[0]; w3[1] = wordl3[1] | wordr3[1]; @@ -537,28 +612,28 @@ __kernel void m00150_s04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, * pads */ - u32x w0_t[4]; + u32 w0_t[4]; w0_t[0] = swap32 (w0[0]); w0_t[1] = swap32 (w0[1]); w0_t[2] = swap32 (w0[2]); w0_t[3] = swap32 (w0[3]); - u32x w1_t[4]; + u32 w1_t[4]; w1_t[0] = swap32 (w1[0]); w1_t[1] = swap32 (w1[1]); w1_t[2] = swap32 (w1[2]); w1_t[3] = swap32 (w1[3]); - u32x w2_t[4]; + u32 w2_t[4]; w2_t[0] = 0; w2_t[1] = 0; w2_t[2] = 0; w2_t[3] = 0; - u32x w3_t[4]; + u32 w3_t[4]; w3_t[0] = 0; w3_t[1] = 0; @@ -591,7 +666,12 @@ __kernel void m00150_s04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, hmac_sha1_run (w0_t, w1_t, w2_t, w3_t, ipad, opad, digest); - COMPARE_S_SIMD (digest[3], digest[4], digest[2], digest[1]); + const u32 r0 = digest[3]; + const u32 r1 = digest[4]; + const u32 r2 = digest[2]; + const u32 r3 = digest[1]; + + #include COMPARE_S } } diff --git a/OpenCL/m00160_a1.cl b/OpenCL/m00160_a1.cl index fe3a72d..adba7db 100644 --- a/OpenCL/m00160_a1.cl +++ b/OpenCL/m00160_a1.cl @@ -5,8 +5,6 @@ #define _SHA1_ -#define NEW_SIMD_CODE - #include "include/constants.h" #include "include/kernel_vendor.h" @@ -18,7 +16,9 @@ #include "include/kernel_functions.c" #include "OpenCL/types_ocl.c" #include "OpenCL/common.c" -#include "OpenCL/simd.c" + +#define COMPARE_S "OpenCL/check_single_comp4.c" +#define COMPARE_M "OpenCL/check_multi_comp4.c" static void sha1_transform (const u32 w0[4], const u32 w1[4], const u32 w2[4], const u32 w3[4], u32 digest[5]) { @@ -253,20 +253,41 @@ __kernel void m00160_m04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, if (gid >= gid_max) return; - u32 pws0[4] = { 0 }; - u32 pws1[4] = { 0 }; + u32 wordl0[4]; + + wordl0[0] = pws[gid].i[ 0]; + wordl0[1] = pws[gid].i[ 1]; + wordl0[2] = pws[gid].i[ 2]; + wordl0[3] = pws[gid].i[ 3]; + + u32 wordl1[4]; + + wordl1[0] = pws[gid].i[ 4]; + wordl1[1] = pws[gid].i[ 5]; + wordl1[2] = pws[gid].i[ 6]; + wordl1[3] = pws[gid].i[ 7]; - pws0[0] = pws[gid].i[0]; - pws0[1] = pws[gid].i[1]; - pws0[2] = pws[gid].i[2]; - pws0[3] = pws[gid].i[3]; - pws1[0] = pws[gid].i[4]; - pws1[1] = pws[gid].i[5]; - pws1[2] = pws[gid].i[6]; - pws1[3] = pws[gid].i[7]; + u32 wordl2[4]; + + wordl2[0] = 0; + wordl2[1] = 0; + wordl2[2] = 0; + wordl2[3] = 0; + + u32 wordl3[4]; + + wordl3[0] = 0; + wordl3[1] = 0; + wordl3[2] = 0; + wordl3[3] = 0; const u32 pw_l_len = pws[gid].pw_len; + if (combs_mode == COMBINATOR_MODE_BASE_RIGHT) + { + switch_buffer_by_offset_le (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len); + } + /** * salt */ @@ -326,53 +347,67 @@ __kernel void m00160_m04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, * loop */ - for (u32 il_pos = 0; il_pos < combs_cnt; il_pos += VECT_SIZE) + for (u32 il_pos = 0; il_pos < combs_cnt; il_pos++) { - const u32x pw_r_len = pwlenx_create_combt (combs_buf, il_pos); + const u32 pw_r_len = combs_buf[il_pos].pw_len; + + const u32 pw_len = pw_l_len + pw_r_len; + + u32 wordr0[4]; + + wordr0[0] = combs_buf[il_pos].i[0]; + wordr0[1] = combs_buf[il_pos].i[1]; + wordr0[2] = combs_buf[il_pos].i[2]; + wordr0[3] = combs_buf[il_pos].i[3]; + + u32 wordr1[4]; + + wordr1[0] = combs_buf[il_pos].i[4]; + wordr1[1] = combs_buf[il_pos].i[5]; + wordr1[2] = combs_buf[il_pos].i[6]; + wordr1[3] = combs_buf[il_pos].i[7]; + + u32 wordr2[4]; - const u32x pw_len = pw_l_len + pw_r_len; + wordr2[0] = 0; + wordr2[1] = 0; + wordr2[2] = 0; + wordr2[3] = 0; - u32x wordr0[4] = { 0 }; - u32x wordr1[4] = { 0 }; - u32x wordr2[4] = { 0 }; - u32x wordr3[4] = { 0 }; + u32 wordr3[4]; - wordr0[0] = ix_create_combt (combs_buf, il_pos, 0); - wordr0[1] = ix_create_combt (combs_buf, il_pos, 1); - wordr0[2] = ix_create_combt (combs_buf, il_pos, 2); - wordr0[3] = ix_create_combt (combs_buf, il_pos, 3); - wordr1[0] = ix_create_combt (combs_buf, il_pos, 4); - wordr1[1] = ix_create_combt (combs_buf, il_pos, 5); - wordr1[2] = ix_create_combt (combs_buf, il_pos, 6); - wordr1[3] = ix_create_combt (combs_buf, il_pos, 7); + wordr3[0] = 0; + wordr3[1] = 0; + wordr3[2] = 0; + wordr3[3] = 0; if (combs_mode == COMBINATOR_MODE_BASE_LEFT) { switch_buffer_by_offset_le (wordr0, wordr1, wordr2, wordr3, pw_l_len); } - u32x w0[4]; + u32 w0[4]; w0[0] = wordl0[0] | wordr0[0]; w0[1] = wordl0[1] | wordr0[1]; w0[2] = wordl0[2] | wordr0[2]; w0[3] = wordl0[3] | wordr0[3]; - u32x w1[4]; + u32 w1[4]; w1[0] = wordl1[0] | wordr1[0]; w1[1] = wordl1[1] | wordr1[1]; w1[2] = wordl1[2] | wordr1[2]; w1[3] = wordl1[3] | wordr1[3]; - u32x w2[4]; + u32 w2[4]; w2[0] = wordl2[0] | wordr2[0]; w2[1] = wordl2[1] | wordr2[1]; w2[2] = wordl2[2] | wordr2[2]; w2[3] = wordl2[3] | wordr2[3]; - u32x w3[4]; + u32 w3[4]; w3[0] = wordl3[0] | wordr3[0]; w3[1] = wordl3[1] | wordr3[1]; @@ -402,7 +437,12 @@ __kernel void m00160_m04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, hmac_sha1_run (w0_t, w1_t, w2_t, w3_t, ipad, opad, digest); - COMPARE_M_SIMD (digest[3], digest[4], digest[2], digest[1]); + const u32 r0 = digest[3]; + const u32 r1 = digest[4]; + const u32 r2 = digest[2]; + const u32 r3 = digest[1]; + + #include COMPARE_M } } @@ -430,20 +470,41 @@ __kernel void m00160_s04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, if (gid >= gid_max) return; - u32 pws0[4] = { 0 }; - u32 pws1[4] = { 0 }; + u32 wordl0[4]; + + wordl0[0] = pws[gid].i[ 0]; + wordl0[1] = pws[gid].i[ 1]; + wordl0[2] = pws[gid].i[ 2]; + wordl0[3] = pws[gid].i[ 3]; + + u32 wordl1[4]; + + wordl1[0] = pws[gid].i[ 4]; + wordl1[1] = pws[gid].i[ 5]; + wordl1[2] = pws[gid].i[ 6]; + wordl1[3] = pws[gid].i[ 7]; + + u32 wordl2[4]; - pws0[0] = pws[gid].i[0]; - pws0[1] = pws[gid].i[1]; - pws0[2] = pws[gid].i[2]; - pws0[3] = pws[gid].i[3]; - pws1[0] = pws[gid].i[4]; - pws1[1] = pws[gid].i[5]; - pws1[2] = pws[gid].i[6]; - pws1[3] = pws[gid].i[7]; + wordl2[0] = 0; + wordl2[1] = 0; + wordl2[2] = 0; + wordl2[3] = 0; + + u32 wordl3[4]; + + wordl3[0] = 0; + wordl3[1] = 0; + wordl3[2] = 0; + wordl3[3] = 0; const u32 pw_l_len = pws[gid].pw_len; + if (combs_mode == COMBINATOR_MODE_BASE_RIGHT) + { + switch_buffer_by_offset_le (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len); + } + /** * salt */ @@ -515,53 +576,67 @@ __kernel void m00160_s04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, * loop */ - for (u32 il_pos = 0; il_pos < combs_cnt; il_pos += VECT_SIZE) + for (u32 il_pos = 0; il_pos < combs_cnt; il_pos++) { - const u32x pw_r_len = pwlenx_create_combt (combs_buf, il_pos); + const u32 pw_r_len = combs_buf[il_pos].pw_len; + + const u32 pw_len = pw_l_len + pw_r_len; + + u32 wordr0[4]; + + wordr0[0] = combs_buf[il_pos].i[0]; + wordr0[1] = combs_buf[il_pos].i[1]; + wordr0[2] = combs_buf[il_pos].i[2]; + wordr0[3] = combs_buf[il_pos].i[3]; - const u32x pw_len = pw_l_len + pw_r_len; + u32 wordr1[4]; - u32x wordr0[4] = { 0 }; - u32x wordr1[4] = { 0 }; - u32x wordr2[4] = { 0 }; - u32x wordr3[4] = { 0 }; + wordr1[0] = combs_buf[il_pos].i[4]; + wordr1[1] = combs_buf[il_pos].i[5]; + wordr1[2] = combs_buf[il_pos].i[6]; + wordr1[3] = combs_buf[il_pos].i[7]; - wordr0[0] = ix_create_combt (combs_buf, il_pos, 0); - wordr0[1] = ix_create_combt (combs_buf, il_pos, 1); - wordr0[2] = ix_create_combt (combs_buf, il_pos, 2); - wordr0[3] = ix_create_combt (combs_buf, il_pos, 3); - wordr1[0] = ix_create_combt (combs_buf, il_pos, 4); - wordr1[1] = ix_create_combt (combs_buf, il_pos, 5); - wordr1[2] = ix_create_combt (combs_buf, il_pos, 6); - wordr1[3] = ix_create_combt (combs_buf, il_pos, 7); + u32 wordr2[4]; + + wordr2[0] = 0; + wordr2[1] = 0; + wordr2[2] = 0; + wordr2[3] = 0; + + u32 wordr3[4]; + + wordr3[0] = 0; + wordr3[1] = 0; + wordr3[2] = 0; + wordr3[3] = 0; if (combs_mode == COMBINATOR_MODE_BASE_LEFT) { switch_buffer_by_offset_le (wordr0, wordr1, wordr2, wordr3, pw_l_len); } - u32x w0[4]; + u32 w0[4]; w0[0] = wordl0[0] | wordr0[0]; w0[1] = wordl0[1] | wordr0[1]; w0[2] = wordl0[2] | wordr0[2]; w0[3] = wordl0[3] | wordr0[3]; - u32x w1[4]; + u32 w1[4]; w1[0] = wordl1[0] | wordr1[0]; w1[1] = wordl1[1] | wordr1[1]; w1[2] = wordl1[2] | wordr1[2]; w1[3] = wordl1[3] | wordr1[3]; - u32x w2[4]; + u32 w2[4]; w2[0] = wordl2[0] | wordr2[0]; w2[1] = wordl2[1] | wordr2[1]; w2[2] = wordl2[2] | wordr2[2]; w2[3] = wordl2[3] | wordr2[3]; - u32x w3[4]; + u32 w3[4]; w3[0] = wordl3[0] | wordr3[0]; w3[1] = wordl3[1] | wordr3[1]; @@ -591,7 +666,12 @@ __kernel void m00160_s04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, hmac_sha1_run (w0_t, w1_t, w2_t, w3_t, ipad, opad, digest); - COMPARE_S_SIMD (digest[3], digest[4], digest[2], digest[1]); + const u32 r0 = digest[3]; + const u32 r1 = digest[4]; + const u32 r2 = digest[2]; + const u32 r3 = digest[1]; + + #include COMPARE_S } } diff --git a/OpenCL/m00190_a1.cl b/OpenCL/m00190_a1.cl index 2f54000..3eec61c 100644 --- a/OpenCL/m00190_a1.cl +++ b/OpenCL/m00190_a1.cl @@ -5,8 +5,6 @@ #define _SHA1_ -#define NEW_SIMD_CODE - #include "include/constants.h" #include "include/kernel_vendor.h" @@ -18,7 +16,9 @@ #include "include/kernel_functions.c" #include "OpenCL/types_ocl.c" #include "OpenCL/common.c" -#include "OpenCL/simd.c" + +#define COMPARE_S "OpenCL/check_single_comp4.c" +#define COMPARE_M "OpenCL/check_multi_comp4.c" __kernel void m00190_m04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, __global comb_t *combs_buf, __global bf_t *bfs_buf, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 combs_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max) { @@ -36,71 +36,110 @@ __kernel void m00190_m04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, if (gid >= gid_max) return; - u32 pws0[4] = { 0 }; - u32 pws1[4] = { 0 }; + u32 wordl0[4]; + + wordl0[0] = pws[gid].i[ 0]; + wordl0[1] = pws[gid].i[ 1]; + wordl0[2] = pws[gid].i[ 2]; + wordl0[3] = pws[gid].i[ 3]; + + u32 wordl1[4]; + + wordl1[0] = pws[gid].i[ 4]; + wordl1[1] = pws[gid].i[ 5]; + wordl1[2] = pws[gid].i[ 6]; + wordl1[3] = pws[gid].i[ 7]; + + u32 wordl2[4]; - pws0[0] = pws[gid].i[0]; - pws0[1] = pws[gid].i[1]; - pws0[2] = pws[gid].i[2]; - pws0[3] = pws[gid].i[3]; - pws1[0] = pws[gid].i[4]; - pws1[1] = pws[gid].i[5]; - pws1[2] = pws[gid].i[6]; - pws1[3] = pws[gid].i[7]; + wordl2[0] = 0; + wordl2[1] = 0; + wordl2[2] = 0; + wordl2[3] = 0; + + u32 wordl3[4]; + + wordl3[0] = 0; + wordl3[1] = 0; + wordl3[2] = 0; + wordl3[3] = 0; const u32 pw_l_len = pws[gid].pw_len; + if (combs_mode == COMBINATOR_MODE_BASE_RIGHT) + { + append_0x80_2x4 (wordl0, wordl1, pw_l_len); + + switch_buffer_by_offset_le (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len); + } + /** * loop */ - for (u32 il_pos = 0; il_pos < combs_cnt; il_pos += VECT_SIZE) + for (u32 il_pos = 0; il_pos < combs_cnt; il_pos++) { - const u32x pw_r_len = pwlenx_create_combt (combs_buf, il_pos); + const u32 pw_r_len = combs_buf[il_pos].pw_len; - const u32x pw_len = pw_l_len + pw_r_len; + const u32 pw_len = pw_l_len + pw_r_len; - u32x wordr0[4] = { 0 }; - u32x wordr1[4] = { 0 }; - u32x wordr2[4] = { 0 }; - u32x wordr3[4] = { 0 }; + u32 wordr0[4]; - wordr0[0] = ix_create_combt (combs_buf, il_pos, 0); - wordr0[1] = ix_create_combt (combs_buf, il_pos, 1); - wordr0[2] = ix_create_combt (combs_buf, il_pos, 2); - wordr0[3] = ix_create_combt (combs_buf, il_pos, 3); - wordr1[0] = ix_create_combt (combs_buf, il_pos, 4); - wordr1[1] = ix_create_combt (combs_buf, il_pos, 5); - wordr1[2] = ix_create_combt (combs_buf, il_pos, 6); - wordr1[3] = ix_create_combt (combs_buf, il_pos, 7); + wordr0[0] = combs_buf[il_pos].i[0]; + wordr0[1] = combs_buf[il_pos].i[1]; + wordr0[2] = combs_buf[il_pos].i[2]; + wordr0[3] = combs_buf[il_pos].i[3]; + + u32 wordr1[4]; + + wordr1[0] = combs_buf[il_pos].i[4]; + wordr1[1] = combs_buf[il_pos].i[5]; + wordr1[2] = combs_buf[il_pos].i[6]; + wordr1[3] = combs_buf[il_pos].i[7]; + + u32 wordr2[4]; + + wordr2[0] = 0; + wordr2[1] = 0; + wordr2[2] = 0; + wordr2[3] = 0; + + u32 wordr3[4]; + + wordr3[0] = 0; + wordr3[1] = 0; + wordr3[2] = 0; + wordr3[3] = 0; if (combs_mode == COMBINATOR_MODE_BASE_LEFT) { + append_0x80_2x4 (wordr0, wordr1, pw_r_len); + switch_buffer_by_offset_le (wordr0, wordr1, wordr2, wordr3, pw_l_len); } - u32x w0[4]; + u32 w0[4]; w0[0] = wordl0[0] | wordr0[0]; w0[1] = wordl0[1] | wordr0[1]; w0[2] = wordl0[2] | wordr0[2]; w0[3] = wordl0[3] | wordr0[3]; - u32x w1[4]; + u32 w1[4]; w1[0] = wordl1[0] | wordr1[0]; w1[1] = wordl1[1] | wordr1[1]; w1[2] = wordl1[2] | wordr1[2]; w1[3] = wordl1[3] | wordr1[3]; - u32x w2[4]; + u32 w2[4]; w2[0] = wordl2[0] | wordr2[0]; w2[1] = wordl2[1] | wordr2[1]; w2[2] = wordl2[2] | wordr2[2]; w2[3] = wordl2[3] | wordr2[3]; - u32x w3[4]; + u32 w3[4]; w3[0] = wordl3[0] | wordr3[0]; w3[1] = wordl3[1] | wordr3[1]; @@ -111,28 +150,28 @@ __kernel void m00190_m04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, * sha1 */ - u32x w0_t = swap32 (w0[0]); - u32x w1_t = swap32 (w0[1]); - u32x w2_t = swap32 (w0[2]); - u32x w3_t = swap32 (w0[3]); - u32x w4_t = swap32 (w1[0]); - u32x w5_t = swap32 (w1[1]); - u32x w6_t = swap32 (w1[2]); - u32x w7_t = swap32 (w1[3]); - u32x w8_t = swap32 (w2[0]); - u32x w9_t = swap32 (w2[1]); - u32x wa_t = swap32 (w2[2]); - u32x wb_t = swap32 (w2[3]); - u32x wc_t = swap32 (w3[0]); - u32x wd_t = swap32 (w3[1]); - u32x we_t = 0; - u32x wf_t = pw_len * 8; - - u32x a = SHA1M_A; - u32x b = SHA1M_B; - u32x c = SHA1M_C; - u32x d = SHA1M_D; - u32x e = SHA1M_E; + u32 w0_t = swap32 (w0[0]); + u32 w1_t = swap32 (w0[1]); + u32 w2_t = swap32 (w0[2]); + u32 w3_t = swap32 (w0[3]); + u32 w4_t = swap32 (w1[0]); + u32 w5_t = swap32 (w1[1]); + u32 w6_t = swap32 (w1[2]); + u32 w7_t = swap32 (w1[3]); + u32 w8_t = swap32 (w2[0]); + u32 w9_t = swap32 (w2[1]); + u32 wa_t = swap32 (w2[2]); + u32 wb_t = swap32 (w2[3]); + u32 wc_t = swap32 (w3[0]); + u32 wd_t = swap32 (w3[1]); + u32 we_t = 0; + u32 wf_t = pw_len * 8; + + u32 a = SHA1M_A; + u32 b = SHA1M_B; + u32 c = SHA1M_C; + u32 d = SHA1M_D; + u32 e = SHA1M_E; #undef K #define K SHA1C00 @@ -235,11 +274,25 @@ __kernel void m00190_m04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, d += SHA1M_D; c += SHA1M_C; - COMPARE_M_SIMD (a, e, d, c); + { + const u32 r0 = a; + const u32 r1 = e; + const u32 r2 = d; + const u32 r3 = c; + + #include COMPARE_M + } a &= 0x00000fff; - COMPARE_M_SIMD (a, e, d, c); + { + const u32 r0 = a; + const u32 r1 = e; + const u32 r2 = d; + const u32 r3 = c; + + #include COMPARE_M + } } } @@ -267,20 +320,43 @@ __kernel void m00190_s04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, if (gid >= gid_max) return; - u32 pws0[4] = { 0 }; - u32 pws1[4] = { 0 }; + u32 wordl0[4]; + + wordl0[0] = pws[gid].i[ 0]; + wordl0[1] = pws[gid].i[ 1]; + wordl0[2] = pws[gid].i[ 2]; + wordl0[3] = pws[gid].i[ 3]; + + u32 wordl1[4]; + + wordl1[0] = pws[gid].i[ 4]; + wordl1[1] = pws[gid].i[ 5]; + wordl1[2] = pws[gid].i[ 6]; + wordl1[3] = pws[gid].i[ 7]; + + u32 wordl2[4]; + + wordl2[0] = 0; + wordl2[1] = 0; + wordl2[2] = 0; + wordl2[3] = 0; - pws0[0] = pws[gid].i[0]; - pws0[1] = pws[gid].i[1]; - pws0[2] = pws[gid].i[2]; - pws0[3] = pws[gid].i[3]; - pws1[0] = pws[gid].i[4]; - pws1[1] = pws[gid].i[5]; - pws1[2] = pws[gid].i[6]; - pws1[3] = pws[gid].i[7]; + u32 wordl3[4]; + + wordl3[0] = 0; + wordl3[1] = 0; + wordl3[2] = 0; + wordl3[3] = 0; const u32 pw_l_len = pws[gid].pw_len; + if (combs_mode == COMBINATOR_MODE_BASE_RIGHT) + { + append_0x80_2x4 (wordl0, wordl1, pw_l_len); + + switch_buffer_by_offset_le (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len); + } + /** * digest */ @@ -297,53 +373,69 @@ __kernel void m00190_s04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, * loop */ - for (u32 il_pos = 0; il_pos < combs_cnt; il_pos += VECT_SIZE) + for (u32 il_pos = 0; il_pos < combs_cnt; il_pos++) { - const u32x pw_r_len = pwlenx_create_combt (combs_buf, il_pos); + const u32 pw_r_len = combs_buf[il_pos].pw_len; + + const u32 pw_len = pw_l_len + pw_r_len; - const u32x pw_len = pw_l_len + pw_r_len; + u32 wordr0[4]; - u32x wordr0[4] = { 0 }; - u32x wordr1[4] = { 0 }; - u32x wordr2[4] = { 0 }; - u32x wordr3[4] = { 0 }; + wordr0[0] = combs_buf[il_pos].i[0]; + wordr0[1] = combs_buf[il_pos].i[1]; + wordr0[2] = combs_buf[il_pos].i[2]; + wordr0[3] = combs_buf[il_pos].i[3]; - wordr0[0] = ix_create_combt (combs_buf, il_pos, 0); - wordr0[1] = ix_create_combt (combs_buf, il_pos, 1); - wordr0[2] = ix_create_combt (combs_buf, il_pos, 2); - wordr0[3] = ix_create_combt (combs_buf, il_pos, 3); - wordr1[0] = ix_create_combt (combs_buf, il_pos, 4); - wordr1[1] = ix_create_combt (combs_buf, il_pos, 5); - wordr1[2] = ix_create_combt (combs_buf, il_pos, 6); - wordr1[3] = ix_create_combt (combs_buf, il_pos, 7); + u32 wordr1[4]; + + wordr1[0] = combs_buf[il_pos].i[4]; + wordr1[1] = combs_buf[il_pos].i[5]; + wordr1[2] = combs_buf[il_pos].i[6]; + wordr1[3] = combs_buf[il_pos].i[7]; + + u32 wordr2[4]; + + wordr2[0] = 0; + wordr2[1] = 0; + wordr2[2] = 0; + wordr2[3] = 0; + + u32 wordr3[4]; + + wordr3[0] = 0; + wordr3[1] = 0; + wordr3[2] = 0; + wordr3[3] = 0; if (combs_mode == COMBINATOR_MODE_BASE_LEFT) { + append_0x80_2x4 (wordr0, wordr1, pw_r_len); + switch_buffer_by_offset_le (wordr0, wordr1, wordr2, wordr3, pw_l_len); } - u32x w0[4]; + u32 w0[4]; w0[0] = wordl0[0] | wordr0[0]; w0[1] = wordl0[1] | wordr0[1]; w0[2] = wordl0[2] | wordr0[2]; w0[3] = wordl0[3] | wordr0[3]; - u32x w1[4]; + u32 w1[4]; w1[0] = wordl1[0] | wordr1[0]; w1[1] = wordl1[1] | wordr1[1]; w1[2] = wordl1[2] | wordr1[2]; w1[3] = wordl1[3] | wordr1[3]; - u32x w2[4]; + u32 w2[4]; w2[0] = wordl2[0] | wordr2[0]; w2[1] = wordl2[1] | wordr2[1]; w2[2] = wordl2[2] | wordr2[2]; w2[3] = wordl2[3] | wordr2[3]; - u32x w3[4]; + u32 w3[4]; w3[0] = wordl3[0] | wordr3[0]; w3[1] = wordl3[1] | wordr3[1]; @@ -354,28 +446,28 @@ __kernel void m00190_s04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, * sha1 */ - u32x w0_t = swap32 (w0[0]); - u32x w1_t = swap32 (w0[1]); - u32x w2_t = swap32 (w0[2]); - u32x w3_t = swap32 (w0[3]); - u32x w4_t = swap32 (w1[0]); - u32x w5_t = swap32 (w1[1]); - u32x w6_t = swap32 (w1[2]); - u32x w7_t = swap32 (w1[3]); - u32x w8_t = swap32 (w2[0]); - u32x w9_t = swap32 (w2[1]); - u32x wa_t = swap32 (w2[2]); - u32x wb_t = swap32 (w2[3]); - u32x wc_t = swap32 (w3[0]); - u32x wd_t = swap32 (w3[1]); - u32x we_t = 0; - u32x wf_t = pw_len * 8; - - u32x a = SHA1M_A; - u32x b = SHA1M_B; - u32x c = SHA1M_C; - u32x d = SHA1M_D; - u32x e = SHA1M_E; + u32 w0_t = swap32 (w0[0]); + u32 w1_t = swap32 (w0[1]); + u32 w2_t = swap32 (w0[2]); + u32 w3_t = swap32 (w0[3]); + u32 w4_t = swap32 (w1[0]); + u32 w5_t = swap32 (w1[1]); + u32 w6_t = swap32 (w1[2]); + u32 w7_t = swap32 (w1[3]); + u32 w8_t = swap32 (w2[0]); + u32 w9_t = swap32 (w2[1]); + u32 wa_t = swap32 (w2[2]); + u32 wb_t = swap32 (w2[3]); + u32 wc_t = swap32 (w3[0]); + u32 wd_t = swap32 (w3[1]); + u32 we_t = 0; + u32 wf_t = pw_len * 8; + + u32 a = SHA1M_A; + u32 b = SHA1M_B; + u32 c = SHA1M_C; + u32 d = SHA1M_D; + u32 e = SHA1M_E; #undef K #define K SHA1C00 @@ -478,11 +570,25 @@ __kernel void m00190_s04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, d += SHA1M_D; c += SHA1M_C; - COMPARE_S_SIMD (a, e, d, c); + { + const u32 r0 = a; + const u32 r1 = e; + const u32 r2 = d; + const u32 r3 = c; + + #include COMPARE_S + } a &= 0x00000fff; - COMPARE_S_SIMD (a, e, d, c); + { + const u32 r0 = a; + const u32 r1 = e; + const u32 r2 = d; + const u32 r3 = c; + + #include COMPARE_S + } } } diff --git a/OpenCL/m00200_a1.cl b/OpenCL/m00200_a1.cl index d4d09c1..c9e2242 100644 --- a/OpenCL/m00200_a1.cl +++ b/OpenCL/m00200_a1.cl @@ -5,8 +5,6 @@ #define _MYSQL323_ -#define NEW_SIMD_CODE - #include "include/constants.h" #include "include/kernel_vendor.h" @@ -18,7 +16,9 @@ #include "include/kernel_functions.c" #include "OpenCL/types_ocl.c" #include "OpenCL/common.c" -#include "OpenCL/simd.c" + +#define COMPARE_S "OpenCL/check_single_comp4.c" +#define COMPARE_M "OpenCL/check_multi_comp4.c" __kernel void m00200_m04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, __global comb_t *combs_buf, __global bf_t *bfs_buf, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 combs_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max) { @@ -36,50 +36,85 @@ __kernel void m00200_m04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, if (gid >= gid_max) return; - u32 pws0[4] = { 0 }; - u32 pws1[4] = { 0 }; + u32 wordl0[4]; + + wordl0[0] = pws[gid].i[ 0]; + wordl0[1] = pws[gid].i[ 1]; + wordl0[2] = pws[gid].i[ 2]; + wordl0[3] = pws[gid].i[ 3]; + + u32 wordl1[4]; + + wordl1[0] = pws[gid].i[ 4]; + wordl1[1] = pws[gid].i[ 5]; + wordl1[2] = pws[gid].i[ 6]; + wordl1[3] = pws[gid].i[ 7]; - pws0[0] = pws[gid].i[0]; - pws0[1] = pws[gid].i[1]; - pws0[2] = pws[gid].i[2]; - pws0[3] = pws[gid].i[3]; - pws1[0] = pws[gid].i[4]; - pws1[1] = pws[gid].i[5]; - pws1[2] = pws[gid].i[6]; - pws1[3] = pws[gid].i[7]; + u32 wordl2[4]; + + wordl2[0] = 0; + wordl2[1] = 0; + wordl2[2] = 0; + wordl2[3] = 0; + + u32 wordl3[4]; + + wordl3[0] = 0; + wordl3[1] = 0; + wordl3[2] = 0; + wordl3[3] = 0; const u32 pw_l_len = pws[gid].pw_len; + if (combs_mode == COMBINATOR_MODE_BASE_RIGHT) + { + switch_buffer_by_offset_le (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len); + } + /** * loop */ - for (u32 il_pos = 0; il_pos < combs_cnt; il_pos += VECT_SIZE) + for (u32 il_pos = 0; il_pos < combs_cnt; il_pos++) { - const u32x pw_r_len = pwlenx_create_combt (combs_buf, il_pos); + const u32 pw_r_len = combs_buf[il_pos].pw_len; + + const u32 pw_len = pw_l_len + pw_r_len; + + u32 wordr0[4]; + + wordr0[0] = combs_buf[il_pos].i[0]; + wordr0[1] = combs_buf[il_pos].i[1]; + wordr0[2] = combs_buf[il_pos].i[2]; + wordr0[3] = combs_buf[il_pos].i[3]; + + u32 wordr1[4]; + + wordr1[0] = combs_buf[il_pos].i[4]; + wordr1[1] = combs_buf[il_pos].i[5]; + wordr1[2] = combs_buf[il_pos].i[6]; + wordr1[3] = combs_buf[il_pos].i[7]; + + u32 wordr2[4]; - const u32x pw_len = pw_l_len + pw_r_len; + wordr2[0] = 0; + wordr2[1] = 0; + wordr2[2] = 0; + wordr2[3] = 0; - u32x wordr0[4] = { 0 }; - u32x wordr1[4] = { 0 }; - u32x wordr2[4] = { 0 }; - u32x wordr3[4] = { 0 }; + u32 wordr3[4]; - wordr0[0] = ix_create_combt (combs_buf, il_pos, 0); - wordr0[1] = ix_create_combt (combs_buf, il_pos, 1); - wordr0[2] = ix_create_combt (combs_buf, il_pos, 2); - wordr0[3] = ix_create_combt (combs_buf, il_pos, 3); - wordr1[0] = ix_create_combt (combs_buf, il_pos, 4); - wordr1[1] = ix_create_combt (combs_buf, il_pos, 5); - wordr1[2] = ix_create_combt (combs_buf, il_pos, 6); - wordr1[3] = ix_create_combt (combs_buf, il_pos, 7); + wordr3[0] = 0; + wordr3[1] = 0; + wordr3[2] = 0; + wordr3[3] = 0; if (combs_mode == COMBINATOR_MODE_BASE_LEFT) { switch_buffer_by_offset_le (wordr0, wordr1, wordr2, wordr3, pw_l_len); } - u32x w_t[16]; + u32 w_t[16]; w_t[ 0] = wordl0[0] | wordr0[0]; w_t[ 1] = wordl0[1] | wordr0[1]; @@ -98,10 +133,8 @@ __kernel void m00200_m04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, w_t[14] = wordl3[2] | wordr3[2]; w_t[15] = 0; - u32x a = MYSQL323_A; - u32x b = MYSQL323_B; - u32x c = 0; - u32x d = 0; + u32 a = MYSQL323_A; + u32 b = MYSQL323_B; u32 add = 7; @@ -148,7 +181,12 @@ __kernel void m00200_m04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, a &= 0x7fffffff; b &= 0x7fffffff; - COMPARE_M_SIMD (a, b, c, d); + const u32 r0 = a; + const u32 r1 = b; + const u32 r2 = 0; + const u32 r3 = 0; + + #include COMPARE_M } } @@ -176,20 +214,41 @@ __kernel void m00200_s04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, if (gid >= gid_max) return; - u32 pws0[4] = { 0 }; - u32 pws1[4] = { 0 }; + u32 wordl0[4]; + + wordl0[0] = pws[gid].i[ 0]; + wordl0[1] = pws[gid].i[ 1]; + wordl0[2] = pws[gid].i[ 2]; + wordl0[3] = pws[gid].i[ 3]; + + u32 wordl1[4]; + + wordl1[0] = pws[gid].i[ 4]; + wordl1[1] = pws[gid].i[ 5]; + wordl1[2] = pws[gid].i[ 6]; + wordl1[3] = pws[gid].i[ 7]; + + u32 wordl2[4]; - pws0[0] = pws[gid].i[0]; - pws0[1] = pws[gid].i[1]; - pws0[2] = pws[gid].i[2]; - pws0[3] = pws[gid].i[3]; - pws1[0] = pws[gid].i[4]; - pws1[1] = pws[gid].i[5]; - pws1[2] = pws[gid].i[6]; - pws1[3] = pws[gid].i[7]; + wordl2[0] = 0; + wordl2[1] = 0; + wordl2[2] = 0; + wordl2[3] = 0; + + u32 wordl3[4]; + + wordl3[0] = 0; + wordl3[1] = 0; + wordl3[2] = 0; + wordl3[3] = 0; const u32 pw_l_len = pws[gid].pw_len; + if (combs_mode == COMBINATOR_MODE_BASE_RIGHT) + { + switch_buffer_by_offset_le (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len); + } + /** * digest */ @@ -206,32 +265,46 @@ __kernel void m00200_s04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, * loop */ - for (u32 il_pos = 0; il_pos < combs_cnt; il_pos += VECT_SIZE) + for (u32 il_pos = 0; il_pos < combs_cnt; il_pos++) { - const u32x pw_r_len = pwlenx_create_combt (combs_buf, il_pos); + const u32 pw_r_len = combs_buf[il_pos].pw_len; + + const u32 pw_len = pw_l_len + pw_r_len; + + u32 wordr0[4]; + + wordr0[0] = combs_buf[il_pos].i[0]; + wordr0[1] = combs_buf[il_pos].i[1]; + wordr0[2] = combs_buf[il_pos].i[2]; + wordr0[3] = combs_buf[il_pos].i[3]; - const u32x pw_len = pw_l_len + pw_r_len; + u32 wordr1[4]; - u32x wordr0[4] = { 0 }; - u32x wordr1[4] = { 0 }; - u32x wordr2[4] = { 0 }; - u32x wordr3[4] = { 0 }; + wordr1[0] = combs_buf[il_pos].i[4]; + wordr1[1] = combs_buf[il_pos].i[5]; + wordr1[2] = combs_buf[il_pos].i[6]; + wordr1[3] = combs_buf[il_pos].i[7]; - wordr0[0] = ix_create_combt (combs_buf, il_pos, 0); - wordr0[1] = ix_create_combt (combs_buf, il_pos, 1); - wordr0[2] = ix_create_combt (combs_buf, il_pos, 2); - wordr0[3] = ix_create_combt (combs_buf, il_pos, 3); - wordr1[0] = ix_create_combt (combs_buf, il_pos, 4); - wordr1[1] = ix_create_combt (combs_buf, il_pos, 5); - wordr1[2] = ix_create_combt (combs_buf, il_pos, 6); - wordr1[3] = ix_create_combt (combs_buf, il_pos, 7); + u32 wordr2[4]; + + wordr2[0] = 0; + wordr2[1] = 0; + wordr2[2] = 0; + wordr2[3] = 0; + + u32 wordr3[4]; + + wordr3[0] = 0; + wordr3[1] = 0; + wordr3[2] = 0; + wordr3[3] = 0; if (combs_mode == COMBINATOR_MODE_BASE_LEFT) { switch_buffer_by_offset_le (wordr0, wordr1, wordr2, wordr3, pw_l_len); } - u32x w_t[16]; + u32 w_t[16]; w_t[ 0] = wordl0[0] | wordr0[0]; w_t[ 1] = wordl0[1] | wordr0[1]; @@ -250,10 +323,8 @@ __kernel void m00200_s04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, w_t[14] = wordl3[2] | wordr3[2]; w_t[15] = 0; - u32x a = MYSQL323_A; - u32x b = MYSQL323_B; - u32x c = 0; - u32x d = 0; + u32 a = MYSQL323_A; + u32 b = MYSQL323_B; u32 add = 7; @@ -300,7 +371,12 @@ __kernel void m00200_s04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, a &= 0x7fffffff; b &= 0x7fffffff; - COMPARE_S_SIMD (a, b, c, d); + const u32 r0 = a; + const u32 r1 = b; + const u32 r2 = 0; + const u32 r3 = 0; + + #include COMPARE_S } } diff --git a/OpenCL/m00300_a1.cl b/OpenCL/m00300_a1.cl index a5b0b11..ca70b9f 100644 --- a/OpenCL/m00300_a1.cl +++ b/OpenCL/m00300_a1.cl @@ -5,8 +5,6 @@ #define _SHA1_ -#define NEW_SIMD_CODE - #include "include/constants.h" #include "include/kernel_vendor.h" @@ -18,7 +16,9 @@ #include "include/kernel_functions.c" #include "OpenCL/types_ocl.c" #include "OpenCL/common.c" -#include "OpenCL/simd.c" + +#define COMPARE_S "OpenCL/check_single_comp4.c" +#define COMPARE_M "OpenCL/check_multi_comp4.c" __kernel void m00300_m04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, __global comb_t *combs_buf, __global bf_t *bfs_buf, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 combs_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max) { @@ -36,71 +36,110 @@ __kernel void m00300_m04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, if (gid >= gid_max) return; - u32 pws0[4] = { 0 }; - u32 pws1[4] = { 0 }; + u32 wordl0[4]; + + wordl0[0] = pws[gid].i[ 0]; + wordl0[1] = pws[gid].i[ 1]; + wordl0[2] = pws[gid].i[ 2]; + wordl0[3] = pws[gid].i[ 3]; + + u32 wordl1[4]; + + wordl1[0] = pws[gid].i[ 4]; + wordl1[1] = pws[gid].i[ 5]; + wordl1[2] = pws[gid].i[ 6]; + wordl1[3] = pws[gid].i[ 7]; - pws0[0] = pws[gid].i[0]; - pws0[1] = pws[gid].i[1]; - pws0[2] = pws[gid].i[2]; - pws0[3] = pws[gid].i[3]; - pws1[0] = pws[gid].i[4]; - pws1[1] = pws[gid].i[5]; - pws1[2] = pws[gid].i[6]; - pws1[3] = pws[gid].i[7]; + u32 wordl2[4]; + + wordl2[0] = 0; + wordl2[1] = 0; + wordl2[2] = 0; + wordl2[3] = 0; + + u32 wordl3[4]; + + wordl3[0] = 0; + wordl3[1] = 0; + wordl3[2] = 0; + wordl3[3] = 0; const u32 pw_l_len = pws[gid].pw_len; + if (combs_mode == COMBINATOR_MODE_BASE_RIGHT) + { + append_0x80_2x4 (wordl0, wordl1, pw_l_len); + + switch_buffer_by_offset_le (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len); + } + /** * loop */ - for (u32 il_pos = 0; il_pos < combs_cnt; il_pos += VECT_SIZE) + for (u32 il_pos = 0; il_pos < combs_cnt; il_pos++) { - const u32x pw_r_len = pwlenx_create_combt (combs_buf, il_pos); + const u32 pw_r_len = combs_buf[il_pos].pw_len; + + const u32 pw_len = pw_l_len + pw_r_len; + + u32 wordr0[4]; + + wordr0[0] = combs_buf[il_pos].i[0]; + wordr0[1] = combs_buf[il_pos].i[1]; + wordr0[2] = combs_buf[il_pos].i[2]; + wordr0[3] = combs_buf[il_pos].i[3]; - const u32x pw_len = pw_l_len + pw_r_len; + u32 wordr1[4]; - u32x wordr0[4] = { 0 }; - u32x wordr1[4] = { 0 }; - u32x wordr2[4] = { 0 }; - u32x wordr3[4] = { 0 }; + wordr1[0] = combs_buf[il_pos].i[4]; + wordr1[1] = combs_buf[il_pos].i[5]; + wordr1[2] = combs_buf[il_pos].i[6]; + wordr1[3] = combs_buf[il_pos].i[7]; - wordr0[0] = ix_create_combt (combs_buf, il_pos, 0); - wordr0[1] = ix_create_combt (combs_buf, il_pos, 1); - wordr0[2] = ix_create_combt (combs_buf, il_pos, 2); - wordr0[3] = ix_create_combt (combs_buf, il_pos, 3); - wordr1[0] = ix_create_combt (combs_buf, il_pos, 4); - wordr1[1] = ix_create_combt (combs_buf, il_pos, 5); - wordr1[2] = ix_create_combt (combs_buf, il_pos, 6); - wordr1[3] = ix_create_combt (combs_buf, il_pos, 7); + u32 wordr2[4]; + + wordr2[0] = 0; + wordr2[1] = 0; + wordr2[2] = 0; + wordr2[3] = 0; + + u32 wordr3[4]; + + wordr3[0] = 0; + wordr3[1] = 0; + wordr3[2] = 0; + wordr3[3] = 0; if (combs_mode == COMBINATOR_MODE_BASE_LEFT) { + append_0x80_2x4 (wordr0, wordr1, pw_r_len); + switch_buffer_by_offset_le (wordr0, wordr1, wordr2, wordr3, pw_l_len); } - u32x w0[4]; + u32 w0[4]; w0[0] = wordl0[0] | wordr0[0]; w0[1] = wordl0[1] | wordr0[1]; w0[2] = wordl0[2] | wordr0[2]; w0[3] = wordl0[3] | wordr0[3]; - u32x w1[4]; + u32 w1[4]; w1[0] = wordl1[0] | wordr1[0]; w1[1] = wordl1[1] | wordr1[1]; w1[2] = wordl1[2] | wordr1[2]; w1[3] = wordl1[3] | wordr1[3]; - u32x w2[4]; + u32 w2[4]; w2[0] = wordl2[0] | wordr2[0]; w2[1] = wordl2[1] | wordr2[1]; w2[2] = wordl2[2] | wordr2[2]; w2[3] = wordl2[3] | wordr2[3]; - u32x w3[4]; + u32 w3[4]; w3[0] = wordl3[0] | wordr3[0]; w3[1] = wordl3[1] | wordr3[1]; @@ -111,28 +150,28 @@ __kernel void m00300_m04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, * sha1 */ - u32x w0_t = swap32 (w0[0]); - u32x w1_t = swap32 (w0[1]); - u32x w2_t = swap32 (w0[2]); - u32x w3_t = swap32 (w0[3]); - u32x w4_t = swap32 (w1[0]); - u32x w5_t = swap32 (w1[1]); - u32x w6_t = swap32 (w1[2]); - u32x w7_t = swap32 (w1[3]); - u32x w8_t = swap32 (w2[0]); - u32x w9_t = swap32 (w2[1]); - u32x wa_t = swap32 (w2[2]); - u32x wb_t = swap32 (w2[3]); - u32x wc_t = swap32 (w3[0]); - u32x wd_t = swap32 (w3[1]); - u32x we_t = 0; - u32x wf_t = pw_len * 8; - - u32x a = SHA1M_A; - u32x b = SHA1M_B; - u32x c = SHA1M_C; - u32x d = SHA1M_D; - u32x e = SHA1M_E; + u32 w0_t = swap32 (w0[0]); + u32 w1_t = swap32 (w0[1]); + u32 w2_t = swap32 (w0[2]); + u32 w3_t = swap32 (w0[3]); + u32 w4_t = swap32 (w1[0]); + u32 w5_t = swap32 (w1[1]); + u32 w6_t = swap32 (w1[2]); + u32 w7_t = swap32 (w1[3]); + u32 w8_t = swap32 (w2[0]); + u32 w9_t = swap32 (w2[1]); + u32 wa_t = swap32 (w2[2]); + u32 wb_t = swap32 (w2[3]); + u32 wc_t = swap32 (w3[0]); + u32 wd_t = swap32 (w3[1]); + u32 we_t = 0; + u32 wf_t = pw_len * 8; + + u32 a = SHA1M_A; + u32 b = SHA1M_B; + u32 c = SHA1M_C; + u32 d = SHA1M_D; + u32 e = SHA1M_E; #undef K #define K SHA1C00 @@ -355,7 +394,13 @@ __kernel void m00300_m04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, we_t = rotl32 ((wb_t ^ w6_t ^ w0_t ^ we_t), 1u); SHA1_STEP (SHA1_F1, c, d, e, a, b, we_t); wf_t = rotl32 ((wc_t ^ w7_t ^ w1_t ^ wf_t), 1u); SHA1_STEP (SHA1_F1, b, c, d, e, a, wf_t); - COMPARE_M_SIMD (d, e, c, b); + + const u32 r0 = d; + const u32 r1 = e; + const u32 r2 = c; + const u32 r3 = b; + + #include COMPARE_M } } @@ -383,20 +428,43 @@ __kernel void m00300_s04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, if (gid >= gid_max) return; - u32 pws0[4] = { 0 }; - u32 pws1[4] = { 0 }; + u32 wordl0[4]; + + wordl0[0] = pws[gid].i[ 0]; + wordl0[1] = pws[gid].i[ 1]; + wordl0[2] = pws[gid].i[ 2]; + wordl0[3] = pws[gid].i[ 3]; + + u32 wordl1[4]; + + wordl1[0] = pws[gid].i[ 4]; + wordl1[1] = pws[gid].i[ 5]; + wordl1[2] = pws[gid].i[ 6]; + wordl1[3] = pws[gid].i[ 7]; + + u32 wordl2[4]; - pws0[0] = pws[gid].i[0]; - pws0[1] = pws[gid].i[1]; - pws0[2] = pws[gid].i[2]; - pws0[3] = pws[gid].i[3]; - pws1[0] = pws[gid].i[4]; - pws1[1] = pws[gid].i[5]; - pws1[2] = pws[gid].i[6]; - pws1[3] = pws[gid].i[7]; + wordl2[0] = 0; + wordl2[1] = 0; + wordl2[2] = 0; + wordl2[3] = 0; + + u32 wordl3[4]; + + wordl3[0] = 0; + wordl3[1] = 0; + wordl3[2] = 0; + wordl3[3] = 0; const u32 pw_l_len = pws[gid].pw_len; + if (combs_mode == COMBINATOR_MODE_BASE_RIGHT) + { + append_0x80_2x4 (wordl0, wordl1, pw_l_len); + + switch_buffer_by_offset_le (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len); + } + /** * digest */ @@ -413,59 +481,75 @@ __kernel void m00300_s04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, * reverse */ - const u32 e_rev = rotl32_S (search[1], 2u); + const u32 e_rev = rotl32 (search[1], 2u); /** * loop */ - for (u32 il_pos = 0; il_pos < combs_cnt; il_pos += VECT_SIZE) + for (u32 il_pos = 0; il_pos < combs_cnt; il_pos++) { - const u32x pw_r_len = pwlenx_create_combt (combs_buf, il_pos); + const u32 pw_r_len = combs_buf[il_pos].pw_len; + + const u32 pw_len = pw_l_len + pw_r_len; + + u32 wordr0[4]; + + wordr0[0] = combs_buf[il_pos].i[0]; + wordr0[1] = combs_buf[il_pos].i[1]; + wordr0[2] = combs_buf[il_pos].i[2]; + wordr0[3] = combs_buf[il_pos].i[3]; + + u32 wordr1[4]; - const u32x pw_len = pw_l_len + pw_r_len; + wordr1[0] = combs_buf[il_pos].i[4]; + wordr1[1] = combs_buf[il_pos].i[5]; + wordr1[2] = combs_buf[il_pos].i[6]; + wordr1[3] = combs_buf[il_pos].i[7]; - u32x wordr0[4] = { 0 }; - u32x wordr1[4] = { 0 }; - u32x wordr2[4] = { 0 }; - u32x wordr3[4] = { 0 }; + u32 wordr2[4]; - wordr0[0] = ix_create_combt (combs_buf, il_pos, 0); - wordr0[1] = ix_create_combt (combs_buf, il_pos, 1); - wordr0[2] = ix_create_combt (combs_buf, il_pos, 2); - wordr0[3] = ix_create_combt (combs_buf, il_pos, 3); - wordr1[0] = ix_create_combt (combs_buf, il_pos, 4); - wordr1[1] = ix_create_combt (combs_buf, il_pos, 5); - wordr1[2] = ix_create_combt (combs_buf, il_pos, 6); - wordr1[3] = ix_create_combt (combs_buf, il_pos, 7); + wordr2[0] = 0; + wordr2[1] = 0; + wordr2[2] = 0; + wordr2[3] = 0; + + u32 wordr3[4]; + + wordr3[0] = 0; + wordr3[1] = 0; + wordr3[2] = 0; + wordr3[3] = 0; if (combs_mode == COMBINATOR_MODE_BASE_LEFT) { + append_0x80_2x4 (wordr0, wordr1, pw_r_len); + switch_buffer_by_offset_le (wordr0, wordr1, wordr2, wordr3, pw_l_len); } - u32x w0[4]; + u32 w0[4]; w0[0] = wordl0[0] | wordr0[0]; w0[1] = wordl0[1] | wordr0[1]; w0[2] = wordl0[2] | wordr0[2]; w0[3] = wordl0[3] | wordr0[3]; - u32x w1[4]; + u32 w1[4]; w1[0] = wordl1[0] | wordr1[0]; w1[1] = wordl1[1] | wordr1[1]; w1[2] = wordl1[2] | wordr1[2]; w1[3] = wordl1[3] | wordr1[3]; - u32x w2[4]; + u32 w2[4]; w2[0] = wordl2[0] | wordr2[0]; w2[1] = wordl2[1] | wordr2[1]; w2[2] = wordl2[2] | wordr2[2]; w2[3] = wordl2[3] | wordr2[3]; - u32x w3[4]; + u32 w3[4]; w3[0] = wordl3[0] | wordr3[0]; w3[1] = wordl3[1] | wordr3[1]; @@ -476,28 +560,28 @@ __kernel void m00300_s04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, * sha1 */ - u32x w0_t = swap32 (w0[0]); - u32x w1_t = swap32 (w0[1]); - u32x w2_t = swap32 (w0[2]); - u32x w3_t = swap32 (w0[3]); - u32x w4_t = swap32 (w1[0]); - u32x w5_t = swap32 (w1[1]); - u32x w6_t = swap32 (w1[2]); - u32x w7_t = swap32 (w1[3]); - u32x w8_t = swap32 (w2[0]); - u32x w9_t = swap32 (w2[1]); - u32x wa_t = swap32 (w2[2]); - u32x wb_t = swap32 (w2[3]); - u32x wc_t = swap32 (w3[0]); - u32x wd_t = swap32 (w3[1]); - u32x we_t = 0; - u32x wf_t = pw_len * 8; - - u32x a = SHA1M_A; - u32x b = SHA1M_B; - u32x c = SHA1M_C; - u32x d = SHA1M_D; - u32x e = SHA1M_E; + u32 w0_t = swap32 (w0[0]); + u32 w1_t = swap32 (w0[1]); + u32 w2_t = swap32 (w0[2]); + u32 w3_t = swap32 (w0[3]); + u32 w4_t = swap32 (w1[0]); + u32 w5_t = swap32 (w1[1]); + u32 w6_t = swap32 (w1[2]); + u32 w7_t = swap32 (w1[3]); + u32 w8_t = swap32 (w2[0]); + u32 w9_t = swap32 (w2[1]); + u32 wa_t = swap32 (w2[2]); + u32 wb_t = swap32 (w2[3]); + u32 wc_t = swap32 (w3[0]); + u32 wd_t = swap32 (w3[1]); + u32 we_t = 0; + u32 wf_t = pw_len * 8; + + u32 a = SHA1M_A; + u32 b = SHA1M_B; + u32 c = SHA1M_C; + u32 d = SHA1M_D; + u32 e = SHA1M_E; #undef K #define K SHA1C00 @@ -716,14 +800,20 @@ __kernel void m00300_s04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, wa_t = rotl32 ((w7_t ^ w2_t ^ wc_t ^ wa_t), 1u); SHA1_STEP (SHA1_F1, b, c, d, e, a, wa_t); wb_t = rotl32 ((w8_t ^ w3_t ^ wd_t ^ wb_t), 1u); SHA1_STEP (SHA1_F1, a, b, c, d, e, wb_t); - if (MATCHES_NONE_VS (e, e_rev)) continue; + if (allx (e != e_rev)) continue; wc_t = rotl32 ((w9_t ^ w4_t ^ we_t ^ wc_t), 1u); SHA1_STEP (SHA1_F1, e, a, b, c, d, wc_t); wd_t = rotl32 ((wa_t ^ w5_t ^ wf_t ^ wd_t), 1u); SHA1_STEP (SHA1_F1, d, e, a, b, c, wd_t); we_t = rotl32 ((wb_t ^ w6_t ^ w0_t ^ we_t), 1u); SHA1_STEP (SHA1_F1, c, d, e, a, b, we_t); wf_t = rotl32 ((wc_t ^ w7_t ^ w1_t ^ wf_t), 1u); SHA1_STEP (SHA1_F1, b, c, d, e, a, wf_t); - COMPARE_S_SIMD (d, e, c, b); + + const u32 r0 = d; + const u32 r1 = e; + const u32 r2 = c; + const u32 r3 = b; + + #include COMPARE_S } } diff --git a/OpenCL/m00900_a1.cl b/OpenCL/m00900_a1.cl index 7985608..f302ab7 100644 --- a/OpenCL/m00900_a1.cl +++ b/OpenCL/m00900_a1.cl @@ -5,8 +5,6 @@ #define _MD4_ -#define NEW_SIMD_CODE - #include "include/constants.h" #include "include/kernel_vendor.h" @@ -18,7 +16,9 @@ #include "include/kernel_functions.c" #include "OpenCL/types_ocl.c" #include "OpenCL/common.c" -#include "OpenCL/simd.c" + +#define COMPARE_S "OpenCL/check_single_comp4.c" +#define COMPARE_M "OpenCL/check_multi_comp4.c" __kernel void m00900_m04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, __global comb_t *combs_buf, __global bf_t *bfs_buf, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 combs_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max) { @@ -36,81 +36,118 @@ __kernel void m00900_m04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, if (gid >= gid_max) return; - u32 pws0[4] = { 0 }; - u32 pws1[4] = { 0 }; + u32 wordl0[4]; + + wordl0[0] = pws[gid].i[ 0]; + wordl0[1] = pws[gid].i[ 1]; + wordl0[2] = pws[gid].i[ 2]; + wordl0[3] = pws[gid].i[ 3]; + + u32 wordl1[4]; + + wordl1[0] = pws[gid].i[ 4]; + wordl1[1] = pws[gid].i[ 5]; + wordl1[2] = pws[gid].i[ 6]; + wordl1[3] = pws[gid].i[ 7]; - pws0[0] = pws[gid].i[0]; - pws0[1] = pws[gid].i[1]; - pws0[2] = pws[gid].i[2]; - pws0[3] = pws[gid].i[3]; - pws1[0] = pws[gid].i[4]; - pws1[1] = pws[gid].i[5]; - pws1[2] = pws[gid].i[6]; - pws1[3] = pws[gid].i[7]; + u32 wordl2[4]; + + wordl2[0] = 0; + wordl2[1] = 0; + wordl2[2] = 0; + wordl2[3] = 0; + + u32 wordl3[4]; + + wordl3[0] = 0; + wordl3[1] = 0; + wordl3[2] = 0; + wordl3[3] = 0; const u32 pw_l_len = pws[gid].pw_len; + if (combs_mode == COMBINATOR_MODE_BASE_RIGHT) + { + append_0x80_2x4 (wordl0, wordl1, pw_l_len); + + switch_buffer_by_offset_le (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len); + } + /** * loop */ - for (u32 il_pos = 0; il_pos < combs_cnt; il_pos += VECT_SIZE) + for (u32 il_pos = 0; il_pos < combs_cnt; il_pos++) { - const u32x pw_r_len = pwlenx_create_combt (combs_buf, il_pos); + const u32 pw_r_len = combs_buf[il_pos].pw_len; + + const u32 pw_len = pw_l_len + pw_r_len; + + u32 wordr0[4]; + + wordr0[0] = combs_buf[il_pos].i[0]; + wordr0[1] = combs_buf[il_pos].i[1]; + wordr0[2] = combs_buf[il_pos].i[2]; + wordr0[3] = combs_buf[il_pos].i[3]; - const u32x pw_len = pw_l_len + pw_r_len; + u32 wordr1[4]; - u32x wordr0[4] = { 0 }; - u32x wordr1[4] = { 0 }; - u32x wordr2[4] = { 0 }; - u32x wordr3[4] = { 0 }; + wordr1[0] = combs_buf[il_pos].i[4]; + wordr1[1] = combs_buf[il_pos].i[5]; + wordr1[2] = combs_buf[il_pos].i[6]; + wordr1[3] = combs_buf[il_pos].i[7]; - wordr0[0] = ix_create_combt (combs_buf, il_pos, 0); - wordr0[1] = ix_create_combt (combs_buf, il_pos, 1); - wordr0[2] = ix_create_combt (combs_buf, il_pos, 2); - wordr0[3] = ix_create_combt (combs_buf, il_pos, 3); - wordr1[0] = ix_create_combt (combs_buf, il_pos, 4); - wordr1[1] = ix_create_combt (combs_buf, il_pos, 5); - wordr1[2] = ix_create_combt (combs_buf, il_pos, 6); - wordr1[3] = ix_create_combt (combs_buf, il_pos, 7); + u32 wordr2[4]; + + wordr2[0] = 0; + wordr2[1] = 0; + wordr2[2] = 0; + wordr2[3] = 0; + + u32 wordr3[4]; + + wordr3[0] = 0; + wordr3[1] = 0; + wordr3[2] = 0; + wordr3[3] = 0; if (combs_mode == COMBINATOR_MODE_BASE_LEFT) { switch_buffer_by_offset_le (wordr0, wordr1, wordr2, wordr3, pw_l_len); } - u32x w0[4]; + u32 w0[4]; w0[0] = wordl0[0] | wordr0[0]; w0[1] = wordl0[1] | wordr0[1]; w0[2] = wordl0[2] | wordr0[2]; w0[3] = wordl0[3] | wordr0[3]; - u32x w1[4]; + u32 w1[4]; w1[0] = wordl1[0] | wordr1[0]; w1[1] = wordl1[1] | wordr1[1]; w1[2] = wordl1[2] | wordr1[2]; w1[3] = wordl1[3] | wordr1[3]; - u32x w2[4]; + u32 w2[4]; w2[0] = wordl2[0] | wordr2[0]; w2[1] = wordl2[1] | wordr2[1]; w2[2] = wordl2[2] | wordr2[2]; w2[3] = wordl2[3] | wordr2[3]; - u32x w3[4]; + u32 w3[4]; w3[0] = wordl3[0] | wordr3[0]; w3[1] = wordl3[1] | wordr3[1]; w3[2] = pw_len * 8; w3[3] = 0; - u32x a = MD4M_A; - u32x b = MD4M_B; - u32x c = MD4M_C; - u32x d = MD4M_D; + u32 a = MD4M_A; + u32 b = MD4M_B; + u32 c = MD4M_C; + u32 d = MD4M_D; MD4_STEP (MD4_Fo, a, b, c, d, w0[0], MD4C00, MD4S00); MD4_STEP (MD4_Fo, d, a, b, c, w0[1], MD4C00, MD4S01); @@ -162,7 +199,13 @@ __kernel void m00900_m04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, MD4_STEP (MD4_H , d, a, b, c, w2[3], MD4C02, MD4S21); MD4_STEP (MD4_H , c, d, a, b, w1[3], MD4C02, MD4S22); MD4_STEP (MD4_H , b, c, d, a, w3[3], MD4C02, MD4S23); - COMPARE_M_SIMD (a, d, c, b); + + const u32 r0 = a; + const u32 r1 = d; + const u32 r2 = c; + const u32 r3 = b; + + #include COMPARE_M } } @@ -190,19 +233,43 @@ __kernel void m00900_s04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, if (gid >= gid_max) return; - u32 pws0[4] = { 0 }; - u32 pws1[4] = { 0 }; + u32 wordl0[4]; - pws0[0] = pws[gid].i[0]; - pws0[1] = pws[gid].i[1]; - pws0[2] = pws[gid].i[2]; - pws0[3] = pws[gid].i[3]; - pws1[0] = pws[gid].i[4]; - pws1[1] = pws[gid].i[5]; - pws1[2] = pws[gid].i[6]; - pws1[3] = pws[gid].i[7]; + wordl0[0] = pws[gid].i[ 0]; + wordl0[1] = pws[gid].i[ 1]; + wordl0[2] = pws[gid].i[ 2]; + wordl0[3] = pws[gid].i[ 3]; + + u32 wordl1[4]; + + wordl1[0] = pws[gid].i[ 4]; + wordl1[1] = pws[gid].i[ 5]; + wordl1[2] = pws[gid].i[ 6]; + wordl1[3] = pws[gid].i[ 7]; + + u32 wordl2[4]; + + wordl2[0] = 0; + wordl2[1] = 0; + wordl2[2] = 0; + wordl2[3] = 0; + + u32 wordl3[4]; + + wordl3[0] = 0; + wordl3[1] = 0; + wordl3[2] = 0; + wordl3[3] = 0; const u32 pw_l_len = pws[gid].pw_len; + + if (combs_mode == COMBINATOR_MODE_BASE_RIGHT) + { + append_0x80_2x4 (wordl0, wordl1, pw_l_len); + + switch_buffer_by_offset_le (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len); + } + /** * digest */ @@ -219,63 +286,77 @@ __kernel void m00900_s04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, * loop */ - for (u32 il_pos = 0; il_pos < combs_cnt; il_pos += VECT_SIZE) + for (u32 il_pos = 0; il_pos < combs_cnt; il_pos++) { - const u32x pw_r_len = pwlenx_create_combt (combs_buf, il_pos); + const u32 pw_r_len = combs_buf[il_pos].pw_len; + + const u32 pw_len = pw_l_len + pw_r_len; + + u32 wordr0[4]; + + wordr0[0] = combs_buf[il_pos].i[0]; + wordr0[1] = combs_buf[il_pos].i[1]; + wordr0[2] = combs_buf[il_pos].i[2]; + wordr0[3] = combs_buf[il_pos].i[3]; + + u32 wordr1[4]; - const u32x pw_len = pw_l_len + pw_r_len; + wordr1[0] = combs_buf[il_pos].i[4]; + wordr1[1] = combs_buf[il_pos].i[5]; + wordr1[2] = combs_buf[il_pos].i[6]; + wordr1[3] = combs_buf[il_pos].i[7]; - u32x wordr0[4] = { 0 }; - u32x wordr1[4] = { 0 }; - u32x wordr2[4] = { 0 }; - u32x wordr3[4] = { 0 }; + u32 wordr2[4]; - wordr0[0] = ix_create_combt (combs_buf, il_pos, 0); - wordr0[1] = ix_create_combt (combs_buf, il_pos, 1); - wordr0[2] = ix_create_combt (combs_buf, il_pos, 2); - wordr0[3] = ix_create_combt (combs_buf, il_pos, 3); - wordr1[0] = ix_create_combt (combs_buf, il_pos, 4); - wordr1[1] = ix_create_combt (combs_buf, il_pos, 5); - wordr1[2] = ix_create_combt (combs_buf, il_pos, 6); - wordr1[3] = ix_create_combt (combs_buf, il_pos, 7); + wordr2[0] = 0; + wordr2[1] = 0; + wordr2[2] = 0; + wordr2[3] = 0; + + u32 wordr3[4]; + + wordr3[0] = 0; + wordr3[1] = 0; + wordr3[2] = 0; + wordr3[3] = 0; if (combs_mode == COMBINATOR_MODE_BASE_LEFT) { switch_buffer_by_offset_le (wordr0, wordr1, wordr2, wordr3, pw_l_len); } - u32x w0[4]; + u32 w0[4]; w0[0] = wordl0[0] | wordr0[0]; w0[1] = wordl0[1] | wordr0[1]; w0[2] = wordl0[2] | wordr0[2]; w0[3] = wordl0[3] | wordr0[3]; - u32x w1[4]; + u32 w1[4]; w1[0] = wordl1[0] | wordr1[0]; w1[1] = wordl1[1] | wordr1[1]; w1[2] = wordl1[2] | wordr1[2]; w1[3] = wordl1[3] | wordr1[3]; - u32x w2[4]; + u32 w2[4]; w2[0] = wordl2[0] | wordr2[0]; w2[1] = wordl2[1] | wordr2[1]; w2[2] = wordl2[2] | wordr2[2]; w2[3] = wordl2[3] | wordr2[3]; - u32x w3[4]; + u32 w3[4]; w3[0] = wordl3[0] | wordr3[0]; w3[1] = wordl3[1] | wordr3[1]; w3[2] = pw_len * 8; w3[3] = 0; - u32x a = MD4M_A; - u32x b = MD4M_B; - u32x c = MD4M_C; - u32x d = MD4M_D; + u32 a = MD4M_A; + u32 b = MD4M_B; + u32 c = MD4M_C; + u32 d = MD4M_D; MD4_STEP (MD4_Fo, a, b, c, d, w0[0], MD4C00, MD4S00); MD4_STEP (MD4_Fo, d, a, b, c, w0[1], MD4C00, MD4S01); @@ -327,7 +408,13 @@ __kernel void m00900_s04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, MD4_STEP (MD4_H , d, a, b, c, w2[3], MD4C02, MD4S21); MD4_STEP (MD4_H , c, d, a, b, w1[3], MD4C02, MD4S22); MD4_STEP (MD4_H , b, c, d, a, w3[3], MD4C02, MD4S23); - COMPARE_S_SIMD (a, d, c, b); + + const u32 r0 = a; + const u32 r1 = d; + const u32 r2 = c; + const u32 r3 = b; + + #include COMPARE_S } } diff --git a/OpenCL/m01000_a1.cl b/OpenCL/m01000_a1.cl index afd21ba..280c459 100644 --- a/OpenCL/m01000_a1.cl +++ b/OpenCL/m01000_a1.cl @@ -5,8 +5,6 @@ #define _MD4_ -#define NEW_SIMD_CODE - #include "include/constants.h" #include "include/kernel_vendor.h" @@ -18,7 +16,9 @@ #include "include/kernel_functions.c" #include "OpenCL/types_ocl.c" #include "OpenCL/common.c" -#include "OpenCL/simd.c" + +#define COMPARE_S "OpenCL/check_single_comp4.c" +#define COMPARE_M "OpenCL/check_multi_comp4.c" __kernel void m01000_m04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, __global comb_t *combs_buf, __global bf_t *bfs_buf, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 combs_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max) { @@ -36,91 +36,128 @@ __kernel void m01000_m04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, if (gid >= gid_max) return; - u32 pws0[4] = { 0 }; - u32 pws1[4] = { 0 }; + u32 wordl0[4]; + + wordl0[0] = pws[gid].i[ 0]; + wordl0[1] = pws[gid].i[ 1]; + wordl0[2] = pws[gid].i[ 2]; + wordl0[3] = pws[gid].i[ 3]; + + u32 wordl1[4]; + + wordl1[0] = pws[gid].i[ 4]; + wordl1[1] = pws[gid].i[ 5]; + wordl1[2] = pws[gid].i[ 6]; + wordl1[3] = pws[gid].i[ 7]; - pws0[0] = pws[gid].i[0]; - pws0[1] = pws[gid].i[1]; - pws0[2] = pws[gid].i[2]; - pws0[3] = pws[gid].i[3]; - pws1[0] = pws[gid].i[4]; - pws1[1] = pws[gid].i[5]; - pws1[2] = pws[gid].i[6]; - pws1[3] = pws[gid].i[7]; + u32 wordl2[4]; + + wordl2[0] = 0; + wordl2[1] = 0; + wordl2[2] = 0; + wordl2[3] = 0; + + u32 wordl3[4]; + + wordl3[0] = 0; + wordl3[1] = 0; + wordl3[2] = 0; + wordl3[3] = 0; const u32 pw_l_len = pws[gid].pw_len; + if (combs_mode == COMBINATOR_MODE_BASE_RIGHT) + { + append_0x80_2x4 (wordl0, wordl1, pw_l_len); + + switch_buffer_by_offset_le (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len); + } + /** * loop */ - for (u32 il_pos = 0; il_pos < combs_cnt; il_pos += VECT_SIZE) + for (u32 il_pos = 0; il_pos < combs_cnt; il_pos++) { - const u32x pw_r_len = pwlenx_create_combt (combs_buf, il_pos); + const u32 pw_r_len = combs_buf[il_pos].pw_len; + + const u32 pw_len = pw_l_len + pw_r_len; + + u32 wordr0[4]; + + wordr0[0] = combs_buf[il_pos].i[0]; + wordr0[1] = combs_buf[il_pos].i[1]; + wordr0[2] = combs_buf[il_pos].i[2]; + wordr0[3] = combs_buf[il_pos].i[3]; - const u32x pw_len = pw_l_len + pw_r_len; + u32 wordr1[4]; - u32x wordr0[4] = { 0 }; - u32x wordr1[4] = { 0 }; - u32x wordr2[4] = { 0 }; - u32x wordr3[4] = { 0 }; + wordr1[0] = combs_buf[il_pos].i[4]; + wordr1[1] = combs_buf[il_pos].i[5]; + wordr1[2] = combs_buf[il_pos].i[6]; + wordr1[3] = combs_buf[il_pos].i[7]; - wordr0[0] = ix_create_combt (combs_buf, il_pos, 0); - wordr0[1] = ix_create_combt (combs_buf, il_pos, 1); - wordr0[2] = ix_create_combt (combs_buf, il_pos, 2); - wordr0[3] = ix_create_combt (combs_buf, il_pos, 3); - wordr1[0] = ix_create_combt (combs_buf, il_pos, 4); - wordr1[1] = ix_create_combt (combs_buf, il_pos, 5); - wordr1[2] = ix_create_combt (combs_buf, il_pos, 6); - wordr1[3] = ix_create_combt (combs_buf, il_pos, 7); + u32 wordr2[4]; + + wordr2[0] = 0; + wordr2[1] = 0; + wordr2[2] = 0; + wordr2[3] = 0; + + u32 wordr3[4]; + + wordr3[0] = 0; + wordr3[1] = 0; + wordr3[2] = 0; + wordr3[3] = 0; if (combs_mode == COMBINATOR_MODE_BASE_LEFT) { switch_buffer_by_offset_le (wordr0, wordr1, wordr2, wordr3, pw_l_len); } - u32x w0[4]; + u32 w0[4]; w0[0] = wordl0[0] | wordr0[0]; w0[1] = wordl0[1] | wordr0[1]; w0[2] = wordl0[2] | wordr0[2]; w0[3] = wordl0[3] | wordr0[3]; - u32x w1[4]; + u32 w1[4]; w1[0] = wordl1[0] | wordr1[0]; w1[1] = wordl1[1] | wordr1[1]; w1[2] = wordl1[2] | wordr1[2]; w1[3] = wordl1[3] | wordr1[3]; - u32x w2[4]; + u32 w2[4]; w2[0] = 0; w2[1] = 0; w2[2] = 0; w2[3] = 0; - u32x w3[4]; + u32 w3[4]; w3[0] = 0; w3[1] = 0; w3[2] = 0; w3[3] = 0; - u32x w0_t[4]; - u32x w1_t[4]; - u32x w2_t[4]; - u32x w3_t[4]; + u32 w0_t[4]; + u32 w1_t[4]; + u32 w2_t[4]; + u32 w3_t[4]; make_unicode (w0, w0_t, w1_t); make_unicode (w1, w2_t, w3_t); w3_t[2] = pw_len * 8 * 2; - u32x a = MD4M_A; - u32x b = MD4M_B; - u32x c = MD4M_C; - u32x d = MD4M_D; + u32 a = MD4M_A; + u32 b = MD4M_B; + u32 c = MD4M_C; + u32 d = MD4M_D; MD4_STEP (MD4_Fo, a, b, c, d, w0_t[0], MD4C00, MD4S00); MD4_STEP (MD4_Fo, d, a, b, c, w0_t[1], MD4C00, MD4S01); @@ -172,7 +209,13 @@ __kernel void m01000_m04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, MD4_STEP (MD4_H , d, a, b, c, w2_t[3], MD4C02, MD4S21); MD4_STEP (MD4_H , c, d, a, b, w1_t[3], MD4C02, MD4S22); MD4_STEP (MD4_H , b, c, d, a, w3_t[3], MD4C02, MD4S23); - COMPARE_M_SIMD (a, d, c, b); + + const u32 r0 = a; + const u32 r1 = d; + const u32 r2 = c; + const u32 r3 = b; + + #include COMPARE_M } } @@ -200,20 +243,43 @@ __kernel void m01000_s04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, if (gid >= gid_max) return; - u32 pws0[4] = { 0 }; - u32 pws1[4] = { 0 }; + u32 wordl0[4]; + + wordl0[0] = pws[gid].i[ 0]; + wordl0[1] = pws[gid].i[ 1]; + wordl0[2] = pws[gid].i[ 2]; + wordl0[3] = pws[gid].i[ 3]; + + u32 wordl1[4]; + + wordl1[0] = pws[gid].i[ 4]; + wordl1[1] = pws[gid].i[ 5]; + wordl1[2] = pws[gid].i[ 6]; + wordl1[3] = pws[gid].i[ 7]; + + u32 wordl2[4]; - pws0[0] = pws[gid].i[0]; - pws0[1] = pws[gid].i[1]; - pws0[2] = pws[gid].i[2]; - pws0[3] = pws[gid].i[3]; - pws1[0] = pws[gid].i[4]; - pws1[1] = pws[gid].i[5]; - pws1[2] = pws[gid].i[6]; - pws1[3] = pws[gid].i[7]; + wordl2[0] = 0; + wordl2[1] = 0; + wordl2[2] = 0; + wordl2[3] = 0; + + u32 wordl3[4]; + + wordl3[0] = 0; + wordl3[1] = 0; + wordl3[2] = 0; + wordl3[3] = 0; const u32 pw_l_len = pws[gid].pw_len; + if (combs_mode == COMBINATOR_MODE_BASE_RIGHT) + { + append_0x80_2x4 (wordl0, wordl1, pw_l_len); + + switch_buffer_by_offset_le (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len); + } + /** * digest */ @@ -230,73 +296,87 @@ __kernel void m01000_s04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, * loop */ - for (u32 il_pos = 0; il_pos < combs_cnt; il_pos += VECT_SIZE) + for (u32 il_pos = 0; il_pos < combs_cnt; il_pos++) { - const u32x pw_r_len = pwlenx_create_combt (combs_buf, il_pos); + const u32 pw_r_len = combs_buf[il_pos].pw_len; + + const u32 pw_len = pw_l_len + pw_r_len; + + u32 wordr0[4]; + + wordr0[0] = combs_buf[il_pos].i[0]; + wordr0[1] = combs_buf[il_pos].i[1]; + wordr0[2] = combs_buf[il_pos].i[2]; + wordr0[3] = combs_buf[il_pos].i[3]; + + u32 wordr1[4]; - const u32x pw_len = pw_l_len + pw_r_len; + wordr1[0] = combs_buf[il_pos].i[4]; + wordr1[1] = combs_buf[il_pos].i[5]; + wordr1[2] = combs_buf[il_pos].i[6]; + wordr1[3] = combs_buf[il_pos].i[7]; - u32x wordr0[4] = { 0 }; - u32x wordr1[4] = { 0 }; - u32x wordr2[4] = { 0 }; - u32x wordr3[4] = { 0 }; + u32 wordr2[4]; - wordr0[0] = ix_create_combt (combs_buf, il_pos, 0); - wordr0[1] = ix_create_combt (combs_buf, il_pos, 1); - wordr0[2] = ix_create_combt (combs_buf, il_pos, 2); - wordr0[3] = ix_create_combt (combs_buf, il_pos, 3); - wordr1[0] = ix_create_combt (combs_buf, il_pos, 4); - wordr1[1] = ix_create_combt (combs_buf, il_pos, 5); - wordr1[2] = ix_create_combt (combs_buf, il_pos, 6); - wordr1[3] = ix_create_combt (combs_buf, il_pos, 7); + wordr2[0] = 0; + wordr2[1] = 0; + wordr2[2] = 0; + wordr2[3] = 0; + + u32 wordr3[4]; + + wordr3[0] = 0; + wordr3[1] = 0; + wordr3[2] = 0; + wordr3[3] = 0; if (combs_mode == COMBINATOR_MODE_BASE_LEFT) { switch_buffer_by_offset_le (wordr0, wordr1, wordr2, wordr3, pw_l_len); } - u32x w0[4]; + u32 w0[4]; w0[0] = wordl0[0] | wordr0[0]; w0[1] = wordl0[1] | wordr0[1]; w0[2] = wordl0[2] | wordr0[2]; w0[3] = wordl0[3] | wordr0[3]; - u32x w1[4]; + u32 w1[4]; w1[0] = wordl1[0] | wordr1[0]; w1[1] = wordl1[1] | wordr1[1]; w1[2] = wordl1[2] | wordr1[2]; w1[3] = wordl1[3] | wordr1[3]; - u32x w2[4]; + u32 w2[4]; w2[0] = 0; w2[1] = 0; w2[2] = 0; w2[3] = 0; - u32x w3[4]; + u32 w3[4]; w3[0] = 0; w3[1] = 0; w3[2] = 0; w3[3] = 0; - u32x w0_t[4]; - u32x w1_t[4]; - u32x w2_t[4]; - u32x w3_t[4]; + u32 w0_t[4]; + u32 w1_t[4]; + u32 w2_t[4]; + u32 w3_t[4]; make_unicode (w0, w0_t, w1_t); make_unicode (w1, w2_t, w3_t); w3_t[2] = pw_len * 8 * 2; - u32x a = MD4M_A; - u32x b = MD4M_B; - u32x c = MD4M_C; - u32x d = MD4M_D; + u32 a = MD4M_A; + u32 b = MD4M_B; + u32 c = MD4M_C; + u32 d = MD4M_D; MD4_STEP (MD4_Fo, a, b, c, d, w0_t[0], MD4C00, MD4S00); MD4_STEP (MD4_Fo, d, a, b, c, w0_t[1], MD4C00, MD4S01); @@ -348,7 +428,13 @@ __kernel void m01000_s04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, MD4_STEP (MD4_H , d, a, b, c, w2_t[3], MD4C02, MD4S21); MD4_STEP (MD4_H , c, d, a, b, w1_t[3], MD4C02, MD4S22); MD4_STEP (MD4_H , b, c, d, a, w3_t[3], MD4C02, MD4S23); - COMPARE_S_SIMD (a, d, c, b); + + const u32 r0 = a; + const u32 r1 = d; + const u32 r2 = c; + const u32 r3 = b; + + #include COMPARE_S } } diff --git a/OpenCL/m01100_a1.cl b/OpenCL/m01100_a1.cl index e3e8547..0a77ff3 100644 --- a/OpenCL/m01100_a1.cl +++ b/OpenCL/m01100_a1.cl @@ -5,8 +5,6 @@ #define _MD4_ -#define NEW_SIMD_CODE - #include "include/constants.h" #include "include/kernel_vendor.h" @@ -18,7 +16,9 @@ #include "include/kernel_functions.c" #include "OpenCL/types_ocl.c" #include "OpenCL/common.c" -#include "OpenCL/simd.c" + +#define COMPARE_S "OpenCL/check_single_comp4.c" +#define COMPARE_M "OpenCL/check_multi_comp4.c" __kernel void m01100_m04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, __global comb_t *combs_buf, __global bf_t *bfs_buf, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 combs_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max) { @@ -36,20 +36,43 @@ __kernel void m01100_m04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, if (gid >= gid_max) return; - u32 pws0[4] = { 0 }; - u32 pws1[4] = { 0 }; + u32 wordl0[4]; + + wordl0[0] = pws[gid].i[ 0]; + wordl0[1] = pws[gid].i[ 1]; + wordl0[2] = pws[gid].i[ 2]; + wordl0[3] = pws[gid].i[ 3]; + + u32 wordl1[4]; + + wordl1[0] = pws[gid].i[ 4]; + wordl1[1] = pws[gid].i[ 5]; + wordl1[2] = pws[gid].i[ 6]; + wordl1[3] = pws[gid].i[ 7]; - pws0[0] = pws[gid].i[0]; - pws0[1] = pws[gid].i[1]; - pws0[2] = pws[gid].i[2]; - pws0[3] = pws[gid].i[3]; - pws1[0] = pws[gid].i[4]; - pws1[1] = pws[gid].i[5]; - pws1[2] = pws[gid].i[6]; - pws1[3] = pws[gid].i[7]; + u32 wordl2[4]; + + wordl2[0] = 0; + wordl2[1] = 0; + wordl2[2] = 0; + wordl2[3] = 0; + + u32 wordl3[4]; + + wordl3[0] = 0; + wordl3[1] = 0; + wordl3[2] = 0; + wordl3[3] = 0; const u32 pw_l_len = pws[gid].pw_len; + if (combs_mode == COMBINATOR_MODE_BASE_RIGHT) + { + append_0x80_2x4 (wordl0, wordl1, pw_l_len); + + switch_buffer_by_offset_le (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len); + } + /** * salt */ @@ -81,73 +104,87 @@ __kernel void m01100_m04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, * loop */ - for (u32 il_pos = 0; il_pos < combs_cnt; il_pos += VECT_SIZE) + for (u32 il_pos = 0; il_pos < combs_cnt; il_pos++) { - const u32x pw_r_len = pwlenx_create_combt (combs_buf, il_pos); + const u32 pw_r_len = combs_buf[il_pos].pw_len; + + const u32 pw_len = pw_l_len + pw_r_len; + + u32 wordr0[4]; + + wordr0[0] = combs_buf[il_pos].i[0]; + wordr0[1] = combs_buf[il_pos].i[1]; + wordr0[2] = combs_buf[il_pos].i[2]; + wordr0[3] = combs_buf[il_pos].i[3]; - const u32x pw_len = pw_l_len + pw_r_len; + u32 wordr1[4]; - u32x wordr0[4] = { 0 }; - u32x wordr1[4] = { 0 }; - u32x wordr2[4] = { 0 }; - u32x wordr3[4] = { 0 }; + wordr1[0] = combs_buf[il_pos].i[4]; + wordr1[1] = combs_buf[il_pos].i[5]; + wordr1[2] = combs_buf[il_pos].i[6]; + wordr1[3] = combs_buf[il_pos].i[7]; - wordr0[0] = ix_create_combt (combs_buf, il_pos, 0); - wordr0[1] = ix_create_combt (combs_buf, il_pos, 1); - wordr0[2] = ix_create_combt (combs_buf, il_pos, 2); - wordr0[3] = ix_create_combt (combs_buf, il_pos, 3); - wordr1[0] = ix_create_combt (combs_buf, il_pos, 4); - wordr1[1] = ix_create_combt (combs_buf, il_pos, 5); - wordr1[2] = ix_create_combt (combs_buf, il_pos, 6); - wordr1[3] = ix_create_combt (combs_buf, il_pos, 7); + u32 wordr2[4]; + + wordr2[0] = 0; + wordr2[1] = 0; + wordr2[2] = 0; + wordr2[3] = 0; + + u32 wordr3[4]; + + wordr3[0] = 0; + wordr3[1] = 0; + wordr3[2] = 0; + wordr3[3] = 0; if (combs_mode == COMBINATOR_MODE_BASE_LEFT) { switch_buffer_by_offset_le (wordr0, wordr1, wordr2, wordr3, pw_l_len); } - u32x w0[4]; + u32 w0[4]; w0[0] = wordl0[0] | wordr0[0]; w0[1] = wordl0[1] | wordr0[1]; w0[2] = wordl0[2] | wordr0[2]; w0[3] = wordl0[3] | wordr0[3]; - u32x w1[4]; + u32 w1[4]; w1[0] = wordl1[0] | wordr1[0]; w1[1] = wordl1[1] | wordr1[1]; w1[2] = wordl1[2] | wordr1[2]; w1[3] = wordl1[3] | wordr1[3]; - u32x w2[4]; + u32 w2[4]; w2[0] = 0; w2[1] = 0; w2[2] = 0; w2[3] = 0; - u32x w3[4]; + u32 w3[4]; w3[0] = 0; w3[1] = 0; w3[2] = 0; w3[3] = 0; - u32x w0_t[4]; - u32x w1_t[4]; - u32x w2_t[4]; - u32x w3_t[4]; + u32 w0_t[4]; + u32 w1_t[4]; + u32 w2_t[4]; + u32 w3_t[4]; make_unicode (w0, w0_t, w1_t); make_unicode (w1, w2_t, w3_t); w3_t[2] = pw_len * 8 * 2; - u32x a = MD4M_A; - u32x b = MD4M_B; - u32x c = MD4M_C; - u32x d = MD4M_D; + u32 a = MD4M_A; + u32 b = MD4M_B; + u32 c = MD4M_C; + u32 d = MD4M_D; MD4_STEP (MD4_Fo, a, b, c, d, w0_t[0], MD4C00, MD4S00); MD4_STEP (MD4_Fo, d, a, b, c, w0_t[1], MD4C00, MD4S01); @@ -277,7 +314,13 @@ __kernel void m01100_m04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, MD4_STEP (MD4_H , d, a, b, c, w2_t[3], MD4C02, MD4S21); MD4_STEP (MD4_H , c, d, a, b, w1_t[3], MD4C02, MD4S22); MD4_STEP (MD4_H , b, c, d, a, w3_t[3], MD4C02, MD4S23); - COMPARE_M_SIMD (a, d, c, b); + + const u32 r0 = a; + const u32 r1 = d; + const u32 r2 = c; + const u32 r3 = b; + + #include COMPARE_M } } @@ -305,20 +348,43 @@ __kernel void m01100_s04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, if (gid >= gid_max) return; - u32 pws0[4] = { 0 }; - u32 pws1[4] = { 0 }; + u32 wordl0[4]; - pws0[0] = pws[gid].i[0]; - pws0[1] = pws[gid].i[1]; - pws0[2] = pws[gid].i[2]; - pws0[3] = pws[gid].i[3]; - pws1[0] = pws[gid].i[4]; - pws1[1] = pws[gid].i[5]; - pws1[2] = pws[gid].i[6]; - pws1[3] = pws[gid].i[7]; + wordl0[0] = pws[gid].i[ 0]; + wordl0[1] = pws[gid].i[ 1]; + wordl0[2] = pws[gid].i[ 2]; + wordl0[3] = pws[gid].i[ 3]; + + u32 wordl1[4]; + + wordl1[0] = pws[gid].i[ 4]; + wordl1[1] = pws[gid].i[ 5]; + wordl1[2] = pws[gid].i[ 6]; + wordl1[3] = pws[gid].i[ 7]; + + u32 wordl2[4]; + + wordl2[0] = 0; + wordl2[1] = 0; + wordl2[2] = 0; + wordl2[3] = 0; + + u32 wordl3[4]; + + wordl3[0] = 0; + wordl3[1] = 0; + wordl3[2] = 0; + wordl3[3] = 0; const u32 pw_l_len = pws[gid].pw_len; + if (combs_mode == COMBINATOR_MODE_BASE_RIGHT) + { + append_0x80_2x4 (wordl0, wordl1, pw_l_len); + + switch_buffer_by_offset_le (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len); + } + /** * digest */ @@ -362,73 +428,87 @@ __kernel void m01100_s04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, * loop */ - for (u32 il_pos = 0; il_pos < combs_cnt; il_pos += VECT_SIZE) + for (u32 il_pos = 0; il_pos < combs_cnt; il_pos++) { - const u32x pw_r_len = pwlenx_create_combt (combs_buf, il_pos); + const u32 pw_r_len = combs_buf[il_pos].pw_len; - const u32x pw_len = pw_l_len + pw_r_len; + const u32 pw_len = pw_l_len + pw_r_len; - u32x wordr0[4] = { 0 }; - u32x wordr1[4] = { 0 }; - u32x wordr2[4] = { 0 }; - u32x wordr3[4] = { 0 }; + u32 wordr0[4]; - wordr0[0] = ix_create_combt (combs_buf, il_pos, 0); - wordr0[1] = ix_create_combt (combs_buf, il_pos, 1); - wordr0[2] = ix_create_combt (combs_buf, il_pos, 2); - wordr0[3] = ix_create_combt (combs_buf, il_pos, 3); - wordr1[0] = ix_create_combt (combs_buf, il_pos, 4); - wordr1[1] = ix_create_combt (combs_buf, il_pos, 5); - wordr1[2] = ix_create_combt (combs_buf, il_pos, 6); - wordr1[3] = ix_create_combt (combs_buf, il_pos, 7); + wordr0[0] = combs_buf[il_pos].i[0]; + wordr0[1] = combs_buf[il_pos].i[1]; + wordr0[2] = combs_buf[il_pos].i[2]; + wordr0[3] = combs_buf[il_pos].i[3]; + + u32 wordr1[4]; + + wordr1[0] = combs_buf[il_pos].i[4]; + wordr1[1] = combs_buf[il_pos].i[5]; + wordr1[2] = combs_buf[il_pos].i[6]; + wordr1[3] = combs_buf[il_pos].i[7]; + + u32 wordr2[4]; + + wordr2[0] = 0; + wordr2[1] = 0; + wordr2[2] = 0; + wordr2[3] = 0; + + u32 wordr3[4]; + + wordr3[0] = 0; + wordr3[1] = 0; + wordr3[2] = 0; + wordr3[3] = 0; if (combs_mode == COMBINATOR_MODE_BASE_LEFT) { switch_buffer_by_offset_le (wordr0, wordr1, wordr2, wordr3, pw_l_len); } - u32x w0[4]; + u32 w0[4]; w0[0] = wordl0[0] | wordr0[0]; w0[1] = wordl0[1] | wordr0[1]; w0[2] = wordl0[2] | wordr0[2]; w0[3] = wordl0[3] | wordr0[3]; - u32x w1[4]; + u32 w1[4]; w1[0] = wordl1[0] | wordr1[0]; w1[1] = wordl1[1] | wordr1[1]; w1[2] = wordl1[2] | wordr1[2]; w1[3] = wordl1[3] | wordr1[3]; - u32x w2[4]; + u32 w2[4]; w2[0] = 0; w2[1] = 0; w2[2] = 0; w2[3] = 0; - u32x w3[4]; + u32 w3[4]; w3[0] = 0; w3[1] = 0; w3[2] = 0; w3[3] = 0; - u32x w0_t[4]; - u32x w1_t[4]; - u32x w2_t[4]; - u32x w3_t[4]; + u32 w0_t[4]; + u32 w1_t[4]; + u32 w2_t[4]; + u32 w3_t[4]; make_unicode (w0, w0_t, w1_t); make_unicode (w1, w2_t, w3_t); w3_t[2] = pw_len * 8 * 2; - u32x a = MD4M_A; - u32x b = MD4M_B; - u32x c = MD4M_C; - u32x d = MD4M_D; + u32 a = MD4M_A; + u32 b = MD4M_B; + u32 c = MD4M_C; + u32 d = MD4M_D; MD4_STEP (MD4_Fo, a, b, c, d, w0_t[0], MD4C00, MD4S00); MD4_STEP (MD4_Fo, d, a, b, c, w0_t[1], MD4C00, MD4S01); @@ -556,12 +636,20 @@ __kernel void m01100_s04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, MD4_STEP (MD4_H , b, c, d, a, w3_t[1], MD4C02, MD4S23); MD4_STEP (MD4_H , a, b, c, d, w0_t[3], MD4C02, MD4S20); - if (MATCHES_NONE_VS (a, search[0])) continue; + bool q_cond = allx (search[0] != a); + + if (q_cond) continue; MD4_STEP (MD4_H , d, a, b, c, w2_t[3], MD4C02, MD4S21); MD4_STEP (MD4_H , c, d, a, b, w1_t[3], MD4C02, MD4S22); MD4_STEP (MD4_H , b, c, d, a, w3_t[3], MD4C02, MD4S23); - COMPARE_S_SIMD (a, d, c, b); + + const u32 r0 = a; + const u32 r1 = d; + const u32 r2 = c; + const u32 r3 = b; + + #include COMPARE_S } } diff --git a/OpenCL/m01400_a1.cl b/OpenCL/m01400_a1.cl index a4c720e..1354ab4 100644 --- a/OpenCL/m01400_a1.cl +++ b/OpenCL/m01400_a1.cl @@ -5,8 +5,6 @@ #define _SHA256_ -#define NEW_SIMD_CODE - #include "include/constants.h" #include "include/kernel_vendor.h" @@ -18,7 +16,9 @@ #include "include/kernel_functions.c" #include "OpenCL/types_ocl.c" #include "OpenCL/common.c" -#include "OpenCL/simd.c" + +#define COMPARE_S "OpenCL/check_single_comp4.c" +#define COMPARE_M "OpenCL/check_multi_comp4.c" __kernel void m01400_m04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, __global comb_t *combs_buf, __global bf_t *bfs_buf, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 combs_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max) { @@ -36,53 +36,92 @@ __kernel void m01400_m04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, if (gid >= gid_max) return; - u32 pws0[4] = { 0 }; - u32 pws1[4] = { 0 }; + u32 wordl0[4]; + + wordl0[0] = pws[gid].i[ 0]; + wordl0[1] = pws[gid].i[ 1]; + wordl0[2] = pws[gid].i[ 2]; + wordl0[3] = pws[gid].i[ 3]; + + u32 wordl1[4]; + + wordl1[0] = pws[gid].i[ 4]; + wordl1[1] = pws[gid].i[ 5]; + wordl1[2] = pws[gid].i[ 6]; + wordl1[3] = pws[gid].i[ 7]; - pws0[0] = pws[gid].i[0]; - pws0[1] = pws[gid].i[1]; - pws0[2] = pws[gid].i[2]; - pws0[3] = pws[gid].i[3]; - pws1[0] = pws[gid].i[4]; - pws1[1] = pws[gid].i[5]; - pws1[2] = pws[gid].i[6]; - pws1[3] = pws[gid].i[7]; + u32 wordl2[4]; + + wordl2[0] = 0; + wordl2[1] = 0; + wordl2[2] = 0; + wordl2[3] = 0; + + u32 wordl3[4]; + + wordl3[0] = 0; + wordl3[1] = 0; + wordl3[2] = 0; + wordl3[3] = 0; const u32 pw_l_len = pws[gid].pw_len; + if (combs_mode == COMBINATOR_MODE_BASE_RIGHT) + { + append_0x80_2x4 (wordl0, wordl1, pw_l_len); + + switch_buffer_by_offset_le (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len); + } + /** * loop */ - for (u32 il_pos = 0; il_pos < combs_cnt; il_pos += VECT_SIZE) + for (u32 il_pos = 0; il_pos < combs_cnt; il_pos++) { - const u32x pw_r_len = pwlenx_create_combt (combs_buf, il_pos); + const u32 pw_r_len = combs_buf[il_pos].pw_len; + + const u32 pw_len = pw_l_len + pw_r_len; + + u32 wordr0[4]; + + wordr0[0] = combs_buf[il_pos].i[0]; + wordr0[1] = combs_buf[il_pos].i[1]; + wordr0[2] = combs_buf[il_pos].i[2]; + wordr0[3] = combs_buf[il_pos].i[3]; - const u32x pw_len = pw_l_len + pw_r_len; + u32 wordr1[4]; - u32x wordr0[4] = { 0 }; - u32x wordr1[4] = { 0 }; - u32x wordr2[4] = { 0 }; - u32x wordr3[4] = { 0 }; + wordr1[0] = combs_buf[il_pos].i[4]; + wordr1[1] = combs_buf[il_pos].i[5]; + wordr1[2] = combs_buf[il_pos].i[6]; + wordr1[3] = combs_buf[il_pos].i[7]; - wordr0[0] = ix_create_combt (combs_buf, il_pos, 0); - wordr0[1] = ix_create_combt (combs_buf, il_pos, 1); - wordr0[2] = ix_create_combt (combs_buf, il_pos, 2); - wordr0[3] = ix_create_combt (combs_buf, il_pos, 3); - wordr1[0] = ix_create_combt (combs_buf, il_pos, 4); - wordr1[1] = ix_create_combt (combs_buf, il_pos, 5); - wordr1[2] = ix_create_combt (combs_buf, il_pos, 6); - wordr1[3] = ix_create_combt (combs_buf, il_pos, 7); + u32 wordr2[4]; + + wordr2[0] = 0; + wordr2[1] = 0; + wordr2[2] = 0; + wordr2[3] = 0; + + u32 wordr3[4]; + + wordr3[0] = 0; + wordr3[1] = 0; + wordr3[2] = 0; + wordr3[3] = 0; if (combs_mode == COMBINATOR_MODE_BASE_LEFT) { + append_0x80_2x4 (wordr0, wordr1, pw_r_len); + switch_buffer_by_offset_le (wordr0, wordr1, wordr2, wordr3, pw_l_len); } - u32x w0[4]; - u32x w1[4]; - u32x w2[4]; - u32x w3[4]; + u32 w0[4]; + u32 w1[4]; + u32 w2[4]; + u32 w3[4]; w0[0] = wordl0[0] | wordr0[0]; w0[1] = wordl0[1] | wordr0[1]; @@ -105,31 +144,31 @@ __kernel void m01400_m04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, * SHA256 */ - u32x w0_t = swap32 (w0[0]); - u32x w1_t = swap32 (w0[1]); - u32x w2_t = swap32 (w0[2]); - u32x w3_t = swap32 (w0[3]); - u32x w4_t = swap32 (w1[0]); - u32x w5_t = swap32 (w1[1]); - u32x w6_t = swap32 (w1[2]); - u32x w7_t = swap32 (w1[3]); - u32x w8_t = swap32 (w2[0]); - u32x w9_t = swap32 (w2[1]); - u32x wa_t = swap32 (w2[2]); - u32x wb_t = swap32 (w2[3]); - u32x wc_t = swap32 (w3[0]); - u32x wd_t = swap32 (w3[1]); - u32x we_t = 0; - u32x wf_t = pw_len * 8; - - u32x a = SHA256M_A; - u32x b = SHA256M_B; - u32x c = SHA256M_C; - u32x d = SHA256M_D; - u32x e = SHA256M_E; - u32x f = SHA256M_F; - u32x g = SHA256M_G; - u32x h = SHA256M_H; + u32 w0_t = swap32 (w0[0]); + u32 w1_t = swap32 (w0[1]); + u32 w2_t = swap32 (w0[2]); + u32 w3_t = swap32 (w0[3]); + u32 w4_t = swap32 (w1[0]); + u32 w5_t = swap32 (w1[1]); + u32 w6_t = swap32 (w1[2]); + u32 w7_t = swap32 (w1[3]); + u32 w8_t = swap32 (w2[0]); + u32 w9_t = swap32 (w2[1]); + u32 wa_t = swap32 (w2[2]); + u32 wb_t = swap32 (w2[3]); + u32 wc_t = swap32 (w3[0]); + u32 wd_t = swap32 (w3[1]); + u32 we_t = 0; + u32 wf_t = pw_len * 8; + + u32 a = SHA256M_A; + u32 b = SHA256M_B; + u32 c = SHA256M_C; + u32 d = SHA256M_D; + u32 e = SHA256M_E; + u32 f = SHA256M_F; + u32 g = SHA256M_G; + u32 h = SHA256M_H; SHA256_STEP (SHA256_F0o, SHA256_F1o, a, b, c, d, e, f, g, h, w0_t, SHA256C00); SHA256_STEP (SHA256_F0o, SHA256_F1o, h, a, b, c, d, e, f, g, w1_t, SHA256C01); @@ -199,7 +238,13 @@ __kernel void m01400_m04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, we_t = SHA256_EXPAND (wc_t, w7_t, wf_t, we_t); SHA256_STEP (SHA256_F0o, SHA256_F1o, c, d, e, f, g, h, a, b, we_t, SHA256C3e); wf_t = SHA256_EXPAND (wd_t, w8_t, w0_t, wf_t); SHA256_STEP (SHA256_F0o, SHA256_F1o, b, c, d, e, f, g, h, a, wf_t, SHA256C3f); - COMPARE_M_SIMD (d, h, c, g); + + const u32 r0 = d; + const u32 r1 = h; + const u32 r2 = c; + const u32 r3 = g; + + #include COMPARE_M } } @@ -227,20 +272,43 @@ __kernel void m01400_s04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, if (gid >= gid_max) return; - u32 pws0[4] = { 0 }; - u32 pws1[4] = { 0 }; + u32 wordl0[4]; + + wordl0[0] = pws[gid].i[ 0]; + wordl0[1] = pws[gid].i[ 1]; + wordl0[2] = pws[gid].i[ 2]; + wordl0[3] = pws[gid].i[ 3]; + + u32 wordl1[4]; + + wordl1[0] = pws[gid].i[ 4]; + wordl1[1] = pws[gid].i[ 5]; + wordl1[2] = pws[gid].i[ 6]; + wordl1[3] = pws[gid].i[ 7]; + + u32 wordl2[4]; - pws0[0] = pws[gid].i[0]; - pws0[1] = pws[gid].i[1]; - pws0[2] = pws[gid].i[2]; - pws0[3] = pws[gid].i[3]; - pws1[0] = pws[gid].i[4]; - pws1[1] = pws[gid].i[5]; - pws1[2] = pws[gid].i[6]; - pws1[3] = pws[gid].i[7]; + wordl2[0] = 0; + wordl2[1] = 0; + wordl2[2] = 0; + wordl2[3] = 0; + + u32 wordl3[4]; + + wordl3[0] = 0; + wordl3[1] = 0; + wordl3[2] = 0; + wordl3[3] = 0; const u32 pw_l_len = pws[gid].pw_len; + if (combs_mode == COMBINATOR_MODE_BASE_RIGHT) + { + append_0x80_2x4 (wordl0, wordl1, pw_l_len); + + switch_buffer_by_offset_le (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len); + } + /** * digest */ @@ -257,35 +325,51 @@ __kernel void m01400_s04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, * loop */ - for (u32 il_pos = 0; il_pos < combs_cnt; il_pos += VECT_SIZE) + for (u32 il_pos = 0; il_pos < combs_cnt; il_pos++) { - const u32x pw_r_len = pwlenx_create_combt (combs_buf, il_pos); + const u32 pw_r_len = combs_buf[il_pos].pw_len; + + const u32 pw_len = pw_l_len + pw_r_len; + + u32 wordr0[4]; + + wordr0[0] = combs_buf[il_pos].i[0]; + wordr0[1] = combs_buf[il_pos].i[1]; + wordr0[2] = combs_buf[il_pos].i[2]; + wordr0[3] = combs_buf[il_pos].i[3]; + + u32 wordr1[4]; - const u32x pw_len = pw_l_len + pw_r_len; + wordr1[0] = combs_buf[il_pos].i[4]; + wordr1[1] = combs_buf[il_pos].i[5]; + wordr1[2] = combs_buf[il_pos].i[6]; + wordr1[3] = combs_buf[il_pos].i[7]; - u32x wordr0[4] = { 0 }; - u32x wordr1[4] = { 0 }; - u32x wordr2[4] = { 0 }; - u32x wordr3[4] = { 0 }; + u32 wordr2[4]; - wordr0[0] = ix_create_combt (combs_buf, il_pos, 0); - wordr0[1] = ix_create_combt (combs_buf, il_pos, 1); - wordr0[2] = ix_create_combt (combs_buf, il_pos, 2); - wordr0[3] = ix_create_combt (combs_buf, il_pos, 3); - wordr1[0] = ix_create_combt (combs_buf, il_pos, 4); - wordr1[1] = ix_create_combt (combs_buf, il_pos, 5); - wordr1[2] = ix_create_combt (combs_buf, il_pos, 6); - wordr1[3] = ix_create_combt (combs_buf, il_pos, 7); + wordr2[0] = 0; + wordr2[1] = 0; + wordr2[2] = 0; + wordr2[3] = 0; + + u32 wordr3[4]; + + wordr3[0] = 0; + wordr3[1] = 0; + wordr3[2] = 0; + wordr3[3] = 0; if (combs_mode == COMBINATOR_MODE_BASE_LEFT) { + append_0x80_2x4 (wordr0, wordr1, pw_r_len); + switch_buffer_by_offset_le (wordr0, wordr1, wordr2, wordr3, pw_l_len); } - u32x w0[4]; - u32x w1[4]; - u32x w2[4]; - u32x w3[4]; + u32 w0[4]; + u32 w1[4]; + u32 w2[4]; + u32 w3[4]; w0[0] = wordl0[0] | wordr0[0]; w0[1] = wordl0[1] | wordr0[1]; @@ -308,31 +392,31 @@ __kernel void m01400_s04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, * SHA256 */ - u32x w0_t = swap32 (w0[0]); - u32x w1_t = swap32 (w0[1]); - u32x w2_t = swap32 (w0[2]); - u32x w3_t = swap32 (w0[3]); - u32x w4_t = swap32 (w1[0]); - u32x w5_t = swap32 (w1[1]); - u32x w6_t = swap32 (w1[2]); - u32x w7_t = swap32 (w1[3]); - u32x w8_t = swap32 (w2[0]); - u32x w9_t = swap32 (w2[1]); - u32x wa_t = swap32 (w2[2]); - u32x wb_t = swap32 (w2[3]); - u32x wc_t = swap32 (w3[0]); - u32x wd_t = swap32 (w3[1]); - u32x we_t = 0; - u32x wf_t = pw_len * 8; - - u32x a = SHA256M_A; - u32x b = SHA256M_B; - u32x c = SHA256M_C; - u32x d = SHA256M_D; - u32x e = SHA256M_E; - u32x f = SHA256M_F; - u32x g = SHA256M_G; - u32x h = SHA256M_H; + u32 w0_t = swap32 (w0[0]); + u32 w1_t = swap32 (w0[1]); + u32 w2_t = swap32 (w0[2]); + u32 w3_t = swap32 (w0[3]); + u32 w4_t = swap32 (w1[0]); + u32 w5_t = swap32 (w1[1]); + u32 w6_t = swap32 (w1[2]); + u32 w7_t = swap32 (w1[3]); + u32 w8_t = swap32 (w2[0]); + u32 w9_t = swap32 (w2[1]); + u32 wa_t = swap32 (w2[2]); + u32 wb_t = swap32 (w2[3]); + u32 wc_t = swap32 (w3[0]); + u32 wd_t = swap32 (w3[1]); + u32 we_t = 0; + u32 wf_t = pw_len * 8; + + u32 a = SHA256M_A; + u32 b = SHA256M_B; + u32 c = SHA256M_C; + u32 d = SHA256M_D; + u32 e = SHA256M_E; + u32 f = SHA256M_F; + u32 g = SHA256M_G; + u32 h = SHA256M_H; SHA256_STEP (SHA256_F0o, SHA256_F1o, a, b, c, d, e, f, g, h, w0_t, SHA256C00); SHA256_STEP (SHA256_F0o, SHA256_F1o, h, a, b, c, d, e, f, g, w1_t, SHA256C01); @@ -402,7 +486,13 @@ __kernel void m01400_s04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, we_t = SHA256_EXPAND (wc_t, w7_t, wf_t, we_t); SHA256_STEP (SHA256_F0o, SHA256_F1o, c, d, e, f, g, h, a, b, we_t, SHA256C3e); wf_t = SHA256_EXPAND (wd_t, w8_t, w0_t, wf_t); SHA256_STEP (SHA256_F0o, SHA256_F1o, b, c, d, e, f, g, h, a, wf_t, SHA256C3f); - COMPARE_S_SIMD (d, h, c, g); + + const u32 r0 = d; + const u32 r1 = h; + const u32 r2 = c; + const u32 r3 = g; + + #include COMPARE_S } } diff --git a/OpenCL/m01410_a1.cl b/OpenCL/m01410_a1.cl index ae6ea73..7ed5222 100644 --- a/OpenCL/m01410_a1.cl +++ b/OpenCL/m01410_a1.cl @@ -5,8 +5,6 @@ #define _SHA256_ -#define NEW_SIMD_CODE - #include "include/constants.h" #include "include/kernel_vendor.h" @@ -18,7 +16,9 @@ #include "include/kernel_functions.c" #include "OpenCL/types_ocl.c" #include "OpenCL/common.c" -#include "OpenCL/simd.c" + +#define COMPARE_S "OpenCL/check_single_comp4.c" +#define COMPARE_M "OpenCL/check_multi_comp4.c" __kernel void m01410_m04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, __global comb_t *combs_buf, __global bf_t *bfs_buf, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 combs_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max) { @@ -36,31 +36,54 @@ __kernel void m01410_m04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, if (gid >= gid_max) return; - u32 pws0[4] = { 0 }; - u32 pws1[4] = { 0 }; + u32 wordl0[4]; + + wordl0[0] = pws[gid].i[ 0]; + wordl0[1] = pws[gid].i[ 1]; + wordl0[2] = pws[gid].i[ 2]; + wordl0[3] = pws[gid].i[ 3]; + + u32 wordl1[4]; + + wordl1[0] = pws[gid].i[ 4]; + wordl1[1] = pws[gid].i[ 5]; + wordl1[2] = pws[gid].i[ 6]; + wordl1[3] = pws[gid].i[ 7]; + + u32 wordl2[4]; - pws0[0] = pws[gid].i[0]; - pws0[1] = pws[gid].i[1]; - pws0[2] = pws[gid].i[2]; - pws0[3] = pws[gid].i[3]; - pws1[0] = pws[gid].i[4]; - pws1[1] = pws[gid].i[5]; - pws1[2] = pws[gid].i[6]; - pws1[3] = pws[gid].i[7]; + wordl2[0] = 0; + wordl2[1] = 0; + wordl2[2] = 0; + wordl2[3] = 0; + + u32 wordl3[4]; + + wordl3[0] = 0; + wordl3[1] = 0; + wordl3[2] = 0; + wordl3[3] = 0; const u32 pw_l_len = pws[gid].pw_len; + if (combs_mode == COMBINATOR_MODE_BASE_RIGHT) + { + switch_buffer_by_offset_le (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len); + } + /** * salt */ u32 salt_buf0[4]; - u32 salt_buf1[4]; salt_buf0[0] = salt_bufs[salt_pos].salt_buf[0]; salt_buf0[1] = salt_bufs[salt_pos].salt_buf[1]; salt_buf0[2] = salt_bufs[salt_pos].salt_buf[2]; salt_buf0[3] = salt_bufs[salt_pos].salt_buf[3]; + + u32 salt_buf1[4]; + salt_buf1[0] = salt_bufs[salt_pos].salt_buf[4]; salt_buf1[1] = salt_bufs[salt_pos].salt_buf[5]; salt_buf1[2] = salt_bufs[salt_pos].salt_buf[6]; @@ -72,25 +95,39 @@ __kernel void m01410_m04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, * loop */ - for (u32 il_pos = 0; il_pos < combs_cnt; il_pos += VECT_SIZE) + for (u32 il_pos = 0; il_pos < combs_cnt; il_pos++) { - const u32x pw_r_len = pwlenx_create_combt (combs_buf, il_pos); + const u32 pw_r_len = combs_buf[il_pos].pw_len; + + const u32 pw_len = pw_l_len + pw_r_len; + + u32 wordr0[4]; - const u32x pw_len = pw_l_len + pw_r_len; + wordr0[0] = combs_buf[il_pos].i[0]; + wordr0[1] = combs_buf[il_pos].i[1]; + wordr0[2] = combs_buf[il_pos].i[2]; + wordr0[3] = combs_buf[il_pos].i[3]; - u32x wordr0[4] = { 0 }; - u32x wordr1[4] = { 0 }; - u32x wordr2[4] = { 0 }; - u32x wordr3[4] = { 0 }; + u32 wordr1[4]; - wordr0[0] = ix_create_combt (combs_buf, il_pos, 0); - wordr0[1] = ix_create_combt (combs_buf, il_pos, 1); - wordr0[2] = ix_create_combt (combs_buf, il_pos, 2); - wordr0[3] = ix_create_combt (combs_buf, il_pos, 3); - wordr1[0] = ix_create_combt (combs_buf, il_pos, 4); - wordr1[1] = ix_create_combt (combs_buf, il_pos, 5); - wordr1[2] = ix_create_combt (combs_buf, il_pos, 6); - wordr1[3] = ix_create_combt (combs_buf, il_pos, 7); + wordr1[0] = combs_buf[il_pos].i[4]; + wordr1[1] = combs_buf[il_pos].i[5]; + wordr1[2] = combs_buf[il_pos].i[6]; + wordr1[3] = combs_buf[il_pos].i[7]; + + u32 wordr2[4]; + + wordr2[0] = 0; + wordr2[1] = 0; + wordr2[2] = 0; + wordr2[3] = 0; + + u32 wordr3[4]; + + wordr3[0] = 0; + wordr3[1] = 0; + wordr3[2] = 0; + wordr3[3] = 0; if (combs_mode == COMBINATOR_MODE_BASE_LEFT) { @@ -129,14 +166,14 @@ __kernel void m01410_m04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, s3[2] = 0; s3[3] = 0; - switch_buffer_by_offset_le_S (s0, s1, s2, s3, pw_len); + switch_buffer_by_offset_le (s0, s1, s2, s3, pw_len); - const u32x pw_salt_len = pw_len + salt_len; + const u32 pw_salt_len = pw_len + salt_len; - u32x w0[4]; - u32x w1[4]; - u32x w2[4]; - u32x w3[4]; + u32 w0[4]; + u32 w1[4]; + u32 w2[4]; + u32 w3[4]; w0[0] = wordl0[0] | wordr0[0] | s0[0]; w0[1] = wordl0[1] | wordr0[1] | s0[1]; @@ -161,31 +198,31 @@ __kernel void m01410_m04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, * sha256 */ - u32x w0_t = swap32 (w0[0]); - u32x w1_t = swap32 (w0[1]); - u32x w2_t = swap32 (w0[2]); - u32x w3_t = swap32 (w0[3]); - u32x w4_t = swap32 (w1[0]); - u32x w5_t = swap32 (w1[1]); - u32x w6_t = swap32 (w1[2]); - u32x w7_t = swap32 (w1[3]); - u32x w8_t = swap32 (w2[0]); - u32x w9_t = swap32 (w2[1]); - u32x wa_t = swap32 (w2[2]); - u32x wb_t = swap32 (w2[3]); - u32x wc_t = swap32 (w3[0]); - u32x wd_t = swap32 (w3[1]); - u32x we_t = 0; - u32x wf_t = pw_salt_len * 8; - - u32x a = SHA256M_A; - u32x b = SHA256M_B; - u32x c = SHA256M_C; - u32x d = SHA256M_D; - u32x e = SHA256M_E; - u32x f = SHA256M_F; - u32x g = SHA256M_G; - u32x h = SHA256M_H; + u32 w0_t = swap32 (w0[0]); + u32 w1_t = swap32 (w0[1]); + u32 w2_t = swap32 (w0[2]); + u32 w3_t = swap32 (w0[3]); + u32 w4_t = swap32 (w1[0]); + u32 w5_t = swap32 (w1[1]); + u32 w6_t = swap32 (w1[2]); + u32 w7_t = swap32 (w1[3]); + u32 w8_t = swap32 (w2[0]); + u32 w9_t = swap32 (w2[1]); + u32 wa_t = swap32 (w2[2]); + u32 wb_t = swap32 (w2[3]); + u32 wc_t = swap32 (w3[0]); + u32 wd_t = swap32 (w3[1]); + u32 we_t = 0; + u32 wf_t = pw_salt_len * 8; + + u32 a = SHA256M_A; + u32 b = SHA256M_B; + u32 c = SHA256M_C; + u32 d = SHA256M_D; + u32 e = SHA256M_E; + u32 f = SHA256M_F; + u32 g = SHA256M_G; + u32 h = SHA256M_H; SHA256_STEP (SHA256_F0o, SHA256_F1o, a, b, c, d, e, f, g, h, w0_t, SHA256C00); SHA256_STEP (SHA256_F0o, SHA256_F1o, h, a, b, c, d, e, f, g, w1_t, SHA256C01); @@ -255,7 +292,13 @@ __kernel void m01410_m04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, we_t = SHA256_EXPAND (wc_t, w7_t, wf_t, we_t); SHA256_STEP (SHA256_F0o, SHA256_F1o, c, d, e, f, g, h, a, b, we_t, SHA256C3e); wf_t = SHA256_EXPAND (wd_t, w8_t, w0_t, wf_t); SHA256_STEP (SHA256_F0o, SHA256_F1o, b, c, d, e, f, g, h, a, wf_t, SHA256C3f); - COMPARE_M_SIMD (d, h, c, g); + + const u32 r0 = d; + const u32 r1 = h; + const u32 r2 = c; + const u32 r3 = g; + + #include COMPARE_M } } @@ -283,31 +326,54 @@ __kernel void m01410_s04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, if (gid >= gid_max) return; - u32 pws0[4] = { 0 }; - u32 pws1[4] = { 0 }; + u32 wordl0[4]; + + wordl0[0] = pws[gid].i[ 0]; + wordl0[1] = pws[gid].i[ 1]; + wordl0[2] = pws[gid].i[ 2]; + wordl0[3] = pws[gid].i[ 3]; + + u32 wordl1[4]; + + wordl1[0] = pws[gid].i[ 4]; + wordl1[1] = pws[gid].i[ 5]; + wordl1[2] = pws[gid].i[ 6]; + wordl1[3] = pws[gid].i[ 7]; + + u32 wordl2[4]; + + wordl2[0] = 0; + wordl2[1] = 0; + wordl2[2] = 0; + wordl2[3] = 0; - pws0[0] = pws[gid].i[0]; - pws0[1] = pws[gid].i[1]; - pws0[2] = pws[gid].i[2]; - pws0[3] = pws[gid].i[3]; - pws1[0] = pws[gid].i[4]; - pws1[1] = pws[gid].i[5]; - pws1[2] = pws[gid].i[6]; - pws1[3] = pws[gid].i[7]; + u32 wordl3[4]; + + wordl3[0] = 0; + wordl3[1] = 0; + wordl3[2] = 0; + wordl3[3] = 0; const u32 pw_l_len = pws[gid].pw_len; + if (combs_mode == COMBINATOR_MODE_BASE_RIGHT) + { + switch_buffer_by_offset_le (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len); + } + /** * salt */ u32 salt_buf0[4]; - u32 salt_buf1[4]; salt_buf0[0] = salt_bufs[salt_pos].salt_buf[0]; salt_buf0[1] = salt_bufs[salt_pos].salt_buf[1]; salt_buf0[2] = salt_bufs[salt_pos].salt_buf[2]; salt_buf0[3] = salt_bufs[salt_pos].salt_buf[3]; + + u32 salt_buf1[4]; + salt_buf1[0] = salt_bufs[salt_pos].salt_buf[4]; salt_buf1[1] = salt_bufs[salt_pos].salt_buf[5]; salt_buf1[2] = salt_bufs[salt_pos].salt_buf[6]; @@ -331,25 +397,39 @@ __kernel void m01410_s04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, * loop */ - for (u32 il_pos = 0; il_pos < combs_cnt; il_pos += VECT_SIZE) + for (u32 il_pos = 0; il_pos < combs_cnt; il_pos++) { - const u32x pw_r_len = pwlenx_create_combt (combs_buf, il_pos); + const u32 pw_r_len = combs_buf[il_pos].pw_len; + + const u32 pw_len = pw_l_len + pw_r_len; + + u32 wordr0[4]; + + wordr0[0] = combs_buf[il_pos].i[0]; + wordr0[1] = combs_buf[il_pos].i[1]; + wordr0[2] = combs_buf[il_pos].i[2]; + wordr0[3] = combs_buf[il_pos].i[3]; - const u32x pw_len = pw_l_len + pw_r_len; + u32 wordr1[4]; - u32x wordr0[4] = { 0 }; - u32x wordr1[4] = { 0 }; - u32x wordr2[4] = { 0 }; - u32x wordr3[4] = { 0 }; + wordr1[0] = combs_buf[il_pos].i[4]; + wordr1[1] = combs_buf[il_pos].i[5]; + wordr1[2] = combs_buf[il_pos].i[6]; + wordr1[3] = combs_buf[il_pos].i[7]; - wordr0[0] = ix_create_combt (combs_buf, il_pos, 0); - wordr0[1] = ix_create_combt (combs_buf, il_pos, 1); - wordr0[2] = ix_create_combt (combs_buf, il_pos, 2); - wordr0[3] = ix_create_combt (combs_buf, il_pos, 3); - wordr1[0] = ix_create_combt (combs_buf, il_pos, 4); - wordr1[1] = ix_create_combt (combs_buf, il_pos, 5); - wordr1[2] = ix_create_combt (combs_buf, il_pos, 6); - wordr1[3] = ix_create_combt (combs_buf, il_pos, 7); + u32 wordr2[4]; + + wordr2[0] = 0; + wordr2[1] = 0; + wordr2[2] = 0; + wordr2[3] = 0; + + u32 wordr3[4]; + + wordr3[0] = 0; + wordr3[1] = 0; + wordr3[2] = 0; + wordr3[3] = 0; if (combs_mode == COMBINATOR_MODE_BASE_LEFT) { @@ -388,14 +468,14 @@ __kernel void m01410_s04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, s3[2] = 0; s3[3] = 0; - switch_buffer_by_offset_le_S (s0, s1, s2, s3, pw_len); + switch_buffer_by_offset_le (s0, s1, s2, s3, pw_len); - const u32x pw_salt_len = pw_len + salt_len; + const u32 pw_salt_len = pw_len + salt_len; - u32x w0[4]; - u32x w1[4]; - u32x w2[4]; - u32x w3[4]; + u32 w0[4]; + u32 w1[4]; + u32 w2[4]; + u32 w3[4]; w0[0] = wordl0[0] | wordr0[0] | s0[0]; w0[1] = wordl0[1] | wordr0[1] | s0[1]; @@ -420,31 +500,31 @@ __kernel void m01410_s04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, * sha256 */ - u32x w0_t = swap32 (w0[0]); - u32x w1_t = swap32 (w0[1]); - u32x w2_t = swap32 (w0[2]); - u32x w3_t = swap32 (w0[3]); - u32x w4_t = swap32 (w1[0]); - u32x w5_t = swap32 (w1[1]); - u32x w6_t = swap32 (w1[2]); - u32x w7_t = swap32 (w1[3]); - u32x w8_t = swap32 (w2[0]); - u32x w9_t = swap32 (w2[1]); - u32x wa_t = swap32 (w2[2]); - u32x wb_t = swap32 (w2[3]); - u32x wc_t = swap32 (w3[0]); - u32x wd_t = swap32 (w3[1]); - u32x we_t = 0; - u32x wf_t = pw_salt_len * 8; - - u32x a = SHA256M_A; - u32x b = SHA256M_B; - u32x c = SHA256M_C; - u32x d = SHA256M_D; - u32x e = SHA256M_E; - u32x f = SHA256M_F; - u32x g = SHA256M_G; - u32x h = SHA256M_H; + u32 w0_t = swap32 (w0[0]); + u32 w1_t = swap32 (w0[1]); + u32 w2_t = swap32 (w0[2]); + u32 w3_t = swap32 (w0[3]); + u32 w4_t = swap32 (w1[0]); + u32 w5_t = swap32 (w1[1]); + u32 w6_t = swap32 (w1[2]); + u32 w7_t = swap32 (w1[3]); + u32 w8_t = swap32 (w2[0]); + u32 w9_t = swap32 (w2[1]); + u32 wa_t = swap32 (w2[2]); + u32 wb_t = swap32 (w2[3]); + u32 wc_t = swap32 (w3[0]); + u32 wd_t = swap32 (w3[1]); + u32 we_t = 0; + u32 wf_t = pw_salt_len * 8; + + u32 a = SHA256M_A; + u32 b = SHA256M_B; + u32 c = SHA256M_C; + u32 d = SHA256M_D; + u32 e = SHA256M_E; + u32 f = SHA256M_F; + u32 g = SHA256M_G; + u32 h = SHA256M_H; SHA256_STEP (SHA256_F0o, SHA256_F1o, a, b, c, d, e, f, g, h, w0_t, SHA256C00); SHA256_STEP (SHA256_F0o, SHA256_F1o, h, a, b, c, d, e, f, g, w1_t, SHA256C01); @@ -514,7 +594,13 @@ __kernel void m01410_s04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, we_t = SHA256_EXPAND (wc_t, w7_t, wf_t, we_t); SHA256_STEP (SHA256_F0o, SHA256_F1o, c, d, e, f, g, h, a, b, we_t, SHA256C3e); wf_t = SHA256_EXPAND (wd_t, w8_t, w0_t, wf_t); SHA256_STEP (SHA256_F0o, SHA256_F1o, b, c, d, e, f, g, h, a, wf_t, SHA256C3f); - COMPARE_S_SIMD (d, h, c, g); + + const u32 r0 = d; + const u32 r1 = h; + const u32 r2 = c; + const u32 r3 = g; + + #include COMPARE_S } } diff --git a/OpenCL/m01420_a1.cl b/OpenCL/m01420_a1.cl index aca8ad9..bf55fae 100644 --- a/OpenCL/m01420_a1.cl +++ b/OpenCL/m01420_a1.cl @@ -5,8 +5,6 @@ #define _SHA256_ -#define NEW_SIMD_CODE - #include "include/constants.h" #include "include/kernel_vendor.h" @@ -18,7 +16,9 @@ #include "include/kernel_functions.c" #include "OpenCL/types_ocl.c" #include "OpenCL/common.c" -#include "OpenCL/simd.c" + +#define COMPARE_S "OpenCL/check_single_comp4.c" +#define COMPARE_M "OpenCL/check_multi_comp4.c" __kernel void m01420_m04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, __global comb_t *combs_buf, __global bf_t *bfs_buf, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 combs_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max) { @@ -36,31 +36,54 @@ __kernel void m01420_m04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, if (gid >= gid_max) return; - u32 pws0[4] = { 0 }; - u32 pws1[4] = { 0 }; + u32 wordl0[4]; + + wordl0[0] = pws[gid].i[ 0]; + wordl0[1] = pws[gid].i[ 1]; + wordl0[2] = pws[gid].i[ 2]; + wordl0[3] = pws[gid].i[ 3]; + + u32 wordl1[4]; + + wordl1[0] = pws[gid].i[ 4]; + wordl1[1] = pws[gid].i[ 5]; + wordl1[2] = pws[gid].i[ 6]; + wordl1[3] = pws[gid].i[ 7]; - pws0[0] = pws[gid].i[0]; - pws0[1] = pws[gid].i[1]; - pws0[2] = pws[gid].i[2]; - pws0[3] = pws[gid].i[3]; - pws1[0] = pws[gid].i[4]; - pws1[1] = pws[gid].i[5]; - pws1[2] = pws[gid].i[6]; - pws1[3] = pws[gid].i[7]; + u32 wordl2[4]; + + wordl2[0] = 0; + wordl2[1] = 0; + wordl2[2] = 0; + wordl2[3] = 0; + + u32 wordl3[4]; + + wordl3[0] = 0; + wordl3[1] = 0; + wordl3[2] = 0; + wordl3[3] = 0; const u32 pw_l_len = pws[gid].pw_len; + if (combs_mode == COMBINATOR_MODE_BASE_RIGHT) + { + switch_buffer_by_offset_le (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len); + } + /** * salt */ u32 salt_buf0[4]; - u32 salt_buf1[4]; salt_buf0[0] = salt_bufs[salt_pos].salt_buf[0]; salt_buf0[1] = salt_bufs[salt_pos].salt_buf[1]; salt_buf0[2] = salt_bufs[salt_pos].salt_buf[2]; salt_buf0[3] = salt_bufs[salt_pos].salt_buf[3]; + + u32 salt_buf1[4]; + salt_buf1[0] = salt_bufs[salt_pos].salt_buf[4]; salt_buf1[1] = salt_bufs[salt_pos].salt_buf[5]; salt_buf1[2] = salt_bufs[salt_pos].salt_buf[6]; @@ -72,35 +95,43 @@ __kernel void m01420_m04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, * loop */ - for (u32 il_pos = 0; il_pos < combs_cnt; il_pos += VECT_SIZE) + for (u32 il_pos = 0; il_pos < combs_cnt; il_pos++) { - const u32x pw_r_len = pwlenx_create_combt (combs_buf, il_pos); - - const u32x pw_len = pw_l_len + pw_r_len; - - u32x wordr0[4] = { 0 }; - u32x wordr1[4] = { 0 }; - u32x wordr2[4] = { 0 }; - u32x wordr3[4] = { 0 }; - - wordr0[0] = ix_create_combt (combs_buf, il_pos, 0); - wordr0[1] = ix_create_combt (combs_buf, il_pos, 1); - wordr0[2] = ix_create_combt (combs_buf, il_pos, 2); - wordr0[3] = ix_create_combt (combs_buf, il_pos, 3); - wordr1[0] = ix_create_combt (combs_buf, il_pos, 4); - wordr1[1] = ix_create_combt (combs_buf, il_pos, 5); - wordr1[2] = ix_create_combt (combs_buf, il_pos, 6); - wordr1[3] = ix_create_combt (combs_buf, il_pos, 7); + const u32 pw_r_len = combs_buf[il_pos].pw_len; + + const u32 pw_len = pw_l_len + pw_r_len; + + u32 wordr0[4]; + u32 wordr1[4]; + u32 wordr2[4]; + u32 wordr3[4]; + + wordr0[0] = combs_buf[il_pos].i[0]; + wordr0[1] = combs_buf[il_pos].i[1]; + wordr0[2] = combs_buf[il_pos].i[2]; + wordr0[3] = combs_buf[il_pos].i[3]; + wordr1[0] = combs_buf[il_pos].i[4]; + wordr1[1] = combs_buf[il_pos].i[5]; + wordr1[2] = combs_buf[il_pos].i[6]; + wordr1[3] = combs_buf[il_pos].i[7]; + wordr2[0] = 0; + wordr2[1] = 0; + wordr2[2] = 0; + wordr2[3] = 0; + wordr3[0] = 0; + wordr3[1] = 0; + wordr3[2] = 0; + wordr3[3] = 0; if (combs_mode == COMBINATOR_MODE_BASE_LEFT) { switch_buffer_by_offset_le (wordr0, wordr1, wordr2, wordr3, pw_l_len); } - u32x w0[4]; - u32x w1[4]; - u32x w2[4]; - u32x w3[4]; + u32 w0[4]; + u32 w1[4]; + u32 w2[4]; + u32 w3[4]; w0[0] = wordl0[0] | wordr0[0]; w0[1] = wordl0[1] | wordr0[1]; @@ -123,7 +154,7 @@ __kernel void m01420_m04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, * prepend salt */ - const u32x pw_salt_len = pw_len + salt_len; + const u32 pw_salt_len = pw_len + salt_len; switch_buffer_by_offset_le (w0, w1, w2, w3, salt_len); @@ -142,31 +173,31 @@ __kernel void m01420_m04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, * sha256 */ - u32x w0_t = swap32 (w0[0]); - u32x w1_t = swap32 (w0[1]); - u32x w2_t = swap32 (w0[2]); - u32x w3_t = swap32 (w0[3]); - u32x w4_t = swap32 (w1[0]); - u32x w5_t = swap32 (w1[1]); - u32x w6_t = swap32 (w1[2]); - u32x w7_t = swap32 (w1[3]); - u32x w8_t = swap32 (w2[0]); - u32x w9_t = swap32 (w2[1]); - u32x wa_t = swap32 (w2[2]); - u32x wb_t = swap32 (w2[3]); - u32x wc_t = swap32 (w3[0]); - u32x wd_t = swap32 (w3[1]); - u32x we_t = 0; - u32x wf_t = pw_salt_len * 8; - - u32x a = SHA256M_A; - u32x b = SHA256M_B; - u32x c = SHA256M_C; - u32x d = SHA256M_D; - u32x e = SHA256M_E; - u32x f = SHA256M_F; - u32x g = SHA256M_G; - u32x h = SHA256M_H; + u32 w0_t = swap32 (w0[0]); + u32 w1_t = swap32 (w0[1]); + u32 w2_t = swap32 (w0[2]); + u32 w3_t = swap32 (w0[3]); + u32 w4_t = swap32 (w1[0]); + u32 w5_t = swap32 (w1[1]); + u32 w6_t = swap32 (w1[2]); + u32 w7_t = swap32 (w1[3]); + u32 w8_t = swap32 (w2[0]); + u32 w9_t = swap32 (w2[1]); + u32 wa_t = swap32 (w2[2]); + u32 wb_t = swap32 (w2[3]); + u32 wc_t = swap32 (w3[0]); + u32 wd_t = swap32 (w3[1]); + u32 we_t = 0; + u32 wf_t = pw_salt_len * 8; + + u32 a = SHA256M_A; + u32 b = SHA256M_B; + u32 c = SHA256M_C; + u32 d = SHA256M_D; + u32 e = SHA256M_E; + u32 f = SHA256M_F; + u32 g = SHA256M_G; + u32 h = SHA256M_H; SHA256_STEP (SHA256_F0o, SHA256_F1o, a, b, c, d, e, f, g, h, w0_t, SHA256C00); SHA256_STEP (SHA256_F0o, SHA256_F1o, h, a, b, c, d, e, f, g, w1_t, SHA256C01); @@ -236,7 +267,13 @@ __kernel void m01420_m04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, we_t = SHA256_EXPAND (wc_t, w7_t, wf_t, we_t); SHA256_STEP (SHA256_F0o, SHA256_F1o, c, d, e, f, g, h, a, b, we_t, SHA256C3e); wf_t = SHA256_EXPAND (wd_t, w8_t, w0_t, wf_t); SHA256_STEP (SHA256_F0o, SHA256_F1o, b, c, d, e, f, g, h, a, wf_t, SHA256C3f); - COMPARE_M_SIMD (d, h, c, g); + + const u32 r0 = d; + const u32 r1 = h; + const u32 r2 = c; + const u32 r3 = g; + + #include COMPARE_M } } @@ -264,31 +301,54 @@ __kernel void m01420_s04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, if (gid >= gid_max) return; - u32 pws0[4] = { 0 }; - u32 pws1[4] = { 0 }; + u32 wordl0[4]; + + wordl0[0] = pws[gid].i[ 0]; + wordl0[1] = pws[gid].i[ 1]; + wordl0[2] = pws[gid].i[ 2]; + wordl0[3] = pws[gid].i[ 3]; + + u32 wordl1[4]; + + wordl1[0] = pws[gid].i[ 4]; + wordl1[1] = pws[gid].i[ 5]; + wordl1[2] = pws[gid].i[ 6]; + wordl1[3] = pws[gid].i[ 7]; + + u32 wordl2[4]; - pws0[0] = pws[gid].i[0]; - pws0[1] = pws[gid].i[1]; - pws0[2] = pws[gid].i[2]; - pws0[3] = pws[gid].i[3]; - pws1[0] = pws[gid].i[4]; - pws1[1] = pws[gid].i[5]; - pws1[2] = pws[gid].i[6]; - pws1[3] = pws[gid].i[7]; + wordl2[0] = 0; + wordl2[1] = 0; + wordl2[2] = 0; + wordl2[3] = 0; + + u32 wordl3[4]; + + wordl3[0] = 0; + wordl3[1] = 0; + wordl3[2] = 0; + wordl3[3] = 0; const u32 pw_l_len = pws[gid].pw_len; + if (combs_mode == COMBINATOR_MODE_BASE_RIGHT) + { + switch_buffer_by_offset_le (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len); + } + /** * salt */ u32 salt_buf0[4]; - u32 salt_buf1[4]; salt_buf0[0] = salt_bufs[salt_pos].salt_buf[0]; salt_buf0[1] = salt_bufs[salt_pos].salt_buf[1]; salt_buf0[2] = salt_bufs[salt_pos].salt_buf[2]; salt_buf0[3] = salt_bufs[salt_pos].salt_buf[3]; + + u32 salt_buf1[4]; + salt_buf1[0] = salt_bufs[salt_pos].salt_buf[4]; salt_buf1[1] = salt_bufs[salt_pos].salt_buf[5]; salt_buf1[2] = salt_bufs[salt_pos].salt_buf[6]; @@ -312,35 +372,43 @@ __kernel void m01420_s04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, * loop */ - for (u32 il_pos = 0; il_pos < combs_cnt; il_pos += VECT_SIZE) + for (u32 il_pos = 0; il_pos < combs_cnt; il_pos++) { - const u32x pw_r_len = pwlenx_create_combt (combs_buf, il_pos); - - const u32x pw_len = pw_l_len + pw_r_len; - - u32x wordr0[4] = { 0 }; - u32x wordr1[4] = { 0 }; - u32x wordr2[4] = { 0 }; - u32x wordr3[4] = { 0 }; - - wordr0[0] = ix_create_combt (combs_buf, il_pos, 0); - wordr0[1] = ix_create_combt (combs_buf, il_pos, 1); - wordr0[2] = ix_create_combt (combs_buf, il_pos, 2); - wordr0[3] = ix_create_combt (combs_buf, il_pos, 3); - wordr1[0] = ix_create_combt (combs_buf, il_pos, 4); - wordr1[1] = ix_create_combt (combs_buf, il_pos, 5); - wordr1[2] = ix_create_combt (combs_buf, il_pos, 6); - wordr1[3] = ix_create_combt (combs_buf, il_pos, 7); + const u32 pw_r_len = combs_buf[il_pos].pw_len; + + const u32 pw_len = pw_l_len + pw_r_len; + + u32 wordr0[4]; + u32 wordr1[4]; + u32 wordr2[4]; + u32 wordr3[4]; + + wordr0[0] = combs_buf[il_pos].i[0]; + wordr0[1] = combs_buf[il_pos].i[1]; + wordr0[2] = combs_buf[il_pos].i[2]; + wordr0[3] = combs_buf[il_pos].i[3]; + wordr1[0] = combs_buf[il_pos].i[4]; + wordr1[1] = combs_buf[il_pos].i[5]; + wordr1[2] = combs_buf[il_pos].i[6]; + wordr1[3] = combs_buf[il_pos].i[7]; + wordr2[0] = 0; + wordr2[1] = 0; + wordr2[2] = 0; + wordr2[3] = 0; + wordr3[0] = 0; + wordr3[1] = 0; + wordr3[2] = 0; + wordr3[3] = 0; if (combs_mode == COMBINATOR_MODE_BASE_LEFT) { switch_buffer_by_offset_le (wordr0, wordr1, wordr2, wordr3, pw_l_len); } - u32x w0[4]; - u32x w1[4]; - u32x w2[4]; - u32x w3[4]; + u32 w0[4]; + u32 w1[4]; + u32 w2[4]; + u32 w3[4]; w0[0] = wordl0[0] | wordr0[0]; w0[1] = wordl0[1] | wordr0[1]; @@ -363,7 +431,7 @@ __kernel void m01420_s04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, * prepend salt */ - const u32x pw_salt_len = pw_len + salt_len; + const u32 pw_salt_len = pw_len + salt_len; switch_buffer_by_offset_le (w0, w1, w2, w3, salt_len); @@ -382,31 +450,31 @@ __kernel void m01420_s04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, * sha256 */ - u32x w0_t = swap32 (w0[0]); - u32x w1_t = swap32 (w0[1]); - u32x w2_t = swap32 (w0[2]); - u32x w3_t = swap32 (w0[3]); - u32x w4_t = swap32 (w1[0]); - u32x w5_t = swap32 (w1[1]); - u32x w6_t = swap32 (w1[2]); - u32x w7_t = swap32 (w1[3]); - u32x w8_t = swap32 (w2[0]); - u32x w9_t = swap32 (w2[1]); - u32x wa_t = swap32 (w2[2]); - u32x wb_t = swap32 (w2[3]); - u32x wc_t = swap32 (w3[0]); - u32x wd_t = swap32 (w3[1]); - u32x we_t = 0; - u32x wf_t = pw_salt_len * 8; - - u32x a = SHA256M_A; - u32x b = SHA256M_B; - u32x c = SHA256M_C; - u32x d = SHA256M_D; - u32x e = SHA256M_E; - u32x f = SHA256M_F; - u32x g = SHA256M_G; - u32x h = SHA256M_H; + u32 w0_t = swap32 (w0[0]); + u32 w1_t = swap32 (w0[1]); + u32 w2_t = swap32 (w0[2]); + u32 w3_t = swap32 (w0[3]); + u32 w4_t = swap32 (w1[0]); + u32 w5_t = swap32 (w1[1]); + u32 w6_t = swap32 (w1[2]); + u32 w7_t = swap32 (w1[3]); + u32 w8_t = swap32 (w2[0]); + u32 w9_t = swap32 (w2[1]); + u32 wa_t = swap32 (w2[2]); + u32 wb_t = swap32 (w2[3]); + u32 wc_t = swap32 (w3[0]); + u32 wd_t = swap32 (w3[1]); + u32 we_t = 0; + u32 wf_t = pw_salt_len * 8; + + u32 a = SHA256M_A; + u32 b = SHA256M_B; + u32 c = SHA256M_C; + u32 d = SHA256M_D; + u32 e = SHA256M_E; + u32 f = SHA256M_F; + u32 g = SHA256M_G; + u32 h = SHA256M_H; SHA256_STEP (SHA256_F0o, SHA256_F1o, a, b, c, d, e, f, g, h, w0_t, SHA256C00); SHA256_STEP (SHA256_F0o, SHA256_F1o, h, a, b, c, d, e, f, g, w1_t, SHA256C01); @@ -476,7 +544,13 @@ __kernel void m01420_s04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, we_t = SHA256_EXPAND (wc_t, w7_t, wf_t, we_t); SHA256_STEP (SHA256_F0o, SHA256_F1o, c, d, e, f, g, h, a, b, we_t, SHA256C3e); wf_t = SHA256_EXPAND (wd_t, w8_t, w0_t, wf_t); SHA256_STEP (SHA256_F0o, SHA256_F1o, b, c, d, e, f, g, h, a, wf_t, SHA256C3f); - COMPARE_S_SIMD (d, h, c, g); + + const u32 r0 = d; + const u32 r1 = h; + const u32 r2 = c; + const u32 r3 = g; + + #include COMPARE_S } } diff --git a/OpenCL/m01430_a1.cl b/OpenCL/m01430_a1.cl index 5ca4337..01538ec 100644 --- a/OpenCL/m01430_a1.cl +++ b/OpenCL/m01430_a1.cl @@ -5,8 +5,6 @@ #define _SHA256_ -#define NEW_SIMD_CODE - #include "include/constants.h" #include "include/kernel_vendor.h" @@ -18,7 +16,9 @@ #include "include/kernel_functions.c" #include "OpenCL/types_ocl.c" #include "OpenCL/common.c" -#include "OpenCL/simd.c" + +#define COMPARE_S "OpenCL/check_single_comp4.c" +#define COMPARE_M "OpenCL/check_multi_comp4.c" __kernel void m01430_m04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, __global comb_t *combs_buf, __global bf_t *bfs_buf, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 combs_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max) { @@ -36,31 +36,54 @@ __kernel void m01430_m04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, if (gid >= gid_max) return; - u32 pws0[4] = { 0 }; - u32 pws1[4] = { 0 }; + u32 wordl0[4]; + + wordl0[0] = pws[gid].i[ 0]; + wordl0[1] = pws[gid].i[ 1]; + wordl0[2] = pws[gid].i[ 2]; + wordl0[3] = pws[gid].i[ 3]; + + u32 wordl1[4]; + + wordl1[0] = pws[gid].i[ 4]; + wordl1[1] = pws[gid].i[ 5]; + wordl1[2] = pws[gid].i[ 6]; + wordl1[3] = pws[gid].i[ 7]; - pws0[0] = pws[gid].i[0]; - pws0[1] = pws[gid].i[1]; - pws0[2] = pws[gid].i[2]; - pws0[3] = pws[gid].i[3]; - pws1[0] = pws[gid].i[4]; - pws1[1] = pws[gid].i[5]; - pws1[2] = pws[gid].i[6]; - pws1[3] = pws[gid].i[7]; + u32 wordl2[4]; + + wordl2[0] = 0; + wordl2[1] = 0; + wordl2[2] = 0; + wordl2[3] = 0; + + u32 wordl3[4]; + + wordl3[0] = 0; + wordl3[1] = 0; + wordl3[2] = 0; + wordl3[3] = 0; const u32 pw_l_len = pws[gid].pw_len; + if (combs_mode == COMBINATOR_MODE_BASE_RIGHT) + { + switch_buffer_by_offset_le (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len); + } + /** * salt */ u32 salt_buf0[4]; - u32 salt_buf1[4]; salt_buf0[0] = salt_bufs[salt_pos].salt_buf[0]; salt_buf0[1] = salt_bufs[salt_pos].salt_buf[1]; salt_buf0[2] = salt_bufs[salt_pos].salt_buf[2]; salt_buf0[3] = salt_bufs[salt_pos].salt_buf[3]; + + u32 salt_buf1[4]; + salt_buf1[0] = salt_bufs[salt_pos].salt_buf[4]; salt_buf1[1] = salt_bufs[salt_pos].salt_buf[5]; salt_buf1[2] = salt_bufs[salt_pos].salt_buf[6]; @@ -72,35 +95,43 @@ __kernel void m01430_m04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, * loop */ - for (u32 il_pos = 0; il_pos < combs_cnt; il_pos += VECT_SIZE) + for (u32 il_pos = 0; il_pos < combs_cnt; il_pos++) { - const u32x pw_r_len = pwlenx_create_combt (combs_buf, il_pos); - - const u32x pw_len = pw_l_len + pw_r_len; - - u32x wordr0[4] = { 0 }; - u32x wordr1[4] = { 0 }; - u32x wordr2[4] = { 0 }; - u32x wordr3[4] = { 0 }; - - wordr0[0] = ix_create_combt (combs_buf, il_pos, 0); - wordr0[1] = ix_create_combt (combs_buf, il_pos, 1); - wordr0[2] = ix_create_combt (combs_buf, il_pos, 2); - wordr0[3] = ix_create_combt (combs_buf, il_pos, 3); - wordr1[0] = ix_create_combt (combs_buf, il_pos, 4); - wordr1[1] = ix_create_combt (combs_buf, il_pos, 5); - wordr1[2] = ix_create_combt (combs_buf, il_pos, 6); - wordr1[3] = ix_create_combt (combs_buf, il_pos, 7); + const u32 pw_r_len = combs_buf[il_pos].pw_len; + + const u32 pw_len = pw_l_len + pw_r_len; + + u32 wordr0[4]; + u32 wordr1[4]; + u32 wordr2[4]; + u32 wordr3[4]; + + wordr0[0] = combs_buf[il_pos].i[0]; + wordr0[1] = combs_buf[il_pos].i[1]; + wordr0[2] = combs_buf[il_pos].i[2]; + wordr0[3] = combs_buf[il_pos].i[3]; + wordr1[0] = combs_buf[il_pos].i[4]; + wordr1[1] = combs_buf[il_pos].i[5]; + wordr1[2] = combs_buf[il_pos].i[6]; + wordr1[3] = combs_buf[il_pos].i[7]; + wordr2[0] = 0; + wordr2[1] = 0; + wordr2[2] = 0; + wordr2[3] = 0; + wordr3[0] = 0; + wordr3[1] = 0; + wordr3[2] = 0; + wordr3[3] = 0; if (combs_mode == COMBINATOR_MODE_BASE_LEFT) { switch_buffer_by_offset_le (wordr0, wordr1, wordr2, wordr3, pw_l_len); } - u32x w0[4]; - u32x w1[4]; - u32x w2[4]; - u32x w3[4]; + u32 w0[4]; + u32 w1[4]; + u32 w2[4]; + u32 w3[4]; w0[0] = wordl0[0] | wordr0[0]; w0[1] = wordl0[1] | wordr0[1]; @@ -155,10 +186,10 @@ __kernel void m01430_m04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, const u32 pw_salt_len = (pw_len * 2) + salt_len; - u32x w0_t2[4]; - u32x w1_t2[4]; - u32x w2_t2[4]; - u32x w3_t2[4]; + u32 w0_t2[4]; + u32 w1_t2[4]; + u32 w2_t2[4]; + u32 w3_t2[4]; make_unicode (w0, w0_t2, w1_t2); make_unicode (w1, w2_t2, w3_t2); @@ -186,31 +217,31 @@ __kernel void m01430_m04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, * sha256 */ - u32x w0_t = swap32 (w0_t2[0]); - u32x w1_t = swap32 (w0_t2[1]); - u32x w2_t = swap32 (w0_t2[2]); - u32x w3_t = swap32 (w0_t2[3]); - u32x w4_t = swap32 (w1_t2[0]); - u32x w5_t = swap32 (w1_t2[1]); - u32x w6_t = swap32 (w1_t2[2]); - u32x w7_t = swap32 (w1_t2[3]); - u32x w8_t = swap32 (w2_t2[0]); - u32x w9_t = swap32 (w2_t2[1]); - u32x wa_t = swap32 (w2_t2[2]); - u32x wb_t = swap32 (w2_t2[3]); - u32x wc_t = swap32 (w3_t2[0]); - u32x wd_t = swap32 (w3_t2[1]); - u32x we_t = 0; - u32x wf_t = pw_salt_len * 8; - - u32x a = SHA256M_A; - u32x b = SHA256M_B; - u32x c = SHA256M_C; - u32x d = SHA256M_D; - u32x e = SHA256M_E; - u32x f = SHA256M_F; - u32x g = SHA256M_G; - u32x h = SHA256M_H; + u32 w0_t = swap32 (w0_t2[0]); + u32 w1_t = swap32 (w0_t2[1]); + u32 w2_t = swap32 (w0_t2[2]); + u32 w3_t = swap32 (w0_t2[3]); + u32 w4_t = swap32 (w1_t2[0]); + u32 w5_t = swap32 (w1_t2[1]); + u32 w6_t = swap32 (w1_t2[2]); + u32 w7_t = swap32 (w1_t2[3]); + u32 w8_t = swap32 (w2_t2[0]); + u32 w9_t = swap32 (w2_t2[1]); + u32 wa_t = swap32 (w2_t2[2]); + u32 wb_t = swap32 (w2_t2[3]); + u32 wc_t = swap32 (w3_t2[0]); + u32 wd_t = swap32 (w3_t2[1]); + u32 we_t = 0; + u32 wf_t = pw_salt_len * 8; + + u32 a = SHA256M_A; + u32 b = SHA256M_B; + u32 c = SHA256M_C; + u32 d = SHA256M_D; + u32 e = SHA256M_E; + u32 f = SHA256M_F; + u32 g = SHA256M_G; + u32 h = SHA256M_H; SHA256_STEP (SHA256_F0o, SHA256_F1o, a, b, c, d, e, f, g, h, w0_t, SHA256C00); SHA256_STEP (SHA256_F0o, SHA256_F1o, h, a, b, c, d, e, f, g, w1_t, SHA256C01); @@ -280,7 +311,13 @@ __kernel void m01430_m04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, we_t = SHA256_EXPAND (wc_t, w7_t, wf_t, we_t); SHA256_STEP (SHA256_F0o, SHA256_F1o, c, d, e, f, g, h, a, b, we_t, SHA256C3e); wf_t = SHA256_EXPAND (wd_t, w8_t, w0_t, wf_t); SHA256_STEP (SHA256_F0o, SHA256_F1o, b, c, d, e, f, g, h, a, wf_t, SHA256C3f); - COMPARE_M_SIMD (d, h, c, g); + + const u32 r0 = d; + const u32 r1 = h; + const u32 r2 = c; + const u32 r3 = g; + + #include COMPARE_M } } @@ -308,31 +345,54 @@ __kernel void m01430_s04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, if (gid >= gid_max) return; - u32 pws0[4] = { 0 }; - u32 pws1[4] = { 0 }; + u32 wordl0[4]; + + wordl0[0] = pws[gid].i[ 0]; + wordl0[1] = pws[gid].i[ 1]; + wordl0[2] = pws[gid].i[ 2]; + wordl0[3] = pws[gid].i[ 3]; + + u32 wordl1[4]; + + wordl1[0] = pws[gid].i[ 4]; + wordl1[1] = pws[gid].i[ 5]; + wordl1[2] = pws[gid].i[ 6]; + wordl1[3] = pws[gid].i[ 7]; + + u32 wordl2[4]; - pws0[0] = pws[gid].i[0]; - pws0[1] = pws[gid].i[1]; - pws0[2] = pws[gid].i[2]; - pws0[3] = pws[gid].i[3]; - pws1[0] = pws[gid].i[4]; - pws1[1] = pws[gid].i[5]; - pws1[2] = pws[gid].i[6]; - pws1[3] = pws[gid].i[7]; + wordl2[0] = 0; + wordl2[1] = 0; + wordl2[2] = 0; + wordl2[3] = 0; + + u32 wordl3[4]; + + wordl3[0] = 0; + wordl3[1] = 0; + wordl3[2] = 0; + wordl3[3] = 0; const u32 pw_l_len = pws[gid].pw_len; + if (combs_mode == COMBINATOR_MODE_BASE_RIGHT) + { + switch_buffer_by_offset_le (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len); + } + /** * salt */ u32 salt_buf0[4]; - u32 salt_buf1[4]; salt_buf0[0] = salt_bufs[salt_pos].salt_buf[0]; salt_buf0[1] = salt_bufs[salt_pos].salt_buf[1]; salt_buf0[2] = salt_bufs[salt_pos].salt_buf[2]; salt_buf0[3] = salt_bufs[salt_pos].salt_buf[3]; + + u32 salt_buf1[4]; + salt_buf1[0] = salt_bufs[salt_pos].salt_buf[4]; salt_buf1[1] = salt_bufs[salt_pos].salt_buf[5]; salt_buf1[2] = salt_bufs[salt_pos].salt_buf[6]; @@ -356,35 +416,43 @@ __kernel void m01430_s04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, * loop */ - for (u32 il_pos = 0; il_pos < combs_cnt; il_pos += VECT_SIZE) + for (u32 il_pos = 0; il_pos < combs_cnt; il_pos++) { - const u32x pw_r_len = pwlenx_create_combt (combs_buf, il_pos); - - const u32x pw_len = pw_l_len + pw_r_len; - - u32x wordr0[4] = { 0 }; - u32x wordr1[4] = { 0 }; - u32x wordr2[4] = { 0 }; - u32x wordr3[4] = { 0 }; - - wordr0[0] = ix_create_combt (combs_buf, il_pos, 0); - wordr0[1] = ix_create_combt (combs_buf, il_pos, 1); - wordr0[2] = ix_create_combt (combs_buf, il_pos, 2); - wordr0[3] = ix_create_combt (combs_buf, il_pos, 3); - wordr1[0] = ix_create_combt (combs_buf, il_pos, 4); - wordr1[1] = ix_create_combt (combs_buf, il_pos, 5); - wordr1[2] = ix_create_combt (combs_buf, il_pos, 6); - wordr1[3] = ix_create_combt (combs_buf, il_pos, 7); + const u32 pw_r_len = combs_buf[il_pos].pw_len; + + const u32 pw_len = pw_l_len + pw_r_len; + + u32 wordr0[4]; + u32 wordr1[4]; + u32 wordr2[4]; + u32 wordr3[4]; + + wordr0[0] = combs_buf[il_pos].i[0]; + wordr0[1] = combs_buf[il_pos].i[1]; + wordr0[2] = combs_buf[il_pos].i[2]; + wordr0[3] = combs_buf[il_pos].i[3]; + wordr1[0] = combs_buf[il_pos].i[4]; + wordr1[1] = combs_buf[il_pos].i[5]; + wordr1[2] = combs_buf[il_pos].i[6]; + wordr1[3] = combs_buf[il_pos].i[7]; + wordr2[0] = 0; + wordr2[1] = 0; + wordr2[2] = 0; + wordr2[3] = 0; + wordr3[0] = 0; + wordr3[1] = 0; + wordr3[2] = 0; + wordr3[3] = 0; if (combs_mode == COMBINATOR_MODE_BASE_LEFT) { switch_buffer_by_offset_le (wordr0, wordr1, wordr2, wordr3, pw_l_len); } - u32x w0[4]; - u32x w1[4]; - u32x w2[4]; - u32x w3[4]; + u32 w0[4]; + u32 w1[4]; + u32 w2[4]; + u32 w3[4]; w0[0] = wordl0[0] | wordr0[0]; w0[1] = wordl0[1] | wordr0[1]; @@ -439,10 +507,10 @@ __kernel void m01430_s04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, const u32 pw_salt_len = (pw_len * 2) + salt_len; - u32x w0_t2[4]; - u32x w1_t2[4]; - u32x w2_t2[4]; - u32x w3_t2[4]; + u32 w0_t2[4]; + u32 w1_t2[4]; + u32 w2_t2[4]; + u32 w3_t2[4]; make_unicode (w0, w0_t2, w1_t2); make_unicode (w1, w2_t2, w3_t2); @@ -470,31 +538,31 @@ __kernel void m01430_s04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, * sha256 */ - u32x w0_t = swap32 (w0_t2[0]); - u32x w1_t = swap32 (w0_t2[1]); - u32x w2_t = swap32 (w0_t2[2]); - u32x w3_t = swap32 (w0_t2[3]); - u32x w4_t = swap32 (w1_t2[0]); - u32x w5_t = swap32 (w1_t2[1]); - u32x w6_t = swap32 (w1_t2[2]); - u32x w7_t = swap32 (w1_t2[3]); - u32x w8_t = swap32 (w2_t2[0]); - u32x w9_t = swap32 (w2_t2[1]); - u32x wa_t = swap32 (w2_t2[2]); - u32x wb_t = swap32 (w2_t2[3]); - u32x wc_t = swap32 (w3_t2[0]); - u32x wd_t = swap32 (w3_t2[1]); - u32x we_t = 0; - u32x wf_t = pw_salt_len * 8; - - u32x a = SHA256M_A; - u32x b = SHA256M_B; - u32x c = SHA256M_C; - u32x d = SHA256M_D; - u32x e = SHA256M_E; - u32x f = SHA256M_F; - u32x g = SHA256M_G; - u32x h = SHA256M_H; + u32 w0_t = swap32 (w0_t2[0]); + u32 w1_t = swap32 (w0_t2[1]); + u32 w2_t = swap32 (w0_t2[2]); + u32 w3_t = swap32 (w0_t2[3]); + u32 w4_t = swap32 (w1_t2[0]); + u32 w5_t = swap32 (w1_t2[1]); + u32 w6_t = swap32 (w1_t2[2]); + u32 w7_t = swap32 (w1_t2[3]); + u32 w8_t = swap32 (w2_t2[0]); + u32 w9_t = swap32 (w2_t2[1]); + u32 wa_t = swap32 (w2_t2[2]); + u32 wb_t = swap32 (w2_t2[3]); + u32 wc_t = swap32 (w3_t2[0]); + u32 wd_t = swap32 (w3_t2[1]); + u32 we_t = 0; + u32 wf_t = pw_salt_len * 8; + + u32 a = SHA256M_A; + u32 b = SHA256M_B; + u32 c = SHA256M_C; + u32 d = SHA256M_D; + u32 e = SHA256M_E; + u32 f = SHA256M_F; + u32 g = SHA256M_G; + u32 h = SHA256M_H; SHA256_STEP (SHA256_F0o, SHA256_F1o, a, b, c, d, e, f, g, h, w0_t, SHA256C00); SHA256_STEP (SHA256_F0o, SHA256_F1o, h, a, b, c, d, e, f, g, w1_t, SHA256C01); @@ -564,7 +632,13 @@ __kernel void m01430_s04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, we_t = SHA256_EXPAND (wc_t, w7_t, wf_t, we_t); SHA256_STEP (SHA256_F0o, SHA256_F1o, c, d, e, f, g, h, a, b, we_t, SHA256C3e); wf_t = SHA256_EXPAND (wd_t, w8_t, w0_t, wf_t); SHA256_STEP (SHA256_F0o, SHA256_F1o, b, c, d, e, f, g, h, a, wf_t, SHA256C3f); - COMPARE_S_SIMD (d, h, c, g); + + const u32 r0 = d; + const u32 r1 = h; + const u32 r2 = c; + const u32 r3 = g; + + #include COMPARE_S } } diff --git a/OpenCL/m01440_a1.cl b/OpenCL/m01440_a1.cl index 41def87..be82ddf 100644 --- a/OpenCL/m01440_a1.cl +++ b/OpenCL/m01440_a1.cl @@ -5,8 +5,6 @@ #define _SHA256_ -#define NEW_SIMD_CODE - #include "include/constants.h" #include "include/kernel_vendor.h" @@ -18,7 +16,9 @@ #include "include/kernel_functions.c" #include "OpenCL/types_ocl.c" #include "OpenCL/common.c" -#include "OpenCL/simd.c" + +#define COMPARE_S "OpenCL/check_single_comp4.c" +#define COMPARE_M "OpenCL/check_multi_comp4.c" __kernel void m01440_m04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, __global comb_t *combs_buf, __global bf_t *bfs_buf, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 combs_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max) { @@ -36,31 +36,54 @@ __kernel void m01440_m04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, if (gid >= gid_max) return; - u32 pws0[4] = { 0 }; - u32 pws1[4] = { 0 }; + u32 wordl0[4]; + + wordl0[0] = pws[gid].i[ 0]; + wordl0[1] = pws[gid].i[ 1]; + wordl0[2] = pws[gid].i[ 2]; + wordl0[3] = pws[gid].i[ 3]; + + u32 wordl1[4]; + + wordl1[0] = pws[gid].i[ 4]; + wordl1[1] = pws[gid].i[ 5]; + wordl1[2] = pws[gid].i[ 6]; + wordl1[3] = pws[gid].i[ 7]; - pws0[0] = pws[gid].i[0]; - pws0[1] = pws[gid].i[1]; - pws0[2] = pws[gid].i[2]; - pws0[3] = pws[gid].i[3]; - pws1[0] = pws[gid].i[4]; - pws1[1] = pws[gid].i[5]; - pws1[2] = pws[gid].i[6]; - pws1[3] = pws[gid].i[7]; + u32 wordl2[4]; + + wordl2[0] = 0; + wordl2[1] = 0; + wordl2[2] = 0; + wordl2[3] = 0; + + u32 wordl3[4]; + + wordl3[0] = 0; + wordl3[1] = 0; + wordl3[2] = 0; + wordl3[3] = 0; const u32 pw_l_len = pws[gid].pw_len; + if (combs_mode == COMBINATOR_MODE_BASE_RIGHT) + { + switch_buffer_by_offset_le (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len); + } + /** * salt */ u32 salt_buf0[4]; - u32 salt_buf1[4]; salt_buf0[0] = salt_bufs[salt_pos].salt_buf[0]; salt_buf0[1] = salt_bufs[salt_pos].salt_buf[1]; salt_buf0[2] = salt_bufs[salt_pos].salt_buf[2]; salt_buf0[3] = salt_bufs[salt_pos].salt_buf[3]; + + u32 salt_buf1[4]; + salt_buf1[0] = salt_bufs[salt_pos].salt_buf[4]; salt_buf1[1] = salt_bufs[salt_pos].salt_buf[5]; salt_buf1[2] = salt_bufs[salt_pos].salt_buf[6]; @@ -72,35 +95,43 @@ __kernel void m01440_m04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, * loop */ - for (u32 il_pos = 0; il_pos < combs_cnt; il_pos += VECT_SIZE) + for (u32 il_pos = 0; il_pos < combs_cnt; il_pos++) { - const u32x pw_r_len = pwlenx_create_combt (combs_buf, il_pos); - - const u32x pw_len = pw_l_len + pw_r_len; - - u32x wordr0[4] = { 0 }; - u32x wordr1[4] = { 0 }; - u32x wordr2[4] = { 0 }; - u32x wordr3[4] = { 0 }; - - wordr0[0] = ix_create_combt (combs_buf, il_pos, 0); - wordr0[1] = ix_create_combt (combs_buf, il_pos, 1); - wordr0[2] = ix_create_combt (combs_buf, il_pos, 2); - wordr0[3] = ix_create_combt (combs_buf, il_pos, 3); - wordr1[0] = ix_create_combt (combs_buf, il_pos, 4); - wordr1[1] = ix_create_combt (combs_buf, il_pos, 5); - wordr1[2] = ix_create_combt (combs_buf, il_pos, 6); - wordr1[3] = ix_create_combt (combs_buf, il_pos, 7); + const u32 pw_r_len = combs_buf[il_pos].pw_len; + + const u32 pw_len = pw_l_len + pw_r_len; + + u32 wordr0[4]; + u32 wordr1[4]; + u32 wordr2[4]; + u32 wordr3[4]; + + wordr0[0] = combs_buf[il_pos].i[0]; + wordr0[1] = combs_buf[il_pos].i[1]; + wordr0[2] = combs_buf[il_pos].i[2]; + wordr0[3] = combs_buf[il_pos].i[3]; + wordr1[0] = combs_buf[il_pos].i[4]; + wordr1[1] = combs_buf[il_pos].i[5]; + wordr1[2] = combs_buf[il_pos].i[6]; + wordr1[3] = combs_buf[il_pos].i[7]; + wordr2[0] = 0; + wordr2[1] = 0; + wordr2[2] = 0; + wordr2[3] = 0; + wordr3[0] = 0; + wordr3[1] = 0; + wordr3[2] = 0; + wordr3[3] = 0; if (combs_mode == COMBINATOR_MODE_BASE_LEFT) { switch_buffer_by_offset_le (wordr0, wordr1, wordr2, wordr3, pw_l_len); } - u32x w0[4]; - u32x w1[4]; - u32x w2[4]; - u32x w3[4]; + u32 w0[4]; + u32 w1[4]; + u32 w2[4]; + u32 w3[4]; w0[0] = wordl0[0] | wordr0[0]; w0[1] = wordl0[1] | wordr0[1]; @@ -125,10 +156,10 @@ __kernel void m01440_m04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, const u32 pw_salt_len = (pw_len * 2) + salt_len; - u32x w0_t2[4]; - u32x w1_t2[4]; - u32x w2_t2[4]; - u32x w3_t2[4]; + u32 w0_t2[4]; + u32 w1_t2[4]; + u32 w2_t2[4]; + u32 w3_t2[4]; make_unicode (w0, w0_t2, w1_t2); make_unicode (w1, w2_t2, w3_t2); @@ -150,31 +181,31 @@ __kernel void m01440_m04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, * sha256 */ - u32x w0_t = swap32 (w0_t2[0]); - u32x w1_t = swap32 (w0_t2[1]); - u32x w2_t = swap32 (w0_t2[2]); - u32x w3_t = swap32 (w0_t2[3]); - u32x w4_t = swap32 (w1_t2[0]); - u32x w5_t = swap32 (w1_t2[1]); - u32x w6_t = swap32 (w1_t2[2]); - u32x w7_t = swap32 (w1_t2[3]); - u32x w8_t = swap32 (w2_t2[0]); - u32x w9_t = swap32 (w2_t2[1]); - u32x wa_t = swap32 (w2_t2[2]); - u32x wb_t = swap32 (w2_t2[3]); - u32x wc_t = swap32 (w3_t2[0]); - u32x wd_t = swap32 (w3_t2[1]); - u32x we_t = 0; - u32x wf_t = pw_salt_len * 8; - - u32x a = SHA256M_A; - u32x b = SHA256M_B; - u32x c = SHA256M_C; - u32x d = SHA256M_D; - u32x e = SHA256M_E; - u32x f = SHA256M_F; - u32x g = SHA256M_G; - u32x h = SHA256M_H; + u32 w0_t = swap32 (w0_t2[0]); + u32 w1_t = swap32 (w0_t2[1]); + u32 w2_t = swap32 (w0_t2[2]); + u32 w3_t = swap32 (w0_t2[3]); + u32 w4_t = swap32 (w1_t2[0]); + u32 w5_t = swap32 (w1_t2[1]); + u32 w6_t = swap32 (w1_t2[2]); + u32 w7_t = swap32 (w1_t2[3]); + u32 w8_t = swap32 (w2_t2[0]); + u32 w9_t = swap32 (w2_t2[1]); + u32 wa_t = swap32 (w2_t2[2]); + u32 wb_t = swap32 (w2_t2[3]); + u32 wc_t = swap32 (w3_t2[0]); + u32 wd_t = swap32 (w3_t2[1]); + u32 we_t = 0; + u32 wf_t = pw_salt_len * 8; + + u32 a = SHA256M_A; + u32 b = SHA256M_B; + u32 c = SHA256M_C; + u32 d = SHA256M_D; + u32 e = SHA256M_E; + u32 f = SHA256M_F; + u32 g = SHA256M_G; + u32 h = SHA256M_H; SHA256_STEP (SHA256_F0o, SHA256_F1o, a, b, c, d, e, f, g, h, w0_t, SHA256C00); SHA256_STEP (SHA256_F0o, SHA256_F1o, h, a, b, c, d, e, f, g, w1_t, SHA256C01); @@ -244,7 +275,13 @@ __kernel void m01440_m04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, we_t = SHA256_EXPAND (wc_t, w7_t, wf_t, we_t); SHA256_STEP (SHA256_F0o, SHA256_F1o, c, d, e, f, g, h, a, b, we_t, SHA256C3e); wf_t = SHA256_EXPAND (wd_t, w8_t, w0_t, wf_t); SHA256_STEP (SHA256_F0o, SHA256_F1o, b, c, d, e, f, g, h, a, wf_t, SHA256C3f); - COMPARE_M_SIMD (d, h, c, g); + + const u32 r0 = d; + const u32 r1 = h; + const u32 r2 = c; + const u32 r3 = g; + + #include COMPARE_M } } @@ -272,31 +309,54 @@ __kernel void m01440_s04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, if (gid >= gid_max) return; - u32 pws0[4] = { 0 }; - u32 pws1[4] = { 0 }; + u32 wordl0[4]; + + wordl0[0] = pws[gid].i[ 0]; + wordl0[1] = pws[gid].i[ 1]; + wordl0[2] = pws[gid].i[ 2]; + wordl0[3] = pws[gid].i[ 3]; + + u32 wordl1[4]; + + wordl1[0] = pws[gid].i[ 4]; + wordl1[1] = pws[gid].i[ 5]; + wordl1[2] = pws[gid].i[ 6]; + wordl1[3] = pws[gid].i[ 7]; + + u32 wordl2[4]; - pws0[0] = pws[gid].i[0]; - pws0[1] = pws[gid].i[1]; - pws0[2] = pws[gid].i[2]; - pws0[3] = pws[gid].i[3]; - pws1[0] = pws[gid].i[4]; - pws1[1] = pws[gid].i[5]; - pws1[2] = pws[gid].i[6]; - pws1[3] = pws[gid].i[7]; + wordl2[0] = 0; + wordl2[1] = 0; + wordl2[2] = 0; + wordl2[3] = 0; + + u32 wordl3[4]; + + wordl3[0] = 0; + wordl3[1] = 0; + wordl3[2] = 0; + wordl3[3] = 0; const u32 pw_l_len = pws[gid].pw_len; + if (combs_mode == COMBINATOR_MODE_BASE_RIGHT) + { + switch_buffer_by_offset_le (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len); + } + /** * salt */ u32 salt_buf0[4]; - u32 salt_buf1[4]; salt_buf0[0] = salt_bufs[salt_pos].salt_buf[0]; salt_buf0[1] = salt_bufs[salt_pos].salt_buf[1]; salt_buf0[2] = salt_bufs[salt_pos].salt_buf[2]; salt_buf0[3] = salt_bufs[salt_pos].salt_buf[3]; + + u32 salt_buf1[4]; + salt_buf1[0] = salt_bufs[salt_pos].salt_buf[4]; salt_buf1[1] = salt_bufs[salt_pos].salt_buf[5]; salt_buf1[2] = salt_bufs[salt_pos].salt_buf[6]; @@ -320,35 +380,43 @@ __kernel void m01440_s04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, * loop */ - for (u32 il_pos = 0; il_pos < combs_cnt; il_pos += VECT_SIZE) + for (u32 il_pos = 0; il_pos < combs_cnt; il_pos++) { - const u32x pw_r_len = pwlenx_create_combt (combs_buf, il_pos); - - const u32x pw_len = pw_l_len + pw_r_len; - - u32x wordr0[4] = { 0 }; - u32x wordr1[4] = { 0 }; - u32x wordr2[4] = { 0 }; - u32x wordr3[4] = { 0 }; - - wordr0[0] = ix_create_combt (combs_buf, il_pos, 0); - wordr0[1] = ix_create_combt (combs_buf, il_pos, 1); - wordr0[2] = ix_create_combt (combs_buf, il_pos, 2); - wordr0[3] = ix_create_combt (combs_buf, il_pos, 3); - wordr1[0] = ix_create_combt (combs_buf, il_pos, 4); - wordr1[1] = ix_create_combt (combs_buf, il_pos, 5); - wordr1[2] = ix_create_combt (combs_buf, il_pos, 6); - wordr1[3] = ix_create_combt (combs_buf, il_pos, 7); + const u32 pw_r_len = combs_buf[il_pos].pw_len; + + const u32 pw_len = pw_l_len + pw_r_len; + + u32 wordr0[4]; + u32 wordr1[4]; + u32 wordr2[4]; + u32 wordr3[4]; + + wordr0[0] = combs_buf[il_pos].i[0]; + wordr0[1] = combs_buf[il_pos].i[1]; + wordr0[2] = combs_buf[il_pos].i[2]; + wordr0[3] = combs_buf[il_pos].i[3]; + wordr1[0] = combs_buf[il_pos].i[4]; + wordr1[1] = combs_buf[il_pos].i[5]; + wordr1[2] = combs_buf[il_pos].i[6]; + wordr1[3] = combs_buf[il_pos].i[7]; + wordr2[0] = 0; + wordr2[1] = 0; + wordr2[2] = 0; + wordr2[3] = 0; + wordr3[0] = 0; + wordr3[1] = 0; + wordr3[2] = 0; + wordr3[3] = 0; if (combs_mode == COMBINATOR_MODE_BASE_LEFT) { switch_buffer_by_offset_le (wordr0, wordr1, wordr2, wordr3, pw_l_len); } - u32x w0[4]; - u32x w1[4]; - u32x w2[4]; - u32x w3[4]; + u32 w0[4]; + u32 w1[4]; + u32 w2[4]; + u32 w3[4]; w0[0] = wordl0[0] | wordr0[0]; w0[1] = wordl0[1] | wordr0[1]; @@ -373,10 +441,10 @@ __kernel void m01440_s04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, const u32 pw_salt_len = (pw_len * 2) + salt_len; - u32x w0_t2[4]; - u32x w1_t2[4]; - u32x w2_t2[4]; - u32x w3_t2[4]; + u32 w0_t2[4]; + u32 w1_t2[4]; + u32 w2_t2[4]; + u32 w3_t2[4]; make_unicode (w0, w0_t2, w1_t2); make_unicode (w1, w2_t2, w3_t2); @@ -398,31 +466,31 @@ __kernel void m01440_s04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, * sha256 */ - u32x w0_t = swap32 (w0_t2[0]); - u32x w1_t = swap32 (w0_t2[1]); - u32x w2_t = swap32 (w0_t2[2]); - u32x w3_t = swap32 (w0_t2[3]); - u32x w4_t = swap32 (w1_t2[0]); - u32x w5_t = swap32 (w1_t2[1]); - u32x w6_t = swap32 (w1_t2[2]); - u32x w7_t = swap32 (w1_t2[3]); - u32x w8_t = swap32 (w2_t2[0]); - u32x w9_t = swap32 (w2_t2[1]); - u32x wa_t = swap32 (w2_t2[2]); - u32x wb_t = swap32 (w2_t2[3]); - u32x wc_t = swap32 (w3_t2[0]); - u32x wd_t = swap32 (w3_t2[1]); - u32x we_t = 0; - u32x wf_t = pw_salt_len * 8; - - u32x a = SHA256M_A; - u32x b = SHA256M_B; - u32x c = SHA256M_C; - u32x d = SHA256M_D; - u32x e = SHA256M_E; - u32x f = SHA256M_F; - u32x g = SHA256M_G; - u32x h = SHA256M_H; + u32 w0_t = swap32 (w0_t2[0]); + u32 w1_t = swap32 (w0_t2[1]); + u32 w2_t = swap32 (w0_t2[2]); + u32 w3_t = swap32 (w0_t2[3]); + u32 w4_t = swap32 (w1_t2[0]); + u32 w5_t = swap32 (w1_t2[1]); + u32 w6_t = swap32 (w1_t2[2]); + u32 w7_t = swap32 (w1_t2[3]); + u32 w8_t = swap32 (w2_t2[0]); + u32 w9_t = swap32 (w2_t2[1]); + u32 wa_t = swap32 (w2_t2[2]); + u32 wb_t = swap32 (w2_t2[3]); + u32 wc_t = swap32 (w3_t2[0]); + u32 wd_t = swap32 (w3_t2[1]); + u32 we_t = 0; + u32 wf_t = pw_salt_len * 8; + + u32 a = SHA256M_A; + u32 b = SHA256M_B; + u32 c = SHA256M_C; + u32 d = SHA256M_D; + u32 e = SHA256M_E; + u32 f = SHA256M_F; + u32 g = SHA256M_G; + u32 h = SHA256M_H; SHA256_STEP (SHA256_F0o, SHA256_F1o, a, b, c, d, e, f, g, h, w0_t, SHA256C00); SHA256_STEP (SHA256_F0o, SHA256_F1o, h, a, b, c, d, e, f, g, w1_t, SHA256C01); @@ -492,7 +560,13 @@ __kernel void m01440_s04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, we_t = SHA256_EXPAND (wc_t, w7_t, wf_t, we_t); SHA256_STEP (SHA256_F0o, SHA256_F1o, c, d, e, f, g, h, a, b, we_t, SHA256C3e); wf_t = SHA256_EXPAND (wd_t, w8_t, w0_t, wf_t); SHA256_STEP (SHA256_F0o, SHA256_F1o, b, c, d, e, f, g, h, a, wf_t, SHA256C3f); - COMPARE_S_SIMD (d, h, c, g); + + const u32 r0 = d; + const u32 r1 = h; + const u32 r2 = c; + const u32 r3 = g; + + #include COMPARE_S } } diff --git a/OpenCL/m01450_a1.cl b/OpenCL/m01450_a1.cl index 8c7659a..e816680 100644 --- a/OpenCL/m01450_a1.cl +++ b/OpenCL/m01450_a1.cl @@ -5,8 +5,6 @@ #define _SHA256_ -#define NEW_SIMD_CODE - #include "include/constants.h" #include "include/kernel_vendor.h" @@ -18,7 +16,9 @@ #include "include/kernel_functions.c" #include "OpenCL/types_ocl.c" #include "OpenCL/common.c" -#include "OpenCL/simd.c" + +#define COMPARE_S "OpenCL/check_single_comp4.c" +#define COMPARE_M "OpenCL/check_multi_comp4.c" __constant u32 k_sha256[64] = { @@ -243,20 +243,41 @@ __kernel void m01450_m04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, if (gid >= gid_max) return; - u32 pws0[4] = { 0 }; - u32 pws1[4] = { 0 }; + u32 wordl0[4]; + + wordl0[0] = pws[gid].i[ 0]; + wordl0[1] = pws[gid].i[ 1]; + wordl0[2] = pws[gid].i[ 2]; + wordl0[3] = pws[gid].i[ 3]; + + u32 wordl1[4]; + + wordl1[0] = pws[gid].i[ 4]; + wordl1[1] = pws[gid].i[ 5]; + wordl1[2] = pws[gid].i[ 6]; + wordl1[3] = pws[gid].i[ 7]; - pws0[0] = pws[gid].i[0]; - pws0[1] = pws[gid].i[1]; - pws0[2] = pws[gid].i[2]; - pws0[3] = pws[gid].i[3]; - pws1[0] = pws[gid].i[4]; - pws1[1] = pws[gid].i[5]; - pws1[2] = pws[gid].i[6]; - pws1[3] = pws[gid].i[7]; + u32 wordl2[4]; + + wordl2[0] = 0; + wordl2[1] = 0; + wordl2[2] = 0; + wordl2[3] = 0; + + u32 wordl3[4]; + + wordl3[0] = 0; + wordl3[1] = 0; + wordl3[2] = 0; + wordl3[3] = 0; const u32 pw_l_len = pws[gid].pw_len; + if (combs_mode == COMBINATOR_MODE_BASE_RIGHT) + { + switch_buffer_by_offset_le (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len); + } + /** * salt */ @@ -281,53 +302,67 @@ __kernel void m01450_m04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, * loop */ - for (u32 il_pos = 0; il_pos < combs_cnt; il_pos += VECT_SIZE) + for (u32 il_pos = 0; il_pos < combs_cnt; il_pos++) { - const u32x pw_r_len = pwlenx_create_combt (combs_buf, il_pos); + const u32 pw_r_len = combs_buf[il_pos].pw_len; + + const u32 pw_len = pw_l_len + pw_r_len; + + u32 wordr0[4]; + + wordr0[0] = combs_buf[il_pos].i[0]; + wordr0[1] = combs_buf[il_pos].i[1]; + wordr0[2] = combs_buf[il_pos].i[2]; + wordr0[3] = combs_buf[il_pos].i[3]; + + u32 wordr1[4]; + + wordr1[0] = combs_buf[il_pos].i[4]; + wordr1[1] = combs_buf[il_pos].i[5]; + wordr1[2] = combs_buf[il_pos].i[6]; + wordr1[3] = combs_buf[il_pos].i[7]; + + u32 wordr2[4]; - const u32x pw_len = pw_l_len + pw_r_len; + wordr2[0] = 0; + wordr2[1] = 0; + wordr2[2] = 0; + wordr2[3] = 0; - u32x wordr0[4] = { 0 }; - u32x wordr1[4] = { 0 }; - u32x wordr2[4] = { 0 }; - u32x wordr3[4] = { 0 }; + u32 wordr3[4]; - wordr0[0] = ix_create_combt (combs_buf, il_pos, 0); - wordr0[1] = ix_create_combt (combs_buf, il_pos, 1); - wordr0[2] = ix_create_combt (combs_buf, il_pos, 2); - wordr0[3] = ix_create_combt (combs_buf, il_pos, 3); - wordr1[0] = ix_create_combt (combs_buf, il_pos, 4); - wordr1[1] = ix_create_combt (combs_buf, il_pos, 5); - wordr1[2] = ix_create_combt (combs_buf, il_pos, 6); - wordr1[3] = ix_create_combt (combs_buf, il_pos, 7); + wordr3[0] = 0; + wordr3[1] = 0; + wordr3[2] = 0; + wordr3[3] = 0; if (combs_mode == COMBINATOR_MODE_BASE_LEFT) { switch_buffer_by_offset_le (wordr0, wordr1, wordr2, wordr3, pw_l_len); } - u32x w0[4]; + u32 w0[4]; w0[0] = wordl0[0] | wordr0[0]; w0[1] = wordl0[1] | wordr0[1]; w0[2] = wordl0[2] | wordr0[2]; w0[3] = wordl0[3] | wordr0[3]; - u32x w1[4]; + u32 w1[4]; w1[0] = wordl1[0] | wordr1[0]; w1[1] = wordl1[1] | wordr1[1]; w1[2] = wordl1[2] | wordr1[2]; w1[3] = wordl1[3] | wordr1[3]; - u32x w2[4]; + u32 w2[4]; w2[0] = wordl2[0] | wordr2[0]; w2[1] = wordl2[1] | wordr2[1]; w2[2] = wordl2[2] | wordr2[2]; w2[3] = wordl2[3] | wordr2[3]; - u32x w3[4]; + u32 w3[4]; w3[0] = wordl3[0] | wordr3[0]; w3[1] = wordl3[1] | wordr3[1]; @@ -338,28 +373,28 @@ __kernel void m01450_m04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, * pads */ - u32x w0_t[4]; + u32 w0_t[4]; w0_t[0] = swap32 (w0[0]); w0_t[1] = swap32 (w0[1]); w0_t[2] = swap32 (w0[2]); w0_t[3] = swap32 (w0[3]); - u32x w1_t[4]; + u32 w1_t[4]; w1_t[0] = swap32 (w1[0]); w1_t[1] = swap32 (w1[1]); w1_t[2] = swap32 (w1[2]); w1_t[3] = swap32 (w1[3]); - u32x w2_t[4]; + u32 w2_t[4]; w2_t[0] = 0; w2_t[1] = 0; w2_t[2] = 0; w2_t[3] = 0; - u32x w3_t[4]; + u32 w3_t[4]; w3_t[0] = 0; w3_t[1] = 0; @@ -392,7 +427,12 @@ __kernel void m01450_m04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, hmac_sha256_run (w0_t, w1_t, w2_t, w3_t, ipad, opad, digest); - COMPARE_M_SIMD (digest[3], digest[7], digest[2], digest[6]); + const u32 r0 = digest[3]; + const u32 r1 = digest[7]; + const u32 r2 = digest[2]; + const u32 r3 = digest[6]; + + #include COMPARE_M } } @@ -420,20 +460,41 @@ __kernel void m01450_s04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, if (gid >= gid_max) return; - u32 pws0[4] = { 0 }; - u32 pws1[4] = { 0 }; + u32 wordl0[4]; + + wordl0[0] = pws[gid].i[ 0]; + wordl0[1] = pws[gid].i[ 1]; + wordl0[2] = pws[gid].i[ 2]; + wordl0[3] = pws[gid].i[ 3]; + + u32 wordl1[4]; + + wordl1[0] = pws[gid].i[ 4]; + wordl1[1] = pws[gid].i[ 5]; + wordl1[2] = pws[gid].i[ 6]; + wordl1[3] = pws[gid].i[ 7]; + + u32 wordl2[4]; - pws0[0] = pws[gid].i[0]; - pws0[1] = pws[gid].i[1]; - pws0[2] = pws[gid].i[2]; - pws0[3] = pws[gid].i[3]; - pws1[0] = pws[gid].i[4]; - pws1[1] = pws[gid].i[5]; - pws1[2] = pws[gid].i[6]; - pws1[3] = pws[gid].i[7]; + wordl2[0] = 0; + wordl2[1] = 0; + wordl2[2] = 0; + wordl2[3] = 0; + + u32 wordl3[4]; + + wordl3[0] = 0; + wordl3[1] = 0; + wordl3[2] = 0; + wordl3[3] = 0; const u32 pw_l_len = pws[gid].pw_len; + if (combs_mode == COMBINATOR_MODE_BASE_RIGHT) + { + switch_buffer_by_offset_le (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len); + } + /** * salt */ @@ -470,53 +531,67 @@ __kernel void m01450_s04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, * loop */ - for (u32 il_pos = 0; il_pos < combs_cnt; il_pos += VECT_SIZE) + for (u32 il_pos = 0; il_pos < combs_cnt; il_pos++) { - const u32x pw_r_len = pwlenx_create_combt (combs_buf, il_pos); + const u32 pw_r_len = combs_buf[il_pos].pw_len; + + const u32 pw_len = pw_l_len + pw_r_len; + + u32 wordr0[4]; + + wordr0[0] = combs_buf[il_pos].i[0]; + wordr0[1] = combs_buf[il_pos].i[1]; + wordr0[2] = combs_buf[il_pos].i[2]; + wordr0[3] = combs_buf[il_pos].i[3]; - const u32x pw_len = pw_l_len + pw_r_len; + u32 wordr1[4]; - u32x wordr0[4] = { 0 }; - u32x wordr1[4] = { 0 }; - u32x wordr2[4] = { 0 }; - u32x wordr3[4] = { 0 }; + wordr1[0] = combs_buf[il_pos].i[4]; + wordr1[1] = combs_buf[il_pos].i[5]; + wordr1[2] = combs_buf[il_pos].i[6]; + wordr1[3] = combs_buf[il_pos].i[7]; - wordr0[0] = ix_create_combt (combs_buf, il_pos, 0); - wordr0[1] = ix_create_combt (combs_buf, il_pos, 1); - wordr0[2] = ix_create_combt (combs_buf, il_pos, 2); - wordr0[3] = ix_create_combt (combs_buf, il_pos, 3); - wordr1[0] = ix_create_combt (combs_buf, il_pos, 4); - wordr1[1] = ix_create_combt (combs_buf, il_pos, 5); - wordr1[2] = ix_create_combt (combs_buf, il_pos, 6); - wordr1[3] = ix_create_combt (combs_buf, il_pos, 7); + u32 wordr2[4]; + + wordr2[0] = 0; + wordr2[1] = 0; + wordr2[2] = 0; + wordr2[3] = 0; + + u32 wordr3[4]; + + wordr3[0] = 0; + wordr3[1] = 0; + wordr3[2] = 0; + wordr3[3] = 0; if (combs_mode == COMBINATOR_MODE_BASE_LEFT) { switch_buffer_by_offset_le (wordr0, wordr1, wordr2, wordr3, pw_l_len); } - u32x w0[4]; + u32 w0[4]; w0[0] = wordl0[0] | wordr0[0]; w0[1] = wordl0[1] | wordr0[1]; w0[2] = wordl0[2] | wordr0[2]; w0[3] = wordl0[3] | wordr0[3]; - u32x w1[4]; + u32 w1[4]; w1[0] = wordl1[0] | wordr1[0]; w1[1] = wordl1[1] | wordr1[1]; w1[2] = wordl1[2] | wordr1[2]; w1[3] = wordl1[3] | wordr1[3]; - u32x w2[4]; + u32 w2[4]; w2[0] = wordl2[0] | wordr2[0]; w2[1] = wordl2[1] | wordr2[1]; w2[2] = wordl2[2] | wordr2[2]; w2[3] = wordl2[3] | wordr2[3]; - u32x w3[4]; + u32 w3[4]; w3[0] = wordl3[0] | wordr3[0]; w3[1] = wordl3[1] | wordr3[1]; @@ -527,28 +602,28 @@ __kernel void m01450_s04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, * pads */ - u32x w0_t[4]; + u32 w0_t[4]; w0_t[0] = swap32 (w0[0]); w0_t[1] = swap32 (w0[1]); w0_t[2] = swap32 (w0[2]); w0_t[3] = swap32 (w0[3]); - u32x w1_t[4]; + u32 w1_t[4]; w1_t[0] = swap32 (w1[0]); w1_t[1] = swap32 (w1[1]); w1_t[2] = swap32 (w1[2]); w1_t[3] = swap32 (w1[3]); - u32x w2_t[4]; + u32 w2_t[4]; w2_t[0] = 0; w2_t[1] = 0; w2_t[2] = 0; w2_t[3] = 0; - u32x w3_t[4]; + u32 w3_t[4]; w3_t[0] = 0; w3_t[1] = 0; @@ -581,7 +656,12 @@ __kernel void m01450_s04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, hmac_sha256_run (w0_t, w1_t, w2_t, w3_t, ipad, opad, digest); - COMPARE_S_SIMD (digest[3], digest[7], digest[2], digest[6]); + const u32 r0 = digest[3]; + const u32 r1 = digest[7]; + const u32 r2 = digest[2]; + const u32 r3 = digest[6]; + + #include COMPARE_S } } diff --git a/OpenCL/m01460_a1.cl b/OpenCL/m01460_a1.cl index 38396c9..2e83a07 100644 --- a/OpenCL/m01460_a1.cl +++ b/OpenCL/m01460_a1.cl @@ -5,8 +5,6 @@ #define _SHA256_ -#define NEW_SIMD_CODE - #include "include/constants.h" #include "include/kernel_vendor.h" @@ -18,7 +16,9 @@ #include "include/kernel_functions.c" #include "OpenCL/types_ocl.c" #include "OpenCL/common.c" -#include "OpenCL/simd.c" + +#define COMPARE_S "OpenCL/check_single_comp4.c" +#define COMPARE_M "OpenCL/check_multi_comp4.c" __constant u32 k_sha256[64] = { @@ -243,20 +243,41 @@ __kernel void m01460_m04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, if (gid >= gid_max) return; - u32 pws0[4] = { 0 }; - u32 pws1[4] = { 0 }; + u32 wordl0[4]; + + wordl0[0] = pws[gid].i[ 0]; + wordl0[1] = pws[gid].i[ 1]; + wordl0[2] = pws[gid].i[ 2]; + wordl0[3] = pws[gid].i[ 3]; + + u32 wordl1[4]; + + wordl1[0] = pws[gid].i[ 4]; + wordl1[1] = pws[gid].i[ 5]; + wordl1[2] = pws[gid].i[ 6]; + wordl1[3] = pws[gid].i[ 7]; - pws0[0] = pws[gid].i[0]; - pws0[1] = pws[gid].i[1]; - pws0[2] = pws[gid].i[2]; - pws0[3] = pws[gid].i[3]; - pws1[0] = pws[gid].i[4]; - pws1[1] = pws[gid].i[5]; - pws1[2] = pws[gid].i[6]; - pws1[3] = pws[gid].i[7]; + u32 wordl2[4]; + + wordl2[0] = 0; + wordl2[1] = 0; + wordl2[2] = 0; + wordl2[3] = 0; + + u32 wordl3[4]; + + wordl3[0] = 0; + wordl3[1] = 0; + wordl3[2] = 0; + wordl3[3] = 0; const u32 pw_l_len = pws[gid].pw_len; + if (combs_mode == COMBINATOR_MODE_BASE_RIGHT) + { + switch_buffer_by_offset_le (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len); + } + /** * salt */ @@ -316,53 +337,67 @@ __kernel void m01460_m04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, * loop */ - for (u32 il_pos = 0; il_pos < combs_cnt; il_pos += VECT_SIZE) + for (u32 il_pos = 0; il_pos < combs_cnt; il_pos++) { - const u32x pw_r_len = pwlenx_create_combt (combs_buf, il_pos); + const u32 pw_r_len = combs_buf[il_pos].pw_len; + + const u32 pw_len = pw_l_len + pw_r_len; + + u32 wordr0[4]; + + wordr0[0] = combs_buf[il_pos].i[0]; + wordr0[1] = combs_buf[il_pos].i[1]; + wordr0[2] = combs_buf[il_pos].i[2]; + wordr0[3] = combs_buf[il_pos].i[3]; + + u32 wordr1[4]; + + wordr1[0] = combs_buf[il_pos].i[4]; + wordr1[1] = combs_buf[il_pos].i[5]; + wordr1[2] = combs_buf[il_pos].i[6]; + wordr1[3] = combs_buf[il_pos].i[7]; + + u32 wordr2[4]; - const u32x pw_len = pw_l_len + pw_r_len; + wordr2[0] = 0; + wordr2[1] = 0; + wordr2[2] = 0; + wordr2[3] = 0; - u32x wordr0[4] = { 0 }; - u32x wordr1[4] = { 0 }; - u32x wordr2[4] = { 0 }; - u32x wordr3[4] = { 0 }; + u32 wordr3[4]; - wordr0[0] = ix_create_combt (combs_buf, il_pos, 0); - wordr0[1] = ix_create_combt (combs_buf, il_pos, 1); - wordr0[2] = ix_create_combt (combs_buf, il_pos, 2); - wordr0[3] = ix_create_combt (combs_buf, il_pos, 3); - wordr1[0] = ix_create_combt (combs_buf, il_pos, 4); - wordr1[1] = ix_create_combt (combs_buf, il_pos, 5); - wordr1[2] = ix_create_combt (combs_buf, il_pos, 6); - wordr1[3] = ix_create_combt (combs_buf, il_pos, 7); + wordr3[0] = 0; + wordr3[1] = 0; + wordr3[2] = 0; + wordr3[3] = 0; if (combs_mode == COMBINATOR_MODE_BASE_LEFT) { switch_buffer_by_offset_le (wordr0, wordr1, wordr2, wordr3, pw_l_len); } - u32x w0[4]; + u32 w0[4]; w0[0] = wordl0[0] | wordr0[0]; w0[1] = wordl0[1] | wordr0[1]; w0[2] = wordl0[2] | wordr0[2]; w0[3] = wordl0[3] | wordr0[3]; - u32x w1[4]; + u32 w1[4]; w1[0] = wordl1[0] | wordr1[0]; w1[1] = wordl1[1] | wordr1[1]; w1[2] = wordl1[2] | wordr1[2]; w1[3] = wordl1[3] | wordr1[3]; - u32x w2[4]; + u32 w2[4]; w2[0] = wordl2[0] | wordr2[0]; w2[1] = wordl2[1] | wordr2[1]; w2[2] = wordl2[2] | wordr2[2]; w2[3] = wordl2[3] | wordr2[3]; - u32x w3[4]; + u32 w3[4]; w3[0] = wordl3[0] | wordr3[0]; w3[1] = wordl3[1] | wordr3[1]; @@ -392,7 +427,12 @@ __kernel void m01460_m04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, hmac_sha256_run (w0_t, w1_t, w2_t, w3_t, ipad, opad, digest); - COMPARE_M_SIMD (digest[3], digest[7], digest[2], digest[6]); + const u32 r0 = digest[3]; + const u32 r1 = digest[7]; + const u32 r2 = digest[2]; + const u32 r3 = digest[6]; + + #include COMPARE_M } } @@ -420,20 +460,41 @@ __kernel void m01460_s04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, if (gid >= gid_max) return; - u32 pws0[4] = { 0 }; - u32 pws1[4] = { 0 }; + u32 wordl0[4]; + + wordl0[0] = pws[gid].i[ 0]; + wordl0[1] = pws[gid].i[ 1]; + wordl0[2] = pws[gid].i[ 2]; + wordl0[3] = pws[gid].i[ 3]; + + u32 wordl1[4]; + + wordl1[0] = pws[gid].i[ 4]; + wordl1[1] = pws[gid].i[ 5]; + wordl1[2] = pws[gid].i[ 6]; + wordl1[3] = pws[gid].i[ 7]; + + u32 wordl2[4]; - pws0[0] = pws[gid].i[0]; - pws0[1] = pws[gid].i[1]; - pws0[2] = pws[gid].i[2]; - pws0[3] = pws[gid].i[3]; - pws1[0] = pws[gid].i[4]; - pws1[1] = pws[gid].i[5]; - pws1[2] = pws[gid].i[6]; - pws1[3] = pws[gid].i[7]; + wordl2[0] = 0; + wordl2[1] = 0; + wordl2[2] = 0; + wordl2[3] = 0; + + u32 wordl3[4]; + + wordl3[0] = 0; + wordl3[1] = 0; + wordl3[2] = 0; + wordl3[3] = 0; const u32 pw_l_len = pws[gid].pw_len; + if (combs_mode == COMBINATOR_MODE_BASE_RIGHT) + { + switch_buffer_by_offset_le (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len); + } + /** * salt */ @@ -505,53 +566,67 @@ __kernel void m01460_s04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, * loop */ - for (u32 il_pos = 0; il_pos < combs_cnt; il_pos += VECT_SIZE) + for (u32 il_pos = 0; il_pos < combs_cnt; il_pos++) { - const u32x pw_r_len = pwlenx_create_combt (combs_buf, il_pos); + const u32 pw_r_len = combs_buf[il_pos].pw_len; + + const u32 pw_len = pw_l_len + pw_r_len; + + u32 wordr0[4]; + + wordr0[0] = combs_buf[il_pos].i[0]; + wordr0[1] = combs_buf[il_pos].i[1]; + wordr0[2] = combs_buf[il_pos].i[2]; + wordr0[3] = combs_buf[il_pos].i[3]; - const u32x pw_len = pw_l_len + pw_r_len; + u32 wordr1[4]; - u32x wordr0[4] = { 0 }; - u32x wordr1[4] = { 0 }; - u32x wordr2[4] = { 0 }; - u32x wordr3[4] = { 0 }; + wordr1[0] = combs_buf[il_pos].i[4]; + wordr1[1] = combs_buf[il_pos].i[5]; + wordr1[2] = combs_buf[il_pos].i[6]; + wordr1[3] = combs_buf[il_pos].i[7]; - wordr0[0] = ix_create_combt (combs_buf, il_pos, 0); - wordr0[1] = ix_create_combt (combs_buf, il_pos, 1); - wordr0[2] = ix_create_combt (combs_buf, il_pos, 2); - wordr0[3] = ix_create_combt (combs_buf, il_pos, 3); - wordr1[0] = ix_create_combt (combs_buf, il_pos, 4); - wordr1[1] = ix_create_combt (combs_buf, il_pos, 5); - wordr1[2] = ix_create_combt (combs_buf, il_pos, 6); - wordr1[3] = ix_create_combt (combs_buf, il_pos, 7); + u32 wordr2[4]; + + wordr2[0] = 0; + wordr2[1] = 0; + wordr2[2] = 0; + wordr2[3] = 0; + + u32 wordr3[4]; + + wordr3[0] = 0; + wordr3[1] = 0; + wordr3[2] = 0; + wordr3[3] = 0; if (combs_mode == COMBINATOR_MODE_BASE_LEFT) { switch_buffer_by_offset_le (wordr0, wordr1, wordr2, wordr3, pw_l_len); } - u32x w0[4]; + u32 w0[4]; w0[0] = wordl0[0] | wordr0[0]; w0[1] = wordl0[1] | wordr0[1]; w0[2] = wordl0[2] | wordr0[2]; w0[3] = wordl0[3] | wordr0[3]; - u32x w1[4]; + u32 w1[4]; w1[0] = wordl1[0] | wordr1[0]; w1[1] = wordl1[1] | wordr1[1]; w1[2] = wordl1[2] | wordr1[2]; w1[3] = wordl1[3] | wordr1[3]; - u32x w2[4]; + u32 w2[4]; w2[0] = wordl2[0] | wordr2[0]; w2[1] = wordl2[1] | wordr2[1]; w2[2] = wordl2[2] | wordr2[2]; w2[3] = wordl2[3] | wordr2[3]; - u32x w3[4]; + u32 w3[4]; w3[0] = wordl3[0] | wordr3[0]; w3[1] = wordl3[1] | wordr3[1]; @@ -581,7 +656,12 @@ __kernel void m01460_s04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, hmac_sha256_run (w0_t, w1_t, w2_t, w3_t, ipad, opad, digest); - COMPARE_S_SIMD (digest[3], digest[7], digest[2], digest[6]); + const u32 r0 = digest[3]; + const u32 r1 = digest[7]; + const u32 r2 = digest[2]; + const u32 r3 = digest[6]; + + #include COMPARE_S } } diff --git a/OpenCL/m01500_a1.cl b/OpenCL/m01500_a1.cl index 94a0ca1..cab8c0d 100644 --- a/OpenCL/m01500_a1.cl +++ b/OpenCL/m01500_a1.cl @@ -7,8 +7,6 @@ #define _DES_ -#define NEW_SIMD_CODE - #include "include/constants.h" #include "include/kernel_vendor.h" @@ -20,7 +18,9 @@ #include "include/kernel_functions.c" #include "OpenCL/types_ocl.c" #include "OpenCL/common.c" -#include "OpenCL/simd.c" + +#define COMPARE_S "OpenCL/check_single_comp4.c" +#define COMPARE_M "OpenCL/check_multi_comp4.c" #define PERM_OP(a,b,tt,n,m) \ { \ @@ -520,7 +520,7 @@ __kernel void m01500_m04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, if (combs_mode == COMBINATOR_MODE_BASE_RIGHT) { - switch_buffer_by_offset_le_S (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len); + switch_buffer_by_offset_le (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len); } /** @@ -565,49 +565,69 @@ __kernel void m01500_m04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, * main */ - for (u32 il_pos = 0; il_pos < combs_cnt; il_pos += VECT_SIZE) + for (u32 il_pos = 0; il_pos < combs_cnt; il_pos++) { - const u32x pw_r_len = pwlenx_create_combt (combs_buf, il_pos); + const u32 pw_r_len = combs_buf[il_pos].pw_len; - u32x pw_len = pw_l_len + pw_r_len; + u32 pw_len = pw_l_len + pw_r_len; pw_len = (pw_len >= 8) ? 8 : pw_len; - u32x wordr0[4] = { 0 }; - u32x wordr1[4] = { 0 }; - u32x wordr2[4] = { 0 }; - u32x wordr3[4] = { 0 }; + u32 wordr0[4]; + + wordr0[0] = combs_buf[il_pos].i[0]; + wordr0[1] = combs_buf[il_pos].i[1]; + wordr0[2] = 0; + wordr0[3] = 0; + + u32 wordr1[4]; + + wordr1[0] = 0; + wordr1[1] = 0; + wordr1[2] = 0; + wordr1[3] = 0; + + u32 wordr2[4]; + + wordr2[0] = 0; + wordr2[1] = 0; + wordr2[2] = 0; + wordr2[3] = 0; - wordr0[0] = ix_create_combt (combs_buf, il_pos, 0); - wordr0[1] = ix_create_combt (combs_buf, il_pos, 1); + u32 wordr3[4]; + + wordr3[0] = 0; + wordr3[1] = 0; + wordr3[2] = 0; + wordr3[3] = 0; if (combs_mode == COMBINATOR_MODE_BASE_LEFT) { switch_buffer_by_offset_le (wordr0, wordr1, wordr2, wordr3, pw_l_len); } - u32x w0[4]; + u32 w0[4]; w0[0] = wordl0[0] | wordr0[0]; w0[1] = wordl0[1] | wordr0[1]; w0[2] = 0; w0[3] = 0; - u32x w1[4]; + u32 w1[4]; w1[0] = 0; w1[1] = 0; w1[2] = 0; w1[3] = 0; - u32x w2[4]; + u32 w2[4]; w2[0] = 0; w2[1] = 0; w2[2] = 0; w2[3] = 0; - u32x w3[4]; + u32 w3[4]; w3[0] = 0; w3[1] = 0; @@ -628,10 +648,12 @@ __kernel void m01500_m04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, _des_crypt_encrypt (iv, mask, Kc, Kd, s_SPtrans); - u32x c = 0; - u32x d = 0; + const u32 r0 = iv[0]; + const u32 r1 = iv[1]; + const u32 r2 = 0; + const u32 r3 = 0; - COMPARE_M_SIMD (iv[0], iv[1], c, d); + #include COMPARE_M } } @@ -689,7 +711,7 @@ __kernel void m01500_s04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, if (combs_mode == COMBINATOR_MODE_BASE_RIGHT) { - switch_buffer_by_offset_le_S (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len); + switch_buffer_by_offset_le (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len); } /** @@ -746,49 +768,69 @@ __kernel void m01500_s04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, * main */ - for (u32 il_pos = 0; il_pos < combs_cnt; il_pos += VECT_SIZE) + for (u32 il_pos = 0; il_pos < combs_cnt; il_pos++) { - const u32x pw_r_len = pwlenx_create_combt (combs_buf, il_pos); + const u32 pw_r_len = combs_buf[il_pos].pw_len; - u32x pw_len = pw_l_len + pw_r_len; + u32 pw_len = pw_l_len + pw_r_len; pw_len = (pw_len >= 8) ? 8 : pw_len; - u32x wordr0[4] = { 0 }; - u32x wordr1[4] = { 0 }; - u32x wordr2[4] = { 0 }; - u32x wordr3[4] = { 0 }; + u32 wordr0[4]; + + wordr0[0] = combs_buf[il_pos].i[0]; + wordr0[1] = combs_buf[il_pos].i[1]; + wordr0[2] = 0; + wordr0[3] = 0; + + u32 wordr1[4]; + + wordr1[0] = 0; + wordr1[1] = 0; + wordr1[2] = 0; + wordr1[3] = 0; + + u32 wordr2[4]; + + wordr2[0] = 0; + wordr2[1] = 0; + wordr2[2] = 0; + wordr2[3] = 0; + + u32 wordr3[4]; - wordr0[0] = ix_create_combt (combs_buf, il_pos, 0); - wordr0[1] = ix_create_combt (combs_buf, il_pos, 1); + wordr3[0] = 0; + wordr3[1] = 0; + wordr3[2] = 0; + wordr3[3] = 0; if (combs_mode == COMBINATOR_MODE_BASE_LEFT) { switch_buffer_by_offset_le (wordr0, wordr1, wordr2, wordr3, pw_l_len); } - u32x w0[4]; + u32 w0[4]; w0[0] = wordl0[0] | wordr0[0]; w0[1] = wordl0[1] | wordr0[1]; w0[2] = 0; w0[3] = 0; - u32x w1[4]; + u32 w1[4]; w1[0] = 0; w1[1] = 0; w1[2] = 0; w1[3] = 0; - u32x w2[4]; + u32 w2[4]; w2[0] = 0; w2[1] = 0; w2[2] = 0; w2[3] = 0; - u32x w3[4]; + u32 w3[4]; w3[0] = 0; w3[1] = 0; @@ -809,10 +851,12 @@ __kernel void m01500_s04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, _des_crypt_encrypt (iv, mask, Kc, Kd, s_SPtrans); - u32x c = 0; - u32x d = 0; + const u32 r0 = iv[0]; + const u32 r1 = iv[1]; + const u32 r2 = 0; + const u32 r3 = 0; - COMPARE_S_SIMD (iv[0], iv[1], c, d); + #include COMPARE_S } } diff --git a/OpenCL/m01700_a1.cl b/OpenCL/m01700_a1.cl index f780d25..61c654c 100644 --- a/OpenCL/m01700_a1.cl +++ b/OpenCL/m01700_a1.cl @@ -5,8 +5,6 @@ #define _SHA512_ -#define NEW_SIMD_CODE - #include "include/constants.h" #include "include/kernel_vendor.h" @@ -18,7 +16,9 @@ #include "include/kernel_functions.c" #include "OpenCL/types_ocl.c" #include "OpenCL/common.c" -#include "OpenCL/simd.c" + +#define COMPARE_S "OpenCL/check_single_comp4.c" +#define COMPARE_M "OpenCL/check_multi_comp4.c" __constant u64 k_sha512[80] = { @@ -157,53 +157,92 @@ __kernel void m01700_m04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, if (gid >= gid_max) return; - u32 pws0[4] = { 0 }; - u32 pws1[4] = { 0 }; + u32 wordl0[4]; + + wordl0[0] = pws[gid].i[ 0]; + wordl0[1] = pws[gid].i[ 1]; + wordl0[2] = pws[gid].i[ 2]; + wordl0[3] = pws[gid].i[ 3]; + + u32 wordl1[4]; + + wordl1[0] = pws[gid].i[ 4]; + wordl1[1] = pws[gid].i[ 5]; + wordl1[2] = pws[gid].i[ 6]; + wordl1[3] = pws[gid].i[ 7]; - pws0[0] = pws[gid].i[0]; - pws0[1] = pws[gid].i[1]; - pws0[2] = pws[gid].i[2]; - pws0[3] = pws[gid].i[3]; - pws1[0] = pws[gid].i[4]; - pws1[1] = pws[gid].i[5]; - pws1[2] = pws[gid].i[6]; - pws1[3] = pws[gid].i[7]; + u32 wordl2[4]; + + wordl2[0] = 0; + wordl2[1] = 0; + wordl2[2] = 0; + wordl2[3] = 0; + + u32 wordl3[4]; + + wordl3[0] = 0; + wordl3[1] = 0; + wordl3[2] = 0; + wordl3[3] = 0; const u32 pw_l_len = pws[gid].pw_len; + if (combs_mode == COMBINATOR_MODE_BASE_RIGHT) + { + append_0x80_2x4 (wordl0, wordl1, pw_l_len); + + switch_buffer_by_offset_le (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len); + } + /** * loop */ - for (u32 il_pos = 0; il_pos < combs_cnt; il_pos += VECT_SIZE) + for (u32 il_pos = 0; il_pos < combs_cnt; il_pos++) { - const u32x pw_r_len = pwlenx_create_combt (combs_buf, il_pos); + const u32 pw_r_len = combs_buf[il_pos].pw_len; + + const u32 pw_len = pw_l_len + pw_r_len; + + u32 wordr0[4]; + + wordr0[0] = combs_buf[il_pos].i[0]; + wordr0[1] = combs_buf[il_pos].i[1]; + wordr0[2] = combs_buf[il_pos].i[2]; + wordr0[3] = combs_buf[il_pos].i[3]; + + u32 wordr1[4]; + + wordr1[0] = combs_buf[il_pos].i[4]; + wordr1[1] = combs_buf[il_pos].i[5]; + wordr1[2] = combs_buf[il_pos].i[6]; + wordr1[3] = combs_buf[il_pos].i[7]; + + u32 wordr2[4]; - const u32x pw_len = pw_l_len + pw_r_len; + wordr2[0] = 0; + wordr2[1] = 0; + wordr2[2] = 0; + wordr2[3] = 0; - u32x wordr0[4] = { 0 }; - u32x wordr1[4] = { 0 }; - u32x wordr2[4] = { 0 }; - u32x wordr3[4] = { 0 }; + u32 wordr3[4]; - wordr0[0] = ix_create_combt (combs_buf, il_pos, 0); - wordr0[1] = ix_create_combt (combs_buf, il_pos, 1); - wordr0[2] = ix_create_combt (combs_buf, il_pos, 2); - wordr0[3] = ix_create_combt (combs_buf, il_pos, 3); - wordr1[0] = ix_create_combt (combs_buf, il_pos, 4); - wordr1[1] = ix_create_combt (combs_buf, il_pos, 5); - wordr1[2] = ix_create_combt (combs_buf, il_pos, 6); - wordr1[3] = ix_create_combt (combs_buf, il_pos, 7); + wordr3[0] = 0; + wordr3[1] = 0; + wordr3[2] = 0; + wordr3[3] = 0; if (combs_mode == COMBINATOR_MODE_BASE_LEFT) { + append_0x80_2x4 (wordr0, wordr1, pw_r_len); + switch_buffer_by_offset_le (wordr0, wordr1, wordr2, wordr3, pw_l_len); } - u32x w0[4]; - u32x w1[4]; - u32x w2[4]; - u32x w3[4]; + u32 w0[4]; + u32 w1[4]; + u32 w2[4]; + u32 w3[4]; w0[0] = wordl0[0] | wordr0[0]; w0[1] = wordl0[1] | wordr0[1]; @@ -226,10 +265,10 @@ __kernel void m01700_m04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, * SHA512 */ - u32x w0_t[4]; - u32x w1_t[4]; - u32x w2_t[4]; - u32x w3_t[4]; + u32 w0_t[4]; + u32 w1_t[4]; + u32 w2_t[4]; + u32 w3_t[4]; w0_t[0] = swap32 (w0[0]); w0_t[1] = swap32 (w0[1]); @@ -295,20 +334,43 @@ __kernel void m01700_s04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, if (gid >= gid_max) return; - u32 pws0[4] = { 0 }; - u32 pws1[4] = { 0 }; + u32 wordl0[4]; + + wordl0[0] = pws[gid].i[ 0]; + wordl0[1] = pws[gid].i[ 1]; + wordl0[2] = pws[gid].i[ 2]; + wordl0[3] = pws[gid].i[ 3]; + + u32 wordl1[4]; + + wordl1[0] = pws[gid].i[ 4]; + wordl1[1] = pws[gid].i[ 5]; + wordl1[2] = pws[gid].i[ 6]; + wordl1[3] = pws[gid].i[ 7]; + + u32 wordl2[4]; - pws0[0] = pws[gid].i[0]; - pws0[1] = pws[gid].i[1]; - pws0[2] = pws[gid].i[2]; - pws0[3] = pws[gid].i[3]; - pws1[0] = pws[gid].i[4]; - pws1[1] = pws[gid].i[5]; - pws1[2] = pws[gid].i[6]; - pws1[3] = pws[gid].i[7]; + wordl2[0] = 0; + wordl2[1] = 0; + wordl2[2] = 0; + wordl2[3] = 0; + + u32 wordl3[4]; + + wordl3[0] = 0; + wordl3[1] = 0; + wordl3[2] = 0; + wordl3[3] = 0; const u32 pw_l_len = pws[gid].pw_len; + if (combs_mode == COMBINATOR_MODE_BASE_RIGHT) + { + append_0x80_2x4 (wordl0, wordl1, pw_l_len); + + switch_buffer_by_offset_le (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len); + } + /** * digest */ @@ -325,35 +387,51 @@ __kernel void m01700_s04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, * loop */ - for (u32 il_pos = 0; il_pos < combs_cnt; il_pos += VECT_SIZE) + for (u32 il_pos = 0; il_pos < combs_cnt; il_pos++) { - const u32x pw_r_len = pwlenx_create_combt (combs_buf, il_pos); + const u32 pw_r_len = combs_buf[il_pos].pw_len; + + const u32 pw_len = pw_l_len + pw_r_len; + + u32 wordr0[4]; + + wordr0[0] = combs_buf[il_pos].i[0]; + wordr0[1] = combs_buf[il_pos].i[1]; + wordr0[2] = combs_buf[il_pos].i[2]; + wordr0[3] = combs_buf[il_pos].i[3]; - const u32x pw_len = pw_l_len + pw_r_len; + u32 wordr1[4]; - u32x wordr0[4] = { 0 }; - u32x wordr1[4] = { 0 }; - u32x wordr2[4] = { 0 }; - u32x wordr3[4] = { 0 }; + wordr1[0] = combs_buf[il_pos].i[4]; + wordr1[1] = combs_buf[il_pos].i[5]; + wordr1[2] = combs_buf[il_pos].i[6]; + wordr1[3] = combs_buf[il_pos].i[7]; - wordr0[0] = ix_create_combt (combs_buf, il_pos, 0); - wordr0[1] = ix_create_combt (combs_buf, il_pos, 1); - wordr0[2] = ix_create_combt (combs_buf, il_pos, 2); - wordr0[3] = ix_create_combt (combs_buf, il_pos, 3); - wordr1[0] = ix_create_combt (combs_buf, il_pos, 4); - wordr1[1] = ix_create_combt (combs_buf, il_pos, 5); - wordr1[2] = ix_create_combt (combs_buf, il_pos, 6); - wordr1[3] = ix_create_combt (combs_buf, il_pos, 7); + u32 wordr2[4]; + + wordr2[0] = 0; + wordr2[1] = 0; + wordr2[2] = 0; + wordr2[3] = 0; + + u32 wordr3[4]; + + wordr3[0] = 0; + wordr3[1] = 0; + wordr3[2] = 0; + wordr3[3] = 0; if (combs_mode == COMBINATOR_MODE_BASE_LEFT) { + append_0x80_2x4 (wordr0, wordr1, pw_r_len); + switch_buffer_by_offset_le (wordr0, wordr1, wordr2, wordr3, pw_l_len); } - u32x w0[4]; - u32x w1[4]; - u32x w2[4]; - u32x w3[4]; + u32 w0[4]; + u32 w1[4]; + u32 w2[4]; + u32 w3[4]; w0[0] = wordl0[0] | wordr0[0]; w0[1] = wordl0[1] | wordr0[1]; @@ -376,10 +454,10 @@ __kernel void m01700_s04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, * SHA512 */ - u32x w0_t[4]; - u32x w1_t[4]; - u32x w2_t[4]; - u32x w3_t[4]; + u32 w0_t[4]; + u32 w1_t[4]; + u32 w2_t[4]; + u32 w3_t[4]; w0_t[0] = swap32 (w0[0]); w0_t[1] = swap32 (w0[1]); diff --git a/OpenCL/m01710_a1.cl b/OpenCL/m01710_a1.cl index 36a3c0e..85d481c 100644 --- a/OpenCL/m01710_a1.cl +++ b/OpenCL/m01710_a1.cl @@ -5,8 +5,6 @@ #define _SHA512_ -#define NEW_SIMD_CODE - #include "include/constants.h" #include "include/kernel_vendor.h" @@ -18,7 +16,9 @@ #include "include/kernel_functions.c" #include "OpenCL/types_ocl.c" #include "OpenCL/common.c" -#include "OpenCL/simd.c" + +#define COMPARE_S "OpenCL/check_single_comp4.c" +#define COMPARE_M "OpenCL/check_multi_comp4.c" __constant u64 k_sha512[80] = { @@ -157,31 +157,54 @@ __kernel void m01710_m04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, if (gid >= gid_max) return; - u32 pws0[4] = { 0 }; - u32 pws1[4] = { 0 }; + u32 wordl0[4]; + + wordl0[0] = pws[gid].i[ 0]; + wordl0[1] = pws[gid].i[ 1]; + wordl0[2] = pws[gid].i[ 2]; + wordl0[3] = pws[gid].i[ 3]; + + u32 wordl1[4]; + + wordl1[0] = pws[gid].i[ 4]; + wordl1[1] = pws[gid].i[ 5]; + wordl1[2] = pws[gid].i[ 6]; + wordl1[3] = pws[gid].i[ 7]; - pws0[0] = pws[gid].i[0]; - pws0[1] = pws[gid].i[1]; - pws0[2] = pws[gid].i[2]; - pws0[3] = pws[gid].i[3]; - pws1[0] = pws[gid].i[4]; - pws1[1] = pws[gid].i[5]; - pws1[2] = pws[gid].i[6]; - pws1[3] = pws[gid].i[7]; + u32 wordl2[4]; + + wordl2[0] = 0; + wordl2[1] = 0; + wordl2[2] = 0; + wordl2[3] = 0; + + u32 wordl3[4]; + + wordl3[0] = 0; + wordl3[1] = 0; + wordl3[2] = 0; + wordl3[3] = 0; const u32 pw_l_len = pws[gid].pw_len; + if (combs_mode == COMBINATOR_MODE_BASE_RIGHT) + { + switch_buffer_by_offset_le (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len); + } + /** * salt */ u32 salt_buf0[4]; - u32 salt_buf1[4]; salt_buf0[0] = salt_bufs[salt_pos].salt_buf[0]; salt_buf0[1] = salt_bufs[salt_pos].salt_buf[1]; salt_buf0[2] = salt_bufs[salt_pos].salt_buf[2]; salt_buf0[3] = salt_bufs[salt_pos].salt_buf[3]; + + u32 salt_buf1[4]; + salt_buf1[0] = salt_bufs[salt_pos].salt_buf[4]; salt_buf1[1] = salt_bufs[salt_pos].salt_buf[5]; salt_buf1[2] = salt_bufs[salt_pos].salt_buf[6]; @@ -193,25 +216,39 @@ __kernel void m01710_m04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, * loop */ - for (u32 il_pos = 0; il_pos < combs_cnt; il_pos += VECT_SIZE) + for (u32 il_pos = 0; il_pos < combs_cnt; il_pos++) { - const u32x pw_r_len = pwlenx_create_combt (combs_buf, il_pos); + const u32 pw_r_len = combs_buf[il_pos].pw_len; + + const u32 pw_len = pw_l_len + pw_r_len; + + u32 wordr0[4]; + + wordr0[0] = combs_buf[il_pos].i[0]; + wordr0[1] = combs_buf[il_pos].i[1]; + wordr0[2] = combs_buf[il_pos].i[2]; + wordr0[3] = combs_buf[il_pos].i[3]; + + u32 wordr1[4]; + + wordr1[0] = combs_buf[il_pos].i[4]; + wordr1[1] = combs_buf[il_pos].i[5]; + wordr1[2] = combs_buf[il_pos].i[6]; + wordr1[3] = combs_buf[il_pos].i[7]; - const u32x pw_len = pw_l_len + pw_r_len; + u32 wordr2[4]; - u32x wordr0[4] = { 0 }; - u32x wordr1[4] = { 0 }; - u32x wordr2[4] = { 0 }; - u32x wordr3[4] = { 0 }; + wordr2[0] = 0; + wordr2[1] = 0; + wordr2[2] = 0; + wordr2[3] = 0; - wordr0[0] = ix_create_combt (combs_buf, il_pos, 0); - wordr0[1] = ix_create_combt (combs_buf, il_pos, 1); - wordr0[2] = ix_create_combt (combs_buf, il_pos, 2); - wordr0[3] = ix_create_combt (combs_buf, il_pos, 3); - wordr1[0] = ix_create_combt (combs_buf, il_pos, 4); - wordr1[1] = ix_create_combt (combs_buf, il_pos, 5); - wordr1[2] = ix_create_combt (combs_buf, il_pos, 6); - wordr1[3] = ix_create_combt (combs_buf, il_pos, 7); + u32 wordr3[4]; + + wordr3[0] = 0; + wordr3[1] = 0; + wordr3[2] = 0; + wordr3[3] = 0; if (combs_mode == COMBINATOR_MODE_BASE_LEFT) { @@ -250,14 +287,14 @@ __kernel void m01710_m04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, s3[2] = 0; s3[3] = 0; - switch_buffer_by_offset_le_S (s0, s1, s2, s3, pw_len); + switch_buffer_by_offset_le (s0, s1, s2, s3, pw_len); - const u32x pw_salt_len = pw_len + salt_len; + const u32 pw_salt_len = pw_len + salt_len; - u32x w0[4]; - u32x w1[4]; - u32x w2[4]; - u32x w3[4]; + u32 w0[4]; + u32 w1[4]; + u32 w2[4]; + u32 w3[4]; w0[0] = wordl0[0] | wordr0[0] | s0[0]; w0[1] = wordl0[1] | wordr0[1] | s0[1]; @@ -282,10 +319,10 @@ __kernel void m01710_m04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, * sha512 */ - u32x w0_t[4]; - u32x w1_t[4]; - u32x w2_t[4]; - u32x w3_t[4]; + u32 w0_t[4]; + u32 w1_t[4]; + u32 w2_t[4]; + u32 w3_t[4]; w0_t[0] = swap32 (w0[0]); w0_t[1] = swap32 (w0[1]); @@ -351,31 +388,54 @@ __kernel void m01710_s04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, if (gid >= gid_max) return; - u32 pws0[4] = { 0 }; - u32 pws1[4] = { 0 }; + u32 wordl0[4]; + + wordl0[0] = pws[gid].i[ 0]; + wordl0[1] = pws[gid].i[ 1]; + wordl0[2] = pws[gid].i[ 2]; + wordl0[3] = pws[gid].i[ 3]; + + u32 wordl1[4]; + + wordl1[0] = pws[gid].i[ 4]; + wordl1[1] = pws[gid].i[ 5]; + wordl1[2] = pws[gid].i[ 6]; + wordl1[3] = pws[gid].i[ 7]; + + u32 wordl2[4]; - pws0[0] = pws[gid].i[0]; - pws0[1] = pws[gid].i[1]; - pws0[2] = pws[gid].i[2]; - pws0[3] = pws[gid].i[3]; - pws1[0] = pws[gid].i[4]; - pws1[1] = pws[gid].i[5]; - pws1[2] = pws[gid].i[6]; - pws1[3] = pws[gid].i[7]; + wordl2[0] = 0; + wordl2[1] = 0; + wordl2[2] = 0; + wordl2[3] = 0; + + u32 wordl3[4]; + + wordl3[0] = 0; + wordl3[1] = 0; + wordl3[2] = 0; + wordl3[3] = 0; const u32 pw_l_len = pws[gid].pw_len; + if (combs_mode == COMBINATOR_MODE_BASE_RIGHT) + { + switch_buffer_by_offset_le (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len); + } + /** * salt */ u32 salt_buf0[4]; - u32 salt_buf1[4]; salt_buf0[0] = salt_bufs[salt_pos].salt_buf[0]; salt_buf0[1] = salt_bufs[salt_pos].salt_buf[1]; salt_buf0[2] = salt_bufs[salt_pos].salt_buf[2]; salt_buf0[3] = salt_bufs[salt_pos].salt_buf[3]; + + u32 salt_buf1[4]; + salt_buf1[0] = salt_bufs[salt_pos].salt_buf[4]; salt_buf1[1] = salt_bufs[salt_pos].salt_buf[5]; salt_buf1[2] = salt_bufs[salt_pos].salt_buf[6]; @@ -399,25 +459,39 @@ __kernel void m01710_s04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, * loop */ - for (u32 il_pos = 0; il_pos < combs_cnt; il_pos += VECT_SIZE) + for (u32 il_pos = 0; il_pos < combs_cnt; il_pos++) { - const u32x pw_r_len = pwlenx_create_combt (combs_buf, il_pos); + const u32 pw_r_len = combs_buf[il_pos].pw_len; + + const u32 pw_len = pw_l_len + pw_r_len; + + u32 wordr0[4]; + + wordr0[0] = combs_buf[il_pos].i[0]; + wordr0[1] = combs_buf[il_pos].i[1]; + wordr0[2] = combs_buf[il_pos].i[2]; + wordr0[3] = combs_buf[il_pos].i[3]; + + u32 wordr1[4]; + + wordr1[0] = combs_buf[il_pos].i[4]; + wordr1[1] = combs_buf[il_pos].i[5]; + wordr1[2] = combs_buf[il_pos].i[6]; + wordr1[3] = combs_buf[il_pos].i[7]; + + u32 wordr2[4]; - const u32x pw_len = pw_l_len + pw_r_len; + wordr2[0] = 0; + wordr2[1] = 0; + wordr2[2] = 0; + wordr2[3] = 0; - u32x wordr0[4] = { 0 }; - u32x wordr1[4] = { 0 }; - u32x wordr2[4] = { 0 }; - u32x wordr3[4] = { 0 }; + u32 wordr3[4]; - wordr0[0] = ix_create_combt (combs_buf, il_pos, 0); - wordr0[1] = ix_create_combt (combs_buf, il_pos, 1); - wordr0[2] = ix_create_combt (combs_buf, il_pos, 2); - wordr0[3] = ix_create_combt (combs_buf, il_pos, 3); - wordr1[0] = ix_create_combt (combs_buf, il_pos, 4); - wordr1[1] = ix_create_combt (combs_buf, il_pos, 5); - wordr1[2] = ix_create_combt (combs_buf, il_pos, 6); - wordr1[3] = ix_create_combt (combs_buf, il_pos, 7); + wordr3[0] = 0; + wordr3[1] = 0; + wordr3[2] = 0; + wordr3[3] = 0; if (combs_mode == COMBINATOR_MODE_BASE_LEFT) { @@ -456,14 +530,14 @@ __kernel void m01710_s04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, s3[2] = 0; s3[3] = 0; - switch_buffer_by_offset_le_S (s0, s1, s2, s3, pw_len); + switch_buffer_by_offset_le (s0, s1, s2, s3, pw_len); - const u32x pw_salt_len = pw_len + salt_len; + const u32 pw_salt_len = pw_len + salt_len; - u32x w0[4]; - u32x w1[4]; - u32x w2[4]; - u32x w3[4]; + u32 w0[4]; + u32 w1[4]; + u32 w2[4]; + u32 w3[4]; w0[0] = wordl0[0] | wordr0[0] | s0[0]; w0[1] = wordl0[1] | wordr0[1] | s0[1]; @@ -488,10 +562,10 @@ __kernel void m01710_s04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, * sha512 */ - u32x w0_t[4]; - u32x w1_t[4]; - u32x w2_t[4]; - u32x w3_t[4]; + u32 w0_t[4]; + u32 w1_t[4]; + u32 w2_t[4]; + u32 w3_t[4]; w0_t[0] = swap32 (w0[0]); w0_t[1] = swap32 (w0[1]); diff --git a/OpenCL/m01720_a1.cl b/OpenCL/m01720_a1.cl index 0f228cc..f3600f2 100644 --- a/OpenCL/m01720_a1.cl +++ b/OpenCL/m01720_a1.cl @@ -5,8 +5,6 @@ #define _SHA512_ -#define NEW_SIMD_CODE - #include "include/constants.h" #include "include/kernel_vendor.h" @@ -18,7 +16,9 @@ #include "include/kernel_functions.c" #include "OpenCL/types_ocl.c" #include "OpenCL/common.c" -#include "OpenCL/simd.c" + +#define COMPARE_S "OpenCL/check_single_comp4.c" +#define COMPARE_M "OpenCL/check_multi_comp4.c" __constant u64 k_sha512[80] = { @@ -157,31 +157,54 @@ __kernel void m01720_m04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, if (gid >= gid_max) return; - u32 pws0[4] = { 0 }; - u32 pws1[4] = { 0 }; + u32 wordl0[4]; + + wordl0[0] = pws[gid].i[ 0]; + wordl0[1] = pws[gid].i[ 1]; + wordl0[2] = pws[gid].i[ 2]; + wordl0[3] = pws[gid].i[ 3]; + + u32 wordl1[4]; + + wordl1[0] = pws[gid].i[ 4]; + wordl1[1] = pws[gid].i[ 5]; + wordl1[2] = pws[gid].i[ 6]; + wordl1[3] = pws[gid].i[ 7]; - pws0[0] = pws[gid].i[0]; - pws0[1] = pws[gid].i[1]; - pws0[2] = pws[gid].i[2]; - pws0[3] = pws[gid].i[3]; - pws1[0] = pws[gid].i[4]; - pws1[1] = pws[gid].i[5]; - pws1[2] = pws[gid].i[6]; - pws1[3] = pws[gid].i[7]; + u32 wordl2[4]; + + wordl2[0] = 0; + wordl2[1] = 0; + wordl2[2] = 0; + wordl2[3] = 0; + + u32 wordl3[4]; + + wordl3[0] = 0; + wordl3[1] = 0; + wordl3[2] = 0; + wordl3[3] = 0; const u32 pw_l_len = pws[gid].pw_len; + if (combs_mode == COMBINATOR_MODE_BASE_RIGHT) + { + switch_buffer_by_offset_le (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len); + } + /** * salt */ u32 salt_buf0[4]; - u32 salt_buf1[4]; salt_buf0[0] = salt_bufs[salt_pos].salt_buf[0]; salt_buf0[1] = salt_bufs[salt_pos].salt_buf[1]; salt_buf0[2] = salt_bufs[salt_pos].salt_buf[2]; salt_buf0[3] = salt_bufs[salt_pos].salt_buf[3]; + + u32 salt_buf1[4]; + salt_buf1[0] = salt_bufs[salt_pos].salt_buf[4]; salt_buf1[1] = salt_bufs[salt_pos].salt_buf[5]; salt_buf1[2] = salt_bufs[salt_pos].salt_buf[6]; @@ -193,35 +216,43 @@ __kernel void m01720_m04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, * loop */ - for (u32 il_pos = 0; il_pos < combs_cnt; il_pos += VECT_SIZE) + for (u32 il_pos = 0; il_pos < combs_cnt; il_pos++) { - const u32x pw_r_len = pwlenx_create_combt (combs_buf, il_pos); - - const u32x pw_len = pw_l_len + pw_r_len; - - u32x wordr0[4] = { 0 }; - u32x wordr1[4] = { 0 }; - u32x wordr2[4] = { 0 }; - u32x wordr3[4] = { 0 }; - - wordr0[0] = ix_create_combt (combs_buf, il_pos, 0); - wordr0[1] = ix_create_combt (combs_buf, il_pos, 1); - wordr0[2] = ix_create_combt (combs_buf, il_pos, 2); - wordr0[3] = ix_create_combt (combs_buf, il_pos, 3); - wordr1[0] = ix_create_combt (combs_buf, il_pos, 4); - wordr1[1] = ix_create_combt (combs_buf, il_pos, 5); - wordr1[2] = ix_create_combt (combs_buf, il_pos, 6); - wordr1[3] = ix_create_combt (combs_buf, il_pos, 7); + const u32 pw_r_len = combs_buf[il_pos].pw_len; + + const u32 pw_len = pw_l_len + pw_r_len; + + u32 wordr0[4]; + u32 wordr1[4]; + u32 wordr2[4]; + u32 wordr3[4]; + + wordr0[0] = combs_buf[il_pos].i[0]; + wordr0[1] = combs_buf[il_pos].i[1]; + wordr0[2] = combs_buf[il_pos].i[2]; + wordr0[3] = combs_buf[il_pos].i[3]; + wordr1[0] = combs_buf[il_pos].i[4]; + wordr1[1] = combs_buf[il_pos].i[5]; + wordr1[2] = combs_buf[il_pos].i[6]; + wordr1[3] = combs_buf[il_pos].i[7]; + wordr2[0] = 0; + wordr2[1] = 0; + wordr2[2] = 0; + wordr2[3] = 0; + wordr3[0] = 0; + wordr3[1] = 0; + wordr3[2] = 0; + wordr3[3] = 0; if (combs_mode == COMBINATOR_MODE_BASE_LEFT) { switch_buffer_by_offset_le (wordr0, wordr1, wordr2, wordr3, pw_l_len); } - u32x w0[4]; - u32x w1[4]; - u32x w2[4]; - u32x w3[4]; + u32 w0[4]; + u32 w1[4]; + u32 w2[4]; + u32 w3[4]; w0[0] = wordl0[0] | wordr0[0]; w0[1] = wordl0[1] | wordr0[1]; @@ -244,7 +275,7 @@ __kernel void m01720_m04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, * prepend salt */ - const u32x pw_salt_len = pw_len + salt_len; + const u32 pw_salt_len = pw_len + salt_len; switch_buffer_by_offset_le (w0, w1, w2, w3, salt_len); @@ -263,10 +294,10 @@ __kernel void m01720_m04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, * sha512 */ - u32x w0_t[4]; - u32x w1_t[4]; - u32x w2_t[4]; - u32x w3_t[4]; + u32 w0_t[4]; + u32 w1_t[4]; + u32 w2_t[4]; + u32 w3_t[4]; w0_t[0] = swap32 (w0[0]); w0_t[1] = swap32 (w0[1]); @@ -332,31 +363,54 @@ __kernel void m01720_s04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, if (gid >= gid_max) return; - u32 pws0[4] = { 0 }; - u32 pws1[4] = { 0 }; + u32 wordl0[4]; + + wordl0[0] = pws[gid].i[ 0]; + wordl0[1] = pws[gid].i[ 1]; + wordl0[2] = pws[gid].i[ 2]; + wordl0[3] = pws[gid].i[ 3]; + + u32 wordl1[4]; + + wordl1[0] = pws[gid].i[ 4]; + wordl1[1] = pws[gid].i[ 5]; + wordl1[2] = pws[gid].i[ 6]; + wordl1[3] = pws[gid].i[ 7]; + + u32 wordl2[4]; - pws0[0] = pws[gid].i[0]; - pws0[1] = pws[gid].i[1]; - pws0[2] = pws[gid].i[2]; - pws0[3] = pws[gid].i[3]; - pws1[0] = pws[gid].i[4]; - pws1[1] = pws[gid].i[5]; - pws1[2] = pws[gid].i[6]; - pws1[3] = pws[gid].i[7]; + wordl2[0] = 0; + wordl2[1] = 0; + wordl2[2] = 0; + wordl2[3] = 0; + + u32 wordl3[4]; + + wordl3[0] = 0; + wordl3[1] = 0; + wordl3[2] = 0; + wordl3[3] = 0; const u32 pw_l_len = pws[gid].pw_len; + if (combs_mode == COMBINATOR_MODE_BASE_RIGHT) + { + switch_buffer_by_offset_le (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len); + } + /** * salt */ u32 salt_buf0[4]; - u32 salt_buf1[4]; salt_buf0[0] = salt_bufs[salt_pos].salt_buf[0]; salt_buf0[1] = salt_bufs[salt_pos].salt_buf[1]; salt_buf0[2] = salt_bufs[salt_pos].salt_buf[2]; salt_buf0[3] = salt_bufs[salt_pos].salt_buf[3]; + + u32 salt_buf1[4]; + salt_buf1[0] = salt_bufs[salt_pos].salt_buf[4]; salt_buf1[1] = salt_bufs[salt_pos].salt_buf[5]; salt_buf1[2] = salt_bufs[salt_pos].salt_buf[6]; @@ -380,35 +434,43 @@ __kernel void m01720_s04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, * loop */ - for (u32 il_pos = 0; il_pos < combs_cnt; il_pos += VECT_SIZE) + for (u32 il_pos = 0; il_pos < combs_cnt; il_pos++) { - const u32x pw_r_len = pwlenx_create_combt (combs_buf, il_pos); - - const u32x pw_len = pw_l_len + pw_r_len; - - u32x wordr0[4] = { 0 }; - u32x wordr1[4] = { 0 }; - u32x wordr2[4] = { 0 }; - u32x wordr3[4] = { 0 }; - - wordr0[0] = ix_create_combt (combs_buf, il_pos, 0); - wordr0[1] = ix_create_combt (combs_buf, il_pos, 1); - wordr0[2] = ix_create_combt (combs_buf, il_pos, 2); - wordr0[3] = ix_create_combt (combs_buf, il_pos, 3); - wordr1[0] = ix_create_combt (combs_buf, il_pos, 4); - wordr1[1] = ix_create_combt (combs_buf, il_pos, 5); - wordr1[2] = ix_create_combt (combs_buf, il_pos, 6); - wordr1[3] = ix_create_combt (combs_buf, il_pos, 7); + const u32 pw_r_len = combs_buf[il_pos].pw_len; + + const u32 pw_len = pw_l_len + pw_r_len; + + u32 wordr0[4]; + u32 wordr1[4]; + u32 wordr2[4]; + u32 wordr3[4]; + + wordr0[0] = combs_buf[il_pos].i[0]; + wordr0[1] = combs_buf[il_pos].i[1]; + wordr0[2] = combs_buf[il_pos].i[2]; + wordr0[3] = combs_buf[il_pos].i[3]; + wordr1[0] = combs_buf[il_pos].i[4]; + wordr1[1] = combs_buf[il_pos].i[5]; + wordr1[2] = combs_buf[il_pos].i[6]; + wordr1[3] = combs_buf[il_pos].i[7]; + wordr2[0] = 0; + wordr2[1] = 0; + wordr2[2] = 0; + wordr2[3] = 0; + wordr3[0] = 0; + wordr3[1] = 0; + wordr3[2] = 0; + wordr3[3] = 0; if (combs_mode == COMBINATOR_MODE_BASE_LEFT) { switch_buffer_by_offset_le (wordr0, wordr1, wordr2, wordr3, pw_l_len); } - u32x w0[4]; - u32x w1[4]; - u32x w2[4]; - u32x w3[4]; + u32 w0[4]; + u32 w1[4]; + u32 w2[4]; + u32 w3[4]; w0[0] = wordl0[0] | wordr0[0]; w0[1] = wordl0[1] | wordr0[1]; @@ -431,7 +493,7 @@ __kernel void m01720_s04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, * prepend salt */ - const u32x pw_salt_len = pw_len + salt_len; + const u32 pw_salt_len = pw_len + salt_len; switch_buffer_by_offset_le (w0, w1, w2, w3, salt_len); @@ -450,10 +512,10 @@ __kernel void m01720_s04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, * sha512 */ - u32x w0_t[4]; - u32x w1_t[4]; - u32x w2_t[4]; - u32x w3_t[4]; + u32 w0_t[4]; + u32 w1_t[4]; + u32 w2_t[4]; + u32 w3_t[4]; w0_t[0] = swap32 (w0[0]); w0_t[1] = swap32 (w0[1]); diff --git a/OpenCL/m01730_a1.cl b/OpenCL/m01730_a1.cl index d15b83d..fea7d83 100644 --- a/OpenCL/m01730_a1.cl +++ b/OpenCL/m01730_a1.cl @@ -5,8 +5,6 @@ #define _SHA512_ -#define NEW_SIMD_CODE - #include "include/constants.h" #include "include/kernel_vendor.h" @@ -18,7 +16,9 @@ #include "include/kernel_functions.c" #include "OpenCL/types_ocl.c" #include "OpenCL/common.c" -#include "OpenCL/simd.c" + +#define COMPARE_S "OpenCL/check_single_comp4.c" +#define COMPARE_M "OpenCL/check_multi_comp4.c" __constant u64 k_sha512[80] = { @@ -157,31 +157,54 @@ __kernel void m01730_m04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, if (gid >= gid_max) return; - u32 pws0[4] = { 0 }; - u32 pws1[4] = { 0 }; + u32 wordl0[4]; + + wordl0[0] = pws[gid].i[ 0]; + wordl0[1] = pws[gid].i[ 1]; + wordl0[2] = pws[gid].i[ 2]; + wordl0[3] = pws[gid].i[ 3]; + + u32 wordl1[4]; + + wordl1[0] = pws[gid].i[ 4]; + wordl1[1] = pws[gid].i[ 5]; + wordl1[2] = pws[gid].i[ 6]; + wordl1[3] = pws[gid].i[ 7]; - pws0[0] = pws[gid].i[0]; - pws0[1] = pws[gid].i[1]; - pws0[2] = pws[gid].i[2]; - pws0[3] = pws[gid].i[3]; - pws1[0] = pws[gid].i[4]; - pws1[1] = pws[gid].i[5]; - pws1[2] = pws[gid].i[6]; - pws1[3] = pws[gid].i[7]; + u32 wordl2[4]; + + wordl2[0] = 0; + wordl2[1] = 0; + wordl2[2] = 0; + wordl2[3] = 0; + + u32 wordl3[4]; + + wordl3[0] = 0; + wordl3[1] = 0; + wordl3[2] = 0; + wordl3[3] = 0; const u32 pw_l_len = pws[gid].pw_len; + if (combs_mode == COMBINATOR_MODE_BASE_RIGHT) + { + switch_buffer_by_offset_le (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len); + } + /** * salt */ u32 salt_buf0[4]; - u32 salt_buf1[4]; salt_buf0[0] = salt_bufs[salt_pos].salt_buf[0]; salt_buf0[1] = salt_bufs[salt_pos].salt_buf[1]; salt_buf0[2] = salt_bufs[salt_pos].salt_buf[2]; salt_buf0[3] = salt_bufs[salt_pos].salt_buf[3]; + + u32 salt_buf1[4]; + salt_buf1[0] = salt_bufs[salt_pos].salt_buf[4]; salt_buf1[1] = salt_bufs[salt_pos].salt_buf[5]; salt_buf1[2] = salt_bufs[salt_pos].salt_buf[6]; @@ -193,35 +216,43 @@ __kernel void m01730_m04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, * loop */ - for (u32 il_pos = 0; il_pos < combs_cnt; il_pos += VECT_SIZE) + for (u32 il_pos = 0; il_pos < combs_cnt; il_pos++) { - const u32x pw_r_len = pwlenx_create_combt (combs_buf, il_pos); - - const u32x pw_len = pw_l_len + pw_r_len; - - u32x wordr0[4] = { 0 }; - u32x wordr1[4] = { 0 }; - u32x wordr2[4] = { 0 }; - u32x wordr3[4] = { 0 }; - - wordr0[0] = ix_create_combt (combs_buf, il_pos, 0); - wordr0[1] = ix_create_combt (combs_buf, il_pos, 1); - wordr0[2] = ix_create_combt (combs_buf, il_pos, 2); - wordr0[3] = ix_create_combt (combs_buf, il_pos, 3); - wordr1[0] = ix_create_combt (combs_buf, il_pos, 4); - wordr1[1] = ix_create_combt (combs_buf, il_pos, 5); - wordr1[2] = ix_create_combt (combs_buf, il_pos, 6); - wordr1[3] = ix_create_combt (combs_buf, il_pos, 7); + const u32 pw_r_len = combs_buf[il_pos].pw_len; + + const u32 pw_len = pw_l_len + pw_r_len; + + u32 wordr0[4]; + u32 wordr1[4]; + u32 wordr2[4]; + u32 wordr3[4]; + + wordr0[0] = combs_buf[il_pos].i[0]; + wordr0[1] = combs_buf[il_pos].i[1]; + wordr0[2] = combs_buf[il_pos].i[2]; + wordr0[3] = combs_buf[il_pos].i[3]; + wordr1[0] = combs_buf[il_pos].i[4]; + wordr1[1] = combs_buf[il_pos].i[5]; + wordr1[2] = combs_buf[il_pos].i[6]; + wordr1[3] = combs_buf[il_pos].i[7]; + wordr2[0] = 0; + wordr2[1] = 0; + wordr2[2] = 0; + wordr2[3] = 0; + wordr3[0] = 0; + wordr3[1] = 0; + wordr3[2] = 0; + wordr3[3] = 0; if (combs_mode == COMBINATOR_MODE_BASE_LEFT) { switch_buffer_by_offset_le (wordr0, wordr1, wordr2, wordr3, pw_l_len); } - u32x w0[4]; - u32x w1[4]; - u32x w2[4]; - u32x w3[4]; + u32 w0[4]; + u32 w1[4]; + u32 w2[4]; + u32 w3[4]; w0[0] = wordl0[0] | wordr0[0]; w0[1] = wordl0[1] | wordr0[1]; @@ -276,10 +307,10 @@ __kernel void m01730_m04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, const u32 pw_salt_len = (pw_len * 2) + salt_len; - u32x w0_t[4]; - u32x w1_t[4]; - u32x w2_t[4]; - u32x w3_t[4]; + u32 w0_t[4]; + u32 w1_t[4]; + u32 w2_t[4]; + u32 w3_t[4]; make_unicode (w0, w0_t, w1_t); make_unicode (w1, w2_t, w3_t); @@ -371,31 +402,54 @@ __kernel void m01730_s04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, if (gid >= gid_max) return; - u32 pws0[4] = { 0 }; - u32 pws1[4] = { 0 }; + u32 wordl0[4]; + + wordl0[0] = pws[gid].i[ 0]; + wordl0[1] = pws[gid].i[ 1]; + wordl0[2] = pws[gid].i[ 2]; + wordl0[3] = pws[gid].i[ 3]; + + u32 wordl1[4]; + + wordl1[0] = pws[gid].i[ 4]; + wordl1[1] = pws[gid].i[ 5]; + wordl1[2] = pws[gid].i[ 6]; + wordl1[3] = pws[gid].i[ 7]; + + u32 wordl2[4]; - pws0[0] = pws[gid].i[0]; - pws0[1] = pws[gid].i[1]; - pws0[2] = pws[gid].i[2]; - pws0[3] = pws[gid].i[3]; - pws1[0] = pws[gid].i[4]; - pws1[1] = pws[gid].i[5]; - pws1[2] = pws[gid].i[6]; - pws1[3] = pws[gid].i[7]; + wordl2[0] = 0; + wordl2[1] = 0; + wordl2[2] = 0; + wordl2[3] = 0; + + u32 wordl3[4]; + + wordl3[0] = 0; + wordl3[1] = 0; + wordl3[2] = 0; + wordl3[3] = 0; const u32 pw_l_len = pws[gid].pw_len; + if (combs_mode == COMBINATOR_MODE_BASE_RIGHT) + { + switch_buffer_by_offset_le (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len); + } + /** * salt */ u32 salt_buf0[4]; - u32 salt_buf1[4]; salt_buf0[0] = salt_bufs[salt_pos].salt_buf[0]; salt_buf0[1] = salt_bufs[salt_pos].salt_buf[1]; salt_buf0[2] = salt_bufs[salt_pos].salt_buf[2]; salt_buf0[3] = salt_bufs[salt_pos].salt_buf[3]; + + u32 salt_buf1[4]; + salt_buf1[0] = salt_bufs[salt_pos].salt_buf[4]; salt_buf1[1] = salt_bufs[salt_pos].salt_buf[5]; salt_buf1[2] = salt_bufs[salt_pos].salt_buf[6]; @@ -419,35 +473,43 @@ __kernel void m01730_s04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, * loop */ - for (u32 il_pos = 0; il_pos < combs_cnt; il_pos += VECT_SIZE) + for (u32 il_pos = 0; il_pos < combs_cnt; il_pos++) { - const u32x pw_r_len = pwlenx_create_combt (combs_buf, il_pos); - - const u32x pw_len = pw_l_len + pw_r_len; - - u32x wordr0[4] = { 0 }; - u32x wordr1[4] = { 0 }; - u32x wordr2[4] = { 0 }; - u32x wordr3[4] = { 0 }; - - wordr0[0] = ix_create_combt (combs_buf, il_pos, 0); - wordr0[1] = ix_create_combt (combs_buf, il_pos, 1); - wordr0[2] = ix_create_combt (combs_buf, il_pos, 2); - wordr0[3] = ix_create_combt (combs_buf, il_pos, 3); - wordr1[0] = ix_create_combt (combs_buf, il_pos, 4); - wordr1[1] = ix_create_combt (combs_buf, il_pos, 5); - wordr1[2] = ix_create_combt (combs_buf, il_pos, 6); - wordr1[3] = ix_create_combt (combs_buf, il_pos, 7); + const u32 pw_r_len = combs_buf[il_pos].pw_len; + + const u32 pw_len = pw_l_len + pw_r_len; + + u32 wordr0[4]; + u32 wordr1[4]; + u32 wordr2[4]; + u32 wordr3[4]; + + wordr0[0] = combs_buf[il_pos].i[0]; + wordr0[1] = combs_buf[il_pos].i[1]; + wordr0[2] = combs_buf[il_pos].i[2]; + wordr0[3] = combs_buf[il_pos].i[3]; + wordr1[0] = combs_buf[il_pos].i[4]; + wordr1[1] = combs_buf[il_pos].i[5]; + wordr1[2] = combs_buf[il_pos].i[6]; + wordr1[3] = combs_buf[il_pos].i[7]; + wordr2[0] = 0; + wordr2[1] = 0; + wordr2[2] = 0; + wordr2[3] = 0; + wordr3[0] = 0; + wordr3[1] = 0; + wordr3[2] = 0; + wordr3[3] = 0; if (combs_mode == COMBINATOR_MODE_BASE_LEFT) { switch_buffer_by_offset_le (wordr0, wordr1, wordr2, wordr3, pw_l_len); } - u32x w0[4]; - u32x w1[4]; - u32x w2[4]; - u32x w3[4]; + u32 w0[4]; + u32 w1[4]; + u32 w2[4]; + u32 w3[4]; w0[0] = wordl0[0] | wordr0[0]; w0[1] = wordl0[1] | wordr0[1]; @@ -502,10 +564,10 @@ __kernel void m01730_s04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, const u32 pw_salt_len = (pw_len * 2) + salt_len; - u32x w0_t[4]; - u32x w1_t[4]; - u32x w2_t[4]; - u32x w3_t[4]; + u32 w0_t[4]; + u32 w1_t[4]; + u32 w2_t[4]; + u32 w3_t[4]; make_unicode (w0, w0_t, w1_t); make_unicode (w1, w2_t, w3_t); diff --git a/OpenCL/m01740_a1.cl b/OpenCL/m01740_a1.cl index b3455f3..878a381 100644 --- a/OpenCL/m01740_a1.cl +++ b/OpenCL/m01740_a1.cl @@ -5,8 +5,6 @@ #define _SHA512_ -#define NEW_SIMD_CODE - #include "include/constants.h" #include "include/kernel_vendor.h" @@ -18,7 +16,9 @@ #include "include/kernel_functions.c" #include "OpenCL/types_ocl.c" #include "OpenCL/common.c" -#include "OpenCL/simd.c" + +#define COMPARE_S "OpenCL/check_single_comp4.c" +#define COMPARE_M "OpenCL/check_multi_comp4.c" __constant u64 k_sha512[80] = { @@ -157,31 +157,54 @@ __kernel void m01740_m04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, if (gid >= gid_max) return; - u32 pws0[4] = { 0 }; - u32 pws1[4] = { 0 }; + u32 wordl0[4]; + + wordl0[0] = pws[gid].i[ 0]; + wordl0[1] = pws[gid].i[ 1]; + wordl0[2] = pws[gid].i[ 2]; + wordl0[3] = pws[gid].i[ 3]; + + u32 wordl1[4]; + + wordl1[0] = pws[gid].i[ 4]; + wordl1[1] = pws[gid].i[ 5]; + wordl1[2] = pws[gid].i[ 6]; + wordl1[3] = pws[gid].i[ 7]; - pws0[0] = pws[gid].i[0]; - pws0[1] = pws[gid].i[1]; - pws0[2] = pws[gid].i[2]; - pws0[3] = pws[gid].i[3]; - pws1[0] = pws[gid].i[4]; - pws1[1] = pws[gid].i[5]; - pws1[2] = pws[gid].i[6]; - pws1[3] = pws[gid].i[7]; + u32 wordl2[4]; + + wordl2[0] = 0; + wordl2[1] = 0; + wordl2[2] = 0; + wordl2[3] = 0; + + u32 wordl3[4]; + + wordl3[0] = 0; + wordl3[1] = 0; + wordl3[2] = 0; + wordl3[3] = 0; const u32 pw_l_len = pws[gid].pw_len; + if (combs_mode == COMBINATOR_MODE_BASE_RIGHT) + { + switch_buffer_by_offset_le (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len); + } + /** * salt */ u32 salt_buf0[4]; - u32 salt_buf1[4]; salt_buf0[0] = salt_bufs[salt_pos].salt_buf[0]; salt_buf0[1] = salt_bufs[salt_pos].salt_buf[1]; salt_buf0[2] = salt_bufs[salt_pos].salt_buf[2]; salt_buf0[3] = salt_bufs[salt_pos].salt_buf[3]; + + u32 salt_buf1[4]; + salt_buf1[0] = salt_bufs[salt_pos].salt_buf[4]; salt_buf1[1] = salt_bufs[salt_pos].salt_buf[5]; salt_buf1[2] = salt_bufs[salt_pos].salt_buf[6]; @@ -193,35 +216,43 @@ __kernel void m01740_m04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, * loop */ - for (u32 il_pos = 0; il_pos < combs_cnt; il_pos += VECT_SIZE) + for (u32 il_pos = 0; il_pos < combs_cnt; il_pos++) { - const u32x pw_r_len = pwlenx_create_combt (combs_buf, il_pos); - - const u32x pw_len = pw_l_len + pw_r_len; - - u32x wordr0[4] = { 0 }; - u32x wordr1[4] = { 0 }; - u32x wordr2[4] = { 0 }; - u32x wordr3[4] = { 0 }; - - wordr0[0] = ix_create_combt (combs_buf, il_pos, 0); - wordr0[1] = ix_create_combt (combs_buf, il_pos, 1); - wordr0[2] = ix_create_combt (combs_buf, il_pos, 2); - wordr0[3] = ix_create_combt (combs_buf, il_pos, 3); - wordr1[0] = ix_create_combt (combs_buf, il_pos, 4); - wordr1[1] = ix_create_combt (combs_buf, il_pos, 5); - wordr1[2] = ix_create_combt (combs_buf, il_pos, 6); - wordr1[3] = ix_create_combt (combs_buf, il_pos, 7); + const u32 pw_r_len = combs_buf[il_pos].pw_len; + + const u32 pw_len = pw_l_len + pw_r_len; + + u32 wordr0[4]; + u32 wordr1[4]; + u32 wordr2[4]; + u32 wordr3[4]; + + wordr0[0] = combs_buf[il_pos].i[0]; + wordr0[1] = combs_buf[il_pos].i[1]; + wordr0[2] = combs_buf[il_pos].i[2]; + wordr0[3] = combs_buf[il_pos].i[3]; + wordr1[0] = combs_buf[il_pos].i[4]; + wordr1[1] = combs_buf[il_pos].i[5]; + wordr1[2] = combs_buf[il_pos].i[6]; + wordr1[3] = combs_buf[il_pos].i[7]; + wordr2[0] = 0; + wordr2[1] = 0; + wordr2[2] = 0; + wordr2[3] = 0; + wordr3[0] = 0; + wordr3[1] = 0; + wordr3[2] = 0; + wordr3[3] = 0; if (combs_mode == COMBINATOR_MODE_BASE_LEFT) { switch_buffer_by_offset_le (wordr0, wordr1, wordr2, wordr3, pw_l_len); } - u32x w0[4]; - u32x w1[4]; - u32x w2[4]; - u32x w3[4]; + u32 w0[4]; + u32 w1[4]; + u32 w2[4]; + u32 w3[4]; w0[0] = wordl0[0] | wordr0[0]; w0[1] = wordl0[1] | wordr0[1]; @@ -246,10 +277,10 @@ __kernel void m01740_m04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, const u32 pw_salt_len = (pw_len * 2) + salt_len; - u32x w0_t[4]; - u32x w1_t[4]; - u32x w2_t[4]; - u32x w3_t[4]; + u32 w0_t[4]; + u32 w1_t[4]; + u32 w2_t[4]; + u32 w3_t[4]; make_unicode (w0, w0_t, w1_t); make_unicode (w1, w2_t, w3_t); @@ -335,31 +366,54 @@ __kernel void m01740_s04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, if (gid >= gid_max) return; - u32 pws0[4] = { 0 }; - u32 pws1[4] = { 0 }; + u32 wordl0[4]; + + wordl0[0] = pws[gid].i[ 0]; + wordl0[1] = pws[gid].i[ 1]; + wordl0[2] = pws[gid].i[ 2]; + wordl0[3] = pws[gid].i[ 3]; + + u32 wordl1[4]; + + wordl1[0] = pws[gid].i[ 4]; + wordl1[1] = pws[gid].i[ 5]; + wordl1[2] = pws[gid].i[ 6]; + wordl1[3] = pws[gid].i[ 7]; + + u32 wordl2[4]; - pws0[0] = pws[gid].i[0]; - pws0[1] = pws[gid].i[1]; - pws0[2] = pws[gid].i[2]; - pws0[3] = pws[gid].i[3]; - pws1[0] = pws[gid].i[4]; - pws1[1] = pws[gid].i[5]; - pws1[2] = pws[gid].i[6]; - pws1[3] = pws[gid].i[7]; + wordl2[0] = 0; + wordl2[1] = 0; + wordl2[2] = 0; + wordl2[3] = 0; + + u32 wordl3[4]; + + wordl3[0] = 0; + wordl3[1] = 0; + wordl3[2] = 0; + wordl3[3] = 0; const u32 pw_l_len = pws[gid].pw_len; + if (combs_mode == COMBINATOR_MODE_BASE_RIGHT) + { + switch_buffer_by_offset_le (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len); + } + /** * salt */ u32 salt_buf0[4]; - u32 salt_buf1[4]; salt_buf0[0] = salt_bufs[salt_pos].salt_buf[0]; salt_buf0[1] = salt_bufs[salt_pos].salt_buf[1]; salt_buf0[2] = salt_bufs[salt_pos].salt_buf[2]; salt_buf0[3] = salt_bufs[salt_pos].salt_buf[3]; + + u32 salt_buf1[4]; + salt_buf1[0] = salt_bufs[salt_pos].salt_buf[4]; salt_buf1[1] = salt_bufs[salt_pos].salt_buf[5]; salt_buf1[2] = salt_bufs[salt_pos].salt_buf[6]; @@ -383,35 +437,43 @@ __kernel void m01740_s04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, * loop */ - for (u32 il_pos = 0; il_pos < combs_cnt; il_pos += VECT_SIZE) + for (u32 il_pos = 0; il_pos < combs_cnt; il_pos++) { - const u32x pw_r_len = pwlenx_create_combt (combs_buf, il_pos); - - const u32x pw_len = pw_l_len + pw_r_len; - - u32x wordr0[4] = { 0 }; - u32x wordr1[4] = { 0 }; - u32x wordr2[4] = { 0 }; - u32x wordr3[4] = { 0 }; - - wordr0[0] = ix_create_combt (combs_buf, il_pos, 0); - wordr0[1] = ix_create_combt (combs_buf, il_pos, 1); - wordr0[2] = ix_create_combt (combs_buf, il_pos, 2); - wordr0[3] = ix_create_combt (combs_buf, il_pos, 3); - wordr1[0] = ix_create_combt (combs_buf, il_pos, 4); - wordr1[1] = ix_create_combt (combs_buf, il_pos, 5); - wordr1[2] = ix_create_combt (combs_buf, il_pos, 6); - wordr1[3] = ix_create_combt (combs_buf, il_pos, 7); + const u32 pw_r_len = combs_buf[il_pos].pw_len; + + const u32 pw_len = pw_l_len + pw_r_len; + + u32 wordr0[4]; + u32 wordr1[4]; + u32 wordr2[4]; + u32 wordr3[4]; + + wordr0[0] = combs_buf[il_pos].i[0]; + wordr0[1] = combs_buf[il_pos].i[1]; + wordr0[2] = combs_buf[il_pos].i[2]; + wordr0[3] = combs_buf[il_pos].i[3]; + wordr1[0] = combs_buf[il_pos].i[4]; + wordr1[1] = combs_buf[il_pos].i[5]; + wordr1[2] = combs_buf[il_pos].i[6]; + wordr1[3] = combs_buf[il_pos].i[7]; + wordr2[0] = 0; + wordr2[1] = 0; + wordr2[2] = 0; + wordr2[3] = 0; + wordr3[0] = 0; + wordr3[1] = 0; + wordr3[2] = 0; + wordr3[3] = 0; if (combs_mode == COMBINATOR_MODE_BASE_LEFT) { switch_buffer_by_offset_le (wordr0, wordr1, wordr2, wordr3, pw_l_len); } - u32x w0[4]; - u32x w1[4]; - u32x w2[4]; - u32x w3[4]; + u32 w0[4]; + u32 w1[4]; + u32 w2[4]; + u32 w3[4]; w0[0] = wordl0[0] | wordr0[0]; w0[1] = wordl0[1] | wordr0[1]; @@ -436,10 +498,10 @@ __kernel void m01740_s04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, const u32 pw_salt_len = (pw_len * 2) + salt_len; - u32x w0_t[4]; - u32x w1_t[4]; - u32x w2_t[4]; - u32x w3_t[4]; + u32 w0_t[4]; + u32 w1_t[4]; + u32 w2_t[4]; + u32 w3_t[4]; make_unicode (w0, w0_t, w1_t); make_unicode (w1, w2_t, w3_t); diff --git a/OpenCL/m01750_a1.cl b/OpenCL/m01750_a1.cl index 1529bc9..a14700f 100644 --- a/OpenCL/m01750_a1.cl +++ b/OpenCL/m01750_a1.cl @@ -5,8 +5,6 @@ #define _SHA512_ -#define NEW_SIMD_CODE - #include "include/constants.h" #include "include/kernel_vendor.h" @@ -18,7 +16,9 @@ #include "include/kernel_functions.c" #include "OpenCL/types_ocl.c" #include "OpenCL/common.c" -#include "OpenCL/simd.c" + +#define COMPARE_S "OpenCL/check_single_comp4.c" +#define COMPARE_M "OpenCL/check_multi_comp4.c" __constant u64 k_sha512[80] = { @@ -274,20 +274,41 @@ __kernel void m01750_m04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, if (gid >= gid_max) return; - u32 pws0[4] = { 0 }; - u32 pws1[4] = { 0 }; + u32 wordl0[4]; + + wordl0[0] = pws[gid].i[ 0]; + wordl0[1] = pws[gid].i[ 1]; + wordl0[2] = pws[gid].i[ 2]; + wordl0[3] = pws[gid].i[ 3]; + + u32 wordl1[4]; + + wordl1[0] = pws[gid].i[ 4]; + wordl1[1] = pws[gid].i[ 5]; + wordl1[2] = pws[gid].i[ 6]; + wordl1[3] = pws[gid].i[ 7]; + + u32 wordl2[4]; + + wordl2[0] = 0; + wordl2[1] = 0; + wordl2[2] = 0; + wordl2[3] = 0; + + u32 wordl3[4]; - pws0[0] = pws[gid].i[0]; - pws0[1] = pws[gid].i[1]; - pws0[2] = pws[gid].i[2]; - pws0[3] = pws[gid].i[3]; - pws1[0] = pws[gid].i[4]; - pws1[1] = pws[gid].i[5]; - pws1[2] = pws[gid].i[6]; - pws1[3] = pws[gid].i[7]; + wordl3[0] = 0; + wordl3[1] = 0; + wordl3[2] = 0; + wordl3[3] = 0; const u32 pw_l_len = pws[gid].pw_len; + if (combs_mode == COMBINATOR_MODE_BASE_RIGHT) + { + switch_buffer_by_offset_le (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len); + } + /** * salt */ @@ -312,53 +333,67 @@ __kernel void m01750_m04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, * loop */ - for (u32 il_pos = 0; il_pos < combs_cnt; il_pos += VECT_SIZE) + for (u32 il_pos = 0; il_pos < combs_cnt; il_pos++) { - const u32x pw_r_len = pwlenx_create_combt (combs_buf, il_pos); + const u32 pw_r_len = combs_buf[il_pos].pw_len; + + const u32 pw_len = pw_l_len + pw_r_len; + + u32 wordr0[4]; + + wordr0[0] = combs_buf[il_pos].i[0]; + wordr0[1] = combs_buf[il_pos].i[1]; + wordr0[2] = combs_buf[il_pos].i[2]; + wordr0[3] = combs_buf[il_pos].i[3]; + + u32 wordr1[4]; + + wordr1[0] = combs_buf[il_pos].i[4]; + wordr1[1] = combs_buf[il_pos].i[5]; + wordr1[2] = combs_buf[il_pos].i[6]; + wordr1[3] = combs_buf[il_pos].i[7]; - const u32x pw_len = pw_l_len + pw_r_len; + u32 wordr2[4]; - u32x wordr0[4] = { 0 }; - u32x wordr1[4] = { 0 }; - u32x wordr2[4] = { 0 }; - u32x wordr3[4] = { 0 }; + wordr2[0] = 0; + wordr2[1] = 0; + wordr2[2] = 0; + wordr2[3] = 0; - wordr0[0] = ix_create_combt (combs_buf, il_pos, 0); - wordr0[1] = ix_create_combt (combs_buf, il_pos, 1); - wordr0[2] = ix_create_combt (combs_buf, il_pos, 2); - wordr0[3] = ix_create_combt (combs_buf, il_pos, 3); - wordr1[0] = ix_create_combt (combs_buf, il_pos, 4); - wordr1[1] = ix_create_combt (combs_buf, il_pos, 5); - wordr1[2] = ix_create_combt (combs_buf, il_pos, 6); - wordr1[3] = ix_create_combt (combs_buf, il_pos, 7); + u32 wordr3[4]; + + wordr3[0] = 0; + wordr3[1] = 0; + wordr3[2] = 0; + wordr3[3] = 0; if (combs_mode == COMBINATOR_MODE_BASE_LEFT) { switch_buffer_by_offset_le (wordr0, wordr1, wordr2, wordr3, pw_l_len); } - u32x w0[4]; + u32 w0[4]; w0[0] = wordl0[0] | wordr0[0]; w0[1] = wordl0[1] | wordr0[1]; w0[2] = wordl0[2] | wordr0[2]; w0[3] = wordl0[3] | wordr0[3]; - u32x w1[4]; + u32 w1[4]; w1[0] = wordl1[0] | wordr1[0]; w1[1] = wordl1[1] | wordr1[1]; w1[2] = wordl1[2] | wordr1[2]; w1[3] = wordl1[3] | wordr1[3]; - u32x w2[4]; + u32 w2[4]; w2[0] = wordl2[0] | wordr2[0]; w2[1] = wordl2[1] | wordr2[1]; w2[2] = wordl2[2] | wordr2[2]; w2[3] = wordl2[3] | wordr2[3]; - u32x w3[4]; + u32 w3[4]; w3[0] = wordl3[0] | wordr3[0]; w3[1] = wordl3[1] | wordr3[1]; @@ -369,28 +404,28 @@ __kernel void m01750_m04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, * pads */ - u32x w0_t[4]; + u32 w0_t[4]; w0_t[0] = swap32 (w0[0]); w0_t[1] = swap32 (w0[1]); w0_t[2] = swap32 (w0[2]); w0_t[3] = swap32 (w0[3]); - u32x w1_t[4]; + u32 w1_t[4]; w1_t[0] = swap32 (w1[0]); w1_t[1] = swap32 (w1[1]); w1_t[2] = swap32 (w1[2]); w1_t[3] = swap32 (w1[3]); - u32x w2_t[4]; + u32 w2_t[4]; w2_t[0] = 0; w2_t[1] = 0; w2_t[2] = 0; w2_t[3] = 0; - u32x w3_t[4]; + u32 w3_t[4]; w3_t[0] = 0; w3_t[1] = 0; @@ -457,20 +492,41 @@ __kernel void m01750_s04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, if (gid >= gid_max) return; - u32 pws0[4] = { 0 }; - u32 pws1[4] = { 0 }; + u32 wordl0[4]; + + wordl0[0] = pws[gid].i[ 0]; + wordl0[1] = pws[gid].i[ 1]; + wordl0[2] = pws[gid].i[ 2]; + wordl0[3] = pws[gid].i[ 3]; + + u32 wordl1[4]; + + wordl1[0] = pws[gid].i[ 4]; + wordl1[1] = pws[gid].i[ 5]; + wordl1[2] = pws[gid].i[ 6]; + wordl1[3] = pws[gid].i[ 7]; - pws0[0] = pws[gid].i[0]; - pws0[1] = pws[gid].i[1]; - pws0[2] = pws[gid].i[2]; - pws0[3] = pws[gid].i[3]; - pws1[0] = pws[gid].i[4]; - pws1[1] = pws[gid].i[5]; - pws1[2] = pws[gid].i[6]; - pws1[3] = pws[gid].i[7]; + u32 wordl2[4]; + + wordl2[0] = 0; + wordl2[1] = 0; + wordl2[2] = 0; + wordl2[3] = 0; + + u32 wordl3[4]; + + wordl3[0] = 0; + wordl3[1] = 0; + wordl3[2] = 0; + wordl3[3] = 0; const u32 pw_l_len = pws[gid].pw_len; + if (combs_mode == COMBINATOR_MODE_BASE_RIGHT) + { + switch_buffer_by_offset_le (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len); + } + /** * salt */ @@ -507,53 +563,67 @@ __kernel void m01750_s04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, * loop */ - for (u32 il_pos = 0; il_pos < combs_cnt; il_pos += VECT_SIZE) + for (u32 il_pos = 0; il_pos < combs_cnt; il_pos++) { - const u32x pw_r_len = pwlenx_create_combt (combs_buf, il_pos); + const u32 pw_r_len = combs_buf[il_pos].pw_len; + + const u32 pw_len = pw_l_len + pw_r_len; + + u32 wordr0[4]; + + wordr0[0] = combs_buf[il_pos].i[0]; + wordr0[1] = combs_buf[il_pos].i[1]; + wordr0[2] = combs_buf[il_pos].i[2]; + wordr0[3] = combs_buf[il_pos].i[3]; + + u32 wordr1[4]; + + wordr1[0] = combs_buf[il_pos].i[4]; + wordr1[1] = combs_buf[il_pos].i[5]; + wordr1[2] = combs_buf[il_pos].i[6]; + wordr1[3] = combs_buf[il_pos].i[7]; + + u32 wordr2[4]; - const u32x pw_len = pw_l_len + pw_r_len; + wordr2[0] = 0; + wordr2[1] = 0; + wordr2[2] = 0; + wordr2[3] = 0; - u32x wordr0[4] = { 0 }; - u32x wordr1[4] = { 0 }; - u32x wordr2[4] = { 0 }; - u32x wordr3[4] = { 0 }; + u32 wordr3[4]; - wordr0[0] = ix_create_combt (combs_buf, il_pos, 0); - wordr0[1] = ix_create_combt (combs_buf, il_pos, 1); - wordr0[2] = ix_create_combt (combs_buf, il_pos, 2); - wordr0[3] = ix_create_combt (combs_buf, il_pos, 3); - wordr1[0] = ix_create_combt (combs_buf, il_pos, 4); - wordr1[1] = ix_create_combt (combs_buf, il_pos, 5); - wordr1[2] = ix_create_combt (combs_buf, il_pos, 6); - wordr1[3] = ix_create_combt (combs_buf, il_pos, 7); + wordr3[0] = 0; + wordr3[1] = 0; + wordr3[2] = 0; + wordr3[3] = 0; if (combs_mode == COMBINATOR_MODE_BASE_LEFT) { switch_buffer_by_offset_le (wordr0, wordr1, wordr2, wordr3, pw_l_len); } - u32x w0[4]; + u32 w0[4]; w0[0] = wordl0[0] | wordr0[0]; w0[1] = wordl0[1] | wordr0[1]; w0[2] = wordl0[2] | wordr0[2]; w0[3] = wordl0[3] | wordr0[3]; - u32x w1[4]; + u32 w1[4]; w1[0] = wordl1[0] | wordr1[0]; w1[1] = wordl1[1] | wordr1[1]; w1[2] = wordl1[2] | wordr1[2]; w1[3] = wordl1[3] | wordr1[3]; - u32x w2[4]; + u32 w2[4]; w2[0] = wordl2[0] | wordr2[0]; w2[1] = wordl2[1] | wordr2[1]; w2[2] = wordl2[2] | wordr2[2]; w2[3] = wordl2[3] | wordr2[3]; - u32x w3[4]; + u32 w3[4]; w3[0] = wordl3[0] | wordr3[0]; w3[1] = wordl3[1] | wordr3[1]; @@ -564,28 +634,28 @@ __kernel void m01750_s04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, * pads */ - u32x w0_t[4]; + u32 w0_t[4]; w0_t[0] = swap32 (w0[0]); w0_t[1] = swap32 (w0[1]); w0_t[2] = swap32 (w0[2]); w0_t[3] = swap32 (w0[3]); - u32x w1_t[4]; + u32 w1_t[4]; w1_t[0] = swap32 (w1[0]); w1_t[1] = swap32 (w1[1]); w1_t[2] = swap32 (w1[2]); w1_t[3] = swap32 (w1[3]); - u32x w2_t[4]; + u32 w2_t[4]; w2_t[0] = 0; w2_t[1] = 0; w2_t[2] = 0; w2_t[3] = 0; - u32x w3_t[4]; + u32 w3_t[4]; w3_t[0] = 0; w3_t[1] = 0; diff --git a/OpenCL/m01760_a1.cl b/OpenCL/m01760_a1.cl index ef1f458..78d1527 100644 --- a/OpenCL/m01760_a1.cl +++ b/OpenCL/m01760_a1.cl @@ -5,8 +5,6 @@ #define _SHA512_ -#define NEW_SIMD_CODE - #include "include/constants.h" #include "include/kernel_vendor.h" @@ -18,7 +16,9 @@ #include "include/kernel_functions.c" #include "OpenCL/types_ocl.c" #include "OpenCL/common.c" -#include "OpenCL/simd.c" + +#define COMPARE_S "OpenCL/check_single_comp4.c" +#define COMPARE_M "OpenCL/check_multi_comp4.c" __constant u64 k_sha512[80] = { @@ -274,20 +274,41 @@ __kernel void m01760_m04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, if (gid >= gid_max) return; - u32 pws0[4] = { 0 }; - u32 pws1[4] = { 0 }; + u32 wordl0[4]; + + wordl0[0] = pws[gid].i[ 0]; + wordl0[1] = pws[gid].i[ 1]; + wordl0[2] = pws[gid].i[ 2]; + wordl0[3] = pws[gid].i[ 3]; + + u32 wordl1[4]; + + wordl1[0] = pws[gid].i[ 4]; + wordl1[1] = pws[gid].i[ 5]; + wordl1[2] = pws[gid].i[ 6]; + wordl1[3] = pws[gid].i[ 7]; + + u32 wordl2[4]; + + wordl2[0] = 0; + wordl2[1] = 0; + wordl2[2] = 0; + wordl2[3] = 0; + + u32 wordl3[4]; - pws0[0] = pws[gid].i[0]; - pws0[1] = pws[gid].i[1]; - pws0[2] = pws[gid].i[2]; - pws0[3] = pws[gid].i[3]; - pws1[0] = pws[gid].i[4]; - pws1[1] = pws[gid].i[5]; - pws1[2] = pws[gid].i[6]; - pws1[3] = pws[gid].i[7]; + wordl3[0] = 0; + wordl3[1] = 0; + wordl3[2] = 0; + wordl3[3] = 0; const u32 pw_l_len = pws[gid].pw_len; + if (combs_mode == COMBINATOR_MODE_BASE_RIGHT) + { + switch_buffer_by_offset_le (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len); + } + /** * salt */ @@ -347,53 +368,67 @@ __kernel void m01760_m04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, * loop */ - for (u32 il_pos = 0; il_pos < combs_cnt; il_pos += VECT_SIZE) + for (u32 il_pos = 0; il_pos < combs_cnt; il_pos++) { - const u32x pw_r_len = pwlenx_create_combt (combs_buf, il_pos); + const u32 pw_r_len = combs_buf[il_pos].pw_len; + + const u32 pw_len = pw_l_len + pw_r_len; + + u32 wordr0[4]; + + wordr0[0] = combs_buf[il_pos].i[0]; + wordr0[1] = combs_buf[il_pos].i[1]; + wordr0[2] = combs_buf[il_pos].i[2]; + wordr0[3] = combs_buf[il_pos].i[3]; + + u32 wordr1[4]; + + wordr1[0] = combs_buf[il_pos].i[4]; + wordr1[1] = combs_buf[il_pos].i[5]; + wordr1[2] = combs_buf[il_pos].i[6]; + wordr1[3] = combs_buf[il_pos].i[7]; - const u32x pw_len = pw_l_len + pw_r_len; + u32 wordr2[4]; - u32x wordr0[4] = { 0 }; - u32x wordr1[4] = { 0 }; - u32x wordr2[4] = { 0 }; - u32x wordr3[4] = { 0 }; + wordr2[0] = 0; + wordr2[1] = 0; + wordr2[2] = 0; + wordr2[3] = 0; - wordr0[0] = ix_create_combt (combs_buf, il_pos, 0); - wordr0[1] = ix_create_combt (combs_buf, il_pos, 1); - wordr0[2] = ix_create_combt (combs_buf, il_pos, 2); - wordr0[3] = ix_create_combt (combs_buf, il_pos, 3); - wordr1[0] = ix_create_combt (combs_buf, il_pos, 4); - wordr1[1] = ix_create_combt (combs_buf, il_pos, 5); - wordr1[2] = ix_create_combt (combs_buf, il_pos, 6); - wordr1[3] = ix_create_combt (combs_buf, il_pos, 7); + u32 wordr3[4]; + + wordr3[0] = 0; + wordr3[1] = 0; + wordr3[2] = 0; + wordr3[3] = 0; if (combs_mode == COMBINATOR_MODE_BASE_LEFT) { switch_buffer_by_offset_le (wordr0, wordr1, wordr2, wordr3, pw_l_len); } - u32x w0[4]; + u32 w0[4]; w0[0] = wordl0[0] | wordr0[0]; w0[1] = wordl0[1] | wordr0[1]; w0[2] = wordl0[2] | wordr0[2]; w0[3] = wordl0[3] | wordr0[3]; - u32x w1[4]; + u32 w1[4]; w1[0] = wordl1[0] | wordr1[0]; w1[1] = wordl1[1] | wordr1[1]; w1[2] = wordl1[2] | wordr1[2]; w1[3] = wordl1[3] | wordr1[3]; - u32x w2[4]; + u32 w2[4]; w2[0] = wordl2[0] | wordr2[0]; w2[1] = wordl2[1] | wordr2[1]; w2[2] = wordl2[2] | wordr2[2]; w2[3] = wordl2[3] | wordr2[3]; - u32x w3[4]; + u32 w3[4]; w3[0] = wordl3[0] | wordr3[0]; w3[1] = wordl3[1] | wordr3[1]; @@ -457,20 +492,41 @@ __kernel void m01760_s04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, if (gid >= gid_max) return; - u32 pws0[4] = { 0 }; - u32 pws1[4] = { 0 }; + u32 wordl0[4]; + + wordl0[0] = pws[gid].i[ 0]; + wordl0[1] = pws[gid].i[ 1]; + wordl0[2] = pws[gid].i[ 2]; + wordl0[3] = pws[gid].i[ 3]; + + u32 wordl1[4]; + + wordl1[0] = pws[gid].i[ 4]; + wordl1[1] = pws[gid].i[ 5]; + wordl1[2] = pws[gid].i[ 6]; + wordl1[3] = pws[gid].i[ 7]; - pws0[0] = pws[gid].i[0]; - pws0[1] = pws[gid].i[1]; - pws0[2] = pws[gid].i[2]; - pws0[3] = pws[gid].i[3]; - pws1[0] = pws[gid].i[4]; - pws1[1] = pws[gid].i[5]; - pws1[2] = pws[gid].i[6]; - pws1[3] = pws[gid].i[7]; + u32 wordl2[4]; + + wordl2[0] = 0; + wordl2[1] = 0; + wordl2[2] = 0; + wordl2[3] = 0; + + u32 wordl3[4]; + + wordl3[0] = 0; + wordl3[1] = 0; + wordl3[2] = 0; + wordl3[3] = 0; const u32 pw_l_len = pws[gid].pw_len; + if (combs_mode == COMBINATOR_MODE_BASE_RIGHT) + { + switch_buffer_by_offset_le (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len); + } + /** * salt */ @@ -542,53 +598,67 @@ __kernel void m01760_s04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, * loop */ - for (u32 il_pos = 0; il_pos < combs_cnt; il_pos += VECT_SIZE) + for (u32 il_pos = 0; il_pos < combs_cnt; il_pos++) { - const u32x pw_r_len = pwlenx_create_combt (combs_buf, il_pos); + const u32 pw_r_len = combs_buf[il_pos].pw_len; + + const u32 pw_len = pw_l_len + pw_r_len; + + u32 wordr0[4]; + + wordr0[0] = combs_buf[il_pos].i[0]; + wordr0[1] = combs_buf[il_pos].i[1]; + wordr0[2] = combs_buf[il_pos].i[2]; + wordr0[3] = combs_buf[il_pos].i[3]; + + u32 wordr1[4]; + + wordr1[0] = combs_buf[il_pos].i[4]; + wordr1[1] = combs_buf[il_pos].i[5]; + wordr1[2] = combs_buf[il_pos].i[6]; + wordr1[3] = combs_buf[il_pos].i[7]; + + u32 wordr2[4]; - const u32x pw_len = pw_l_len + pw_r_len; + wordr2[0] = 0; + wordr2[1] = 0; + wordr2[2] = 0; + wordr2[3] = 0; - u32x wordr0[4] = { 0 }; - u32x wordr1[4] = { 0 }; - u32x wordr2[4] = { 0 }; - u32x wordr3[4] = { 0 }; + u32 wordr3[4]; - wordr0[0] = ix_create_combt (combs_buf, il_pos, 0); - wordr0[1] = ix_create_combt (combs_buf, il_pos, 1); - wordr0[2] = ix_create_combt (combs_buf, il_pos, 2); - wordr0[3] = ix_create_combt (combs_buf, il_pos, 3); - wordr1[0] = ix_create_combt (combs_buf, il_pos, 4); - wordr1[1] = ix_create_combt (combs_buf, il_pos, 5); - wordr1[2] = ix_create_combt (combs_buf, il_pos, 6); - wordr1[3] = ix_create_combt (combs_buf, il_pos, 7); + wordr3[0] = 0; + wordr3[1] = 0; + wordr3[2] = 0; + wordr3[3] = 0; if (combs_mode == COMBINATOR_MODE_BASE_LEFT) { switch_buffer_by_offset_le (wordr0, wordr1, wordr2, wordr3, pw_l_len); } - u32x w0[4]; + u32 w0[4]; w0[0] = wordl0[0] | wordr0[0]; w0[1] = wordl0[1] | wordr0[1]; w0[2] = wordl0[2] | wordr0[2]; w0[3] = wordl0[3] | wordr0[3]; - u32x w1[4]; + u32 w1[4]; w1[0] = wordl1[0] | wordr1[0]; w1[1] = wordl1[1] | wordr1[1]; w1[2] = wordl1[2] | wordr1[2]; w1[3] = wordl1[3] | wordr1[3]; - u32x w2[4]; + u32 w2[4]; w2[0] = wordl2[0] | wordr2[0]; w2[1] = wordl2[1] | wordr2[1]; w2[2] = wordl2[2] | wordr2[2]; w2[3] = wordl2[3] | wordr2[3]; - u32x w3[4]; + u32 w3[4]; w3[0] = wordl3[0] | wordr3[0]; w3[1] = wordl3[1] | wordr3[1]; diff --git a/OpenCL/m02400_a1.cl b/OpenCL/m02400_a1.cl index 4e89e82..751c3a1 100644 --- a/OpenCL/m02400_a1.cl +++ b/OpenCL/m02400_a1.cl @@ -5,8 +5,6 @@ #define _MD5_ -#define NEW_SIMD_CODE - #include "include/constants.h" #include "include/kernel_vendor.h" @@ -18,7 +16,9 @@ #include "include/kernel_functions.c" #include "OpenCL/types_ocl.c" #include "OpenCL/common.c" -#include "OpenCL/simd.c" + +#define COMPARE_S "OpenCL/check_single_comp4.c" +#define COMPARE_M "OpenCL/check_multi_comp4.c" __kernel void m02400_m04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, __global comb_t *combs_buf, __global bf_t *bfs_buf, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 combs_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max) { @@ -36,71 +36,106 @@ __kernel void m02400_m04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, if (gid >= gid_max) return; - u32 pws0[4] = { 0 }; - u32 pws1[4] = { 0 }; + u32 wordl0[4]; + + wordl0[0] = pws[gid].i[ 0]; + wordl0[1] = pws[gid].i[ 1]; + wordl0[2] = pws[gid].i[ 2]; + wordl0[3] = pws[gid].i[ 3]; + + u32 wordl1[4]; + + wordl1[0] = pws[gid].i[ 4]; + wordl1[1] = pws[gid].i[ 5]; + wordl1[2] = pws[gid].i[ 6]; + wordl1[3] = pws[gid].i[ 7]; - pws0[0] = pws[gid].i[0]; - pws0[1] = pws[gid].i[1]; - pws0[2] = pws[gid].i[2]; - pws0[3] = pws[gid].i[3]; - pws1[0] = pws[gid].i[4]; - pws1[1] = pws[gid].i[5]; - pws1[2] = pws[gid].i[6]; - pws1[3] = pws[gid].i[7]; + u32 wordl2[4]; + + wordl2[0] = 0; + wordl2[1] = 0; + wordl2[2] = 0; + wordl2[3] = 0; + + u32 wordl3[4]; + + wordl3[0] = 0; + wordl3[1] = 0; + wordl3[2] = 0; + wordl3[3] = 0; const u32 pw_l_len = pws[gid].pw_len; + if (combs_mode == COMBINATOR_MODE_BASE_RIGHT) + { + switch_buffer_by_offset_le (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len); + } + /** * loop */ - for (u32 il_pos = 0; il_pos < combs_cnt; il_pos += VECT_SIZE) + for (u32 il_pos = 0; il_pos < combs_cnt; il_pos++) { - const u32x pw_r_len = pwlenx_create_combt (combs_buf, il_pos); + const u32 pw_r_len = combs_buf[il_pos].pw_len; + + const u32 pw_len = pw_l_len + pw_r_len; + + u32 wordr0[4]; + + wordr0[0] = combs_buf[il_pos].i[0]; + wordr0[1] = combs_buf[il_pos].i[1]; + wordr0[2] = combs_buf[il_pos].i[2]; + wordr0[3] = combs_buf[il_pos].i[3]; - const u32x pw_len = pw_l_len + pw_r_len; + u32 wordr1[4]; - u32x wordr0[4] = { 0 }; - u32x wordr1[4] = { 0 }; - u32x wordr2[4] = { 0 }; - u32x wordr3[4] = { 0 }; + wordr1[0] = combs_buf[il_pos].i[4]; + wordr1[1] = combs_buf[il_pos].i[5]; + wordr1[2] = combs_buf[il_pos].i[6]; + wordr1[3] = combs_buf[il_pos].i[7]; - wordr0[0] = ix_create_combt (combs_buf, il_pos, 0); - wordr0[1] = ix_create_combt (combs_buf, il_pos, 1); - wordr0[2] = ix_create_combt (combs_buf, il_pos, 2); - wordr0[3] = ix_create_combt (combs_buf, il_pos, 3); - wordr1[0] = ix_create_combt (combs_buf, il_pos, 4); - wordr1[1] = ix_create_combt (combs_buf, il_pos, 5); - wordr1[2] = ix_create_combt (combs_buf, il_pos, 6); - wordr1[3] = ix_create_combt (combs_buf, il_pos, 7); + u32 wordr2[4]; + + wordr2[0] = 0; + wordr2[1] = 0; + wordr2[2] = 0; + wordr2[3] = 0; + + u32 wordr3[4]; + + wordr3[0] = 0; + wordr3[1] = 0; + wordr3[2] = 0; + wordr3[3] = 0; if (combs_mode == COMBINATOR_MODE_BASE_LEFT) { switch_buffer_by_offset_le (wordr0, wordr1, wordr2, wordr3, pw_l_len); } - u32x w0[4]; + u32 w0[4]; w0[0] = wordl0[0] | wordr0[0]; w0[1] = wordl0[1] | wordr0[1]; w0[2] = wordl0[2] | wordr0[2]; w0[3] = wordl0[3] | wordr0[3]; - u32x w1[4]; + u32 w1[4]; w1[0] = wordl1[0] | wordr1[0]; w1[1] = wordl1[1] | wordr1[1]; w1[2] = wordl1[2] | wordr1[2]; w1[3] = wordl1[3] | wordr1[3]; - u32x w2[4]; + u32 w2[4]; w2[0] = wordl2[0] | wordr2[0]; w2[1] = wordl2[1] | wordr2[1]; w2[2] = wordl2[2] | wordr2[2]; w2[3] = wordl2[3] | wordr2[3]; - u32x w3[4]; + u32 w3[4]; w3[0] = wordl3[0] | wordr3[0]; w3[1] = wordl3[1] | wordr3[1]; @@ -112,10 +147,10 @@ __kernel void m02400_m04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, w1[0] = 0x80; w3[2] = 16 * 8; - u32x a = MD5M_A; - u32x b = MD5M_B; - u32x c = MD5M_C; - u32x d = MD5M_D; + u32 a = MD5M_A; + u32 b = MD5M_B; + u32 c = MD5M_C; + u32 d = MD5M_D; MD5_STEP (MD5_Fo, a, b, c, d, w0[0], MD5C00, MD5S00); MD5_STEP (MD5_Fo, d, a, b, c, w0[1], MD5C01, MD5S01); @@ -189,7 +224,13 @@ __kernel void m02400_m04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, d &= 0x00ffffff; c &= 0x00ffffff; b &= 0x00ffffff; - COMPARE_M_SIMD (a, d, c, b); + + const u32 r0 = a; + const u32 r1 = d; + const u32 r2 = c; + const u32 r3 = b; + + #include COMPARE_M } } @@ -217,20 +258,41 @@ __kernel void m02400_s04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, if (gid >= gid_max) return; - u32 pws0[4] = { 0 }; - u32 pws1[4] = { 0 }; + u32 wordl0[4]; + + wordl0[0] = pws[gid].i[ 0]; + wordl0[1] = pws[gid].i[ 1]; + wordl0[2] = pws[gid].i[ 2]; + wordl0[3] = pws[gid].i[ 3]; + + u32 wordl1[4]; + + wordl1[0] = pws[gid].i[ 4]; + wordl1[1] = pws[gid].i[ 5]; + wordl1[2] = pws[gid].i[ 6]; + wordl1[3] = pws[gid].i[ 7]; + + u32 wordl2[4]; - pws0[0] = pws[gid].i[0]; - pws0[1] = pws[gid].i[1]; - pws0[2] = pws[gid].i[2]; - pws0[3] = pws[gid].i[3]; - pws1[0] = pws[gid].i[4]; - pws1[1] = pws[gid].i[5]; - pws1[2] = pws[gid].i[6]; - pws1[3] = pws[gid].i[7]; + wordl2[0] = 0; + wordl2[1] = 0; + wordl2[2] = 0; + wordl2[3] = 0; + + u32 wordl3[4]; + + wordl3[0] = 0; + wordl3[1] = 0; + wordl3[2] = 0; + wordl3[3] = 0; const u32 pw_l_len = pws[gid].pw_len; + if (combs_mode == COMBINATOR_MODE_BASE_RIGHT) + { + switch_buffer_by_offset_le (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len); + } + /** * digest */ @@ -247,53 +309,67 @@ __kernel void m02400_s04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, * loop */ - for (u32 il_pos = 0; il_pos < combs_cnt; il_pos += VECT_SIZE) + for (u32 il_pos = 0; il_pos < combs_cnt; il_pos++) { - const u32x pw_r_len = pwlenx_create_combt (combs_buf, il_pos); + const u32 pw_r_len = combs_buf[il_pos].pw_len; + + const u32 pw_len = pw_l_len + pw_r_len; + + u32 wordr0[4]; + + wordr0[0] = combs_buf[il_pos].i[0]; + wordr0[1] = combs_buf[il_pos].i[1]; + wordr0[2] = combs_buf[il_pos].i[2]; + wordr0[3] = combs_buf[il_pos].i[3]; + + u32 wordr1[4]; - const u32x pw_len = pw_l_len + pw_r_len; + wordr1[0] = combs_buf[il_pos].i[4]; + wordr1[1] = combs_buf[il_pos].i[5]; + wordr1[2] = combs_buf[il_pos].i[6]; + wordr1[3] = combs_buf[il_pos].i[7]; - u32x wordr0[4] = { 0 }; - u32x wordr1[4] = { 0 }; - u32x wordr2[4] = { 0 }; - u32x wordr3[4] = { 0 }; + u32 wordr2[4]; - wordr0[0] = ix_create_combt (combs_buf, il_pos, 0); - wordr0[1] = ix_create_combt (combs_buf, il_pos, 1); - wordr0[2] = ix_create_combt (combs_buf, il_pos, 2); - wordr0[3] = ix_create_combt (combs_buf, il_pos, 3); - wordr1[0] = ix_create_combt (combs_buf, il_pos, 4); - wordr1[1] = ix_create_combt (combs_buf, il_pos, 5); - wordr1[2] = ix_create_combt (combs_buf, il_pos, 6); - wordr1[3] = ix_create_combt (combs_buf, il_pos, 7); + wordr2[0] = 0; + wordr2[1] = 0; + wordr2[2] = 0; + wordr2[3] = 0; + + u32 wordr3[4]; + + wordr3[0] = 0; + wordr3[1] = 0; + wordr3[2] = 0; + wordr3[3] = 0; if (combs_mode == COMBINATOR_MODE_BASE_LEFT) { switch_buffer_by_offset_le (wordr0, wordr1, wordr2, wordr3, pw_l_len); } - u32x w0[4]; + u32 w0[4]; w0[0] = wordl0[0] | wordr0[0]; w0[1] = wordl0[1] | wordr0[1]; w0[2] = wordl0[2] | wordr0[2]; w0[3] = wordl0[3] | wordr0[3]; - u32x w1[4]; + u32 w1[4]; w1[0] = wordl1[0] | wordr1[0]; w1[1] = wordl1[1] | wordr1[1]; w1[2] = wordl1[2] | wordr1[2]; w1[3] = wordl1[3] | wordr1[3]; - u32x w2[4]; + u32 w2[4]; w2[0] = wordl2[0] | wordr2[0]; w2[1] = wordl2[1] | wordr2[1]; w2[2] = wordl2[2] | wordr2[2]; w2[3] = wordl2[3] | wordr2[3]; - u32x w3[4]; + u32 w3[4]; w3[0] = wordl3[0] | wordr3[0]; w3[1] = wordl3[1] | wordr3[1]; @@ -305,10 +381,10 @@ __kernel void m02400_s04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, w1[0] = 0x80; w3[2] = 16 * 8; - u32x a = MD5M_A; - u32x b = MD5M_B; - u32x c = MD5M_C; - u32x d = MD5M_D; + u32 a = MD5M_A; + u32 b = MD5M_B; + u32 c = MD5M_C; + u32 d = MD5M_D; MD5_STEP (MD5_Fo, a, b, c, d, w0[0], MD5C00, MD5S00); MD5_STEP (MD5_Fo, d, a, b, c, w0[1], MD5C01, MD5S01); @@ -387,7 +463,13 @@ __kernel void m02400_s04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, d &= 0x00ffffff; c &= 0x00ffffff; b &= 0x00ffffff; - COMPARE_S_SIMD (a, d, c, b); + + const u32 r0 = a; + const u32 r1 = d; + const u32 r2 = c; + const u32 r3 = b; + + #include COMPARE_S } } diff --git a/OpenCL/m02410_a1.cl b/OpenCL/m02410_a1.cl index 88bafcc..bba4043 100644 --- a/OpenCL/m02410_a1.cl +++ b/OpenCL/m02410_a1.cl @@ -5,8 +5,6 @@ #define _MD5_ -#define NEW_SIMD_CODE - #include "include/constants.h" #include "include/kernel_vendor.h" @@ -18,7 +16,9 @@ #include "include/kernel_functions.c" #include "OpenCL/types_ocl.c" #include "OpenCL/common.c" -#include "OpenCL/simd.c" + +#define COMPARE_S "OpenCL/check_single_comp4.c" +#define COMPARE_M "OpenCL/check_multi_comp4.c" __kernel void m02410_m04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, __global comb_t *combs_buf, __global bf_t *bfs_buf, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 combs_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max) { @@ -36,20 +36,41 @@ __kernel void m02410_m04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, if (gid >= gid_max) return; - u32 pws0[4] = { 0 }; - u32 pws1[4] = { 0 }; + u32 wordl0[4]; + + wordl0[0] = pws[gid].i[ 0]; + wordl0[1] = pws[gid].i[ 1]; + wordl0[2] = pws[gid].i[ 2]; + wordl0[3] = pws[gid].i[ 3]; + + u32 wordl1[4]; + + wordl1[0] = pws[gid].i[ 4]; + wordl1[1] = pws[gid].i[ 5]; + wordl1[2] = pws[gid].i[ 6]; + wordl1[3] = pws[gid].i[ 7]; - pws0[0] = pws[gid].i[0]; - pws0[1] = pws[gid].i[1]; - pws0[2] = pws[gid].i[2]; - pws0[3] = pws[gid].i[3]; - pws1[0] = pws[gid].i[4]; - pws1[1] = pws[gid].i[5]; - pws1[2] = pws[gid].i[6]; - pws1[3] = pws[gid].i[7]; + u32 wordl2[4]; + + wordl2[0] = 0; + wordl2[1] = 0; + wordl2[2] = 0; + wordl2[3] = 0; + + u32 wordl3[4]; + + wordl3[0] = 0; + wordl3[1] = 0; + wordl3[2] = 0; + wordl3[3] = 0; const u32 pw_l_len = pws[gid].pw_len; + if (combs_mode == COMBINATOR_MODE_BASE_RIGHT) + { + switch_buffer_by_offset_le (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len); + } + /** * salt */ @@ -67,25 +88,39 @@ __kernel void m02410_m04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, * loop */ - for (u32 il_pos = 0; il_pos < combs_cnt; il_pos += VECT_SIZE) + for (u32 il_pos = 0; il_pos < combs_cnt; il_pos++) { - const u32x pw_r_len = pwlenx_create_combt (combs_buf, il_pos); + const u32 pw_r_len = combs_buf[il_pos].pw_len; + + const u32 pw_len = pw_l_len + pw_r_len; + + u32 wordr0[4]; + + wordr0[0] = combs_buf[il_pos].i[0]; + wordr0[1] = combs_buf[il_pos].i[1]; + wordr0[2] = combs_buf[il_pos].i[2]; + wordr0[3] = combs_buf[il_pos].i[3]; - const u32x pw_len = pw_l_len + pw_r_len; + u32 wordr1[4]; - u32x wordr0[4] = { 0 }; - u32x wordr1[4] = { 0 }; - u32x wordr2[4] = { 0 }; - u32x wordr3[4] = { 0 }; + wordr1[0] = combs_buf[il_pos].i[4]; + wordr1[1] = combs_buf[il_pos].i[5]; + wordr1[2] = combs_buf[il_pos].i[6]; + wordr1[3] = combs_buf[il_pos].i[7]; - wordr0[0] = ix_create_combt (combs_buf, il_pos, 0); - wordr0[1] = ix_create_combt (combs_buf, il_pos, 1); - wordr0[2] = ix_create_combt (combs_buf, il_pos, 2); - wordr0[3] = ix_create_combt (combs_buf, il_pos, 3); - wordr1[0] = ix_create_combt (combs_buf, il_pos, 4); - wordr1[1] = ix_create_combt (combs_buf, il_pos, 5); - wordr1[2] = ix_create_combt (combs_buf, il_pos, 6); - wordr1[3] = ix_create_combt (combs_buf, il_pos, 7); + u32 wordr2[4]; + + wordr2[0] = 0; + wordr2[1] = 0; + wordr2[2] = 0; + wordr2[3] = 0; + + u32 wordr3[4]; + + wordr3[0] = 0; + wordr3[1] = 0; + wordr3[2] = 0; + wordr3[3] = 0; if (combs_mode == COMBINATOR_MODE_BASE_LEFT) { @@ -124,32 +159,32 @@ __kernel void m02410_m04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, s3[2] = 0; s3[3] = 0; - switch_buffer_by_offset_le_S (s0, s1, s2, s3, pw_len); + switch_buffer_by_offset_le (s0, s1, s2, s3, pw_len); - const u32x pw_salt_len = pw_len + salt_len; + const u32 pw_salt_len = pw_len + salt_len; - u32x w0[4]; + u32 w0[4]; w0[0] = wordl0[0] | wordr0[0] | s0[0]; w0[1] = wordl0[1] | wordr0[1] | s0[1]; w0[2] = wordl0[2] | wordr0[2] | s0[2]; w0[3] = wordl0[3] | wordr0[3] | s0[3]; - u32x w1[4]; + u32 w1[4]; w1[0] = wordl1[0] | wordr1[0]; w1[1] = wordl1[1] | wordr1[1]; w1[2] = wordl1[2] | wordr1[2]; w1[3] = wordl1[3] | wordr1[3]; - u32x w2[4]; + u32 w2[4]; w2[0] = wordl2[0] | wordr2[0]; w2[1] = wordl2[1] | wordr2[1]; w2[2] = wordl2[2] | wordr2[2]; w2[3] = wordl2[3] | wordr2[3]; - u32x w3[4]; + u32 w3[4]; w3[0] = wordl3[0] | wordr3[0]; w3[1] = wordl3[1] | wordr3[1]; @@ -161,10 +196,10 @@ __kernel void m02410_m04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, w1[0] = 0x80; w3[2] = 16 * 8; - u32x a = MD5M_A; - u32x b = MD5M_B; - u32x c = MD5M_C; - u32x d = MD5M_D; + u32 a = MD5M_A; + u32 b = MD5M_B; + u32 c = MD5M_C; + u32 d = MD5M_D; MD5_STEP (MD5_Fo, a, b, c, d, w0[0], MD5C00, MD5S00); MD5_STEP (MD5_Fo, d, a, b, c, w0[1], MD5C01, MD5S01); @@ -238,7 +273,13 @@ __kernel void m02410_m04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, d &= 0x00ffffff; c &= 0x00ffffff; b &= 0x00ffffff; - COMPARE_M_SIMD (a, d, c, b); + + const u32 r0 = a; + const u32 r1 = d; + const u32 r2 = c; + const u32 r3 = b; + + #include COMPARE_M } } @@ -266,20 +307,41 @@ __kernel void m02410_s04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, if (gid >= gid_max) return; - u32 pws0[4] = { 0 }; - u32 pws1[4] = { 0 }; + u32 wordl0[4]; + + wordl0[0] = pws[gid].i[ 0]; + wordl0[1] = pws[gid].i[ 1]; + wordl0[2] = pws[gid].i[ 2]; + wordl0[3] = pws[gid].i[ 3]; + + u32 wordl1[4]; + + wordl1[0] = pws[gid].i[ 4]; + wordl1[1] = pws[gid].i[ 5]; + wordl1[2] = pws[gid].i[ 6]; + wordl1[3] = pws[gid].i[ 7]; + + u32 wordl2[4]; - pws0[0] = pws[gid].i[0]; - pws0[1] = pws[gid].i[1]; - pws0[2] = pws[gid].i[2]; - pws0[3] = pws[gid].i[3]; - pws1[0] = pws[gid].i[4]; - pws1[1] = pws[gid].i[5]; - pws1[2] = pws[gid].i[6]; - pws1[3] = pws[gid].i[7]; + wordl2[0] = 0; + wordl2[1] = 0; + wordl2[2] = 0; + wordl2[3] = 0; + + u32 wordl3[4]; + + wordl3[0] = 0; + wordl3[1] = 0; + wordl3[2] = 0; + wordl3[3] = 0; const u32 pw_l_len = pws[gid].pw_len; + if (combs_mode == COMBINATOR_MODE_BASE_RIGHT) + { + switch_buffer_by_offset_le (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len); + } + /** * salt */ @@ -309,25 +371,39 @@ __kernel void m02410_s04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, * loop */ - for (u32 il_pos = 0; il_pos < combs_cnt; il_pos += VECT_SIZE) + for (u32 il_pos = 0; il_pos < combs_cnt; il_pos++) { - const u32x pw_r_len = pwlenx_create_combt (combs_buf, il_pos); + const u32 pw_r_len = combs_buf[il_pos].pw_len; + + const u32 pw_len = pw_l_len + pw_r_len; + + u32 wordr0[4]; + + wordr0[0] = combs_buf[il_pos].i[0]; + wordr0[1] = combs_buf[il_pos].i[1]; + wordr0[2] = combs_buf[il_pos].i[2]; + wordr0[3] = combs_buf[il_pos].i[3]; + + u32 wordr1[4]; - const u32x pw_len = pw_l_len + pw_r_len; + wordr1[0] = combs_buf[il_pos].i[4]; + wordr1[1] = combs_buf[il_pos].i[5]; + wordr1[2] = combs_buf[il_pos].i[6]; + wordr1[3] = combs_buf[il_pos].i[7]; - u32x wordr0[4] = { 0 }; - u32x wordr1[4] = { 0 }; - u32x wordr2[4] = { 0 }; - u32x wordr3[4] = { 0 }; + u32 wordr2[4]; - wordr0[0] = ix_create_combt (combs_buf, il_pos, 0); - wordr0[1] = ix_create_combt (combs_buf, il_pos, 1); - wordr0[2] = ix_create_combt (combs_buf, il_pos, 2); - wordr0[3] = ix_create_combt (combs_buf, il_pos, 3); - wordr1[0] = ix_create_combt (combs_buf, il_pos, 4); - wordr1[1] = ix_create_combt (combs_buf, il_pos, 5); - wordr1[2] = ix_create_combt (combs_buf, il_pos, 6); - wordr1[3] = ix_create_combt (combs_buf, il_pos, 7); + wordr2[0] = 0; + wordr2[1] = 0; + wordr2[2] = 0; + wordr2[3] = 0; + + u32 wordr3[4]; + + wordr3[0] = 0; + wordr3[1] = 0; + wordr3[2] = 0; + wordr3[3] = 0; if (combs_mode == COMBINATOR_MODE_BASE_LEFT) { @@ -366,32 +442,32 @@ __kernel void m02410_s04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, s3[2] = 0; s3[3] = 0; - switch_buffer_by_offset_le_S (s0, s1, s2, s3, pw_len); + switch_buffer_by_offset_le (s0, s1, s2, s3, pw_len); - const u32x pw_salt_len = pw_len + salt_len; + const u32 pw_salt_len = pw_len + salt_len; - u32x w0[4]; + u32 w0[4]; w0[0] = wordl0[0] | wordr0[0] | s0[0]; w0[1] = wordl0[1] | wordr0[1] | s0[1]; w0[2] = wordl0[2] | wordr0[2] | s0[2]; w0[3] = wordl0[3] | wordr0[3] | s0[3]; - u32x w1[4]; + u32 w1[4]; w1[0] = wordl1[0] | wordr1[0]; w1[1] = wordl1[1] | wordr1[1]; w1[2] = wordl1[2] | wordr1[2]; w1[3] = wordl1[3] | wordr1[3]; - u32x w2[4]; + u32 w2[4]; w2[0] = wordl2[0] | wordr2[0]; w2[1] = wordl2[1] | wordr2[1]; w2[2] = wordl2[2] | wordr2[2]; w2[3] = wordl2[3] | wordr2[3]; - u32x w3[4]; + u32 w3[4]; w3[0] = wordl3[0] | wordr3[0]; w3[1] = wordl3[1] | wordr3[1]; @@ -403,10 +479,10 @@ __kernel void m02410_s04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, w1[0] = 0x80; w3[2] = 16 * 8; - u32x a = MD5M_A; - u32x b = MD5M_B; - u32x c = MD5M_C; - u32x d = MD5M_D; + u32 a = MD5M_A; + u32 b = MD5M_B; + u32 c = MD5M_C; + u32 d = MD5M_D; MD5_STEP (MD5_Fo, a, b, c, d, w0[0], MD5C00, MD5S00); MD5_STEP (MD5_Fo, d, a, b, c, w0[1], MD5C01, MD5S01); @@ -485,7 +561,13 @@ __kernel void m02410_s04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, d &= 0x00ffffff; c &= 0x00ffffff; b &= 0x00ffffff; - COMPARE_S_SIMD (a, d, c, b); + + const u32 r0 = a; + const u32 r1 = d; + const u32 r2 = c; + const u32 r3 = b; + + #include COMPARE_S } } diff --git a/OpenCL/m02610_a1.cl b/OpenCL/m02610_a1.cl index 3bcb2c7..ade1427 100644 --- a/OpenCL/m02610_a1.cl +++ b/OpenCL/m02610_a1.cl @@ -5,8 +5,6 @@ #define _MD5_ -#define NEW_SIMD_CODE - #include "include/constants.h" #include "include/kernel_vendor.h" @@ -18,7 +16,9 @@ #include "include/kernel_functions.c" #include "OpenCL/types_ocl.c" #include "OpenCL/common.c" -#include "OpenCL/simd.c" + +#define COMPARE_S "OpenCL/check_single_comp4.c" +#define COMPARE_M "OpenCL/check_multi_comp4.c" #define uint_to_hex_lower8(i) l_bin2asc[(i)] @@ -36,20 +36,43 @@ __kernel void m02610_m04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, * base */ - u32 pws0[4] = { 0 }; - u32 pws1[4] = { 0 }; + u32 wordl0[4]; + + wordl0[0] = pws[gid].i[ 0]; + wordl0[1] = pws[gid].i[ 1]; + wordl0[2] = pws[gid].i[ 2]; + wordl0[3] = pws[gid].i[ 3]; + + u32 wordl1[4]; + + wordl1[0] = pws[gid].i[ 4]; + wordl1[1] = pws[gid].i[ 5]; + wordl1[2] = pws[gid].i[ 6]; + wordl1[3] = pws[gid].i[ 7]; - pws0[0] = pws[gid].i[0]; - pws0[1] = pws[gid].i[1]; - pws0[2] = pws[gid].i[2]; - pws0[3] = pws[gid].i[3]; - pws1[0] = pws[gid].i[4]; - pws1[1] = pws[gid].i[5]; - pws1[2] = pws[gid].i[6]; - pws1[3] = pws[gid].i[7]; + u32 wordl2[4]; + + wordl2[0] = 0; + wordl2[1] = 0; + wordl2[2] = 0; + wordl2[3] = 0; + + u32 wordl3[4]; + + wordl3[0] = 0; + wordl3[1] = 0; + wordl3[2] = 0; + wordl3[3] = 0; const u32 pw_l_len = pws[gid].pw_len; + if (combs_mode == COMBINATOR_MODE_BASE_RIGHT) + { + append_0x80_2x4 (wordl0, wordl1, pw_l_len); + + switch_buffer_by_offset_le (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len); + } + /** * salt */ @@ -90,63 +113,77 @@ __kernel void m02610_m04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, * loop */ - for (u32 il_pos = 0; il_pos < combs_cnt; il_pos += VECT_SIZE) + for (u32 il_pos = 0; il_pos < combs_cnt; il_pos++) { - const u32x pw_r_len = pwlenx_create_combt (combs_buf, il_pos); + const u32 pw_r_len = combs_buf[il_pos].pw_len; + + const u32 pw_len = pw_l_len + pw_r_len; + + u32 wordr0[4]; + + wordr0[0] = combs_buf[il_pos].i[0]; + wordr0[1] = combs_buf[il_pos].i[1]; + wordr0[2] = combs_buf[il_pos].i[2]; + wordr0[3] = combs_buf[il_pos].i[3]; - const u32x pw_len = pw_l_len + pw_r_len; + u32 wordr1[4]; - u32x wordr0[4] = { 0 }; - u32x wordr1[4] = { 0 }; - u32x wordr2[4] = { 0 }; - u32x wordr3[4] = { 0 }; + wordr1[0] = combs_buf[il_pos].i[4]; + wordr1[1] = combs_buf[il_pos].i[5]; + wordr1[2] = combs_buf[il_pos].i[6]; + wordr1[3] = combs_buf[il_pos].i[7]; - wordr0[0] = ix_create_combt (combs_buf, il_pos, 0); - wordr0[1] = ix_create_combt (combs_buf, il_pos, 1); - wordr0[2] = ix_create_combt (combs_buf, il_pos, 2); - wordr0[3] = ix_create_combt (combs_buf, il_pos, 3); - wordr1[0] = ix_create_combt (combs_buf, il_pos, 4); - wordr1[1] = ix_create_combt (combs_buf, il_pos, 5); - wordr1[2] = ix_create_combt (combs_buf, il_pos, 6); - wordr1[3] = ix_create_combt (combs_buf, il_pos, 7); + u32 wordr2[4]; + + wordr2[0] = 0; + wordr2[1] = 0; + wordr2[2] = 0; + wordr2[3] = 0; + + u32 wordr3[4]; + + wordr3[0] = 0; + wordr3[1] = 0; + wordr3[2] = 0; + wordr3[3] = 0; if (combs_mode == COMBINATOR_MODE_BASE_LEFT) { switch_buffer_by_offset_le (wordr0, wordr1, wordr2, wordr3, pw_l_len); } - u32x w0[4]; + u32 w0[4]; w0[0] = wordl0[0] | wordr0[0]; w0[1] = wordl0[1] | wordr0[1]; w0[2] = wordl0[2] | wordr0[2]; w0[3] = wordl0[3] | wordr0[3]; - u32x w1[4]; + u32 w1[4]; w1[0] = wordl1[0] | wordr1[0]; w1[1] = wordl1[1] | wordr1[1]; w1[2] = wordl1[2] | wordr1[2]; w1[3] = wordl1[3] | wordr1[3]; - u32x w2[4]; + u32 w2[4]; w2[0] = wordl2[0] | wordr2[0]; w2[1] = wordl2[1] | wordr2[1]; w2[2] = wordl2[2] | wordr2[2]; w2[3] = wordl2[3] | wordr2[3]; - u32x w3[4]; + u32 w3[4]; w3[0] = wordl3[0] | wordr3[0]; w3[1] = wordl3[1] | wordr3[1]; w3[2] = pw_len * 8; w3[3] = 0; - u32x a = MD5M_A; - u32x b = MD5M_B; - u32x c = MD5M_C; - u32x d = MD5M_D; + u32 a = MD5M_A; + u32 b = MD5M_B; + u32 c = MD5M_C; + u32 d = MD5M_D; MD5_STEP (MD5_Fo, a, b, c, d, w0[0], MD5C00, MD5S00); MD5_STEP (MD5_Fo, d, a, b, c, w0[1], MD5C01, MD5S01); @@ -320,7 +357,12 @@ __kernel void m02610_m04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, MD5_STEP (MD5_I , c, d, a, b, w2_t, MD5C3e, MD5S32); MD5_STEP (MD5_I , b, c, d, a, w9_t, MD5C3f, MD5S33); - COMPARE_M_SIMD (a, d, c, b); + const u32 r0 = a; + const u32 r1 = d; + const u32 r2 = c; + const u32 r3 = b; + + #include COMPARE_M } } @@ -346,20 +388,43 @@ __kernel void m02610_s04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, * base */ - u32 pws0[4] = { 0 }; - u32 pws1[4] = { 0 }; + u32 wordl0[4]; - pws0[0] = pws[gid].i[0]; - pws0[1] = pws[gid].i[1]; - pws0[2] = pws[gid].i[2]; - pws0[3] = pws[gid].i[3]; - pws1[0] = pws[gid].i[4]; - pws1[1] = pws[gid].i[5]; - pws1[2] = pws[gid].i[6]; - pws1[3] = pws[gid].i[7]; + wordl0[0] = pws[gid].i[ 0]; + wordl0[1] = pws[gid].i[ 1]; + wordl0[2] = pws[gid].i[ 2]; + wordl0[3] = pws[gid].i[ 3]; + + u32 wordl1[4]; + + wordl1[0] = pws[gid].i[ 4]; + wordl1[1] = pws[gid].i[ 5]; + wordl1[2] = pws[gid].i[ 6]; + wordl1[3] = pws[gid].i[ 7]; + + u32 wordl2[4]; + + wordl2[0] = 0; + wordl2[1] = 0; + wordl2[2] = 0; + wordl2[3] = 0; + + u32 wordl3[4]; + + wordl3[0] = 0; + wordl3[1] = 0; + wordl3[2] = 0; + wordl3[3] = 0; const u32 pw_l_len = pws[gid].pw_len; + if (combs_mode == COMBINATOR_MODE_BASE_RIGHT) + { + append_0x80_2x4 (wordl0, wordl1, pw_l_len); + + switch_buffer_by_offset_le (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len); + } + /** * digest */ @@ -412,63 +477,77 @@ __kernel void m02610_s04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, * loop */ - for (u32 il_pos = 0; il_pos < combs_cnt; il_pos += VECT_SIZE) + for (u32 il_pos = 0; il_pos < combs_cnt; il_pos++) { - const u32x pw_r_len = pwlenx_create_combt (combs_buf, il_pos); + const u32 pw_r_len = combs_buf[il_pos].pw_len; + + const u32 pw_len = pw_l_len + pw_r_len; + + u32 wordr0[4]; + + wordr0[0] = combs_buf[il_pos].i[0]; + wordr0[1] = combs_buf[il_pos].i[1]; + wordr0[2] = combs_buf[il_pos].i[2]; + wordr0[3] = combs_buf[il_pos].i[3]; + + u32 wordr1[4]; - const u32x pw_len = pw_l_len + pw_r_len; + wordr1[0] = combs_buf[il_pos].i[4]; + wordr1[1] = combs_buf[il_pos].i[5]; + wordr1[2] = combs_buf[il_pos].i[6]; + wordr1[3] = combs_buf[il_pos].i[7]; - u32x wordr0[4] = { 0 }; - u32x wordr1[4] = { 0 }; - u32x wordr2[4] = { 0 }; - u32x wordr3[4] = { 0 }; + u32 wordr2[4]; - wordr0[0] = ix_create_combt (combs_buf, il_pos, 0); - wordr0[1] = ix_create_combt (combs_buf, il_pos, 1); - wordr0[2] = ix_create_combt (combs_buf, il_pos, 2); - wordr0[3] = ix_create_combt (combs_buf, il_pos, 3); - wordr1[0] = ix_create_combt (combs_buf, il_pos, 4); - wordr1[1] = ix_create_combt (combs_buf, il_pos, 5); - wordr1[2] = ix_create_combt (combs_buf, il_pos, 6); - wordr1[3] = ix_create_combt (combs_buf, il_pos, 7); + wordr2[0] = 0; + wordr2[1] = 0; + wordr2[2] = 0; + wordr2[3] = 0; + + u32 wordr3[4]; + + wordr3[0] = 0; + wordr3[1] = 0; + wordr3[2] = 0; + wordr3[3] = 0; if (combs_mode == COMBINATOR_MODE_BASE_LEFT) { switch_buffer_by_offset_le (wordr0, wordr1, wordr2, wordr3, pw_l_len); } - u32x w0[4]; + u32 w0[4]; w0[0] = wordl0[0] | wordr0[0]; w0[1] = wordl0[1] | wordr0[1]; w0[2] = wordl0[2] | wordr0[2]; w0[3] = wordl0[3] | wordr0[3]; - u32x w1[4]; + u32 w1[4]; w1[0] = wordl1[0] | wordr1[0]; w1[1] = wordl1[1] | wordr1[1]; w1[2] = wordl1[2] | wordr1[2]; w1[3] = wordl1[3] | wordr1[3]; - u32x w2[4]; + u32 w2[4]; w2[0] = wordl2[0] | wordr2[0]; w2[1] = wordl2[1] | wordr2[1]; w2[2] = wordl2[2] | wordr2[2]; w2[3] = wordl2[3] | wordr2[3]; - u32x w3[4]; + u32 w3[4]; w3[0] = wordl3[0] | wordr3[0]; w3[1] = wordl3[1] | wordr3[1]; w3[2] = pw_len * 8; w3[3] = 0; - u32x a = MD5M_A; - u32x b = MD5M_B; - u32x c = MD5M_C; - u32x d = MD5M_D; + u32 a = MD5M_A; + u32 b = MD5M_B; + u32 c = MD5M_C; + u32 d = MD5M_D; MD5_STEP (MD5_Fo, a, b, c, d, w0[0], MD5C00, MD5S00); MD5_STEP (MD5_Fo, d, a, b, c, w0[1], MD5C01, MD5S01); @@ -641,7 +720,13 @@ __kernel void m02610_s04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, MD5_STEP (MD5_I , d, a, b, c, wb_t, MD5C3d, MD5S31); MD5_STEP (MD5_I , c, d, a, b, w2_t, MD5C3e, MD5S32); MD5_STEP (MD5_I , b, c, d, a, w9_t, MD5C3f, MD5S33); - COMPARE_S_SIMD (a, d, c, b); + + const u32 r0 = a; + const u32 r1 = d; + const u32 r2 = c; + const u32 r3 = b; + + #include COMPARE_S } } diff --git a/OpenCL/m02710_a1.cl b/OpenCL/m02710_a1.cl index af4762d..e65a088 100644 --- a/OpenCL/m02710_a1.cl +++ b/OpenCL/m02710_a1.cl @@ -5,8 +5,6 @@ #define _MD5_ -#define NEW_SIMD_CODE - #include "include/constants.h" #include "include/kernel_vendor.h" @@ -18,7 +16,9 @@ #include "include/kernel_functions.c" #include "OpenCL/types_ocl.c" #include "OpenCL/common.c" -#include "OpenCL/simd.c" + +#define COMPARE_S "OpenCL/check_single_comp4.c" +#define COMPARE_M "OpenCL/check_multi_comp4.c" #define uint_to_hex_lower8(i) l_bin2asc[(i)] @@ -36,20 +36,43 @@ __kernel void m02710_m04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, * base */ - u32 pws0[4] = { 0 }; - u32 pws1[4] = { 0 }; + u32 wordl0[4]; + + wordl0[0] = pws[gid].i[ 0]; + wordl0[1] = pws[gid].i[ 1]; + wordl0[2] = pws[gid].i[ 2]; + wordl0[3] = pws[gid].i[ 3]; + + u32 wordl1[4]; + + wordl1[0] = pws[gid].i[ 4]; + wordl1[1] = pws[gid].i[ 5]; + wordl1[2] = pws[gid].i[ 6]; + wordl1[3] = pws[gid].i[ 7]; - pws0[0] = pws[gid].i[0]; - pws0[1] = pws[gid].i[1]; - pws0[2] = pws[gid].i[2]; - pws0[3] = pws[gid].i[3]; - pws1[0] = pws[gid].i[4]; - pws1[1] = pws[gid].i[5]; - pws1[2] = pws[gid].i[6]; - pws1[3] = pws[gid].i[7]; + u32 wordl2[4]; + + wordl2[0] = 0; + wordl2[1] = 0; + wordl2[2] = 0; + wordl2[3] = 0; + + u32 wordl3[4]; + + wordl3[0] = 0; + wordl3[1] = 0; + wordl3[2] = 0; + wordl3[3] = 0; const u32 pw_l_len = pws[gid].pw_len; + if (combs_mode == COMBINATOR_MODE_BASE_RIGHT) + { + append_0x80_2x4 (wordl0, wordl1, pw_l_len); + + switch_buffer_by_offset_le (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len); + } + /** * salt */ @@ -92,63 +115,77 @@ __kernel void m02710_m04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, * loop */ - for (u32 il_pos = 0; il_pos < combs_cnt; il_pos += VECT_SIZE) + for (u32 il_pos = 0; il_pos < combs_cnt; il_pos++) { - const u32x pw_r_len = pwlenx_create_combt (combs_buf, il_pos); + const u32 pw_r_len = combs_buf[il_pos].pw_len; + + const u32 pw_len = pw_l_len + pw_r_len; + + u32 wordr0[4]; + + wordr0[0] = combs_buf[il_pos].i[0]; + wordr0[1] = combs_buf[il_pos].i[1]; + wordr0[2] = combs_buf[il_pos].i[2]; + wordr0[3] = combs_buf[il_pos].i[3]; - const u32x pw_len = pw_l_len + pw_r_len; + u32 wordr1[4]; - u32x wordr0[4] = { 0 }; - u32x wordr1[4] = { 0 }; - u32x wordr2[4] = { 0 }; - u32x wordr3[4] = { 0 }; + wordr1[0] = combs_buf[il_pos].i[4]; + wordr1[1] = combs_buf[il_pos].i[5]; + wordr1[2] = combs_buf[il_pos].i[6]; + wordr1[3] = combs_buf[il_pos].i[7]; - wordr0[0] = ix_create_combt (combs_buf, il_pos, 0); - wordr0[1] = ix_create_combt (combs_buf, il_pos, 1); - wordr0[2] = ix_create_combt (combs_buf, il_pos, 2); - wordr0[3] = ix_create_combt (combs_buf, il_pos, 3); - wordr1[0] = ix_create_combt (combs_buf, il_pos, 4); - wordr1[1] = ix_create_combt (combs_buf, il_pos, 5); - wordr1[2] = ix_create_combt (combs_buf, il_pos, 6); - wordr1[3] = ix_create_combt (combs_buf, il_pos, 7); + u32 wordr2[4]; + + wordr2[0] = 0; + wordr2[1] = 0; + wordr2[2] = 0; + wordr2[3] = 0; + + u32 wordr3[4]; + + wordr3[0] = 0; + wordr3[1] = 0; + wordr3[2] = 0; + wordr3[3] = 0; if (combs_mode == COMBINATOR_MODE_BASE_LEFT) { switch_buffer_by_offset_le (wordr0, wordr1, wordr2, wordr3, pw_l_len); } - u32x w0[4]; + u32 w0[4]; w0[0] = wordl0[0] | wordr0[0]; w0[1] = wordl0[1] | wordr0[1]; w0[2] = wordl0[2] | wordr0[2]; w0[3] = wordl0[3] | wordr0[3]; - u32x w1[4]; + u32 w1[4]; w1[0] = wordl1[0] | wordr1[0]; w1[1] = wordl1[1] | wordr1[1]; w1[2] = wordl1[2] | wordr1[2]; w1[3] = wordl1[3] | wordr1[3]; - u32x w2[4]; + u32 w2[4]; w2[0] = wordl2[0] | wordr2[0]; w2[1] = wordl2[1] | wordr2[1]; w2[2] = wordl2[2] | wordr2[2]; w2[3] = wordl2[3] | wordr2[3]; - u32x w3[4]; + u32 w3[4]; w3[0] = wordl3[0] | wordr3[0]; w3[1] = wordl3[1] | wordr3[1]; w3[2] = pw_len * 8; w3[3] = 0; - u32x a = MD5M_A; - u32x b = MD5M_B; - u32x c = MD5M_C; - u32x d = MD5M_D; + u32 a = MD5M_A; + u32 b = MD5M_B; + u32 c = MD5M_C; + u32 d = MD5M_D; MD5_STEP (MD5_Fo, a, b, c, d, w0[0], MD5C00, MD5S00); MD5_STEP (MD5_Fo, d, a, b, c, w0[1], MD5C01, MD5S01); @@ -404,7 +441,13 @@ __kernel void m02710_m04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, b += r_b; c += r_c; d += r_d; - COMPARE_M_SIMD (a, d, c, b); + + const u32 r0 = a; + const u32 r1 = d; + const u32 r2 = c; + const u32 r3 = b; + + #include COMPARE_M } } @@ -430,20 +473,43 @@ __kernel void m02710_s04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, * base */ - u32 pws0[4] = { 0 }; - u32 pws1[4] = { 0 }; + u32 wordl0[4]; + + wordl0[0] = pws[gid].i[ 0]; + wordl0[1] = pws[gid].i[ 1]; + wordl0[2] = pws[gid].i[ 2]; + wordl0[3] = pws[gid].i[ 3]; + + u32 wordl1[4]; + + wordl1[0] = pws[gid].i[ 4]; + wordl1[1] = pws[gid].i[ 5]; + wordl1[2] = pws[gid].i[ 6]; + wordl1[3] = pws[gid].i[ 7]; + + u32 wordl2[4]; - pws0[0] = pws[gid].i[0]; - pws0[1] = pws[gid].i[1]; - pws0[2] = pws[gid].i[2]; - pws0[3] = pws[gid].i[3]; - pws1[0] = pws[gid].i[4]; - pws1[1] = pws[gid].i[5]; - pws1[2] = pws[gid].i[6]; - pws1[3] = pws[gid].i[7]; + wordl2[0] = 0; + wordl2[1] = 0; + wordl2[2] = 0; + wordl2[3] = 0; + + u32 wordl3[4]; + + wordl3[0] = 0; + wordl3[1] = 0; + wordl3[2] = 0; + wordl3[3] = 0; const u32 pw_l_len = pws[gid].pw_len; + if (combs_mode == COMBINATOR_MODE_BASE_RIGHT) + { + append_0x80_2x4 (wordl0, wordl1, pw_l_len); + + switch_buffer_by_offset_le (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len); + } + /** * salt */ @@ -498,63 +564,77 @@ __kernel void m02710_s04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, * loop */ - for (u32 il_pos = 0; il_pos < combs_cnt; il_pos += VECT_SIZE) + for (u32 il_pos = 0; il_pos < combs_cnt; il_pos++) { - const u32x pw_r_len = pwlenx_create_combt (combs_buf, il_pos); + const u32 pw_r_len = combs_buf[il_pos].pw_len; + + const u32 pw_len = pw_l_len + pw_r_len; + + u32 wordr0[4]; + + wordr0[0] = combs_buf[il_pos].i[0]; + wordr0[1] = combs_buf[il_pos].i[1]; + wordr0[2] = combs_buf[il_pos].i[2]; + wordr0[3] = combs_buf[il_pos].i[3]; + + u32 wordr1[4]; - const u32x pw_len = pw_l_len + pw_r_len; + wordr1[0] = combs_buf[il_pos].i[4]; + wordr1[1] = combs_buf[il_pos].i[5]; + wordr1[2] = combs_buf[il_pos].i[6]; + wordr1[3] = combs_buf[il_pos].i[7]; - u32x wordr0[4] = { 0 }; - u32x wordr1[4] = { 0 }; - u32x wordr2[4] = { 0 }; - u32x wordr3[4] = { 0 }; + u32 wordr2[4]; - wordr0[0] = ix_create_combt (combs_buf, il_pos, 0); - wordr0[1] = ix_create_combt (combs_buf, il_pos, 1); - wordr0[2] = ix_create_combt (combs_buf, il_pos, 2); - wordr0[3] = ix_create_combt (combs_buf, il_pos, 3); - wordr1[0] = ix_create_combt (combs_buf, il_pos, 4); - wordr1[1] = ix_create_combt (combs_buf, il_pos, 5); - wordr1[2] = ix_create_combt (combs_buf, il_pos, 6); - wordr1[3] = ix_create_combt (combs_buf, il_pos, 7); + wordr2[0] = 0; + wordr2[1] = 0; + wordr2[2] = 0; + wordr2[3] = 0; + + u32 wordr3[4]; + + wordr3[0] = 0; + wordr3[1] = 0; + wordr3[2] = 0; + wordr3[3] = 0; if (combs_mode == COMBINATOR_MODE_BASE_LEFT) { switch_buffer_by_offset_le (wordr0, wordr1, wordr2, wordr3, pw_l_len); } - u32x w0[4]; + u32 w0[4]; w0[0] = wordl0[0] | wordr0[0]; w0[1] = wordl0[1] | wordr0[1]; w0[2] = wordl0[2] | wordr0[2]; w0[3] = wordl0[3] | wordr0[3]; - u32x w1[4]; + u32 w1[4]; w1[0] = wordl1[0] | wordr1[0]; w1[1] = wordl1[1] | wordr1[1]; w1[2] = wordl1[2] | wordr1[2]; w1[3] = wordl1[3] | wordr1[3]; - u32x w2[4]; + u32 w2[4]; w2[0] = wordl2[0] | wordr2[0]; w2[1] = wordl2[1] | wordr2[1]; w2[2] = wordl2[2] | wordr2[2]; w2[3] = wordl2[3] | wordr2[3]; - u32x w3[4]; + u32 w3[4]; w3[0] = wordl3[0] | wordr3[0]; w3[1] = wordl3[1] | wordr3[1]; w3[2] = pw_len * 8; w3[3] = 0; - u32x a = MD5M_A; - u32x b = MD5M_B; - u32x c = MD5M_C; - u32x d = MD5M_D; + u32 a = MD5M_A; + u32 b = MD5M_B; + u32 c = MD5M_C; + u32 d = MD5M_D; MD5_STEP (MD5_Fo, a, b, c, d, w0[0], MD5C00, MD5S00); MD5_STEP (MD5_Fo, d, a, b, c, w0[1], MD5C01, MD5S01); @@ -813,7 +893,13 @@ __kernel void m02710_s04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, b += r_b; c += r_c; d += r_d; - COMPARE_S_SIMD (a, d, c, b); + + const u32 r0 = a; + const u32 r1 = d; + const u32 r2 = c; + const u32 r3 = b; + + #include COMPARE_S } } diff --git a/OpenCL/m02810_a1.cl b/OpenCL/m02810_a1.cl index 32b2fd3..4e2e749 100644 --- a/OpenCL/m02810_a1.cl +++ b/OpenCL/m02810_a1.cl @@ -5,8 +5,6 @@ #define _MD5_ -#define NEW_SIMD_CODE - #include "include/constants.h" #include "include/kernel_vendor.h" @@ -18,7 +16,9 @@ #include "include/kernel_functions.c" #include "OpenCL/types_ocl.c" #include "OpenCL/common.c" -#include "OpenCL/simd.c" + +#define COMPARE_S "OpenCL/check_single_comp4.c" +#define COMPARE_M "OpenCL/check_multi_comp4.c" #define uint_to_hex_lower8(i) l_bin2asc[(i)] @@ -36,20 +36,43 @@ __kernel void m02810_m04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, * base */ - u32 pws0[4] = { 0 }; - u32 pws1[4] = { 0 }; + u32 wordl0[4]; + + wordl0[0] = pws[gid].i[ 0]; + wordl0[1] = pws[gid].i[ 1]; + wordl0[2] = pws[gid].i[ 2]; + wordl0[3] = pws[gid].i[ 3]; + + u32 wordl1[4]; + + wordl1[0] = pws[gid].i[ 4]; + wordl1[1] = pws[gid].i[ 5]; + wordl1[2] = pws[gid].i[ 6]; + wordl1[3] = pws[gid].i[ 7]; - pws0[0] = pws[gid].i[0]; - pws0[1] = pws[gid].i[1]; - pws0[2] = pws[gid].i[2]; - pws0[3] = pws[gid].i[3]; - pws1[0] = pws[gid].i[4]; - pws1[1] = pws[gid].i[5]; - pws1[2] = pws[gid].i[6]; - pws1[3] = pws[gid].i[7]; + u32 wordl2[4]; + + wordl2[0] = 0; + wordl2[1] = 0; + wordl2[2] = 0; + wordl2[3] = 0; + + u32 wordl3[4]; + + wordl3[0] = 0; + wordl3[1] = 0; + wordl3[2] = 0; + wordl3[3] = 0; const u32 pw_l_len = pws[gid].pw_len; + if (combs_mode == COMBINATOR_MODE_BASE_RIGHT) + { + append_0x80_2x4 (wordl0, wordl1, pw_l_len); + + switch_buffer_by_offset_le (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len); + } + /** * salt */ @@ -91,63 +114,77 @@ __kernel void m02810_m04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, * loop */ - for (u32 il_pos = 0; il_pos < combs_cnt; il_pos += VECT_SIZE) + for (u32 il_pos = 0; il_pos < combs_cnt; il_pos++) { - const u32x pw_r_len = pwlenx_create_combt (combs_buf, il_pos); + const u32 pw_r_len = combs_buf[il_pos].pw_len; + + const u32 pw_len = pw_l_len + pw_r_len; + + u32 wordr0[4]; + + wordr0[0] = combs_buf[il_pos].i[0]; + wordr0[1] = combs_buf[il_pos].i[1]; + wordr0[2] = combs_buf[il_pos].i[2]; + wordr0[3] = combs_buf[il_pos].i[3]; - const u32x pw_len = pw_l_len + pw_r_len; + u32 wordr1[4]; - u32x wordr0[4] = { 0 }; - u32x wordr1[4] = { 0 }; - u32x wordr2[4] = { 0 }; - u32x wordr3[4] = { 0 }; + wordr1[0] = combs_buf[il_pos].i[4]; + wordr1[1] = combs_buf[il_pos].i[5]; + wordr1[2] = combs_buf[il_pos].i[6]; + wordr1[3] = combs_buf[il_pos].i[7]; - wordr0[0] = ix_create_combt (combs_buf, il_pos, 0); - wordr0[1] = ix_create_combt (combs_buf, il_pos, 1); - wordr0[2] = ix_create_combt (combs_buf, il_pos, 2); - wordr0[3] = ix_create_combt (combs_buf, il_pos, 3); - wordr1[0] = ix_create_combt (combs_buf, il_pos, 4); - wordr1[1] = ix_create_combt (combs_buf, il_pos, 5); - wordr1[2] = ix_create_combt (combs_buf, il_pos, 6); - wordr1[3] = ix_create_combt (combs_buf, il_pos, 7); + u32 wordr2[4]; + + wordr2[0] = 0; + wordr2[1] = 0; + wordr2[2] = 0; + wordr2[3] = 0; + + u32 wordr3[4]; + + wordr3[0] = 0; + wordr3[1] = 0; + wordr3[2] = 0; + wordr3[3] = 0; if (combs_mode == COMBINATOR_MODE_BASE_LEFT) { switch_buffer_by_offset_le (wordr0, wordr1, wordr2, wordr3, pw_l_len); } - u32x w0[4]; + u32 w0[4]; w0[0] = wordl0[0] | wordr0[0]; w0[1] = wordl0[1] | wordr0[1]; w0[2] = wordl0[2] | wordr0[2]; w0[3] = wordl0[3] | wordr0[3]; - u32x w1[4]; + u32 w1[4]; w1[0] = wordl1[0] | wordr1[0]; w1[1] = wordl1[1] | wordr1[1]; w1[2] = wordl1[2] | wordr1[2]; w1[3] = wordl1[3] | wordr1[3]; - u32x w2[4]; + u32 w2[4]; w2[0] = wordl2[0] | wordr2[0]; w2[1] = wordl2[1] | wordr2[1]; w2[2] = wordl2[2] | wordr2[2]; w2[3] = wordl2[3] | wordr2[3]; - u32x w3[4]; + u32 w3[4]; w3[0] = wordl3[0] | wordr3[0]; w3[1] = wordl3[1] | wordr3[1]; w3[2] = pw_len * 8; w3[3] = 0; - u32x a = MD5M_A; - u32x b = MD5M_B; - u32x c = MD5M_C; - u32x d = MD5M_D; + u32 a = MD5M_A; + u32 b = MD5M_B; + u32 c = MD5M_C; + u32 d = MD5M_D; MD5_STEP (MD5_Fo, a, b, c, d, w0[0], MD5C00, MD5S00); MD5_STEP (MD5_Fo, d, a, b, c, w0[1], MD5C01, MD5S01); @@ -403,7 +440,13 @@ __kernel void m02810_m04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, b += r_b; c += r_c; d += r_d; - COMPARE_M_SIMD (a, d, c, b); + + const u32 r0 = a; + const u32 r1 = d; + const u32 r2 = c; + const u32 r3 = b; + + #include COMPARE_M } } @@ -429,20 +472,43 @@ __kernel void m02810_s04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, * base */ - u32 pws0[4] = { 0 }; - u32 pws1[4] = { 0 }; + u32 wordl0[4]; + + wordl0[0] = pws[gid].i[ 0]; + wordl0[1] = pws[gid].i[ 1]; + wordl0[2] = pws[gid].i[ 2]; + wordl0[3] = pws[gid].i[ 3]; + + u32 wordl1[4]; + + wordl1[0] = pws[gid].i[ 4]; + wordl1[1] = pws[gid].i[ 5]; + wordl1[2] = pws[gid].i[ 6]; + wordl1[3] = pws[gid].i[ 7]; + + u32 wordl2[4]; - pws0[0] = pws[gid].i[0]; - pws0[1] = pws[gid].i[1]; - pws0[2] = pws[gid].i[2]; - pws0[3] = pws[gid].i[3]; - pws1[0] = pws[gid].i[4]; - pws1[1] = pws[gid].i[5]; - pws1[2] = pws[gid].i[6]; - pws1[3] = pws[gid].i[7]; + wordl2[0] = 0; + wordl2[1] = 0; + wordl2[2] = 0; + wordl2[3] = 0; + + u32 wordl3[4]; + + wordl3[0] = 0; + wordl3[1] = 0; + wordl3[2] = 0; + wordl3[3] = 0; const u32 pw_l_len = pws[gid].pw_len; + if (combs_mode == COMBINATOR_MODE_BASE_RIGHT) + { + append_0x80_2x4 (wordl0, wordl1, pw_l_len); + + switch_buffer_by_offset_le (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len); + } + /** * salt */ @@ -496,63 +562,77 @@ __kernel void m02810_s04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, * loop */ - for (u32 il_pos = 0; il_pos < combs_cnt; il_pos += VECT_SIZE) + for (u32 il_pos = 0; il_pos < combs_cnt; il_pos++) { - const u32x pw_r_len = pwlenx_create_combt (combs_buf, il_pos); + const u32 pw_r_len = combs_buf[il_pos].pw_len; + + const u32 pw_len = pw_l_len + pw_r_len; + + u32 wordr0[4]; + + wordr0[0] = combs_buf[il_pos].i[0]; + wordr0[1] = combs_buf[il_pos].i[1]; + wordr0[2] = combs_buf[il_pos].i[2]; + wordr0[3] = combs_buf[il_pos].i[3]; + + u32 wordr1[4]; - const u32x pw_len = pw_l_len + pw_r_len; + wordr1[0] = combs_buf[il_pos].i[4]; + wordr1[1] = combs_buf[il_pos].i[5]; + wordr1[2] = combs_buf[il_pos].i[6]; + wordr1[3] = combs_buf[il_pos].i[7]; - u32x wordr0[4] = { 0 }; - u32x wordr1[4] = { 0 }; - u32x wordr2[4] = { 0 }; - u32x wordr3[4] = { 0 }; + u32 wordr2[4]; - wordr0[0] = ix_create_combt (combs_buf, il_pos, 0); - wordr0[1] = ix_create_combt (combs_buf, il_pos, 1); - wordr0[2] = ix_create_combt (combs_buf, il_pos, 2); - wordr0[3] = ix_create_combt (combs_buf, il_pos, 3); - wordr1[0] = ix_create_combt (combs_buf, il_pos, 4); - wordr1[1] = ix_create_combt (combs_buf, il_pos, 5); - wordr1[2] = ix_create_combt (combs_buf, il_pos, 6); - wordr1[3] = ix_create_combt (combs_buf, il_pos, 7); + wordr2[0] = 0; + wordr2[1] = 0; + wordr2[2] = 0; + wordr2[3] = 0; + + u32 wordr3[4]; + + wordr3[0] = 0; + wordr3[1] = 0; + wordr3[2] = 0; + wordr3[3] = 0; if (combs_mode == COMBINATOR_MODE_BASE_LEFT) { switch_buffer_by_offset_le (wordr0, wordr1, wordr2, wordr3, pw_l_len); } - u32x w0[4]; + u32 w0[4]; w0[0] = wordl0[0] | wordr0[0]; w0[1] = wordl0[1] | wordr0[1]; w0[2] = wordl0[2] | wordr0[2]; w0[3] = wordl0[3] | wordr0[3]; - u32x w1[4]; + u32 w1[4]; w1[0] = wordl1[0] | wordr1[0]; w1[1] = wordl1[1] | wordr1[1]; w1[2] = wordl1[2] | wordr1[2]; w1[3] = wordl1[3] | wordr1[3]; - u32x w2[4]; + u32 w2[4]; w2[0] = wordl2[0] | wordr2[0]; w2[1] = wordl2[1] | wordr2[1]; w2[2] = wordl2[2] | wordr2[2]; w2[3] = wordl2[3] | wordr2[3]; - u32x w3[4]; + u32 w3[4]; w3[0] = wordl3[0] | wordr3[0]; w3[1] = wordl3[1] | wordr3[1]; w3[2] = pw_len * 8; w3[3] = 0; - u32x a = MD5M_A; - u32x b = MD5M_B; - u32x c = MD5M_C; - u32x d = MD5M_D; + u32 a = MD5M_A; + u32 b = MD5M_B; + u32 c = MD5M_C; + u32 d = MD5M_D; MD5_STEP (MD5_Fo, a, b, c, d, w0[0], MD5C00, MD5S00); MD5_STEP (MD5_Fo, d, a, b, c, w0[1], MD5C01, MD5S01); @@ -811,7 +891,13 @@ __kernel void m02810_s04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, b += r_b; c += r_c; d += r_d; - COMPARE_S_SIMD (a, d, c, b); + + const u32 r0 = a; + const u32 r1 = d; + const u32 r2 = c; + const u32 r3 = b; + + #include COMPARE_S } } diff --git a/OpenCL/m03000_a1.cl b/OpenCL/m03000_a1.cl index 88dddad..d416213 100644 --- a/OpenCL/m03000_a1.cl +++ b/OpenCL/m03000_a1.cl @@ -7,8 +7,6 @@ #define _DES_ -#define NEW_SIMD_CODE - #include "include/constants.h" #include "include/kernel_vendor.h" @@ -20,7 +18,9 @@ #include "include/kernel_functions.c" #include "OpenCL/types_ocl.c" #include "OpenCL/common.c" -#include "OpenCL/simd.c" + +#define COMPARE_S "OpenCL/check_single_comp4.c" +#define COMPARE_M "OpenCL/check_multi_comp4.c" #define PERM_OP(a,b,tt,n,m) \ { \ @@ -517,7 +517,7 @@ __kernel void m03000_m04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, if (combs_mode == COMBINATOR_MODE_BASE_RIGHT) { - switch_buffer_by_offset_le_S (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len); + switch_buffer_by_offset_le (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len); } /** @@ -556,55 +556,69 @@ __kernel void m03000_m04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, * main */ - for (u32 il_pos = 0; il_pos < combs_cnt; il_pos += VECT_SIZE) + for (u32 il_pos = 0; il_pos < combs_cnt; il_pos++) { - const u32x pw_r_len = pwlenx_create_combt (combs_buf, il_pos); + const u32 pw_r_len = combs_buf[il_pos].pw_len; - u32x pw_len = pw_l_len + pw_r_len; + u32 pw_len = pw_l_len + pw_r_len; pw_len = (pw_len >= 7) ? 7 : pw_len; - u32x wordr0[4] = { 0 }; - u32x wordr1[4] = { 0 }; - u32x wordr2[4] = { 0 }; - u32x wordr3[4] = { 0 }; + u32 wordr0[4]; + + wordr0[0] = combs_buf[il_pos].i[0]; + wordr0[1] = combs_buf[il_pos].i[1]; + wordr0[2] = 0; + wordr0[3] = 0; + + u32 wordr1[4]; + + wordr1[0] = 0; + wordr1[1] = 0; + wordr1[2] = 0; + wordr1[3] = 0; + + u32 wordr2[4]; + + wordr2[0] = 0; + wordr2[1] = 0; + wordr2[2] = 0; + wordr2[3] = 0; - wordr0[0] = ix_create_combt (combs_buf, il_pos, 0); - wordr0[1] = ix_create_combt (combs_buf, il_pos, 1); - wordr0[2] = ix_create_combt (combs_buf, il_pos, 2); - wordr0[3] = ix_create_combt (combs_buf, il_pos, 3); - wordr1[0] = ix_create_combt (combs_buf, il_pos, 4); - wordr1[1] = ix_create_combt (combs_buf, il_pos, 5); - wordr1[2] = ix_create_combt (combs_buf, il_pos, 6); - wordr1[3] = ix_create_combt (combs_buf, il_pos, 7); + u32 wordr3[4]; + + wordr3[0] = 0; + wordr3[1] = 0; + wordr3[2] = 0; + wordr3[3] = 0; if (combs_mode == COMBINATOR_MODE_BASE_LEFT) { switch_buffer_by_offset_le (wordr0, wordr1, wordr2, wordr3, pw_l_len); } - u32x w0[4]; + u32 w0[4]; w0[0] = wordl0[0] | wordr0[0]; w0[1] = wordl0[1] | wordr0[1]; w0[2] = 0; w0[3] = 0; - u32x w1[4]; + u32 w1[4]; w1[0] = 0; w1[1] = 0; w1[2] = 0; w1[3] = 0; - u32x w2[4]; + u32 w2[4]; w2[0] = 0; w2[1] = 0; w2[2] = 0; w2[3] = 0; - u32x w3[4]; + u32 w3[4]; w3[0] = 0; w3[1] = 0; @@ -632,10 +646,12 @@ __kernel void m03000_m04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, _des_crypt_encrypt (iv, data, Kc, Kd, s_SPtrans); - u32x c = 0; - u32x d = 0; + const u32 r0 = iv[0]; + const u32 r1 = iv[1]; + const u32 r2 = 0; + const u32 r3 = 0; - COMPARE_M_SIMD (iv[0], iv[1], c, d); + #include COMPARE_M } } @@ -693,7 +709,7 @@ __kernel void m03000_s04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, if (combs_mode == COMBINATOR_MODE_BASE_RIGHT) { - switch_buffer_by_offset_le_S (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len); + switch_buffer_by_offset_le (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len); } /** @@ -744,49 +760,69 @@ __kernel void m03000_s04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, * main */ - for (u32 il_pos = 0; il_pos < combs_cnt; il_pos += VECT_SIZE) + for (u32 il_pos = 0; il_pos < combs_cnt; il_pos++) { - const u32x pw_r_len = pwlenx_create_combt (combs_buf, il_pos); + const u32 pw_r_len = combs_buf[il_pos].pw_len; - u32x pw_len = pw_l_len + pw_r_len; + u32 pw_len = pw_l_len + pw_r_len; pw_len = (pw_len >= 7) ? 7 : pw_len; - u32x wordr0[4] = { 0 }; - u32x wordr1[4] = { 0 }; - u32x wordr2[4] = { 0 }; - u32x wordr3[4] = { 0 }; + u32 wordr0[4]; + + wordr0[0] = combs_buf[il_pos].i[0]; + wordr0[1] = combs_buf[il_pos].i[1]; + wordr0[2] = 0; + wordr0[3] = 0; + + u32 wordr1[4]; + + wordr1[0] = 0; + wordr1[1] = 0; + wordr1[2] = 0; + wordr1[3] = 0; + + u32 wordr2[4]; + + wordr2[0] = 0; + wordr2[1] = 0; + wordr2[2] = 0; + wordr2[3] = 0; + + u32 wordr3[4]; - wordr0[0] = ix_create_combt (combs_buf, il_pos, 0); - wordr0[1] = ix_create_combt (combs_buf, il_pos, 1); + wordr3[0] = 0; + wordr3[1] = 0; + wordr3[2] = 0; + wordr3[3] = 0; if (combs_mode == COMBINATOR_MODE_BASE_LEFT) { switch_buffer_by_offset_le (wordr0, wordr1, wordr2, wordr3, pw_l_len); } - u32x w0[4]; + u32 w0[4]; w0[0] = wordl0[0] | wordr0[0]; w0[1] = wordl0[1] | wordr0[1]; w0[2] = 0; w0[3] = 0; - u32x w1[4]; + u32 w1[4]; w1[0] = 0; w1[1] = 0; w1[2] = 0; w1[3] = 0; - u32x w2[4]; + u32 w2[4]; w2[0] = 0; w2[1] = 0; w2[2] = 0; w2[3] = 0; - u32x w3[4]; + u32 w3[4]; w3[0] = 0; w3[1] = 0; @@ -814,10 +850,12 @@ __kernel void m03000_s04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, _des_crypt_encrypt (iv, data, Kc, Kd, s_SPtrans); - u32x c = 0; - u32x d = 0; + const u32 r0 = iv[0]; + const u32 r1 = iv[1]; + const u32 r2 = 0; + const u32 r3 = 0; - COMPARE_S_SIMD (iv[0], iv[1], c, d); + #include COMPARE_S } } diff --git a/OpenCL/m03100_a1.cl b/OpenCL/m03100_a1.cl index 22e79de..de174c5 100644 --- a/OpenCL/m03100_a1.cl +++ b/OpenCL/m03100_a1.cl @@ -7,8 +7,6 @@ #define _DES_ -#define NEW_SIMD_CODE - #include "include/constants.h" #include "include/kernel_vendor.h" @@ -20,7 +18,9 @@ #include "include/kernel_functions.c" #include "OpenCL/types_ocl.c" #include "OpenCL/common.c" -#include "OpenCL/simd.c" + +#define COMPARE_S "OpenCL/check_single_comp4.c" +#define COMPARE_M "OpenCL/check_multi_comp4.c" #define PERM_OP(a,b,tt,n,m) \ { \ @@ -558,7 +558,7 @@ __kernel void m03100_m04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, if (combs_mode == COMBINATOR_MODE_BASE_RIGHT) { - switch_buffer_by_offset_le_S (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len); + switch_buffer_by_offset_le (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len); } /** @@ -566,12 +566,14 @@ __kernel void m03100_m04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, */ u32 salt_buf0[4]; - u32 salt_buf1[4]; salt_buf0[0] = salt_bufs[salt_pos].salt_buf[0]; salt_buf0[1] = salt_bufs[salt_pos].salt_buf[1]; salt_buf0[2] = salt_bufs[salt_pos].salt_buf[2]; salt_buf0[3] = salt_bufs[salt_pos].salt_buf[3]; + + u32 salt_buf1[4]; + salt_buf1[0] = salt_bufs[salt_pos].salt_buf[4]; salt_buf1[1] = salt_bufs[salt_pos].salt_buf[5]; salt_buf1[2] = salt_bufs[salt_pos].salt_buf[6]; @@ -583,55 +585,69 @@ __kernel void m03100_m04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, * loop */ - for (u32 il_pos = 0; il_pos < combs_cnt; il_pos += VECT_SIZE) + for (u32 il_pos = 0; il_pos < combs_cnt; il_pos++) { - const u32x pw_r_len = pwlenx_create_combt (combs_buf, il_pos); + const u32 pw_r_len = combs_buf[il_pos].pw_len; - const u32x pw_len = pw_l_len + pw_r_len; + const u32 pw_len = pw_l_len + pw_r_len; const u32 salt_word_len = (salt_len + pw_len) * 2; - u32x wordr0[4] = { 0 }; - u32x wordr1[4] = { 0 }; - u32x wordr2[4] = { 0 }; - u32x wordr3[4] = { 0 }; + u32 wordr0[4]; + + wordr0[0] = combs_buf[il_pos].i[0]; + wordr0[1] = combs_buf[il_pos].i[1]; + wordr0[2] = combs_buf[il_pos].i[2]; + wordr0[3] = combs_buf[il_pos].i[3]; - wordr0[0] = ix_create_combt (combs_buf, il_pos, 0); - wordr0[1] = ix_create_combt (combs_buf, il_pos, 1); - wordr0[2] = ix_create_combt (combs_buf, il_pos, 2); - wordr0[3] = ix_create_combt (combs_buf, il_pos, 3); - wordr1[0] = ix_create_combt (combs_buf, il_pos, 4); - wordr1[1] = ix_create_combt (combs_buf, il_pos, 5); - wordr1[2] = ix_create_combt (combs_buf, il_pos, 6); - wordr1[3] = ix_create_combt (combs_buf, il_pos, 7); + u32 wordr1[4]; + + wordr1[0] = combs_buf[il_pos].i[4]; + wordr1[1] = combs_buf[il_pos].i[5]; + wordr1[2] = combs_buf[il_pos].i[6]; + wordr1[3] = combs_buf[il_pos].i[7]; + + u32 wordr2[4]; + + wordr2[0] = 0; + wordr2[1] = 0; + wordr2[2] = 0; + wordr2[3] = 0; + + u32 wordr3[4]; + + wordr3[0] = 0; + wordr3[1] = 0; + wordr3[2] = 0; + wordr3[3] = 0; if (combs_mode == COMBINATOR_MODE_BASE_LEFT) { switch_buffer_by_offset_le (wordr0, wordr1, wordr2, wordr3, pw_l_len); } - u32x w0[4]; + u32 w0[4]; w0[0] = wordl0[0] | wordr0[0]; w0[1] = wordl0[1] | wordr0[1]; w0[2] = wordl0[2] | wordr0[2]; w0[3] = wordl0[3] | wordr0[3]; - u32x w1[4]; + u32 w1[4]; w1[0] = wordl1[0] | wordr1[0]; w1[1] = wordl1[1] | wordr1[1]; w1[2] = wordl1[2] | wordr1[2]; w1[3] = wordl1[3] | wordr1[3]; - u32x w2[4]; + u32 w2[4]; w2[0] = wordl2[0] | wordr2[0]; w2[1] = wordl2[1] | wordr2[1]; w2[2] = wordl2[2] | wordr2[2]; w2[3] = wordl2[3] | wordr2[3]; - u32x w3[4]; + u32 w3[4]; w3[0] = wordl3[0] | wordr3[0]; w3[1] = wordl3[1] | wordr3[1]; @@ -642,10 +658,10 @@ __kernel void m03100_m04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, * prepend salt */ - u32x w0_t[4]; - u32x w1_t[4]; - u32x w2_t[4]; - u32x w3_t[4]; + u32 w0_t[4]; + u32 w1_t[4]; + u32 w2_t[4]; + u32 w3_t[4]; w0_t[0] = w0[0]; w0_t[1] = w0[1]; @@ -785,10 +801,12 @@ __kernel void m03100_m04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, * cmp */ - u32x c = 0; - u32x d = 0; + const u32 r0 = iv[0]; + const u32 r1 = iv[1]; + const u32 r2 = 0; + const u32 r3 = 0; - COMPARE_M_SIMD (iv[0], iv[1], c, d); + #include COMPARE_M } } @@ -846,31 +864,54 @@ __kernel void m03100_s04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, * base */ - u32 pws0[4] = { 0 }; - u32 pws1[4] = { 0 }; + u32 wordl0[4]; + + wordl0[0] = pws[gid].i[ 0]; + wordl0[1] = pws[gid].i[ 1]; + wordl0[2] = pws[gid].i[ 2]; + wordl0[3] = pws[gid].i[ 3]; + + u32 wordl1[4]; + + wordl1[0] = pws[gid].i[ 4]; + wordl1[1] = pws[gid].i[ 5]; + wordl1[2] = pws[gid].i[ 6]; + wordl1[3] = pws[gid].i[ 7]; + + u32 wordl2[4]; - pws0[0] = pws[gid].i[0]; - pws0[1] = pws[gid].i[1]; - pws0[2] = pws[gid].i[2]; - pws0[3] = pws[gid].i[3]; - pws1[0] = pws[gid].i[4]; - pws1[1] = pws[gid].i[5]; - pws1[2] = pws[gid].i[6]; - pws1[3] = pws[gid].i[7]; + wordl2[0] = 0; + wordl2[1] = 0; + wordl2[2] = 0; + wordl2[3] = 0; + + u32 wordl3[4]; + + wordl3[0] = 0; + wordl3[1] = 0; + wordl3[2] = 0; + wordl3[3] = 0; const u32 pw_l_len = pws[gid].pw_len; + if (combs_mode == COMBINATOR_MODE_BASE_RIGHT) + { + switch_buffer_by_offset_le (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len); + } + /** * salt */ u32 salt_buf0[4]; - u32 salt_buf1[4]; salt_buf0[0] = salt_bufs[salt_pos].salt_buf[0]; salt_buf0[1] = salt_bufs[salt_pos].salt_buf[1]; salt_buf0[2] = salt_bufs[salt_pos].salt_buf[2]; salt_buf0[3] = salt_bufs[salt_pos].salt_buf[3]; + + u32 salt_buf1[4]; + salt_buf1[0] = salt_bufs[salt_pos].salt_buf[4]; salt_buf1[1] = salt_bufs[salt_pos].salt_buf[5]; salt_buf1[2] = salt_bufs[salt_pos].salt_buf[6]; @@ -894,55 +935,69 @@ __kernel void m03100_s04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, * loop */ - for (u32 il_pos = 0; il_pos < combs_cnt; il_pos += VECT_SIZE) + for (u32 il_pos = 0; il_pos < combs_cnt; il_pos++) { - const u32x pw_r_len = pwlenx_create_combt (combs_buf, il_pos); + const u32 pw_r_len = combs_buf[il_pos].pw_len; - const u32x pw_len = pw_l_len + pw_r_len; + const u32 pw_len = pw_l_len + pw_r_len; const u32 salt_word_len = (salt_len + pw_len) * 2; - u32x wordr0[4] = { 0 }; - u32x wordr1[4] = { 0 }; - u32x wordr2[4] = { 0 }; - u32x wordr3[4] = { 0 }; + u32 wordr0[4]; + + wordr0[0] = combs_buf[il_pos].i[0]; + wordr0[1] = combs_buf[il_pos].i[1]; + wordr0[2] = combs_buf[il_pos].i[2]; + wordr0[3] = combs_buf[il_pos].i[3]; + + u32 wordr1[4]; + + wordr1[0] = combs_buf[il_pos].i[4]; + wordr1[1] = combs_buf[il_pos].i[5]; + wordr1[2] = combs_buf[il_pos].i[6]; + wordr1[3] = combs_buf[il_pos].i[7]; + + u32 wordr2[4]; + + wordr2[0] = 0; + wordr2[1] = 0; + wordr2[2] = 0; + wordr2[3] = 0; + + u32 wordr3[4]; - wordr0[0] = ix_create_combt (combs_buf, il_pos, 0); - wordr0[1] = ix_create_combt (combs_buf, il_pos, 1); - wordr0[2] = ix_create_combt (combs_buf, il_pos, 2); - wordr0[3] = ix_create_combt (combs_buf, il_pos, 3); - wordr1[0] = ix_create_combt (combs_buf, il_pos, 4); - wordr1[1] = ix_create_combt (combs_buf, il_pos, 5); - wordr1[2] = ix_create_combt (combs_buf, il_pos, 6); - wordr1[3] = ix_create_combt (combs_buf, il_pos, 7); + wordr3[0] = 0; + wordr3[1] = 0; + wordr3[2] = 0; + wordr3[3] = 0; if (combs_mode == COMBINATOR_MODE_BASE_LEFT) { switch_buffer_by_offset_le (wordr0, wordr1, wordr2, wordr3, pw_l_len); } - u32x w0[4]; + u32 w0[4]; w0[0] = wordl0[0] | wordr0[0]; w0[1] = wordl0[1] | wordr0[1]; w0[2] = wordl0[2] | wordr0[2]; w0[3] = wordl0[3] | wordr0[3]; - u32x w1[4]; + u32 w1[4]; w1[0] = wordl1[0] | wordr1[0]; w1[1] = wordl1[1] | wordr1[1]; w1[2] = wordl1[2] | wordr1[2]; w1[3] = wordl1[3] | wordr1[3]; - u32x w2[4]; + u32 w2[4]; w2[0] = wordl2[0] | wordr2[0]; w2[1] = wordl2[1] | wordr2[1]; w2[2] = wordl2[2] | wordr2[2]; w2[3] = wordl2[3] | wordr2[3]; - u32x w3[4]; + u32 w3[4]; w3[0] = wordl3[0] | wordr3[0]; w3[1] = wordl3[1] | wordr3[1]; @@ -953,10 +1008,10 @@ __kernel void m03100_s04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, * prepend salt */ - u32x w0_t[4]; - u32x w1_t[4]; - u32x w2_t[4]; - u32x w3_t[4]; + u32 w0_t[4]; + u32 w1_t[4]; + u32 w2_t[4]; + u32 w3_t[4]; w0_t[0] = w0[0]; w0_t[1] = w0[1]; @@ -1096,10 +1151,12 @@ __kernel void m03100_s04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, * cmp */ - u32x c = 0; - u32x d = 0; + const u32 r0 = iv[0]; + const u32 r1 = iv[1]; + const u32 r2 = 0; + const u32 r3 = 0; - COMPARE_S_SIMD (iv[0], iv[1], c, d); + #include COMPARE_S } } diff --git a/OpenCL/m03710_a1.cl b/OpenCL/m03710_a1.cl index ca4e4a8..d9380da 100644 --- a/OpenCL/m03710_a1.cl +++ b/OpenCL/m03710_a1.cl @@ -5,8 +5,6 @@ #define _MD5_ -#define NEW_SIMD_CODE - #include "include/constants.h" #include "include/kernel_vendor.h" @@ -18,7 +16,9 @@ #include "include/kernel_functions.c" #include "OpenCL/types_ocl.c" #include "OpenCL/common.c" -#include "OpenCL/simd.c" + +#define COMPARE_S "OpenCL/check_single_comp4.c" +#define COMPARE_M "OpenCL/check_multi_comp4.c" #define uint_to_hex_lower8(i) l_bin2asc[(i)] @@ -55,20 +55,43 @@ __kernel void m03710_m04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, * base */ - u32 pws0[4] = { 0 }; - u32 pws1[4] = { 0 }; + u32 wordl0[4]; + + wordl0[0] = pws[gid].i[ 0]; + wordl0[1] = pws[gid].i[ 1]; + wordl0[2] = pws[gid].i[ 2]; + wordl0[3] = pws[gid].i[ 3]; + + u32 wordl1[4]; + + wordl1[0] = pws[gid].i[ 4]; + wordl1[1] = pws[gid].i[ 5]; + wordl1[2] = pws[gid].i[ 6]; + wordl1[3] = pws[gid].i[ 7]; - pws0[0] = pws[gid].i[0]; - pws0[1] = pws[gid].i[1]; - pws0[2] = pws[gid].i[2]; - pws0[3] = pws[gid].i[3]; - pws1[0] = pws[gid].i[4]; - pws1[1] = pws[gid].i[5]; - pws1[2] = pws[gid].i[6]; - pws1[3] = pws[gid].i[7]; + u32 wordl2[4]; + + wordl2[0] = 0; + wordl2[1] = 0; + wordl2[2] = 0; + wordl2[3] = 0; + + u32 wordl3[4]; + + wordl3[0] = 0; + wordl3[1] = 0; + wordl3[2] = 0; + wordl3[3] = 0; const u32 pw_l_len = pws[gid].pw_len; + if (combs_mode == COMBINATOR_MODE_BASE_RIGHT) + { + append_0x80_2x4 (wordl0, wordl1, pw_l_len); + + switch_buffer_by_offset_le (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len); + } + /** * salt */ @@ -109,63 +132,77 @@ __kernel void m03710_m04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, * loop */ - for (u32 il_pos = 0; il_pos < combs_cnt; il_pos += VECT_SIZE) + for (u32 il_pos = 0; il_pos < combs_cnt; il_pos++) { - const u32x pw_r_len = pwlenx_create_combt (combs_buf, il_pos); + const u32 pw_r_len = combs_buf[il_pos].pw_len; + + const u32 pw_len = pw_l_len + pw_r_len; + + u32 wordr0[4]; + + wordr0[0] = combs_buf[il_pos].i[0]; + wordr0[1] = combs_buf[il_pos].i[1]; + wordr0[2] = combs_buf[il_pos].i[2]; + wordr0[3] = combs_buf[il_pos].i[3]; - const u32x pw_len = pw_l_len + pw_r_len; + u32 wordr1[4]; - u32x wordr0[4] = { 0 }; - u32x wordr1[4] = { 0 }; - u32x wordr2[4] = { 0 }; - u32x wordr3[4] = { 0 }; + wordr1[0] = combs_buf[il_pos].i[4]; + wordr1[1] = combs_buf[il_pos].i[5]; + wordr1[2] = combs_buf[il_pos].i[6]; + wordr1[3] = combs_buf[il_pos].i[7]; - wordr0[0] = ix_create_combt (combs_buf, il_pos, 0); - wordr0[1] = ix_create_combt (combs_buf, il_pos, 1); - wordr0[2] = ix_create_combt (combs_buf, il_pos, 2); - wordr0[3] = ix_create_combt (combs_buf, il_pos, 3); - wordr1[0] = ix_create_combt (combs_buf, il_pos, 4); - wordr1[1] = ix_create_combt (combs_buf, il_pos, 5); - wordr1[2] = ix_create_combt (combs_buf, il_pos, 6); - wordr1[3] = ix_create_combt (combs_buf, il_pos, 7); + u32 wordr2[4]; + + wordr2[0] = 0; + wordr2[1] = 0; + wordr2[2] = 0; + wordr2[3] = 0; + + u32 wordr3[4]; + + wordr3[0] = 0; + wordr3[1] = 0; + wordr3[2] = 0; + wordr3[3] = 0; if (combs_mode == COMBINATOR_MODE_BASE_LEFT) { switch_buffer_by_offset_le (wordr0, wordr1, wordr2, wordr3, pw_l_len); } - u32x w0[4]; + u32 w0[4]; w0[0] = wordl0[0] | wordr0[0]; w0[1] = wordl0[1] | wordr0[1]; w0[2] = wordl0[2] | wordr0[2]; w0[3] = wordl0[3] | wordr0[3]; - u32x w1[4]; + u32 w1[4]; w1[0] = wordl1[0] | wordr1[0]; w1[1] = wordl1[1] | wordr1[1]; w1[2] = wordl1[2] | wordr1[2]; w1[3] = wordl1[3] | wordr1[3]; - u32x w2[4]; + u32 w2[4]; w2[0] = wordl2[0] | wordr2[0]; w2[1] = wordl2[1] | wordr2[1]; w2[2] = wordl2[2] | wordr2[2]; w2[3] = wordl2[3] | wordr2[3]; - u32x w3[4]; + u32 w3[4]; w3[0] = wordl3[0] | wordr3[0]; w3[1] = wordl3[1] | wordr3[1]; w3[2] = pw_len * 8; w3[3] = 0; - u32x a = MD5M_A; - u32x b = MD5M_B; - u32x c = MD5M_C; - u32x d = MD5M_D; + u32 a = MD5M_A; + u32 b = MD5M_B; + u32 c = MD5M_C; + u32 d = MD5M_D; MD5_STEP (MD5_Fo, a, b, c, d, w0[0], MD5C00, MD5S00); MD5_STEP (MD5_Fo, d, a, b, c, w0[1], MD5C01, MD5S01); @@ -240,10 +277,10 @@ __kernel void m03710_m04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, c += MD5M_C; d += MD5M_D; - u32x w0_t[4]; - u32x w1_t[4]; - u32x w2_t[4]; - u32x w3_t[4]; + u32 w0_t[4]; + u32 w1_t[4]; + u32 w2_t[4]; + u32 w3_t[4]; w0_t[0] = uint_to_hex_lower8 ((a >> 0) & 255) << 0 | uint_to_hex_lower8 ((a >> 8) & 255) << 16; @@ -373,7 +410,13 @@ __kernel void m03710_m04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, MD5_STEP (MD5_I , d, a, b, c, w2_t[3], MD5C3d, MD5S31); MD5_STEP (MD5_I , c, d, a, b, w0_t[2], MD5C3e, MD5S32); MD5_STEP (MD5_I , b, c, d, a, w2_t[1], MD5C3f, MD5S33); - COMPARE_M_SIMD (a, d, c, b); + + const u32 r0 = a; + const u32 r1 = d; + const u32 r2 = c; + const u32 r3 = b; + + #include COMPARE_M } } @@ -418,20 +461,43 @@ __kernel void m03710_s04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, * base */ - u32 pws0[4] = { 0 }; - u32 pws1[4] = { 0 }; + u32 wordl0[4]; + + wordl0[0] = pws[gid].i[ 0]; + wordl0[1] = pws[gid].i[ 1]; + wordl0[2] = pws[gid].i[ 2]; + wordl0[3] = pws[gid].i[ 3]; + + u32 wordl1[4]; + + wordl1[0] = pws[gid].i[ 4]; + wordl1[1] = pws[gid].i[ 5]; + wordl1[2] = pws[gid].i[ 6]; + wordl1[3] = pws[gid].i[ 7]; + + u32 wordl2[4]; - pws0[0] = pws[gid].i[0]; - pws0[1] = pws[gid].i[1]; - pws0[2] = pws[gid].i[2]; - pws0[3] = pws[gid].i[3]; - pws1[0] = pws[gid].i[4]; - pws1[1] = pws[gid].i[5]; - pws1[2] = pws[gid].i[6]; - pws1[3] = pws[gid].i[7]; + wordl2[0] = 0; + wordl2[1] = 0; + wordl2[2] = 0; + wordl2[3] = 0; + + u32 wordl3[4]; + + wordl3[0] = 0; + wordl3[1] = 0; + wordl3[2] = 0; + wordl3[3] = 0; const u32 pw_l_len = pws[gid].pw_len; + if (combs_mode == COMBINATOR_MODE_BASE_RIGHT) + { + append_0x80_2x4 (wordl0, wordl1, pw_l_len); + + switch_buffer_by_offset_le (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len); + } + /** * salt */ @@ -484,63 +550,77 @@ __kernel void m03710_s04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, * loop */ - for (u32 il_pos = 0; il_pos < combs_cnt; il_pos += VECT_SIZE) + for (u32 il_pos = 0; il_pos < combs_cnt; il_pos++) { - const u32x pw_r_len = pwlenx_create_combt (combs_buf, il_pos); + const u32 pw_r_len = combs_buf[il_pos].pw_len; + + const u32 pw_len = pw_l_len + pw_r_len; + + u32 wordr0[4]; + + wordr0[0] = combs_buf[il_pos].i[0]; + wordr0[1] = combs_buf[il_pos].i[1]; + wordr0[2] = combs_buf[il_pos].i[2]; + wordr0[3] = combs_buf[il_pos].i[3]; + + u32 wordr1[4]; - const u32x pw_len = pw_l_len + pw_r_len; + wordr1[0] = combs_buf[il_pos].i[4]; + wordr1[1] = combs_buf[il_pos].i[5]; + wordr1[2] = combs_buf[il_pos].i[6]; + wordr1[3] = combs_buf[il_pos].i[7]; - u32x wordr0[4] = { 0 }; - u32x wordr1[4] = { 0 }; - u32x wordr2[4] = { 0 }; - u32x wordr3[4] = { 0 }; + u32 wordr2[4]; - wordr0[0] = ix_create_combt (combs_buf, il_pos, 0); - wordr0[1] = ix_create_combt (combs_buf, il_pos, 1); - wordr0[2] = ix_create_combt (combs_buf, il_pos, 2); - wordr0[3] = ix_create_combt (combs_buf, il_pos, 3); - wordr1[0] = ix_create_combt (combs_buf, il_pos, 4); - wordr1[1] = ix_create_combt (combs_buf, il_pos, 5); - wordr1[2] = ix_create_combt (combs_buf, il_pos, 6); - wordr1[3] = ix_create_combt (combs_buf, il_pos, 7); + wordr2[0] = 0; + wordr2[1] = 0; + wordr2[2] = 0; + wordr2[3] = 0; + + u32 wordr3[4]; + + wordr3[0] = 0; + wordr3[1] = 0; + wordr3[2] = 0; + wordr3[3] = 0; if (combs_mode == COMBINATOR_MODE_BASE_LEFT) { switch_buffer_by_offset_le (wordr0, wordr1, wordr2, wordr3, pw_l_len); } - u32x w0[4]; + u32 w0[4]; w0[0] = wordl0[0] | wordr0[0]; w0[1] = wordl0[1] | wordr0[1]; w0[2] = wordl0[2] | wordr0[2]; w0[3] = wordl0[3] | wordr0[3]; - u32x w1[4]; + u32 w1[4]; w1[0] = wordl1[0] | wordr1[0]; w1[1] = wordl1[1] | wordr1[1]; w1[2] = wordl1[2] | wordr1[2]; w1[3] = wordl1[3] | wordr1[3]; - u32x w2[4]; + u32 w2[4]; w2[0] = wordl2[0] | wordr2[0]; w2[1] = wordl2[1] | wordr2[1]; w2[2] = wordl2[2] | wordr2[2]; w2[3] = wordl2[3] | wordr2[3]; - u32x w3[4]; + u32 w3[4]; w3[0] = wordl3[0] | wordr3[0]; w3[1] = wordl3[1] | wordr3[1]; w3[2] = pw_len * 8; w3[3] = 0; - u32x a = MD5M_A; - u32x b = MD5M_B; - u32x c = MD5M_C; - u32x d = MD5M_D; + u32 a = MD5M_A; + u32 b = MD5M_B; + u32 c = MD5M_C; + u32 d = MD5M_D; MD5_STEP (MD5_Fo, a, b, c, d, w0[0], MD5C00, MD5S00); MD5_STEP (MD5_Fo, d, a, b, c, w0[1], MD5C01, MD5S01); @@ -615,10 +695,10 @@ __kernel void m03710_s04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, c += MD5M_C; d += MD5M_D; - u32x w0_t[4]; - u32x w1_t[4]; - u32x w2_t[4]; - u32x w3_t[4]; + u32 w0_t[4]; + u32 w1_t[4]; + u32 w2_t[4]; + u32 w3_t[4]; w0_t[0] = uint_to_hex_lower8 ((a >> 0) & 255) << 0 | uint_to_hex_lower8 ((a >> 8) & 255) << 16; @@ -748,7 +828,13 @@ __kernel void m03710_s04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, MD5_STEP (MD5_I , d, a, b, c, w2_t[3], MD5C3d, MD5S31); MD5_STEP (MD5_I , c, d, a, b, w0_t[2], MD5C3e, MD5S32); MD5_STEP (MD5_I , b, c, d, a, w2_t[1], MD5C3f, MD5S33); - COMPARE_S_SIMD (a, d, c, b); + + const u32 r0 = a; + const u32 r1 = d; + const u32 r2 = c; + const u32 r3 = b; + + #include COMPARE_S } } diff --git a/OpenCL/m03800_a1.cl b/OpenCL/m03800_a1.cl index d6e86f1..fe2fdf6 100644 --- a/OpenCL/m03800_a1.cl +++ b/OpenCL/m03800_a1.cl @@ -5,8 +5,6 @@ #define _MD5_ -#define NEW_SIMD_CODE - #include "include/constants.h" #include "include/kernel_vendor.h" @@ -18,7 +16,9 @@ #include "include/kernel_functions.c" #include "OpenCL/types_ocl.c" #include "OpenCL/common.c" -#include "OpenCL/simd.c" + +#define COMPARE_S "OpenCL/check_single_comp4.c" +#define COMPARE_M "OpenCL/check_multi_comp4.c" __kernel void m03800_m04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, __global comb_t *combs_buf, __global bf_t *bfs_buf, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 combs_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max) { @@ -34,20 +34,41 @@ __kernel void m03800_m04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, * base */ - u32 pws0[4] = { 0 }; - u32 pws1[4] = { 0 }; + u32 wordl0[4]; + + wordl0[0] = pws[gid].i[ 0]; + wordl0[1] = pws[gid].i[ 1]; + wordl0[2] = pws[gid].i[ 2]; + wordl0[3] = pws[gid].i[ 3]; + + u32 wordl1[4]; + + wordl1[0] = pws[gid].i[ 4]; + wordl1[1] = pws[gid].i[ 5]; + wordl1[2] = pws[gid].i[ 6]; + wordl1[3] = pws[gid].i[ 7]; - pws0[0] = pws[gid].i[0]; - pws0[1] = pws[gid].i[1]; - pws0[2] = pws[gid].i[2]; - pws0[3] = pws[gid].i[3]; - pws1[0] = pws[gid].i[4]; - pws1[1] = pws[gid].i[5]; - pws1[2] = pws[gid].i[6]; - pws1[3] = pws[gid].i[7]; + u32 wordl2[4]; + + wordl2[0] = 0; + wordl2[1] = 0; + wordl2[2] = 0; + wordl2[3] = 0; + + u32 wordl3[4]; + + wordl3[0] = 0; + wordl3[1] = 0; + wordl3[2] = 0; + wordl3[3] = 0; const u32 pw_l_len = pws[gid].pw_len; + if (combs_mode == COMBINATOR_MODE_BASE_RIGHT) + { + switch_buffer_by_offset_le (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len); + } + /** * salt */ @@ -86,53 +107,67 @@ __kernel void m03800_m04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, * loop */ - for (u32 il_pos = 0; il_pos < combs_cnt; il_pos += VECT_SIZE) + for (u32 il_pos = 0; il_pos < combs_cnt; il_pos++) { - const u32x pw_r_len = pwlenx_create_combt (combs_buf, il_pos); + const u32 pw_r_len = combs_buf[il_pos].pw_len; + + const u32 pw_len = pw_l_len + pw_r_len; + + u32 wordr0[4]; + + wordr0[0] = combs_buf[il_pos].i[0]; + wordr0[1] = combs_buf[il_pos].i[1]; + wordr0[2] = combs_buf[il_pos].i[2]; + wordr0[3] = combs_buf[il_pos].i[3]; - const u32x pw_len = pw_l_len + pw_r_len; + u32 wordr1[4]; - u32x wordr0[4] = { 0 }; - u32x wordr1[4] = { 0 }; - u32x wordr2[4] = { 0 }; - u32x wordr3[4] = { 0 }; + wordr1[0] = combs_buf[il_pos].i[4]; + wordr1[1] = combs_buf[il_pos].i[5]; + wordr1[2] = combs_buf[il_pos].i[6]; + wordr1[3] = combs_buf[il_pos].i[7]; - wordr0[0] = ix_create_combt (combs_buf, il_pos, 0); - wordr0[1] = ix_create_combt (combs_buf, il_pos, 1); - wordr0[2] = ix_create_combt (combs_buf, il_pos, 2); - wordr0[3] = ix_create_combt (combs_buf, il_pos, 3); - wordr1[0] = ix_create_combt (combs_buf, il_pos, 4); - wordr1[1] = ix_create_combt (combs_buf, il_pos, 5); - wordr1[2] = ix_create_combt (combs_buf, il_pos, 6); - wordr1[3] = ix_create_combt (combs_buf, il_pos, 7); + u32 wordr2[4]; + + wordr2[0] = 0; + wordr2[1] = 0; + wordr2[2] = 0; + wordr2[3] = 0; + + u32 wordr3[4]; + + wordr3[0] = 0; + wordr3[1] = 0; + wordr3[2] = 0; + wordr3[3] = 0; if (combs_mode == COMBINATOR_MODE_BASE_LEFT) { switch_buffer_by_offset_le (wordr0, wordr1, wordr2, wordr3, pw_l_len); } - u32x w0[4]; + u32 w0[4]; w0[0] = wordl0[0] | wordr0[0]; w0[1] = wordl0[1] | wordr0[1]; w0[2] = wordl0[2] | wordr0[2]; w0[3] = wordl0[3] | wordr0[3]; - u32x w1[4]; + u32 w1[4]; w1[0] = wordl1[0] | wordr1[0]; w1[1] = wordl1[1] | wordr1[1]; w1[2] = wordl1[2] | wordr1[2]; w1[3] = wordl1[3] | wordr1[3]; - u32x w2[4]; + u32 w2[4]; w2[0] = wordl2[0] | wordr2[0]; w2[1] = wordl2[1] | wordr2[1]; w2[2] = wordl2[2] | wordr2[2]; w2[3] = wordl2[3] | wordr2[3]; - u32x w3[4]; + u32 w3[4]; w3[0] = wordl3[0] | wordr3[0]; w3[1] = wordl3[1] | wordr3[1]; @@ -143,28 +178,28 @@ __kernel void m03800_m04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, * prepend salt */ - u32x w0_t[4]; + u32 w0_t[4]; w0_t[0] = w0[0]; w0_t[1] = w0[1]; w0_t[2] = w0[2]; w0_t[3] = w0[3]; - u32x w1_t[4]; + u32 w1_t[4]; w1_t[0] = w1[0]; w1_t[1] = w1[1]; w1_t[2] = w1[2]; w1_t[3] = w1[3]; - u32x w2_t[4]; + u32 w2_t[4]; w2_t[0] = w2[0]; w2_t[1] = w2[1]; w2_t[2] = w2[2]; w2_t[3] = w2[3]; - u32x w3_t[4]; + u32 w3_t[4]; w3_t[0] = w3[0]; w3_t[1] = w3[1]; @@ -251,10 +286,10 @@ __kernel void m03800_m04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, * md5 */ - u32x a = MD5M_A; - u32x b = MD5M_B; - u32x c = MD5M_C; - u32x d = MD5M_D; + u32 a = MD5M_A; + u32 b = MD5M_B; + u32 c = MD5M_C; + u32 d = MD5M_D; MD5_STEP (MD5_Fo, a, b, c, d, w0_t[0], MD5C00, MD5S00); MD5_STEP (MD5_Fo, d, a, b, c, w0_t[1], MD5C01, MD5S01); @@ -324,7 +359,13 @@ __kernel void m03800_m04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, MD5_STEP (MD5_I , d, a, b, c, w2_t[3], MD5C3d, MD5S31); MD5_STEP (MD5_I , c, d, a, b, w0_t[2], MD5C3e, MD5S32); MD5_STEP (MD5_I , b, c, d, a, w2_t[1], MD5C3f, MD5S33); - COMPARE_M_SIMD (a, d, c, b); + + const u32 r0 = a; + const u32 r1 = d; + const u32 r2 = c; + const u32 r3 = b; + + #include COMPARE_M } } @@ -350,20 +391,41 @@ __kernel void m03800_s04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, * base */ - u32 pws0[4] = { 0 }; - u32 pws1[4] = { 0 }; + u32 wordl0[4]; + + wordl0[0] = pws[gid].i[ 0]; + wordl0[1] = pws[gid].i[ 1]; + wordl0[2] = pws[gid].i[ 2]; + wordl0[3] = pws[gid].i[ 3]; + + u32 wordl1[4]; + + wordl1[0] = pws[gid].i[ 4]; + wordl1[1] = pws[gid].i[ 5]; + wordl1[2] = pws[gid].i[ 6]; + wordl1[3] = pws[gid].i[ 7]; + + u32 wordl2[4]; - pws0[0] = pws[gid].i[0]; - pws0[1] = pws[gid].i[1]; - pws0[2] = pws[gid].i[2]; - pws0[3] = pws[gid].i[3]; - pws1[0] = pws[gid].i[4]; - pws1[1] = pws[gid].i[5]; - pws1[2] = pws[gid].i[6]; - pws1[3] = pws[gid].i[7]; + wordl2[0] = 0; + wordl2[1] = 0; + wordl2[2] = 0; + wordl2[3] = 0; + + u32 wordl3[4]; + + wordl3[0] = 0; + wordl3[1] = 0; + wordl3[2] = 0; + wordl3[3] = 0; const u32 pw_l_len = pws[gid].pw_len; + if (combs_mode == COMBINATOR_MODE_BASE_RIGHT) + { + switch_buffer_by_offset_le (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len); + } + /** * salt */ @@ -414,53 +476,67 @@ __kernel void m03800_s04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, * loop */ - for (u32 il_pos = 0; il_pos < combs_cnt; il_pos += VECT_SIZE) + for (u32 il_pos = 0; il_pos < combs_cnt; il_pos++) { - const u32x pw_r_len = pwlenx_create_combt (combs_buf, il_pos); + const u32 pw_r_len = combs_buf[il_pos].pw_len; + + const u32 pw_len = pw_l_len + pw_r_len; + + u32 wordr0[4]; + + wordr0[0] = combs_buf[il_pos].i[0]; + wordr0[1] = combs_buf[il_pos].i[1]; + wordr0[2] = combs_buf[il_pos].i[2]; + wordr0[3] = combs_buf[il_pos].i[3]; + + u32 wordr1[4]; - const u32x pw_len = pw_l_len + pw_r_len; + wordr1[0] = combs_buf[il_pos].i[4]; + wordr1[1] = combs_buf[il_pos].i[5]; + wordr1[2] = combs_buf[il_pos].i[6]; + wordr1[3] = combs_buf[il_pos].i[7]; - u32x wordr0[4] = { 0 }; - u32x wordr1[4] = { 0 }; - u32x wordr2[4] = { 0 }; - u32x wordr3[4] = { 0 }; + u32 wordr2[4]; - wordr0[0] = ix_create_combt (combs_buf, il_pos, 0); - wordr0[1] = ix_create_combt (combs_buf, il_pos, 1); - wordr0[2] = ix_create_combt (combs_buf, il_pos, 2); - wordr0[3] = ix_create_combt (combs_buf, il_pos, 3); - wordr1[0] = ix_create_combt (combs_buf, il_pos, 4); - wordr1[1] = ix_create_combt (combs_buf, il_pos, 5); - wordr1[2] = ix_create_combt (combs_buf, il_pos, 6); - wordr1[3] = ix_create_combt (combs_buf, il_pos, 7); + wordr2[0] = 0; + wordr2[1] = 0; + wordr2[2] = 0; + wordr2[3] = 0; + + u32 wordr3[4]; + + wordr3[0] = 0; + wordr3[1] = 0; + wordr3[2] = 0; + wordr3[3] = 0; if (combs_mode == COMBINATOR_MODE_BASE_LEFT) { switch_buffer_by_offset_le (wordr0, wordr1, wordr2, wordr3, pw_l_len); } - u32x w0[4]; + u32 w0[4]; w0[0] = wordl0[0] | wordr0[0]; w0[1] = wordl0[1] | wordr0[1]; w0[2] = wordl0[2] | wordr0[2]; w0[3] = wordl0[3] | wordr0[3]; - u32x w1[4]; + u32 w1[4]; w1[0] = wordl1[0] | wordr1[0]; w1[1] = wordl1[1] | wordr1[1]; w1[2] = wordl1[2] | wordr1[2]; w1[3] = wordl1[3] | wordr1[3]; - u32x w2[4]; + u32 w2[4]; w2[0] = wordl2[0] | wordr2[0]; w2[1] = wordl2[1] | wordr2[1]; w2[2] = wordl2[2] | wordr2[2]; w2[3] = wordl2[3] | wordr2[3]; - u32x w3[4]; + u32 w3[4]; w3[0] = wordl3[0] | wordr3[0]; w3[1] = wordl3[1] | wordr3[1]; @@ -471,28 +547,28 @@ __kernel void m03800_s04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, * prepend salt */ - u32x w0_t[4]; + u32 w0_t[4]; w0_t[0] = w0[0]; w0_t[1] = w0[1]; w0_t[2] = w0[2]; w0_t[3] = w0[3]; - u32x w1_t[4]; + u32 w1_t[4]; w1_t[0] = w1[0]; w1_t[1] = w1[1]; w1_t[2] = w1[2]; w1_t[3] = w1[3]; - u32x w2_t[4]; + u32 w2_t[4]; w2_t[0] = w2[0]; w2_t[1] = w2[1]; w2_t[2] = w2[2]; w2_t[3] = w2[3]; - u32x w3_t[4]; + u32 w3_t[4]; w3_t[0] = w3[0]; w3_t[1] = w3[1]; @@ -579,10 +655,10 @@ __kernel void m03800_s04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, * md5 */ - u32x a = MD5M_A; - u32x b = MD5M_B; - u32x c = MD5M_C; - u32x d = MD5M_D; + u32 a = MD5M_A; + u32 b = MD5M_B; + u32 c = MD5M_C; + u32 d = MD5M_D; MD5_STEP (MD5_Fo, a, b, c, d, w0_t[0], MD5C00, MD5S00); MD5_STEP (MD5_Fo, d, a, b, c, w0_t[1], MD5C01, MD5S01); @@ -651,7 +727,13 @@ __kernel void m03800_s04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, MD5_STEP (MD5_I , d, a, b, c, w2_t[3], MD5C3d, MD5S31); MD5_STEP (MD5_I , c, d, a, b, w0_t[2], MD5C3e, MD5S32); MD5_STEP (MD5_I , b, c, d, a, w2_t[1], MD5C3f, MD5S33); - COMPARE_S_SIMD (a, d, c, b); + + const u32 r0 = a; + const u32 r1 = d; + const u32 r2 = c; + const u32 r3 = b; + + #include COMPARE_S } } diff --git a/OpenCL/m04310_a1.cl b/OpenCL/m04310_a1.cl index 57858c8..08bf814 100644 --- a/OpenCL/m04310_a1.cl +++ b/OpenCL/m04310_a1.cl @@ -5,8 +5,6 @@ #define _MD5_ -#define NEW_SIMD_CODE - #include "include/constants.h" #include "include/kernel_vendor.h" @@ -18,7 +16,9 @@ #include "include/kernel_functions.c" #include "OpenCL/types_ocl.c" #include "OpenCL/common.c" -#include "OpenCL/simd.c" + +#define COMPARE_S "OpenCL/check_single_comp4.c" +#define COMPARE_M "OpenCL/check_multi_comp4.c" #define uint_to_hex_lower8(i) l_bin2asc[(i)] @@ -55,20 +55,43 @@ __kernel void m04310_m04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, * base */ - u32 pws0[4] = { 0 }; - u32 pws1[4] = { 0 }; + u32 wordl0[4]; + + wordl0[0] = pws[gid].i[ 0]; + wordl0[1] = pws[gid].i[ 1]; + wordl0[2] = pws[gid].i[ 2]; + wordl0[3] = pws[gid].i[ 3]; + + u32 wordl1[4]; + + wordl1[0] = pws[gid].i[ 4]; + wordl1[1] = pws[gid].i[ 5]; + wordl1[2] = pws[gid].i[ 6]; + wordl1[3] = pws[gid].i[ 7]; - pws0[0] = pws[gid].i[0]; - pws0[1] = pws[gid].i[1]; - pws0[2] = pws[gid].i[2]; - pws0[3] = pws[gid].i[3]; - pws1[0] = pws[gid].i[4]; - pws1[1] = pws[gid].i[5]; - pws1[2] = pws[gid].i[6]; - pws1[3] = pws[gid].i[7]; + u32 wordl2[4]; + + wordl2[0] = 0; + wordl2[1] = 0; + wordl2[2] = 0; + wordl2[3] = 0; + + u32 wordl3[4]; + + wordl3[0] = 0; + wordl3[1] = 0; + wordl3[2] = 0; + wordl3[3] = 0; const u32 pw_l_len = pws[gid].pw_len; + if (combs_mode == COMBINATOR_MODE_BASE_RIGHT) + { + append_0x80_2x4 (wordl0, wordl1, pw_l_len); + + switch_buffer_by_offset_le (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len); + } + /** * salt */ @@ -90,63 +113,77 @@ __kernel void m04310_m04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, * loop */ - for (u32 il_pos = 0; il_pos < combs_cnt; il_pos += VECT_SIZE) + for (u32 il_pos = 0; il_pos < combs_cnt; il_pos++) { - const u32x pw_r_len = pwlenx_create_combt (combs_buf, il_pos); + const u32 pw_r_len = combs_buf[il_pos].pw_len; + + const u32 pw_len = pw_l_len + pw_r_len; + + u32 wordr0[4]; + + wordr0[0] = combs_buf[il_pos].i[0]; + wordr0[1] = combs_buf[il_pos].i[1]; + wordr0[2] = combs_buf[il_pos].i[2]; + wordr0[3] = combs_buf[il_pos].i[3]; - const u32x pw_len = pw_l_len + pw_r_len; + u32 wordr1[4]; - u32x wordr0[4] = { 0 }; - u32x wordr1[4] = { 0 }; - u32x wordr2[4] = { 0 }; - u32x wordr3[4] = { 0 }; + wordr1[0] = combs_buf[il_pos].i[4]; + wordr1[1] = combs_buf[il_pos].i[5]; + wordr1[2] = combs_buf[il_pos].i[6]; + wordr1[3] = combs_buf[il_pos].i[7]; - wordr0[0] = ix_create_combt (combs_buf, il_pos, 0); - wordr0[1] = ix_create_combt (combs_buf, il_pos, 1); - wordr0[2] = ix_create_combt (combs_buf, il_pos, 2); - wordr0[3] = ix_create_combt (combs_buf, il_pos, 3); - wordr1[0] = ix_create_combt (combs_buf, il_pos, 4); - wordr1[1] = ix_create_combt (combs_buf, il_pos, 5); - wordr1[2] = ix_create_combt (combs_buf, il_pos, 6); - wordr1[3] = ix_create_combt (combs_buf, il_pos, 7); + u32 wordr2[4]; + + wordr2[0] = 0; + wordr2[1] = 0; + wordr2[2] = 0; + wordr2[3] = 0; + + u32 wordr3[4]; + + wordr3[0] = 0; + wordr3[1] = 0; + wordr3[2] = 0; + wordr3[3] = 0; if (combs_mode == COMBINATOR_MODE_BASE_LEFT) { switch_buffer_by_offset_le (wordr0, wordr1, wordr2, wordr3, pw_l_len); } - u32x w0[4]; + u32 w0[4]; w0[0] = wordl0[0] | wordr0[0]; w0[1] = wordl0[1] | wordr0[1]; w0[2] = wordl0[2] | wordr0[2]; w0[3] = wordl0[3] | wordr0[3]; - u32x w1[4]; + u32 w1[4]; w1[0] = wordl1[0] | wordr1[0]; w1[1] = wordl1[1] | wordr1[1]; w1[2] = wordl1[2] | wordr1[2]; w1[3] = wordl1[3] | wordr1[3]; - u32x w2[4]; + u32 w2[4]; w2[0] = wordl2[0] | wordr2[0]; w2[1] = wordl2[1] | wordr2[1]; w2[2] = wordl2[2] | wordr2[2]; w2[3] = wordl2[3] | wordr2[3]; - u32x w3[4]; + u32 w3[4]; w3[0] = wordl3[0] | wordr3[0]; w3[1] = wordl3[1] | wordr3[1]; w3[2] = pw_len * 8; w3[3] = 0; - u32x a = MD5M_A; - u32x b = MD5M_B; - u32x c = MD5M_C; - u32x d = MD5M_D; + u32 a = MD5M_A; + u32 b = MD5M_B; + u32 c = MD5M_C; + u32 d = MD5M_D; MD5_STEP (MD5_Fo, a, b, c, d, w0[0], MD5C00, MD5S00); MD5_STEP (MD5_Fo, d, a, b, c, w0[1], MD5C01, MD5S01); @@ -319,7 +356,13 @@ __kernel void m04310_m04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, MD5_STEP (MD5_I , d, a, b, c, wb_t, MD5C3d, MD5S31); MD5_STEP (MD5_I , c, d, a, b, w2_t, MD5C3e, MD5S32); MD5_STEP (MD5_I , b, c, d, a, w9_t, MD5C3f, MD5S33); - COMPARE_M_SIMD (a, d, c, b); + + const u32 r0 = a; + const u32 r1 = d; + const u32 r2 = c; + const u32 r3 = b; + + #include COMPARE_M } } @@ -364,20 +407,43 @@ __kernel void m04310_s04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, * base */ - u32 pws0[4] = { 0 }; - u32 pws1[4] = { 0 }; + u32 wordl0[4]; + + wordl0[0] = pws[gid].i[ 0]; + wordl0[1] = pws[gid].i[ 1]; + wordl0[2] = pws[gid].i[ 2]; + wordl0[3] = pws[gid].i[ 3]; + + u32 wordl1[4]; + + wordl1[0] = pws[gid].i[ 4]; + wordl1[1] = pws[gid].i[ 5]; + wordl1[2] = pws[gid].i[ 6]; + wordl1[3] = pws[gid].i[ 7]; + + u32 wordl2[4]; - pws0[0] = pws[gid].i[0]; - pws0[1] = pws[gid].i[1]; - pws0[2] = pws[gid].i[2]; - pws0[3] = pws[gid].i[3]; - pws1[0] = pws[gid].i[4]; - pws1[1] = pws[gid].i[5]; - pws1[2] = pws[gid].i[6]; - pws1[3] = pws[gid].i[7]; + wordl2[0] = 0; + wordl2[1] = 0; + wordl2[2] = 0; + wordl2[3] = 0; + + u32 wordl3[4]; + + wordl3[0] = 0; + wordl3[1] = 0; + wordl3[2] = 0; + wordl3[3] = 0; const u32 pw_l_len = pws[gid].pw_len; + if (combs_mode == COMBINATOR_MODE_BASE_RIGHT) + { + append_0x80_2x4 (wordl0, wordl1, pw_l_len); + + switch_buffer_by_offset_le (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len); + } + /** * digest */ @@ -411,63 +477,77 @@ __kernel void m04310_s04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, * loop */ - for (u32 il_pos = 0; il_pos < combs_cnt; il_pos += VECT_SIZE) + for (u32 il_pos = 0; il_pos < combs_cnt; il_pos++) { - const u32x pw_r_len = pwlenx_create_combt (combs_buf, il_pos); + const u32 pw_r_len = combs_buf[il_pos].pw_len; + + const u32 pw_len = pw_l_len + pw_r_len; + + u32 wordr0[4]; + + wordr0[0] = combs_buf[il_pos].i[0]; + wordr0[1] = combs_buf[il_pos].i[1]; + wordr0[2] = combs_buf[il_pos].i[2]; + wordr0[3] = combs_buf[il_pos].i[3]; + + u32 wordr1[4]; - const u32x pw_len = pw_l_len + pw_r_len; + wordr1[0] = combs_buf[il_pos].i[4]; + wordr1[1] = combs_buf[il_pos].i[5]; + wordr1[2] = combs_buf[il_pos].i[6]; + wordr1[3] = combs_buf[il_pos].i[7]; - u32x wordr0[4] = { 0 }; - u32x wordr1[4] = { 0 }; - u32x wordr2[4] = { 0 }; - u32x wordr3[4] = { 0 }; + u32 wordr2[4]; - wordr0[0] = ix_create_combt (combs_buf, il_pos, 0); - wordr0[1] = ix_create_combt (combs_buf, il_pos, 1); - wordr0[2] = ix_create_combt (combs_buf, il_pos, 2); - wordr0[3] = ix_create_combt (combs_buf, il_pos, 3); - wordr1[0] = ix_create_combt (combs_buf, il_pos, 4); - wordr1[1] = ix_create_combt (combs_buf, il_pos, 5); - wordr1[2] = ix_create_combt (combs_buf, il_pos, 6); - wordr1[3] = ix_create_combt (combs_buf, il_pos, 7); + wordr2[0] = 0; + wordr2[1] = 0; + wordr2[2] = 0; + wordr2[3] = 0; + + u32 wordr3[4]; + + wordr3[0] = 0; + wordr3[1] = 0; + wordr3[2] = 0; + wordr3[3] = 0; if (combs_mode == COMBINATOR_MODE_BASE_LEFT) { switch_buffer_by_offset_le (wordr0, wordr1, wordr2, wordr3, pw_l_len); } - u32x w0[4]; + u32 w0[4]; w0[0] = wordl0[0] | wordr0[0]; w0[1] = wordl0[1] | wordr0[1]; w0[2] = wordl0[2] | wordr0[2]; w0[3] = wordl0[3] | wordr0[3]; - u32x w1[4]; + u32 w1[4]; w1[0] = wordl1[0] | wordr1[0]; w1[1] = wordl1[1] | wordr1[1]; w1[2] = wordl1[2] | wordr1[2]; w1[3] = wordl1[3] | wordr1[3]; - u32x w2[4]; + u32 w2[4]; w2[0] = wordl2[0] | wordr2[0]; w2[1] = wordl2[1] | wordr2[1]; w2[2] = wordl2[2] | wordr2[2]; w2[3] = wordl2[3] | wordr2[3]; - u32x w3[4]; + u32 w3[4]; w3[0] = wordl3[0] | wordr3[0]; w3[1] = wordl3[1] | wordr3[1]; w3[2] = pw_len * 8; w3[3] = 0; - u32x a = MD5M_A; - u32x b = MD5M_B; - u32x c = MD5M_C; - u32x d = MD5M_D; + u32 a = MD5M_A; + u32 b = MD5M_B; + u32 c = MD5M_C; + u32 d = MD5M_D; MD5_STEP (MD5_Fo, a, b, c, d, w0[0], MD5C00, MD5S00); MD5_STEP (MD5_Fo, d, a, b, c, w0[1], MD5C01, MD5S01); @@ -640,7 +720,13 @@ __kernel void m04310_s04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, MD5_STEP (MD5_I , d, a, b, c, wb_t, MD5C3d, MD5S31); MD5_STEP (MD5_I , c, d, a, b, w2_t, MD5C3e, MD5S32); MD5_STEP (MD5_I , b, c, d, a, w9_t, MD5C3f, MD5S33); - COMPARE_S_SIMD (a, d, c, b); + + const u32 r0 = a; + const u32 r1 = d; + const u32 r2 = c; + const u32 r3 = b; + + #include COMPARE_S } } diff --git a/OpenCL/m04400_a1.cl b/OpenCL/m04400_a1.cl index 578dabe..f1826d2 100644 --- a/OpenCL/m04400_a1.cl +++ b/OpenCL/m04400_a1.cl @@ -5,8 +5,6 @@ #define _MD5_SHA1_ -#define NEW_SIMD_CODE - #include "include/constants.h" #include "include/kernel_vendor.h" @@ -18,7 +16,9 @@ #include "include/kernel_functions.c" #include "OpenCL/types_ocl.c" #include "OpenCL/common.c" -#include "OpenCL/simd.c" + +#define COMPARE_S "OpenCL/check_single_comp4.c" +#define COMPARE_M "OpenCL/check_multi_comp4.c" #define uint_to_hex_lower8(i) l_bin2asc[(i)] @@ -55,43 +55,80 @@ __kernel void m04400_m04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, * base */ - u32 pws0[4] = { 0 }; - u32 pws1[4] = { 0 }; + u32 wordl0[4]; + + wordl0[0] = pws[gid].i[ 0]; + wordl0[1] = pws[gid].i[ 1]; + wordl0[2] = pws[gid].i[ 2]; + wordl0[3] = pws[gid].i[ 3]; + + u32 wordl1[4]; + + wordl1[0] = pws[gid].i[ 4]; + wordl1[1] = pws[gid].i[ 5]; + wordl1[2] = pws[gid].i[ 6]; + wordl1[3] = pws[gid].i[ 7]; - pws0[0] = pws[gid].i[0]; - pws0[1] = pws[gid].i[1]; - pws0[2] = pws[gid].i[2]; - pws0[3] = pws[gid].i[3]; - pws1[0] = pws[gid].i[4]; - pws1[1] = pws[gid].i[5]; - pws1[2] = pws[gid].i[6]; - pws1[3] = pws[gid].i[7]; + u32 wordl2[4]; + + wordl2[0] = 0; + wordl2[1] = 0; + wordl2[2] = 0; + wordl2[3] = 0; + + u32 wordl3[4]; + + wordl3[0] = 0; + wordl3[1] = 0; + wordl3[2] = 0; + wordl3[3] = 0; const u32 pw_l_len = pws[gid].pw_len; + if (combs_mode == COMBINATOR_MODE_BASE_RIGHT) + { + append_0x80_2x4 (wordl0, wordl1, pw_l_len); + + switch_buffer_by_offset_le (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len); + } + /** * loop */ - for (u32 il_pos = 0; il_pos < combs_cnt; il_pos += VECT_SIZE) + for (u32 il_pos = 0; il_pos < combs_cnt; il_pos++) { - const u32x pw_r_len = pwlenx_create_combt (combs_buf, il_pos); + const u32 pw_r_len = combs_buf[il_pos].pw_len; + + const u32 pw_len = pw_l_len + pw_r_len; + + u32 wordr0[4]; + + wordr0[0] = combs_buf[il_pos].i[0]; + wordr0[1] = combs_buf[il_pos].i[1]; + wordr0[2] = combs_buf[il_pos].i[2]; + wordr0[3] = combs_buf[il_pos].i[3]; - const u32x pw_len = pw_l_len + pw_r_len; + u32 wordr1[4]; - u32x wordr0[4] = { 0 }; - u32x wordr1[4] = { 0 }; - u32x wordr2[4] = { 0 }; - u32x wordr3[4] = { 0 }; + wordr1[0] = combs_buf[il_pos].i[4]; + wordr1[1] = combs_buf[il_pos].i[5]; + wordr1[2] = combs_buf[il_pos].i[6]; + wordr1[3] = combs_buf[il_pos].i[7]; - wordr0[0] = ix_create_combt (combs_buf, il_pos, 0); - wordr0[1] = ix_create_combt (combs_buf, il_pos, 1); - wordr0[2] = ix_create_combt (combs_buf, il_pos, 2); - wordr0[3] = ix_create_combt (combs_buf, il_pos, 3); - wordr1[0] = ix_create_combt (combs_buf, il_pos, 4); - wordr1[1] = ix_create_combt (combs_buf, il_pos, 5); - wordr1[2] = ix_create_combt (combs_buf, il_pos, 6); - wordr1[3] = ix_create_combt (combs_buf, il_pos, 7); + u32 wordr2[4]; + + wordr2[0] = 0; + wordr2[1] = 0; + wordr2[2] = 0; + wordr2[3] = 0; + + u32 wordr3[4]; + + wordr3[0] = 0; + wordr3[1] = 0; + wordr3[2] = 0; + wordr3[3] = 0; if (combs_mode == COMBINATOR_MODE_BASE_LEFT) { @@ -100,28 +137,28 @@ __kernel void m04400_m04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, switch_buffer_by_offset_le (wordr0, wordr1, wordr2, wordr3, pw_l_len); } - u32x w0[4]; + u32 w0[4]; w0[0] = wordl0[0] | wordr0[0]; w0[1] = wordl0[1] | wordr0[1]; w0[2] = wordl0[2] | wordr0[2]; w0[3] = wordl0[3] | wordr0[3]; - u32x w1[4]; + u32 w1[4]; w1[0] = wordl1[0] | wordr1[0]; w1[1] = wordl1[1] | wordr1[1]; w1[2] = wordl1[2] | wordr1[2]; w1[3] = wordl1[3] | wordr1[3]; - u32x w2[4]; + u32 w2[4]; w2[0] = wordl2[0] | wordr2[0]; w2[1] = wordl2[1] | wordr2[1]; w2[2] = wordl2[2] | wordr2[2]; w2[3] = wordl2[3] | wordr2[3]; - u32x w3[4]; + u32 w3[4]; w3[0] = wordl3[0] | wordr3[0]; w3[1] = wordl3[1] | wordr3[1]; @@ -132,28 +169,28 @@ __kernel void m04400_m04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, * sha1 */ - u32x w0_t = swap32 (w0[0]); - u32x w1_t = swap32 (w0[1]); - u32x w2_t = swap32 (w0[2]); - u32x w3_t = swap32 (w0[3]); - u32x w4_t = swap32 (w1[0]); - u32x w5_t = swap32 (w1[1]); - u32x w6_t = swap32 (w1[2]); - u32x w7_t = swap32 (w1[3]); - u32x w8_t = swap32 (w2[0]); - u32x w9_t = swap32 (w2[1]); - u32x wa_t = swap32 (w2[2]); - u32x wb_t = swap32 (w2[3]); - u32x wc_t = swap32 (w3[0]); - u32x wd_t = swap32 (w3[1]); - u32x we_t = 0; - u32x wf_t = pw_len * 8; - - u32x a = SHA1M_A; - u32x b = SHA1M_B; - u32x c = SHA1M_C; - u32x d = SHA1M_D; - u32x e = SHA1M_E; + u32 w0_t = swap32 (w0[0]); + u32 w1_t = swap32 (w0[1]); + u32 w2_t = swap32 (w0[2]); + u32 w3_t = swap32 (w0[3]); + u32 w4_t = swap32 (w1[0]); + u32 w5_t = swap32 (w1[1]); + u32 w6_t = swap32 (w1[2]); + u32 w7_t = swap32 (w1[3]); + u32 w8_t = swap32 (w2[0]); + u32 w9_t = swap32 (w2[1]); + u32 wa_t = swap32 (w2[2]); + u32 wb_t = swap32 (w2[3]); + u32 wc_t = swap32 (w3[0]); + u32 wd_t = swap32 (w3[1]); + u32 we_t = 0; + u32 wf_t = pw_len * 8; + + u32 a = SHA1M_A; + u32 b = SHA1M_B; + u32 c = SHA1M_C; + u32 d = SHA1M_D; + u32 e = SHA1M_E; #undef K #define K SHA1C00 @@ -361,7 +398,13 @@ __kernel void m04400_m04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, MD5_STEP (MD5_I , d, a, b, c, wb_t, MD5C3d, MD5S31); MD5_STEP (MD5_I , c, d, a, b, w2_t, MD5C3e, MD5S32); MD5_STEP (MD5_I , b, c, d, a, w9_t, MD5C3f, MD5S33); - COMPARE_M_SIMD (a, d, c, b); + + const u32 r0 = a; + const u32 r1 = d; + const u32 r2 = c; + const u32 r3 = b; + + #include COMPARE_M } } @@ -406,20 +449,43 @@ __kernel void m04400_s04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, * base */ - u32 pws0[4] = { 0 }; - u32 pws1[4] = { 0 }; + u32 wordl0[4]; + + wordl0[0] = pws[gid].i[ 0]; + wordl0[1] = pws[gid].i[ 1]; + wordl0[2] = pws[gid].i[ 2]; + wordl0[3] = pws[gid].i[ 3]; + + u32 wordl1[4]; + + wordl1[0] = pws[gid].i[ 4]; + wordl1[1] = pws[gid].i[ 5]; + wordl1[2] = pws[gid].i[ 6]; + wordl1[3] = pws[gid].i[ 7]; + + u32 wordl2[4]; - pws0[0] = pws[gid].i[0]; - pws0[1] = pws[gid].i[1]; - pws0[2] = pws[gid].i[2]; - pws0[3] = pws[gid].i[3]; - pws1[0] = pws[gid].i[4]; - pws1[1] = pws[gid].i[5]; - pws1[2] = pws[gid].i[6]; - pws1[3] = pws[gid].i[7]; + wordl2[0] = 0; + wordl2[1] = 0; + wordl2[2] = 0; + wordl2[3] = 0; + + u32 wordl3[4]; + + wordl3[0] = 0; + wordl3[1] = 0; + wordl3[2] = 0; + wordl3[3] = 0; const u32 pw_l_len = pws[gid].pw_len; + if (combs_mode == COMBINATOR_MODE_BASE_RIGHT) + { + append_0x80_2x4 (wordl0, wordl1, pw_l_len); + + switch_buffer_by_offset_le (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len); + } + /** * digest */ @@ -436,25 +502,39 @@ __kernel void m04400_s04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, * loop */ - for (u32 il_pos = 0; il_pos < combs_cnt; il_pos += VECT_SIZE) + for (u32 il_pos = 0; il_pos < combs_cnt; il_pos++) { - const u32x pw_r_len = pwlenx_create_combt (combs_buf, il_pos); + const u32 pw_r_len = combs_buf[il_pos].pw_len; + + const u32 pw_len = pw_l_len + pw_r_len; + + u32 wordr0[4]; + + wordr0[0] = combs_buf[il_pos].i[0]; + wordr0[1] = combs_buf[il_pos].i[1]; + wordr0[2] = combs_buf[il_pos].i[2]; + wordr0[3] = combs_buf[il_pos].i[3]; + + u32 wordr1[4]; - const u32x pw_len = pw_l_len + pw_r_len; + wordr1[0] = combs_buf[il_pos].i[4]; + wordr1[1] = combs_buf[il_pos].i[5]; + wordr1[2] = combs_buf[il_pos].i[6]; + wordr1[3] = combs_buf[il_pos].i[7]; - u32x wordr0[4] = { 0 }; - u32x wordr1[4] = { 0 }; - u32x wordr2[4] = { 0 }; - u32x wordr3[4] = { 0 }; + u32 wordr2[4]; - wordr0[0] = ix_create_combt (combs_buf, il_pos, 0); - wordr0[1] = ix_create_combt (combs_buf, il_pos, 1); - wordr0[2] = ix_create_combt (combs_buf, il_pos, 2); - wordr0[3] = ix_create_combt (combs_buf, il_pos, 3); - wordr1[0] = ix_create_combt (combs_buf, il_pos, 4); - wordr1[1] = ix_create_combt (combs_buf, il_pos, 5); - wordr1[2] = ix_create_combt (combs_buf, il_pos, 6); - wordr1[3] = ix_create_combt (combs_buf, il_pos, 7); + wordr2[0] = 0; + wordr2[1] = 0; + wordr2[2] = 0; + wordr2[3] = 0; + + u32 wordr3[4]; + + wordr3[0] = 0; + wordr3[1] = 0; + wordr3[2] = 0; + wordr3[3] = 0; if (combs_mode == COMBINATOR_MODE_BASE_LEFT) { @@ -463,28 +543,28 @@ __kernel void m04400_s04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, switch_buffer_by_offset_le (wordr0, wordr1, wordr2, wordr3, pw_l_len); } - u32x w0[4]; + u32 w0[4]; w0[0] = wordl0[0] | wordr0[0]; w0[1] = wordl0[1] | wordr0[1]; w0[2] = wordl0[2] | wordr0[2]; w0[3] = wordl0[3] | wordr0[3]; - u32x w1[4]; + u32 w1[4]; w1[0] = wordl1[0] | wordr1[0]; w1[1] = wordl1[1] | wordr1[1]; w1[2] = wordl1[2] | wordr1[2]; w1[3] = wordl1[3] | wordr1[3]; - u32x w2[4]; + u32 w2[4]; w2[0] = wordl2[0] | wordr2[0]; w2[1] = wordl2[1] | wordr2[1]; w2[2] = wordl2[2] | wordr2[2]; w2[3] = wordl2[3] | wordr2[3]; - u32x w3[4]; + u32 w3[4]; w3[0] = wordl3[0] | wordr3[0]; w3[1] = wordl3[1] | wordr3[1]; @@ -495,28 +575,28 @@ __kernel void m04400_s04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, * sha1 */ - u32x w0_t = swap32 (w0[0]); - u32x w1_t = swap32 (w0[1]); - u32x w2_t = swap32 (w0[2]); - u32x w3_t = swap32 (w0[3]); - u32x w4_t = swap32 (w1[0]); - u32x w5_t = swap32 (w1[1]); - u32x w6_t = swap32 (w1[2]); - u32x w7_t = swap32 (w1[3]); - u32x w8_t = swap32 (w2[0]); - u32x w9_t = swap32 (w2[1]); - u32x wa_t = swap32 (w2[2]); - u32x wb_t = swap32 (w2[3]); - u32x wc_t = swap32 (w3[0]); - u32x wd_t = swap32 (w3[1]); - u32x we_t = 0; - u32x wf_t = pw_len * 8; - - u32x a = SHA1M_A; - u32x b = SHA1M_B; - u32x c = SHA1M_C; - u32x d = SHA1M_D; - u32x e = SHA1M_E; + u32 w0_t = swap32 (w0[0]); + u32 w1_t = swap32 (w0[1]); + u32 w2_t = swap32 (w0[2]); + u32 w3_t = swap32 (w0[3]); + u32 w4_t = swap32 (w1[0]); + u32 w5_t = swap32 (w1[1]); + u32 w6_t = swap32 (w1[2]); + u32 w7_t = swap32 (w1[3]); + u32 w8_t = swap32 (w2[0]); + u32 w9_t = swap32 (w2[1]); + u32 wa_t = swap32 (w2[2]); + u32 wb_t = swap32 (w2[3]); + u32 wc_t = swap32 (w3[0]); + u32 wd_t = swap32 (w3[1]); + u32 we_t = 0; + u32 wf_t = pw_len * 8; + + u32 a = SHA1M_A; + u32 b = SHA1M_B; + u32 c = SHA1M_C; + u32 d = SHA1M_D; + u32 e = SHA1M_E; #undef K #define K SHA1C00 @@ -724,7 +804,13 @@ __kernel void m04400_s04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, MD5_STEP (MD5_I , d, a, b, c, wb_t, MD5C3d, MD5S31); MD5_STEP (MD5_I , c, d, a, b, w2_t, MD5C3e, MD5S32); MD5_STEP (MD5_I , b, c, d, a, w9_t, MD5C3f, MD5S33); - COMPARE_S_SIMD (a, d, c, b); + + const u32 r0 = a; + const u32 r1 = d; + const u32 r2 = c; + const u32 r3 = b; + + #include COMPARE_S } } diff --git a/OpenCL/m04500_a1.cl b/OpenCL/m04500_a1.cl index 3a4c6b8..a526039 100644 --- a/OpenCL/m04500_a1.cl +++ b/OpenCL/m04500_a1.cl @@ -5,8 +5,6 @@ #define _SHA1_ -#define NEW_SIMD_CODE - #include "include/constants.h" #include "include/kernel_vendor.h" @@ -18,7 +16,9 @@ #include "include/kernel_functions.c" #include "OpenCL/types_ocl.c" #include "OpenCL/common.c" -#include "OpenCL/simd.c" + +#define COMPARE_S "OpenCL/check_single_comp4.c" +#define COMPARE_M "OpenCL/check_multi_comp4.c" #define uint_to_hex_lower8_le(i) l_bin2asc[(i)] @@ -55,43 +55,80 @@ __kernel void m04500_m04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, * base */ - u32 pws0[4] = { 0 }; - u32 pws1[4] = { 0 }; + u32 wordl0[4]; + + wordl0[0] = pws[gid].i[ 0]; + wordl0[1] = pws[gid].i[ 1]; + wordl0[2] = pws[gid].i[ 2]; + wordl0[3] = pws[gid].i[ 3]; + + u32 wordl1[4]; + + wordl1[0] = pws[gid].i[ 4]; + wordl1[1] = pws[gid].i[ 5]; + wordl1[2] = pws[gid].i[ 6]; + wordl1[3] = pws[gid].i[ 7]; - pws0[0] = pws[gid].i[0]; - pws0[1] = pws[gid].i[1]; - pws0[2] = pws[gid].i[2]; - pws0[3] = pws[gid].i[3]; - pws1[0] = pws[gid].i[4]; - pws1[1] = pws[gid].i[5]; - pws1[2] = pws[gid].i[6]; - pws1[3] = pws[gid].i[7]; + u32 wordl2[4]; + + wordl2[0] = 0; + wordl2[1] = 0; + wordl2[2] = 0; + wordl2[3] = 0; + + u32 wordl3[4]; + + wordl3[0] = 0; + wordl3[1] = 0; + wordl3[2] = 0; + wordl3[3] = 0; const u32 pw_l_len = pws[gid].pw_len; + if (combs_mode == COMBINATOR_MODE_BASE_RIGHT) + { + append_0x80_2x4 (wordl0, wordl1, pw_l_len); + + switch_buffer_by_offset_le (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len); + } + /** * loop */ - for (u32 il_pos = 0; il_pos < combs_cnt; il_pos += VECT_SIZE) + for (u32 il_pos = 0; il_pos < combs_cnt; il_pos++) { - const u32x pw_r_len = pwlenx_create_combt (combs_buf, il_pos); + const u32 pw_r_len = combs_buf[il_pos].pw_len; + + const u32 pw_len = pw_l_len + pw_r_len; + + u32 wordr0[4]; + + wordr0[0] = combs_buf[il_pos].i[0]; + wordr0[1] = combs_buf[il_pos].i[1]; + wordr0[2] = combs_buf[il_pos].i[2]; + wordr0[3] = combs_buf[il_pos].i[3]; - const u32x pw_len = pw_l_len + pw_r_len; + u32 wordr1[4]; - u32x wordr0[4] = { 0 }; - u32x wordr1[4] = { 0 }; - u32x wordr2[4] = { 0 }; - u32x wordr3[4] = { 0 }; + wordr1[0] = combs_buf[il_pos].i[4]; + wordr1[1] = combs_buf[il_pos].i[5]; + wordr1[2] = combs_buf[il_pos].i[6]; + wordr1[3] = combs_buf[il_pos].i[7]; - wordr0[0] = ix_create_combt (combs_buf, il_pos, 0); - wordr0[1] = ix_create_combt (combs_buf, il_pos, 1); - wordr0[2] = ix_create_combt (combs_buf, il_pos, 2); - wordr0[3] = ix_create_combt (combs_buf, il_pos, 3); - wordr1[0] = ix_create_combt (combs_buf, il_pos, 4); - wordr1[1] = ix_create_combt (combs_buf, il_pos, 5); - wordr1[2] = ix_create_combt (combs_buf, il_pos, 6); - wordr1[3] = ix_create_combt (combs_buf, il_pos, 7); + u32 wordr2[4]; + + wordr2[0] = 0; + wordr2[1] = 0; + wordr2[2] = 0; + wordr2[3] = 0; + + u32 wordr3[4]; + + wordr3[0] = 0; + wordr3[1] = 0; + wordr3[2] = 0; + wordr3[3] = 0; if (combs_mode == COMBINATOR_MODE_BASE_LEFT) { @@ -100,28 +137,28 @@ __kernel void m04500_m04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, switch_buffer_by_offset_le (wordr0, wordr1, wordr2, wordr3, pw_l_len); } - u32x w0[4]; + u32 w0[4]; w0[0] = wordl0[0] | wordr0[0]; w0[1] = wordl0[1] | wordr0[1]; w0[2] = wordl0[2] | wordr0[2]; w0[3] = wordl0[3] | wordr0[3]; - u32x w1[4]; + u32 w1[4]; w1[0] = wordl1[0] | wordr1[0]; w1[1] = wordl1[1] | wordr1[1]; w1[2] = wordl1[2] | wordr1[2]; w1[3] = wordl1[3] | wordr1[3]; - u32x w2[4]; + u32 w2[4]; w2[0] = wordl2[0] | wordr2[0]; w2[1] = wordl2[1] | wordr2[1]; w2[2] = wordl2[2] | wordr2[2]; w2[3] = wordl2[3] | wordr2[3]; - u32x w3[4]; + u32 w3[4]; w3[0] = wordl3[0] | wordr3[0]; w3[1] = wordl3[1] | wordr3[1]; @@ -132,28 +169,28 @@ __kernel void m04500_m04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, * sha1 */ - u32x w0_t = swap32 (w0[0]); - u32x w1_t = swap32 (w0[1]); - u32x w2_t = swap32 (w0[2]); - u32x w3_t = swap32 (w0[3]); - u32x w4_t = swap32 (w1[0]); - u32x w5_t = swap32 (w1[1]); - u32x w6_t = swap32 (w1[2]); - u32x w7_t = swap32 (w1[3]); - u32x w8_t = swap32 (w2[0]); - u32x w9_t = swap32 (w2[1]); - u32x wa_t = swap32 (w2[2]); - u32x wb_t = swap32 (w2[3]); - u32x wc_t = swap32 (w3[0]); - u32x wd_t = swap32 (w3[1]); - u32x we_t = 0; - u32x wf_t = pw_len * 8; - - u32x a = SHA1M_A; - u32x b = SHA1M_B; - u32x c = SHA1M_C; - u32x d = SHA1M_D; - u32x e = SHA1M_E; + u32 w0_t = swap32 (w0[0]); + u32 w1_t = swap32 (w0[1]); + u32 w2_t = swap32 (w0[2]); + u32 w3_t = swap32 (w0[3]); + u32 w4_t = swap32 (w1[0]); + u32 w5_t = swap32 (w1[1]); + u32 w6_t = swap32 (w1[2]); + u32 w7_t = swap32 (w1[3]); + u32 w8_t = swap32 (w2[0]); + u32 w9_t = swap32 (w2[1]); + u32 wa_t = swap32 (w2[2]); + u32 wb_t = swap32 (w2[3]); + u32 wc_t = swap32 (w3[0]); + u32 wd_t = swap32 (w3[1]); + u32 we_t = 0; + u32 wf_t = pw_len * 8; + + u32 a = SHA1M_A; + u32 b = SHA1M_B; + u32 c = SHA1M_C; + u32 d = SHA1M_D; + u32 e = SHA1M_E; #undef K #define K SHA1C00 @@ -390,7 +427,13 @@ __kernel void m04500_m04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, wd_t = rotl32 ((wa_t ^ w5_t ^ wf_t ^ wd_t), 1u); SHA1_STEP (SHA1_F1, d, e, a, b, c, wd_t); we_t = rotl32 ((wb_t ^ w6_t ^ w0_t ^ we_t), 1u); SHA1_STEP (SHA1_F1, c, d, e, a, b, we_t); wf_t = rotl32 ((wc_t ^ w7_t ^ w1_t ^ wf_t), 1u); SHA1_STEP (SHA1_F1, b, c, d, e, a, wf_t); - COMPARE_M_SIMD (a, d, c, b); + + const u32 r0 = d; + const u32 r1 = e; + const u32 r2 = c; + const u32 r3 = b; + + #include COMPARE_M } } @@ -435,20 +478,43 @@ __kernel void m04500_s04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, * base */ - u32 pws0[4] = { 0 }; - u32 pws1[4] = { 0 }; + u32 wordl0[4]; + + wordl0[0] = pws[gid].i[ 0]; + wordl0[1] = pws[gid].i[ 1]; + wordl0[2] = pws[gid].i[ 2]; + wordl0[3] = pws[gid].i[ 3]; + + u32 wordl1[4]; + + wordl1[0] = pws[gid].i[ 4]; + wordl1[1] = pws[gid].i[ 5]; + wordl1[2] = pws[gid].i[ 6]; + wordl1[3] = pws[gid].i[ 7]; + + u32 wordl2[4]; - pws0[0] = pws[gid].i[0]; - pws0[1] = pws[gid].i[1]; - pws0[2] = pws[gid].i[2]; - pws0[3] = pws[gid].i[3]; - pws1[0] = pws[gid].i[4]; - pws1[1] = pws[gid].i[5]; - pws1[2] = pws[gid].i[6]; - pws1[3] = pws[gid].i[7]; + wordl2[0] = 0; + wordl2[1] = 0; + wordl2[2] = 0; + wordl2[3] = 0; + + u32 wordl3[4]; + + wordl3[0] = 0; + wordl3[1] = 0; + wordl3[2] = 0; + wordl3[3] = 0; const u32 pw_l_len = pws[gid].pw_len; + if (combs_mode == COMBINATOR_MODE_BASE_RIGHT) + { + append_0x80_2x4 (wordl0, wordl1, pw_l_len); + + switch_buffer_by_offset_le (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len); + } + /** * digest */ @@ -465,31 +531,45 @@ __kernel void m04500_s04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, * reverse */ - const u32 e_rev = rotl32_S (search[1], 2u); + const u32 e_rev = rotl32 (search[1], 2u); /** * loop */ - for (u32 il_pos = 0; il_pos < combs_cnt; il_pos += VECT_SIZE) + for (u32 il_pos = 0; il_pos < combs_cnt; il_pos++) { - const u32x pw_r_len = pwlenx_create_combt (combs_buf, il_pos); + const u32 pw_r_len = combs_buf[il_pos].pw_len; + + const u32 pw_len = pw_l_len + pw_r_len; + + u32 wordr0[4]; + + wordr0[0] = combs_buf[il_pos].i[0]; + wordr0[1] = combs_buf[il_pos].i[1]; + wordr0[2] = combs_buf[il_pos].i[2]; + wordr0[3] = combs_buf[il_pos].i[3]; + + u32 wordr1[4]; - const u32x pw_len = pw_l_len + pw_r_len; + wordr1[0] = combs_buf[il_pos].i[4]; + wordr1[1] = combs_buf[il_pos].i[5]; + wordr1[2] = combs_buf[il_pos].i[6]; + wordr1[3] = combs_buf[il_pos].i[7]; - u32x wordr0[4] = { 0 }; - u32x wordr1[4] = { 0 }; - u32x wordr2[4] = { 0 }; - u32x wordr3[4] = { 0 }; + u32 wordr2[4]; - wordr0[0] = ix_create_combt (combs_buf, il_pos, 0); - wordr0[1] = ix_create_combt (combs_buf, il_pos, 1); - wordr0[2] = ix_create_combt (combs_buf, il_pos, 2); - wordr0[3] = ix_create_combt (combs_buf, il_pos, 3); - wordr1[0] = ix_create_combt (combs_buf, il_pos, 4); - wordr1[1] = ix_create_combt (combs_buf, il_pos, 5); - wordr1[2] = ix_create_combt (combs_buf, il_pos, 6); - wordr1[3] = ix_create_combt (combs_buf, il_pos, 7); + wordr2[0] = 0; + wordr2[1] = 0; + wordr2[2] = 0; + wordr2[3] = 0; + + u32 wordr3[4]; + + wordr3[0] = 0; + wordr3[1] = 0; + wordr3[2] = 0; + wordr3[3] = 0; if (combs_mode == COMBINATOR_MODE_BASE_LEFT) { @@ -498,28 +578,28 @@ __kernel void m04500_s04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, switch_buffer_by_offset_le (wordr0, wordr1, wordr2, wordr3, pw_l_len); } - u32x w0[4]; + u32 w0[4]; w0[0] = wordl0[0] | wordr0[0]; w0[1] = wordl0[1] | wordr0[1]; w0[2] = wordl0[2] | wordr0[2]; w0[3] = wordl0[3] | wordr0[3]; - u32x w1[4]; + u32 w1[4]; w1[0] = wordl1[0] | wordr1[0]; w1[1] = wordl1[1] | wordr1[1]; w1[2] = wordl1[2] | wordr1[2]; w1[3] = wordl1[3] | wordr1[3]; - u32x w2[4]; + u32 w2[4]; w2[0] = wordl2[0] | wordr2[0]; w2[1] = wordl2[1] | wordr2[1]; w2[2] = wordl2[2] | wordr2[2]; w2[3] = wordl2[3] | wordr2[3]; - u32x w3[4]; + u32 w3[4]; w3[0] = wordl3[0] | wordr3[0]; w3[1] = wordl3[1] | wordr3[1]; @@ -530,28 +610,28 @@ __kernel void m04500_s04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, * sha1 */ - u32x w0_t = swap32 (w0[0]); - u32x w1_t = swap32 (w0[1]); - u32x w2_t = swap32 (w0[2]); - u32x w3_t = swap32 (w0[3]); - u32x w4_t = swap32 (w1[0]); - u32x w5_t = swap32 (w1[1]); - u32x w6_t = swap32 (w1[2]); - u32x w7_t = swap32 (w1[3]); - u32x w8_t = swap32 (w2[0]); - u32x w9_t = swap32 (w2[1]); - u32x wa_t = swap32 (w2[2]); - u32x wb_t = swap32 (w2[3]); - u32x wc_t = swap32 (w3[0]); - u32x wd_t = swap32 (w3[1]); - u32x we_t = 0; - u32x wf_t = pw_len * 8; - - u32x a = SHA1M_A; - u32x b = SHA1M_B; - u32x c = SHA1M_C; - u32x d = SHA1M_D; - u32x e = SHA1M_E; + u32 w0_t = swap32 (w0[0]); + u32 w1_t = swap32 (w0[1]); + u32 w2_t = swap32 (w0[2]); + u32 w3_t = swap32 (w0[3]); + u32 w4_t = swap32 (w1[0]); + u32 w5_t = swap32 (w1[1]); + u32 w6_t = swap32 (w1[2]); + u32 w7_t = swap32 (w1[3]); + u32 w8_t = swap32 (w2[0]); + u32 w9_t = swap32 (w2[1]); + u32 wa_t = swap32 (w2[2]); + u32 wb_t = swap32 (w2[3]); + u32 wc_t = swap32 (w3[0]); + u32 wd_t = swap32 (w3[1]); + u32 we_t = 0; + u32 wf_t = pw_len * 8; + + u32 a = SHA1M_A; + u32 b = SHA1M_B; + u32 c = SHA1M_C; + u32 d = SHA1M_D; + u32 e = SHA1M_E; #undef K #define K SHA1C00 @@ -785,13 +865,19 @@ __kernel void m04500_s04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, wa_t = rotl32 ((w7_t ^ w2_t ^ wc_t ^ wa_t), 1u); SHA1_STEP (SHA1_F1, b, c, d, e, a, wa_t); wb_t = rotl32 ((w8_t ^ w3_t ^ wd_t ^ wb_t), 1u); SHA1_STEP (SHA1_F1, a, b, c, d, e, wb_t); - if (MATCHES_NONE_VS (e, e_rev)) continue; + if (allx (e != e_rev)) continue; wc_t = rotl32 ((w9_t ^ w4_t ^ we_t ^ wc_t), 1u); SHA1_STEP (SHA1_F1, e, a, b, c, d, wc_t); wd_t = rotl32 ((wa_t ^ w5_t ^ wf_t ^ wd_t), 1u); SHA1_STEP (SHA1_F1, d, e, a, b, c, wd_t); we_t = rotl32 ((wb_t ^ w6_t ^ w0_t ^ we_t), 1u); SHA1_STEP (SHA1_F1, c, d, e, a, b, we_t); wf_t = rotl32 ((wc_t ^ w7_t ^ w1_t ^ wf_t), 1u); SHA1_STEP (SHA1_F1, b, c, d, e, a, wf_t); - COMPARE_S_SIMD (a, d, c, b); + + const u32 r0 = d; + const u32 r1 = e; + const u32 r2 = c; + const u32 r3 = b; + + #include COMPARE_S } } diff --git a/OpenCL/m04700_a1.cl b/OpenCL/m04700_a1.cl index 262c4d5..b673061 100644 --- a/OpenCL/m04700_a1.cl +++ b/OpenCL/m04700_a1.cl @@ -5,8 +5,6 @@ #define _SHA1_MD5_ -#define NEW_SIMD_CODE - #include "include/constants.h" #include "include/kernel_vendor.h" @@ -19,7 +17,9 @@ #undef _MD5_ #include "OpenCL/types_ocl.c" #include "OpenCL/common.c" -#include "OpenCL/simd.c" + +#define COMPARE_S "OpenCL/check_single_comp4.c" +#define COMPARE_M "OpenCL/check_multi_comp4.c" #define uint_to_hex_lower8_le(i) l_bin2asc[(i)] @@ -56,43 +56,80 @@ __kernel void m04700_m04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, * base */ - u32 pws0[4] = { 0 }; - u32 pws1[4] = { 0 }; + u32 wordl0[4]; + + wordl0[0] = pws[gid].i[ 0]; + wordl0[1] = pws[gid].i[ 1]; + wordl0[2] = pws[gid].i[ 2]; + wordl0[3] = pws[gid].i[ 3]; + + u32 wordl1[4]; + + wordl1[0] = pws[gid].i[ 4]; + wordl1[1] = pws[gid].i[ 5]; + wordl1[2] = pws[gid].i[ 6]; + wordl1[3] = pws[gid].i[ 7]; - pws0[0] = pws[gid].i[0]; - pws0[1] = pws[gid].i[1]; - pws0[2] = pws[gid].i[2]; - pws0[3] = pws[gid].i[3]; - pws1[0] = pws[gid].i[4]; - pws1[1] = pws[gid].i[5]; - pws1[2] = pws[gid].i[6]; - pws1[3] = pws[gid].i[7]; + u32 wordl2[4]; + + wordl2[0] = 0; + wordl2[1] = 0; + wordl2[2] = 0; + wordl2[3] = 0; + + u32 wordl3[4]; + + wordl3[0] = 0; + wordl3[1] = 0; + wordl3[2] = 0; + wordl3[3] = 0; const u32 pw_l_len = pws[gid].pw_len; + if (combs_mode == COMBINATOR_MODE_BASE_RIGHT) + { + append_0x80_2x4 (wordl0, wordl1, pw_l_len); + + switch_buffer_by_offset_le (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len); + } + /** * loop */ - for (u32 il_pos = 0; il_pos < combs_cnt; il_pos += VECT_SIZE) + for (u32 il_pos = 0; il_pos < combs_cnt; il_pos++) { - const u32x pw_r_len = pwlenx_create_combt (combs_buf, il_pos); + const u32 pw_r_len = combs_buf[il_pos].pw_len; + + const u32 pw_len = pw_l_len + pw_r_len; + + u32 wordr0[4]; + + wordr0[0] = combs_buf[il_pos].i[0]; + wordr0[1] = combs_buf[il_pos].i[1]; + wordr0[2] = combs_buf[il_pos].i[2]; + wordr0[3] = combs_buf[il_pos].i[3]; - const u32x pw_len = pw_l_len + pw_r_len; + u32 wordr1[4]; - u32x wordr0[4] = { 0 }; - u32x wordr1[4] = { 0 }; - u32x wordr2[4] = { 0 }; - u32x wordr3[4] = { 0 }; + wordr1[0] = combs_buf[il_pos].i[4]; + wordr1[1] = combs_buf[il_pos].i[5]; + wordr1[2] = combs_buf[il_pos].i[6]; + wordr1[3] = combs_buf[il_pos].i[7]; - wordr0[0] = ix_create_combt (combs_buf, il_pos, 0); - wordr0[1] = ix_create_combt (combs_buf, il_pos, 1); - wordr0[2] = ix_create_combt (combs_buf, il_pos, 2); - wordr0[3] = ix_create_combt (combs_buf, il_pos, 3); - wordr1[0] = ix_create_combt (combs_buf, il_pos, 4); - wordr1[1] = ix_create_combt (combs_buf, il_pos, 5); - wordr1[2] = ix_create_combt (combs_buf, il_pos, 6); - wordr1[3] = ix_create_combt (combs_buf, il_pos, 7); + u32 wordr2[4]; + + wordr2[0] = 0; + wordr2[1] = 0; + wordr2[2] = 0; + wordr2[3] = 0; + + u32 wordr3[4]; + + wordr3[0] = 0; + wordr3[1] = 0; + wordr3[2] = 0; + wordr3[3] = 0; if (combs_mode == COMBINATOR_MODE_BASE_LEFT) { @@ -101,28 +138,28 @@ __kernel void m04700_m04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, switch_buffer_by_offset_le (wordr0, wordr1, wordr2, wordr3, pw_l_len); } - u32x w0[4]; + u32 w0[4]; w0[0] = wordl0[0] | wordr0[0]; w0[1] = wordl0[1] | wordr0[1]; w0[2] = wordl0[2] | wordr0[2]; w0[3] = wordl0[3] | wordr0[3]; - u32x w1[4]; + u32 w1[4]; w1[0] = wordl1[0] | wordr1[0]; w1[1] = wordl1[1] | wordr1[1]; w1[2] = wordl1[2] | wordr1[2]; w1[3] = wordl1[3] | wordr1[3]; - u32x w2[4]; + u32 w2[4]; w2[0] = wordl2[0] | wordr2[0]; w2[1] = wordl2[1] | wordr2[1]; w2[2] = wordl2[2] | wordr2[2]; w2[3] = wordl2[3] | wordr2[3]; - u32x w3[4]; + u32 w3[4]; w3[0] = wordl3[0] | wordr3[0]; w3[1] = wordl3[1] | wordr3[1]; @@ -133,10 +170,10 @@ __kernel void m04700_m04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, * md5 */ - u32x a = MD5M_A; - u32x b = MD5M_B; - u32x c = MD5M_C; - u32x d = MD5M_D; + u32 a = MD5M_A; + u32 b = MD5M_B; + u32 c = MD5M_C; + u32 d = MD5M_D; MD5_STEP (MD5_Fo, a, b, c, d, w0[0], MD5C00, MD5S00); MD5_STEP (MD5_Fo, d, a, b, c, w0[1], MD5C01, MD5S01); @@ -215,31 +252,31 @@ __kernel void m04700_m04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, * sha1 */ - u32x w0_t = uint_to_hex_lower8_le ((a >> 8) & 255) << 0 + u32 w0_t = uint_to_hex_lower8_le ((a >> 8) & 255) << 0 | uint_to_hex_lower8_le ((a >> 0) & 255) << 16; - u32x w1_t = uint_to_hex_lower8_le ((a >> 24) & 255) << 0 + u32 w1_t = uint_to_hex_lower8_le ((a >> 24) & 255) << 0 | uint_to_hex_lower8_le ((a >> 16) & 255) << 16; - u32x w2_t = uint_to_hex_lower8_le ((b >> 8) & 255) << 0 + u32 w2_t = uint_to_hex_lower8_le ((b >> 8) & 255) << 0 | uint_to_hex_lower8_le ((b >> 0) & 255) << 16; - u32x w3_t = uint_to_hex_lower8_le ((b >> 24) & 255) << 0 + u32 w3_t = uint_to_hex_lower8_le ((b >> 24) & 255) << 0 | uint_to_hex_lower8_le ((b >> 16) & 255) << 16; - u32x w4_t = uint_to_hex_lower8_le ((c >> 8) & 255) << 0 + u32 w4_t = uint_to_hex_lower8_le ((c >> 8) & 255) << 0 | uint_to_hex_lower8_le ((c >> 0) & 255) << 16; - u32x w5_t = uint_to_hex_lower8_le ((c >> 24) & 255) << 0 + u32 w5_t = uint_to_hex_lower8_le ((c >> 24) & 255) << 0 | uint_to_hex_lower8_le ((c >> 16) & 255) << 16; - u32x w6_t = uint_to_hex_lower8_le ((d >> 8) & 255) << 0 + u32 w6_t = uint_to_hex_lower8_le ((d >> 8) & 255) << 0 | uint_to_hex_lower8_le ((d >> 0) & 255) << 16; - u32x w7_t = uint_to_hex_lower8_le ((d >> 24) & 255) << 0 + u32 w7_t = uint_to_hex_lower8_le ((d >> 24) & 255) << 0 | uint_to_hex_lower8_le ((d >> 16) & 255) << 16; - u32x w8_t = 0x80000000; - u32x w9_t = 0; - u32x wa_t = 0; - u32x wb_t = 0; - u32x wc_t = 0; - u32x wd_t = 0; - u32x we_t = 0; - u32x wf_t = 32 * 8; + u32 w8_t = 0x80000000; + u32 w9_t = 0; + u32 wa_t = 0; + u32 wb_t = 0; + u32 wc_t = 0; + u32 wd_t = 0; + u32 we_t = 0; + u32 wf_t = 32 * 8; u32 e; @@ -344,7 +381,13 @@ __kernel void m04700_m04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, wd_t = rotl32 ((wa_t ^ w5_t ^ wf_t ^ wd_t), 1u); SHA1_STEP (SHA1_F1, d, e, a, b, c, wd_t); we_t = rotl32 ((wb_t ^ w6_t ^ w0_t ^ we_t), 1u); SHA1_STEP (SHA1_F1, c, d, e, a, b, we_t); wf_t = rotl32 ((wc_t ^ w7_t ^ w1_t ^ wf_t), 1u); SHA1_STEP (SHA1_F1, b, c, d, e, a, wf_t); - COMPARE_M_SIMD (a, d, c, b); + + const u32 r0 = d; + const u32 r1 = e; + const u32 r2 = c; + const u32 r3 = b; + + #include COMPARE_M } } @@ -389,20 +432,43 @@ __kernel void m04700_s04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, * base */ - u32 pws0[4] = { 0 }; - u32 pws1[4] = { 0 }; + u32 wordl0[4]; + + wordl0[0] = pws[gid].i[ 0]; + wordl0[1] = pws[gid].i[ 1]; + wordl0[2] = pws[gid].i[ 2]; + wordl0[3] = pws[gid].i[ 3]; + + u32 wordl1[4]; + + wordl1[0] = pws[gid].i[ 4]; + wordl1[1] = pws[gid].i[ 5]; + wordl1[2] = pws[gid].i[ 6]; + wordl1[3] = pws[gid].i[ 7]; + + u32 wordl2[4]; - pws0[0] = pws[gid].i[0]; - pws0[1] = pws[gid].i[1]; - pws0[2] = pws[gid].i[2]; - pws0[3] = pws[gid].i[3]; - pws1[0] = pws[gid].i[4]; - pws1[1] = pws[gid].i[5]; - pws1[2] = pws[gid].i[6]; - pws1[3] = pws[gid].i[7]; + wordl2[0] = 0; + wordl2[1] = 0; + wordl2[2] = 0; + wordl2[3] = 0; + + u32 wordl3[4]; + + wordl3[0] = 0; + wordl3[1] = 0; + wordl3[2] = 0; + wordl3[3] = 0; const u32 pw_l_len = pws[gid].pw_len; + if (combs_mode == COMBINATOR_MODE_BASE_RIGHT) + { + append_0x80_2x4 (wordl0, wordl1, pw_l_len); + + switch_buffer_by_offset_le (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len); + } + /** * digest */ @@ -419,31 +485,45 @@ __kernel void m04700_s04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, * reverse */ - const u32 e_rev = rotl32_S (search[1], 2u); + const u32 e_rev = rotl32 (search[1], 2u); /** * loop */ - for (u32 il_pos = 0; il_pos < combs_cnt; il_pos += VECT_SIZE) + for (u32 il_pos = 0; il_pos < combs_cnt; il_pos++) { - const u32x pw_r_len = pwlenx_create_combt (combs_buf, il_pos); + const u32 pw_r_len = combs_buf[il_pos].pw_len; + + const u32 pw_len = pw_l_len + pw_r_len; + + u32 wordr0[4]; + + wordr0[0] = combs_buf[il_pos].i[0]; + wordr0[1] = combs_buf[il_pos].i[1]; + wordr0[2] = combs_buf[il_pos].i[2]; + wordr0[3] = combs_buf[il_pos].i[3]; + + u32 wordr1[4]; - const u32x pw_len = pw_l_len + pw_r_len; + wordr1[0] = combs_buf[il_pos].i[4]; + wordr1[1] = combs_buf[il_pos].i[5]; + wordr1[2] = combs_buf[il_pos].i[6]; + wordr1[3] = combs_buf[il_pos].i[7]; - u32x wordr0[4] = { 0 }; - u32x wordr1[4] = { 0 }; - u32x wordr2[4] = { 0 }; - u32x wordr3[4] = { 0 }; + u32 wordr2[4]; - wordr0[0] = ix_create_combt (combs_buf, il_pos, 0); - wordr0[1] = ix_create_combt (combs_buf, il_pos, 1); - wordr0[2] = ix_create_combt (combs_buf, il_pos, 2); - wordr0[3] = ix_create_combt (combs_buf, il_pos, 3); - wordr1[0] = ix_create_combt (combs_buf, il_pos, 4); - wordr1[1] = ix_create_combt (combs_buf, il_pos, 5); - wordr1[2] = ix_create_combt (combs_buf, il_pos, 6); - wordr1[3] = ix_create_combt (combs_buf, il_pos, 7); + wordr2[0] = 0; + wordr2[1] = 0; + wordr2[2] = 0; + wordr2[3] = 0; + + u32 wordr3[4]; + + wordr3[0] = 0; + wordr3[1] = 0; + wordr3[2] = 0; + wordr3[3] = 0; if (combs_mode == COMBINATOR_MODE_BASE_LEFT) { @@ -452,28 +532,28 @@ __kernel void m04700_s04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, switch_buffer_by_offset_le (wordr0, wordr1, wordr2, wordr3, pw_l_len); } - u32x w0[4]; + u32 w0[4]; w0[0] = wordl0[0] | wordr0[0]; w0[1] = wordl0[1] | wordr0[1]; w0[2] = wordl0[2] | wordr0[2]; w0[3] = wordl0[3] | wordr0[3]; - u32x w1[4]; + u32 w1[4]; w1[0] = wordl1[0] | wordr1[0]; w1[1] = wordl1[1] | wordr1[1]; w1[2] = wordl1[2] | wordr1[2]; w1[3] = wordl1[3] | wordr1[3]; - u32x w2[4]; + u32 w2[4]; w2[0] = wordl2[0] | wordr2[0]; w2[1] = wordl2[1] | wordr2[1]; w2[2] = wordl2[2] | wordr2[2]; w2[3] = wordl2[3] | wordr2[3]; - u32x w3[4]; + u32 w3[4]; w3[0] = wordl3[0] | wordr3[0]; w3[1] = wordl3[1] | wordr3[1]; @@ -484,10 +564,10 @@ __kernel void m04700_s04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, * md5 */ - u32x a = MD5M_A; - u32x b = MD5M_B; - u32x c = MD5M_C; - u32x d = MD5M_D; + u32 a = MD5M_A; + u32 b = MD5M_B; + u32 c = MD5M_C; + u32 d = MD5M_D; MD5_STEP (MD5_Fo, a, b, c, d, w0[0], MD5C00, MD5S00); MD5_STEP (MD5_Fo, d, a, b, c, w0[1], MD5C01, MD5S01); @@ -566,31 +646,31 @@ __kernel void m04700_s04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, * sha1 */ - u32x w0_t = uint_to_hex_lower8_le ((a >> 8) & 255) << 0 + u32 w0_t = uint_to_hex_lower8_le ((a >> 8) & 255) << 0 | uint_to_hex_lower8_le ((a >> 0) & 255) << 16; - u32x w1_t = uint_to_hex_lower8_le ((a >> 24) & 255) << 0 + u32 w1_t = uint_to_hex_lower8_le ((a >> 24) & 255) << 0 | uint_to_hex_lower8_le ((a >> 16) & 255) << 16; - u32x w2_t = uint_to_hex_lower8_le ((b >> 8) & 255) << 0 + u32 w2_t = uint_to_hex_lower8_le ((b >> 8) & 255) << 0 | uint_to_hex_lower8_le ((b >> 0) & 255) << 16; - u32x w3_t = uint_to_hex_lower8_le ((b >> 24) & 255) << 0 + u32 w3_t = uint_to_hex_lower8_le ((b >> 24) & 255) << 0 | uint_to_hex_lower8_le ((b >> 16) & 255) << 16; - u32x w4_t = uint_to_hex_lower8_le ((c >> 8) & 255) << 0 + u32 w4_t = uint_to_hex_lower8_le ((c >> 8) & 255) << 0 | uint_to_hex_lower8_le ((c >> 0) & 255) << 16; - u32x w5_t = uint_to_hex_lower8_le ((c >> 24) & 255) << 0 + u32 w5_t = uint_to_hex_lower8_le ((c >> 24) & 255) << 0 | uint_to_hex_lower8_le ((c >> 16) & 255) << 16; - u32x w6_t = uint_to_hex_lower8_le ((d >> 8) & 255) << 0 + u32 w6_t = uint_to_hex_lower8_le ((d >> 8) & 255) << 0 | uint_to_hex_lower8_le ((d >> 0) & 255) << 16; - u32x w7_t = uint_to_hex_lower8_le ((d >> 24) & 255) << 0 + u32 w7_t = uint_to_hex_lower8_le ((d >> 24) & 255) << 0 | uint_to_hex_lower8_le ((d >> 16) & 255) << 16; - u32x w8_t = 0x80000000; - u32x w9_t = 0; - u32x wa_t = 0; - u32x wb_t = 0; - u32x wc_t = 0; - u32x wd_t = 0; - u32x we_t = 0; - u32x wf_t = 32 * 8; + u32 w8_t = 0x80000000; + u32 w9_t = 0; + u32 wa_t = 0; + u32 wb_t = 0; + u32 wc_t = 0; + u32 wd_t = 0; + u32 we_t = 0; + u32 wf_t = 32 * 8; u32 e; @@ -692,13 +772,19 @@ __kernel void m04700_s04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, wa_t = rotl32 ((w7_t ^ w2_t ^ wc_t ^ wa_t), 1u); SHA1_STEP (SHA1_F1, b, c, d, e, a, wa_t); wb_t = rotl32 ((w8_t ^ w3_t ^ wd_t ^ wb_t), 1u); SHA1_STEP (SHA1_F1, a, b, c, d, e, wb_t); - if (MATCHES_NONE_VS (e, e_rev)) continue; + if (allx (e != e_rev)) continue; wc_t = rotl32 ((w9_t ^ w4_t ^ we_t ^ wc_t), 1u); SHA1_STEP (SHA1_F1, e, a, b, c, d, wc_t); wd_t = rotl32 ((wa_t ^ w5_t ^ wf_t ^ wd_t), 1u); SHA1_STEP (SHA1_F1, d, e, a, b, c, wd_t); we_t = rotl32 ((wb_t ^ w6_t ^ w0_t ^ we_t), 1u); SHA1_STEP (SHA1_F1, c, d, e, a, b, we_t); wf_t = rotl32 ((wc_t ^ w7_t ^ w1_t ^ wf_t), 1u); SHA1_STEP (SHA1_F1, b, c, d, e, a, wf_t); - COMPARE_S_SIMD (a, d, c, b); + + const u32 r0 = d; + const u32 r1 = e; + const u32 r2 = c; + const u32 r3 = b; + + #include COMPARE_S } } diff --git a/OpenCL/m04800_a1.cl b/OpenCL/m04800_a1.cl index 33c654a..cc09dfb 100644 --- a/OpenCL/m04800_a1.cl +++ b/OpenCL/m04800_a1.cl @@ -5,8 +5,6 @@ #define _MD5_ -#define NEW_SIMD_CODE - #include "include/constants.h" #include "include/kernel_vendor.h" @@ -18,7 +16,9 @@ #include "include/kernel_functions.c" #include "OpenCL/types_ocl.c" #include "OpenCL/common.c" -#include "OpenCL/simd.c" + +#define COMPARE_S "OpenCL/check_single_comp4.c" +#define COMPARE_M "OpenCL/check_multi_comp4.c" __kernel void m04800_m04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, __global comb_t *combs_buf, __global bf_t *bfs_buf, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 combs_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max) { @@ -36,20 +36,41 @@ __kernel void m04800_m04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, if (gid >= gid_max) return; - u32 pws0[4] = { 0 }; - u32 pws1[4] = { 0 }; + u32 wordl0[4]; + + wordl0[0] = pws[gid].i[ 0]; + wordl0[1] = pws[gid].i[ 1]; + wordl0[2] = pws[gid].i[ 2]; + wordl0[3] = pws[gid].i[ 3]; + + u32 wordl1[4]; + + wordl1[0] = pws[gid].i[ 4]; + wordl1[1] = pws[gid].i[ 5]; + wordl1[2] = pws[gid].i[ 6]; + wordl1[3] = pws[gid].i[ 7]; - pws0[0] = pws[gid].i[0]; - pws0[1] = pws[gid].i[1]; - pws0[2] = pws[gid].i[2]; - pws0[3] = pws[gid].i[3]; - pws1[0] = pws[gid].i[4]; - pws1[1] = pws[gid].i[5]; - pws1[2] = pws[gid].i[6]; - pws1[3] = pws[gid].i[7]; + u32 wordl2[4]; + + wordl2[0] = 0; + wordl2[1] = 0; + wordl2[2] = 0; + wordl2[3] = 0; + + u32 wordl3[4]; + + wordl3[0] = 0; + wordl3[1] = 0; + wordl3[2] = 0; + wordl3[3] = 0; const u32 pw_l_len = pws[gid].pw_len; + if (combs_mode == COMBINATOR_MODE_BASE_RIGHT) + { + switch_buffer_by_offset_le (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len); + } + /** * salt */ @@ -68,53 +89,67 @@ __kernel void m04800_m04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, * loop */ - for (u32 il_pos = 0; il_pos < combs_cnt; il_pos += VECT_SIZE) + for (u32 il_pos = 0; il_pos < combs_cnt; il_pos++) { - const u32x pw_r_len = pwlenx_create_combt (combs_buf, il_pos); + const u32 pw_r_len = combs_buf[il_pos].pw_len; + + const u32 pw_len = pw_l_len + pw_r_len; + + u32 wordr0[4]; + + wordr0[0] = combs_buf[il_pos].i[0]; + wordr0[1] = combs_buf[il_pos].i[1]; + wordr0[2] = combs_buf[il_pos].i[2]; + wordr0[3] = combs_buf[il_pos].i[3]; + + u32 wordr1[4]; - const u32x pw_len = pw_l_len + pw_r_len; + wordr1[0] = combs_buf[il_pos].i[4]; + wordr1[1] = combs_buf[il_pos].i[5]; + wordr1[2] = combs_buf[il_pos].i[6]; + wordr1[3] = combs_buf[il_pos].i[7]; - u32x wordr0[4] = { 0 }; - u32x wordr1[4] = { 0 }; - u32x wordr2[4] = { 0 }; - u32x wordr3[4] = { 0 }; + u32 wordr2[4]; - wordr0[0] = ix_create_combt (combs_buf, il_pos, 0); - wordr0[1] = ix_create_combt (combs_buf, il_pos, 1); - wordr0[2] = ix_create_combt (combs_buf, il_pos, 2); - wordr0[3] = ix_create_combt (combs_buf, il_pos, 3); - wordr1[0] = ix_create_combt (combs_buf, il_pos, 4); - wordr1[1] = ix_create_combt (combs_buf, il_pos, 5); - wordr1[2] = ix_create_combt (combs_buf, il_pos, 6); - wordr1[3] = ix_create_combt (combs_buf, il_pos, 7); + wordr2[0] = 0; + wordr2[1] = 0; + wordr2[2] = 0; + wordr2[3] = 0; + + u32 wordr3[4]; + + wordr3[0] = 0; + wordr3[1] = 0; + wordr3[2] = 0; + wordr3[3] = 0; if (combs_mode == COMBINATOR_MODE_BASE_LEFT) { switch_buffer_by_offset_le (wordr0, wordr1, wordr2, wordr3, pw_l_len); } - u32x w0[4]; + u32 w0[4]; w0[0] = wordl0[0] | wordr0[0]; w0[1] = wordl0[1] | wordr0[1]; w0[2] = wordl0[2] | wordr0[2]; w0[3] = wordl0[3] | wordr0[3]; - u32x w1[4]; + u32 w1[4]; w1[0] = wordl1[0] | wordr1[0]; w1[1] = wordl1[1] | wordr1[1]; w1[2] = wordl1[2] | wordr1[2]; w1[3] = wordl1[3] | wordr1[3]; - u32x w2[4]; + u32 w2[4]; w2[0] = wordl2[0] | wordr2[0]; w2[1] = wordl2[1] | wordr2[1]; w2[2] = wordl2[2] | wordr2[2]; w2[3] = wordl2[3] | wordr2[3]; - u32x w3[4]; + u32 w3[4]; w3[0] = wordl3[0] | wordr3[0]; w3[1] = wordl3[1] | wordr3[1]; @@ -163,7 +198,7 @@ __kernel void m04800_m04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, switch_buffer_by_offset_le (s0, s1, s2, s3, 1 + pw_len); - const u32x pw_salt_len = pw_len + salt_len; + const u32 pw_salt_len = pw_len + salt_len; w0[0] |= s0[0]; w0[1] |= s0[1]; @@ -186,10 +221,10 @@ __kernel void m04800_m04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, * md5 */ - u32x a = MD5M_A; - u32x b = MD5M_B; - u32x c = MD5M_C; - u32x d = MD5M_D; + u32 a = MD5M_A; + u32 b = MD5M_B; + u32 c = MD5M_C; + u32 d = MD5M_D; MD5_STEP (MD5_Fo, a, b, c, d, w0[0], MD5C00, MD5S00); MD5_STEP (MD5_Fo, d, a, b, c, w0[1], MD5C01, MD5S01); @@ -258,7 +293,13 @@ __kernel void m04800_m04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, MD5_STEP (MD5_I , d, a, b, c, w2[3], MD5C3d, MD5S31); MD5_STEP (MD5_I , c, d, a, b, w0[2], MD5C3e, MD5S32); MD5_STEP (MD5_I , b, c, d, a, w2[1], MD5C3f, MD5S33); - COMPARE_M_SIMD (a, d, c, b); + + const u32 r0 = a; + const u32 r1 = d; + const u32 r2 = c; + const u32 r3 = b; + + #include COMPARE_M } } @@ -286,20 +327,41 @@ __kernel void m04800_s04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, if (gid >= gid_max) return; - u32 pws0[4] = { 0 }; - u32 pws1[4] = { 0 }; + u32 wordl0[4]; - pws0[0] = pws[gid].i[0]; - pws0[1] = pws[gid].i[1]; - pws0[2] = pws[gid].i[2]; - pws0[3] = pws[gid].i[3]; - pws1[0] = pws[gid].i[4]; - pws1[1] = pws[gid].i[5]; - pws1[2] = pws[gid].i[6]; - pws1[3] = pws[gid].i[7]; + wordl0[0] = pws[gid].i[ 0]; + wordl0[1] = pws[gid].i[ 1]; + wordl0[2] = pws[gid].i[ 2]; + wordl0[3] = pws[gid].i[ 3]; + + u32 wordl1[4]; + + wordl1[0] = pws[gid].i[ 4]; + wordl1[1] = pws[gid].i[ 5]; + wordl1[2] = pws[gid].i[ 6]; + wordl1[3] = pws[gid].i[ 7]; + + u32 wordl2[4]; + + wordl2[0] = 0; + wordl2[1] = 0; + wordl2[2] = 0; + wordl2[3] = 0; + + u32 wordl3[4]; + + wordl3[0] = 0; + wordl3[1] = 0; + wordl3[2] = 0; + wordl3[3] = 0; const u32 pw_l_len = pws[gid].pw_len; + if (combs_mode == COMBINATOR_MODE_BASE_RIGHT) + { + switch_buffer_by_offset_le (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len); + } + /** * salt */ @@ -330,53 +392,67 @@ __kernel void m04800_s04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, * loop */ - for (u32 il_pos = 0; il_pos < combs_cnt; il_pos += VECT_SIZE) + for (u32 il_pos = 0; il_pos < combs_cnt; il_pos++) { - const u32x pw_r_len = pwlenx_create_combt (combs_buf, il_pos); + const u32 pw_r_len = combs_buf[il_pos].pw_len; - const u32x pw_len = pw_l_len + pw_r_len; + const u32 pw_len = pw_l_len + pw_r_len; - u32x wordr0[4] = { 0 }; - u32x wordr1[4] = { 0 }; - u32x wordr2[4] = { 0 }; - u32x wordr3[4] = { 0 }; + u32 wordr0[4]; - wordr0[0] = ix_create_combt (combs_buf, il_pos, 0); - wordr0[1] = ix_create_combt (combs_buf, il_pos, 1); - wordr0[2] = ix_create_combt (combs_buf, il_pos, 2); - wordr0[3] = ix_create_combt (combs_buf, il_pos, 3); - wordr1[0] = ix_create_combt (combs_buf, il_pos, 4); - wordr1[1] = ix_create_combt (combs_buf, il_pos, 5); - wordr1[2] = ix_create_combt (combs_buf, il_pos, 6); - wordr1[3] = ix_create_combt (combs_buf, il_pos, 7); + wordr0[0] = combs_buf[il_pos].i[0]; + wordr0[1] = combs_buf[il_pos].i[1]; + wordr0[2] = combs_buf[il_pos].i[2]; + wordr0[3] = combs_buf[il_pos].i[3]; + + u32 wordr1[4]; + + wordr1[0] = combs_buf[il_pos].i[4]; + wordr1[1] = combs_buf[il_pos].i[5]; + wordr1[2] = combs_buf[il_pos].i[6]; + wordr1[3] = combs_buf[il_pos].i[7]; + + u32 wordr2[4]; + + wordr2[0] = 0; + wordr2[1] = 0; + wordr2[2] = 0; + wordr2[3] = 0; + + u32 wordr3[4]; + + wordr3[0] = 0; + wordr3[1] = 0; + wordr3[2] = 0; + wordr3[3] = 0; if (combs_mode == COMBINATOR_MODE_BASE_LEFT) { switch_buffer_by_offset_le (wordr0, wordr1, wordr2, wordr3, pw_l_len); } - u32x w0[4]; + u32 w0[4]; w0[0] = wordl0[0] | wordr0[0]; w0[1] = wordl0[1] | wordr0[1]; w0[2] = wordl0[2] | wordr0[2]; w0[3] = wordl0[3] | wordr0[3]; - u32x w1[4]; + u32 w1[4]; w1[0] = wordl1[0] | wordr1[0]; w1[1] = wordl1[1] | wordr1[1]; w1[2] = wordl1[2] | wordr1[2]; w1[3] = wordl1[3] | wordr1[3]; - u32x w2[4]; + u32 w2[4]; w2[0] = wordl2[0] | wordr2[0]; w2[1] = wordl2[1] | wordr2[1]; w2[2] = wordl2[2] | wordr2[2]; w2[3] = wordl2[3] | wordr2[3]; - u32x w3[4]; + u32 w3[4]; w3[0] = wordl3[0] | wordr3[0]; w3[1] = wordl3[1] | wordr3[1]; @@ -425,7 +501,7 @@ __kernel void m04800_s04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, switch_buffer_by_offset_le (s0, s1, s2, s3, 1 + pw_len); - const u32x pw_salt_len = pw_len + salt_len; + const u32 pw_salt_len = pw_len + salt_len; w0[0] |= s0[0]; w0[1] |= s0[1]; @@ -448,10 +524,10 @@ __kernel void m04800_s04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, * md5 */ - u32x a = MD5M_A; - u32x b = MD5M_B; - u32x c = MD5M_C; - u32x d = MD5M_D; + u32 a = MD5M_A; + u32 b = MD5M_B; + u32 c = MD5M_C; + u32 d = MD5M_D; MD5_STEP (MD5_Fo, a, b, c, d, w0[0], MD5C00, MD5S00); MD5_STEP (MD5_Fo, d, a, b, c, w0[1], MD5C01, MD5S01); @@ -518,12 +594,20 @@ __kernel void m04800_s04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, MD5_STEP (MD5_I , b, c, d, a, w3[1], MD5C3b, MD5S33); MD5_STEP (MD5_I , a, b, c, d, w1[0], MD5C3c, MD5S30); - if (MATCHES_NONE_VS (a, search[0])) continue; + bool q_cond = allx (search[0] != a); + + if (q_cond) continue; MD5_STEP (MD5_I , d, a, b, c, w2[3], MD5C3d, MD5S31); MD5_STEP (MD5_I , c, d, a, b, w0[2], MD5C3e, MD5S32); MD5_STEP (MD5_I , b, c, d, a, w2[1], MD5C3f, MD5S33); - COMPARE_S_SIMD (a, d, c, b); + + const u32 r0 = a; + const u32 r1 = d; + const u32 r2 = c; + const u32 r3 = b; + + #include COMPARE_S } } diff --git a/OpenCL/m04900_a1.cl b/OpenCL/m04900_a1.cl index 8aa494e..80b3387 100644 --- a/OpenCL/m04900_a1.cl +++ b/OpenCL/m04900_a1.cl @@ -5,8 +5,6 @@ #define _SHA1_ -#define NEW_SIMD_CODE - #include "include/constants.h" #include "include/kernel_vendor.h" @@ -18,7 +16,9 @@ #include "include/kernel_functions.c" #include "OpenCL/types_ocl.c" #include "OpenCL/common.c" -#include "OpenCL/simd.c" + +#define COMPARE_S "OpenCL/check_single_comp4.c" +#define COMPARE_M "OpenCL/check_multi_comp4.c" __kernel void m04900_m04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, __global comb_t *combs_buf, __global bf_t *bfs_buf, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 combs_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max) { @@ -36,20 +36,41 @@ __kernel void m04900_m04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, if (gid >= gid_max) return; - u32 pws0[4] = { 0 }; - u32 pws1[4] = { 0 }; + u32 wordl0[4]; + + wordl0[0] = pws[gid].i[ 0]; + wordl0[1] = pws[gid].i[ 1]; + wordl0[2] = pws[gid].i[ 2]; + wordl0[3] = pws[gid].i[ 3]; + + u32 wordl1[4]; + + wordl1[0] = pws[gid].i[ 4]; + wordl1[1] = pws[gid].i[ 5]; + wordl1[2] = pws[gid].i[ 6]; + wordl1[3] = pws[gid].i[ 7]; - pws0[0] = pws[gid].i[0]; - pws0[1] = pws[gid].i[1]; - pws0[2] = pws[gid].i[2]; - pws0[3] = pws[gid].i[3]; - pws1[0] = pws[gid].i[4]; - pws1[1] = pws[gid].i[5]; - pws1[2] = pws[gid].i[6]; - pws1[3] = pws[gid].i[7]; + u32 wordl2[4]; + + wordl2[0] = 0; + wordl2[1] = 0; + wordl2[2] = 0; + wordl2[3] = 0; + + u32 wordl3[4]; + + wordl3[0] = 0; + wordl3[1] = 0; + wordl3[2] = 0; + wordl3[3] = 0; const u32 pw_l_len = pws[gid].pw_len; + if (combs_mode == COMBINATOR_MODE_BASE_RIGHT) + { + switch_buffer_by_offset_le (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len); + } + /** * salt */ @@ -88,53 +109,67 @@ __kernel void m04900_m04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, * loop */ - for (u32 il_pos = 0; il_pos < combs_cnt; il_pos += VECT_SIZE) + for (u32 il_pos = 0; il_pos < combs_cnt; il_pos++) { - const u32x pw_r_len = pwlenx_create_combt (combs_buf, il_pos); + const u32 pw_r_len = combs_buf[il_pos].pw_len; + + const u32 pw_len = pw_l_len + pw_r_len; + + u32 wordr0[4]; + + wordr0[0] = combs_buf[il_pos].i[0]; + wordr0[1] = combs_buf[il_pos].i[1]; + wordr0[2] = combs_buf[il_pos].i[2]; + wordr0[3] = combs_buf[il_pos].i[3]; - const u32x pw_len = pw_l_len + pw_r_len; + u32 wordr1[4]; - u32x wordr0[4] = { 0 }; - u32x wordr1[4] = { 0 }; - u32x wordr2[4] = { 0 }; - u32x wordr3[4] = { 0 }; + wordr1[0] = combs_buf[il_pos].i[4]; + wordr1[1] = combs_buf[il_pos].i[5]; + wordr1[2] = combs_buf[il_pos].i[6]; + wordr1[3] = combs_buf[il_pos].i[7]; - wordr0[0] = ix_create_combt (combs_buf, il_pos, 0); - wordr0[1] = ix_create_combt (combs_buf, il_pos, 1); - wordr0[2] = ix_create_combt (combs_buf, il_pos, 2); - wordr0[3] = ix_create_combt (combs_buf, il_pos, 3); - wordr1[0] = ix_create_combt (combs_buf, il_pos, 4); - wordr1[1] = ix_create_combt (combs_buf, il_pos, 5); - wordr1[2] = ix_create_combt (combs_buf, il_pos, 6); - wordr1[3] = ix_create_combt (combs_buf, il_pos, 7); + u32 wordr2[4]; + + wordr2[0] = 0; + wordr2[1] = 0; + wordr2[2] = 0; + wordr2[3] = 0; + + u32 wordr3[4]; + + wordr3[0] = 0; + wordr3[1] = 0; + wordr3[2] = 0; + wordr3[3] = 0; if (combs_mode == COMBINATOR_MODE_BASE_LEFT) { switch_buffer_by_offset_le (wordr0, wordr1, wordr2, wordr3, pw_l_len); } - u32x w0_t[4]; + u32 w0_t[4]; w0_t[0] = wordl0[0] | wordr0[0]; w0_t[1] = wordl0[1] | wordr0[1]; w0_t[2] = wordl0[2] | wordr0[2]; w0_t[3] = wordl0[3] | wordr0[3]; - u32x w1_t[4]; + u32 w1_t[4]; w1_t[0] = wordl1[0] | wordr1[0]; w1_t[1] = wordl1[1] | wordr1[1]; w1_t[2] = wordl1[2] | wordr1[2]; w1_t[3] = wordl1[3] | wordr1[3]; - u32x w2_t[4]; + u32 w2_t[4]; w2_t[0] = wordl2[0] | wordr2[0]; w2_t[1] = wordl2[1] | wordr2[1]; w2_t[2] = wordl2[2] | wordr2[2]; w2_t[3] = wordl2[3] | wordr2[3]; - u32x w3_t[4]; + u32 w3_t[4]; w3_t[0] = wordl3[0] | wordr3[0]; w3_t[1] = wordl3[1] | wordr3[1]; @@ -219,32 +254,32 @@ __kernel void m04900_m04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, append_0x80_4x4 (w0_t, w1_t, w2_t, w3_t, pw_salt_len); - u32x w0 = swap32 (w0_t[0]); - u32x w1 = swap32 (w0_t[1]); - u32x w2 = swap32 (w0_t[2]); - u32x w3 = swap32 (w0_t[3]); - u32x w4 = swap32 (w1_t[0]); - u32x w5 = swap32 (w1_t[1]); - u32x w6 = swap32 (w1_t[2]); - u32x w7 = swap32 (w1_t[3]); - u32x w8 = swap32 (w2_t[0]); - u32x w9 = swap32 (w2_t[1]); - u32x wa = swap32 (w2_t[2]); - u32x wb = swap32 (w2_t[3]); - u32x wc = swap32 (w3_t[0]); - u32x wd = swap32 (w3_t[1]); - u32x we = 0; - u32x wf = pw_salt_len * 8; + u32 w0 = swap32 (w0_t[0]); + u32 w1 = swap32 (w0_t[1]); + u32 w2 = swap32 (w0_t[2]); + u32 w3 = swap32 (w0_t[3]); + u32 w4 = swap32 (w1_t[0]); + u32 w5 = swap32 (w1_t[1]); + u32 w6 = swap32 (w1_t[2]); + u32 w7 = swap32 (w1_t[3]); + u32 w8 = swap32 (w2_t[0]); + u32 w9 = swap32 (w2_t[1]); + u32 wa = swap32 (w2_t[2]); + u32 wb = swap32 (w2_t[3]); + u32 wc = swap32 (w3_t[0]); + u32 wd = swap32 (w3_t[1]); + u32 we = 0; + u32 wf = pw_salt_len * 8; /** * sha1 */ - u32x a = SHA1M_A; - u32x b = SHA1M_B; - u32x c = SHA1M_C; - u32x d = SHA1M_D; - u32x e = SHA1M_E; + u32 a = SHA1M_A; + u32 b = SHA1M_B; + u32 c = SHA1M_C; + u32 d = SHA1M_D; + u32 e = SHA1M_E; #undef K #define K SHA1C00 @@ -341,7 +376,13 @@ __kernel void m04900_m04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, wd = rotl32 ((wa ^ w5 ^ wf ^ wd), 1u); SHA1_STEP (SHA1_F1, d, e, a, b, c, wd); we = rotl32 ((wb ^ w6 ^ w0 ^ we), 1u); SHA1_STEP (SHA1_F1, c, d, e, a, b, we); wf = rotl32 ((wc ^ w7 ^ w1 ^ wf), 1u); SHA1_STEP (SHA1_F1, b, c, d, e, a, wf); - COMPARE_M_SIMD (a, d, c, b); + + const u32 r0 = d; + const u32 r1 = e; + const u32 r2 = c; + const u32 r3 = b; + + #include COMPARE_M } } @@ -369,20 +410,41 @@ __kernel void m04900_s04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, if (gid >= gid_max) return; - u32 pws0[4] = { 0 }; - u32 pws1[4] = { 0 }; + u32 wordl0[4]; + + wordl0[0] = pws[gid].i[ 0]; + wordl0[1] = pws[gid].i[ 1]; + wordl0[2] = pws[gid].i[ 2]; + wordl0[3] = pws[gid].i[ 3]; + + u32 wordl1[4]; + + wordl1[0] = pws[gid].i[ 4]; + wordl1[1] = pws[gid].i[ 5]; + wordl1[2] = pws[gid].i[ 6]; + wordl1[3] = pws[gid].i[ 7]; + + u32 wordl2[4]; - pws0[0] = pws[gid].i[0]; - pws0[1] = pws[gid].i[1]; - pws0[2] = pws[gid].i[2]; - pws0[3] = pws[gid].i[3]; - pws1[0] = pws[gid].i[4]; - pws1[1] = pws[gid].i[5]; - pws1[2] = pws[gid].i[6]; - pws1[3] = pws[gid].i[7]; + wordl2[0] = 0; + wordl2[1] = 0; + wordl2[2] = 0; + wordl2[3] = 0; + + u32 wordl3[4]; + + wordl3[0] = 0; + wordl3[1] = 0; + wordl3[2] = 0; + wordl3[3] = 0; const u32 pw_l_len = pws[gid].pw_len; + if (combs_mode == COMBINATOR_MODE_BASE_RIGHT) + { + switch_buffer_by_offset_le (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len); + } + /** * salt */ @@ -433,59 +495,73 @@ __kernel void m04900_s04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, * reverse */ - const u32 e_rev = rotl32_S (search[1], 2u); + const u32 e_rev = rotl32 (search[1], 2u); /** * loop */ - for (u32 il_pos = 0; il_pos < combs_cnt; il_pos += VECT_SIZE) + for (u32 il_pos = 0; il_pos < combs_cnt; il_pos++) { - const u32x pw_r_len = pwlenx_create_combt (combs_buf, il_pos); + const u32 pw_r_len = combs_buf[il_pos].pw_len; + + const u32 pw_len = pw_l_len + pw_r_len; + + u32 wordr0[4]; + + wordr0[0] = combs_buf[il_pos].i[0]; + wordr0[1] = combs_buf[il_pos].i[1]; + wordr0[2] = combs_buf[il_pos].i[2]; + wordr0[3] = combs_buf[il_pos].i[3]; + + u32 wordr1[4]; - const u32x pw_len = pw_l_len + pw_r_len; + wordr1[0] = combs_buf[il_pos].i[4]; + wordr1[1] = combs_buf[il_pos].i[5]; + wordr1[2] = combs_buf[il_pos].i[6]; + wordr1[3] = combs_buf[il_pos].i[7]; - u32x wordr0[4] = { 0 }; - u32x wordr1[4] = { 0 }; - u32x wordr2[4] = { 0 }; - u32x wordr3[4] = { 0 }; + u32 wordr2[4]; - wordr0[0] = ix_create_combt (combs_buf, il_pos, 0); - wordr0[1] = ix_create_combt (combs_buf, il_pos, 1); - wordr0[2] = ix_create_combt (combs_buf, il_pos, 2); - wordr0[3] = ix_create_combt (combs_buf, il_pos, 3); - wordr1[0] = ix_create_combt (combs_buf, il_pos, 4); - wordr1[1] = ix_create_combt (combs_buf, il_pos, 5); - wordr1[2] = ix_create_combt (combs_buf, il_pos, 6); - wordr1[3] = ix_create_combt (combs_buf, il_pos, 7); + wordr2[0] = 0; + wordr2[1] = 0; + wordr2[2] = 0; + wordr2[3] = 0; + + u32 wordr3[4]; + + wordr3[0] = 0; + wordr3[1] = 0; + wordr3[2] = 0; + wordr3[3] = 0; if (combs_mode == COMBINATOR_MODE_BASE_LEFT) { switch_buffer_by_offset_le (wordr0, wordr1, wordr2, wordr3, pw_l_len); } - u32x w0_t[4]; + u32 w0_t[4]; w0_t[0] = wordl0[0] | wordr0[0]; w0_t[1] = wordl0[1] | wordr0[1]; w0_t[2] = wordl0[2] | wordr0[2]; w0_t[3] = wordl0[3] | wordr0[3]; - u32x w1_t[4]; + u32 w1_t[4]; w1_t[0] = wordl1[0] | wordr1[0]; w1_t[1] = wordl1[1] | wordr1[1]; w1_t[2] = wordl1[2] | wordr1[2]; w1_t[3] = wordl1[3] | wordr1[3]; - u32x w2_t[4]; + u32 w2_t[4]; w2_t[0] = wordl2[0] | wordr2[0]; w2_t[1] = wordl2[1] | wordr2[1]; w2_t[2] = wordl2[2] | wordr2[2]; w2_t[3] = wordl2[3] | wordr2[3]; - u32x w3_t[4]; + u32 w3_t[4]; w3_t[0] = wordl3[0] | wordr3[0]; w3_t[1] = wordl3[1] | wordr3[1]; @@ -570,32 +646,32 @@ __kernel void m04900_s04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, append_0x80_4x4 (w0_t, w1_t, w2_t, w3_t, pw_salt_len); - u32x w0 = swap32 (w0_t[0]); - u32x w1 = swap32 (w0_t[1]); - u32x w2 = swap32 (w0_t[2]); - u32x w3 = swap32 (w0_t[3]); - u32x w4 = swap32 (w1_t[0]); - u32x w5 = swap32 (w1_t[1]); - u32x w6 = swap32 (w1_t[2]); - u32x w7 = swap32 (w1_t[3]); - u32x w8 = swap32 (w2_t[0]); - u32x w9 = swap32 (w2_t[1]); - u32x wa = swap32 (w2_t[2]); - u32x wb = swap32 (w2_t[3]); - u32x wc = swap32 (w3_t[0]); - u32x wd = swap32 (w3_t[1]); - u32x we = 0; - u32x wf = pw_salt_len * 8; + u32 w0 = swap32 (w0_t[0]); + u32 w1 = swap32 (w0_t[1]); + u32 w2 = swap32 (w0_t[2]); + u32 w3 = swap32 (w0_t[3]); + u32 w4 = swap32 (w1_t[0]); + u32 w5 = swap32 (w1_t[1]); + u32 w6 = swap32 (w1_t[2]); + u32 w7 = swap32 (w1_t[3]); + u32 w8 = swap32 (w2_t[0]); + u32 w9 = swap32 (w2_t[1]); + u32 wa = swap32 (w2_t[2]); + u32 wb = swap32 (w2_t[3]); + u32 wc = swap32 (w3_t[0]); + u32 wd = swap32 (w3_t[1]); + u32 we = 0; + u32 wf = pw_salt_len * 8; /** * sha1 */ - u32x a = SHA1M_A; - u32x b = SHA1M_B; - u32x c = SHA1M_C; - u32x d = SHA1M_D; - u32x e = SHA1M_E; + u32 a = SHA1M_A; + u32 b = SHA1M_B; + u32 c = SHA1M_C; + u32 d = SHA1M_D; + u32 e = SHA1M_E; #undef K #define K SHA1C00 @@ -689,13 +765,19 @@ __kernel void m04900_s04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, wa = rotl32 ((w7 ^ w2 ^ wc ^ wa), 1u); SHA1_STEP (SHA1_F1, b, c, d, e, a, wa); wb = rotl32 ((w8 ^ w3 ^ wd ^ wb), 1u); SHA1_STEP (SHA1_F1, a, b, c, d, e, wb); - if (MATCHES_NONE_VS (e, e_rev)) continue; + if (allx (e != e_rev)) continue; wc = rotl32 ((w9 ^ w4 ^ we ^ wc), 1u); SHA1_STEP (SHA1_F1, e, a, b, c, d, wc); wd = rotl32 ((wa ^ w5 ^ wf ^ wd), 1u); SHA1_STEP (SHA1_F1, d, e, a, b, c, wd); we = rotl32 ((wb ^ w6 ^ w0 ^ we), 1u); SHA1_STEP (SHA1_F1, c, d, e, a, b, we); wf = rotl32 ((wc ^ w7 ^ w1 ^ wf), 1u); SHA1_STEP (SHA1_F1, b, c, d, e, a, wf); - COMPARE_S_SIMD (a, d, c, b); + + const u32 r0 = d; + const u32 r1 = e; + const u32 r2 = c; + const u32 r3 = b; + + #include COMPARE_S } } diff --git a/OpenCL/m05000_a1.cl b/OpenCL/m05000_a1.cl index 37d2c81..d648989 100644 --- a/OpenCL/m05000_a1.cl +++ b/OpenCL/m05000_a1.cl @@ -5,8 +5,6 @@ #define _KECCAK_ -#define NEW_SIMD_CODE - #include "include/constants.h" #include "include/kernel_vendor.h" @@ -18,7 +16,9 @@ #include "include/kernel_functions.c" #include "OpenCL/types_ocl.c" #include "OpenCL/common.c" -#include "OpenCL/simd.c" + +#define COMPARE_S "OpenCL/check_single_comp4.c" +#define COMPARE_M "OpenCL/check_multi_comp4.c" __constant u64 keccakf_rndc[24] = { @@ -102,20 +102,43 @@ __kernel void m05000_m04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, if (gid >= gid_max) return; - u32 pws0[4] = { 0 }; - u32 pws1[4] = { 0 }; + u32 wordl0[4]; + + wordl0[0] = pws[gid].i[ 0]; + wordl0[1] = pws[gid].i[ 1]; + wordl0[2] = pws[gid].i[ 2]; + wordl0[3] = pws[gid].i[ 3]; + + u32 wordl1[4]; + + wordl1[0] = pws[gid].i[ 4]; + wordl1[1] = pws[gid].i[ 5]; + wordl1[2] = pws[gid].i[ 6]; + wordl1[3] = pws[gid].i[ 7]; - pws0[0] = pws[gid].i[0]; - pws0[1] = pws[gid].i[1]; - pws0[2] = pws[gid].i[2]; - pws0[3] = pws[gid].i[3]; - pws1[0] = pws[gid].i[4]; - pws1[1] = pws[gid].i[5]; - pws1[2] = pws[gid].i[6]; - pws1[3] = pws[gid].i[7]; + u32 wordl2[4]; + + wordl2[0] = 0; + wordl2[1] = 0; + wordl2[2] = 0; + wordl2[3] = 0; + + u32 wordl3[4]; + + wordl3[0] = 0; + wordl3[1] = 0; + wordl3[2] = 0; + wordl3[3] = 0; const u32 pw_l_len = pws[gid].pw_len; + if (combs_mode == COMBINATOR_MODE_BASE_RIGHT) + { + append_0x01_2x4 (wordl0, wordl1, pw_l_len); + + switch_buffer_by_offset_le (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len); + } + /** * 0x80 keccak, very special */ @@ -130,25 +153,39 @@ __kernel void m05000_m04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, * loop */ - for (u32 il_pos = 0; il_pos < combs_cnt; il_pos += VECT_SIZE) + for (u32 il_pos = 0; il_pos < combs_cnt; il_pos++) { - const u32x pw_r_len = pwlenx_create_combt (combs_buf, il_pos); + const u32 pw_r_len = combs_buf[il_pos].pw_len; + + const u32 pw_len = pw_l_len + pw_r_len; + + u32 wordr0[4]; + + wordr0[0] = combs_buf[il_pos].i[0]; + wordr0[1] = combs_buf[il_pos].i[1]; + wordr0[2] = combs_buf[il_pos].i[2]; + wordr0[3] = combs_buf[il_pos].i[3]; + + u32 wordr1[4]; + + wordr1[0] = combs_buf[il_pos].i[4]; + wordr1[1] = combs_buf[il_pos].i[5]; + wordr1[2] = combs_buf[il_pos].i[6]; + wordr1[3] = combs_buf[il_pos].i[7]; - const u32x pw_len = pw_l_len + pw_r_len; + u32 wordr2[4]; - u32x wordr0[4] = { 0 }; - u32x wordr1[4] = { 0 }; - u32x wordr2[4] = { 0 }; - u32x wordr3[4] = { 0 }; + wordr2[0] = 0; + wordr2[1] = 0; + wordr2[2] = 0; + wordr2[3] = 0; - wordr0[0] = ix_create_combt (combs_buf, il_pos, 0); - wordr0[1] = ix_create_combt (combs_buf, il_pos, 1); - wordr0[2] = ix_create_combt (combs_buf, il_pos, 2); - wordr0[3] = ix_create_combt (combs_buf, il_pos, 3); - wordr1[0] = ix_create_combt (combs_buf, il_pos, 4); - wordr1[1] = ix_create_combt (combs_buf, il_pos, 5); - wordr1[2] = ix_create_combt (combs_buf, il_pos, 6); - wordr1[3] = ix_create_combt (combs_buf, il_pos, 7); + u32 wordr3[4]; + + wordr3[0] = 0; + wordr3[1] = 0; + wordr3[2] = 0; + wordr3[3] = 0; if (combs_mode == COMBINATOR_MODE_BASE_LEFT) { @@ -157,28 +194,28 @@ __kernel void m05000_m04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, switch_buffer_by_offset_le (wordr0, wordr1, wordr2, wordr3, pw_l_len); } - u32x w0[4]; + u32 w0[4]; w0[0] = wordl0[0] | wordr0[0]; w0[1] = wordl0[1] | wordr0[1]; w0[2] = wordl0[2] | wordr0[2]; w0[3] = wordl0[3] | wordr0[3]; - u32x w1[4]; + u32 w1[4]; w1[0] = wordl1[0] | wordr1[0]; w1[1] = wordl1[1] | wordr1[1]; w1[2] = wordl1[2] | wordr1[2]; w1[3] = wordl1[3] | wordr1[3]; - u32x w2[4]; + u32 w2[4]; w2[0] = wordl2[0] | wordr2[0]; w2[1] = wordl2[1] | wordr2[1]; w2[2] = wordl2[2] | wordr2[2]; w2[3] = wordl2[3] | wordr2[3]; - u32x w3[4]; + u32 w3[4]; w3[0] = wordl3[0] | wordr3[0]; w3[1] = wordl3[1] | wordr3[1]; @@ -326,20 +363,43 @@ __kernel void m05000_s04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, if (gid >= gid_max) return; - u32 pws0[4] = { 0 }; - u32 pws1[4] = { 0 }; + u32 wordl0[4]; + + wordl0[0] = pws[gid].i[ 0]; + wordl0[1] = pws[gid].i[ 1]; + wordl0[2] = pws[gid].i[ 2]; + wordl0[3] = pws[gid].i[ 3]; + + u32 wordl1[4]; + + wordl1[0] = pws[gid].i[ 4]; + wordl1[1] = pws[gid].i[ 5]; + wordl1[2] = pws[gid].i[ 6]; + wordl1[3] = pws[gid].i[ 7]; + + u32 wordl2[4]; - pws0[0] = pws[gid].i[0]; - pws0[1] = pws[gid].i[1]; - pws0[2] = pws[gid].i[2]; - pws0[3] = pws[gid].i[3]; - pws1[0] = pws[gid].i[4]; - pws1[1] = pws[gid].i[5]; - pws1[2] = pws[gid].i[6]; - pws1[3] = pws[gid].i[7]; + wordl2[0] = 0; + wordl2[1] = 0; + wordl2[2] = 0; + wordl2[3] = 0; + + u32 wordl3[4]; + + wordl3[0] = 0; + wordl3[1] = 0; + wordl3[2] = 0; + wordl3[3] = 0; const u32 pw_l_len = pws[gid].pw_len; + if (combs_mode == COMBINATOR_MODE_BASE_RIGHT) + { + append_0x01_2x4 (wordl0, wordl1, pw_l_len); + + switch_buffer_by_offset_le (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len); + } + /** * digest */ @@ -366,25 +426,39 @@ __kernel void m05000_s04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, * loop */ - for (u32 il_pos = 0; il_pos < combs_cnt; il_pos += VECT_SIZE) + for (u32 il_pos = 0; il_pos < combs_cnt; il_pos++) { - const u32x pw_r_len = pwlenx_create_combt (combs_buf, il_pos); + const u32 pw_r_len = combs_buf[il_pos].pw_len; + + const u32 pw_len = pw_l_len + pw_r_len; + + u32 wordr0[4]; + + wordr0[0] = combs_buf[il_pos].i[0]; + wordr0[1] = combs_buf[il_pos].i[1]; + wordr0[2] = combs_buf[il_pos].i[2]; + wordr0[3] = combs_buf[il_pos].i[3]; + + u32 wordr1[4]; + + wordr1[0] = combs_buf[il_pos].i[4]; + wordr1[1] = combs_buf[il_pos].i[5]; + wordr1[2] = combs_buf[il_pos].i[6]; + wordr1[3] = combs_buf[il_pos].i[7]; + + u32 wordr2[4]; - const u32x pw_len = pw_l_len + pw_r_len; + wordr2[0] = 0; + wordr2[1] = 0; + wordr2[2] = 0; + wordr2[3] = 0; - u32x wordr0[4] = { 0 }; - u32x wordr1[4] = { 0 }; - u32x wordr2[4] = { 0 }; - u32x wordr3[4] = { 0 }; + u32 wordr3[4]; - wordr0[0] = ix_create_combt (combs_buf, il_pos, 0); - wordr0[1] = ix_create_combt (combs_buf, il_pos, 1); - wordr0[2] = ix_create_combt (combs_buf, il_pos, 2); - wordr0[3] = ix_create_combt (combs_buf, il_pos, 3); - wordr1[0] = ix_create_combt (combs_buf, il_pos, 4); - wordr1[1] = ix_create_combt (combs_buf, il_pos, 5); - wordr1[2] = ix_create_combt (combs_buf, il_pos, 6); - wordr1[3] = ix_create_combt (combs_buf, il_pos, 7); + wordr3[0] = 0; + wordr3[1] = 0; + wordr3[2] = 0; + wordr3[3] = 0; if (combs_mode == COMBINATOR_MODE_BASE_LEFT) { @@ -393,28 +467,28 @@ __kernel void m05000_s04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, switch_buffer_by_offset_le (wordr0, wordr1, wordr2, wordr3, pw_l_len); } - u32x w0[4]; + u32 w0[4]; w0[0] = wordl0[0] | wordr0[0]; w0[1] = wordl0[1] | wordr0[1]; w0[2] = wordl0[2] | wordr0[2]; w0[3] = wordl0[3] | wordr0[3]; - u32x w1[4]; + u32 w1[4]; w1[0] = wordl1[0] | wordr1[0]; w1[1] = wordl1[1] | wordr1[1]; w1[2] = wordl1[2] | wordr1[2]; w1[3] = wordl1[3] | wordr1[3]; - u32x w2[4]; + u32 w2[4]; w2[0] = wordl2[0] | wordr2[0]; w2[1] = wordl2[1] | wordr2[1]; w2[2] = wordl2[2] | wordr2[2]; w2[3] = wordl2[3] | wordr2[3]; - u32x w3[4]; + u32 w3[4]; w3[0] = wordl3[0] | wordr3[0]; w3[1] = wordl3[1] | wordr3[1]; diff --git a/OpenCL/m05100_a1.cl b/OpenCL/m05100_a1.cl index a9165f5..a8de2f9 100644 --- a/OpenCL/m05100_a1.cl +++ b/OpenCL/m05100_a1.cl @@ -5,8 +5,6 @@ #define _MD5H_ -#define NEW_SIMD_CODE - #include "include/constants.h" #include "include/kernel_vendor.h" @@ -18,7 +16,9 @@ #include "include/kernel_functions.c" #include "OpenCL/types_ocl.c" #include "OpenCL/common.c" -#include "OpenCL/simd.c" + +#define COMPARE_S "OpenCL/check_single_comp4.c" +#define COMPARE_M "OpenCL/check_multi_comp4.c" __kernel void m05100_m04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, __global comb_t *combs_buf, __global bf_t *bfs_buf, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 combs_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max) { @@ -36,81 +36,118 @@ __kernel void m05100_m04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, if (gid >= gid_max) return; - u32 pws0[4] = { 0 }; - u32 pws1[4] = { 0 }; + u32 wordl0[4]; + + wordl0[0] = pws[gid].i[ 0]; + wordl0[1] = pws[gid].i[ 1]; + wordl0[2] = pws[gid].i[ 2]; + wordl0[3] = pws[gid].i[ 3]; + + u32 wordl1[4]; - pws0[0] = pws[gid].i[0]; - pws0[1] = pws[gid].i[1]; - pws0[2] = pws[gid].i[2]; - pws0[3] = pws[gid].i[3]; - pws1[0] = pws[gid].i[4]; - pws1[1] = pws[gid].i[5]; - pws1[2] = pws[gid].i[6]; - pws1[3] = pws[gid].i[7]; + wordl1[0] = pws[gid].i[ 4]; + wordl1[1] = pws[gid].i[ 5]; + wordl1[2] = pws[gid].i[ 6]; + wordl1[3] = pws[gid].i[ 7]; + + u32 wordl2[4]; + + wordl2[0] = 0; + wordl2[1] = 0; + wordl2[2] = 0; + wordl2[3] = 0; + + u32 wordl3[4]; + + wordl3[0] = 0; + wordl3[1] = 0; + wordl3[2] = 0; + wordl3[3] = 0; const u32 pw_l_len = pws[gid].pw_len; + if (combs_mode == COMBINATOR_MODE_BASE_RIGHT) + { + append_0x80_2x4 (wordl0, wordl1, pw_l_len); + + switch_buffer_by_offset_le (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len); + } + /** * loop */ - for (u32 il_pos = 0; il_pos < combs_cnt; il_pos += VECT_SIZE) + for (u32 il_pos = 0; il_pos < combs_cnt; il_pos++) { - const u32x pw_r_len = pwlenx_create_combt (combs_buf, il_pos); + const u32 pw_r_len = combs_buf[il_pos].pw_len; + + const u32 pw_len = pw_l_len + pw_r_len; - const u32x pw_len = pw_l_len + pw_r_len; + u32 wordr0[4]; - u32x wordr0[4] = { 0 }; - u32x wordr1[4] = { 0 }; - u32x wordr2[4] = { 0 }; - u32x wordr3[4] = { 0 }; + wordr0[0] = combs_buf[il_pos].i[0]; + wordr0[1] = combs_buf[il_pos].i[1]; + wordr0[2] = combs_buf[il_pos].i[2]; + wordr0[3] = combs_buf[il_pos].i[3]; - wordr0[0] = ix_create_combt (combs_buf, il_pos, 0); - wordr0[1] = ix_create_combt (combs_buf, il_pos, 1); - wordr0[2] = ix_create_combt (combs_buf, il_pos, 2); - wordr0[3] = ix_create_combt (combs_buf, il_pos, 3); - wordr1[0] = ix_create_combt (combs_buf, il_pos, 4); - wordr1[1] = ix_create_combt (combs_buf, il_pos, 5); - wordr1[2] = ix_create_combt (combs_buf, il_pos, 6); - wordr1[3] = ix_create_combt (combs_buf, il_pos, 7); + u32 wordr1[4]; + + wordr1[0] = combs_buf[il_pos].i[4]; + wordr1[1] = combs_buf[il_pos].i[5]; + wordr1[2] = combs_buf[il_pos].i[6]; + wordr1[3] = combs_buf[il_pos].i[7]; + + u32 wordr2[4]; + + wordr2[0] = 0; + wordr2[1] = 0; + wordr2[2] = 0; + wordr2[3] = 0; + + u32 wordr3[4]; + + wordr3[0] = 0; + wordr3[1] = 0; + wordr3[2] = 0; + wordr3[3] = 0; if (combs_mode == COMBINATOR_MODE_BASE_LEFT) { switch_buffer_by_offset_le (wordr0, wordr1, wordr2, wordr3, pw_l_len); } - u32x w0[4]; + u32 w0[4]; w0[0] = wordl0[0] | wordr0[0]; w0[1] = wordl0[1] | wordr0[1]; w0[2] = wordl0[2] | wordr0[2]; w0[3] = wordl0[3] | wordr0[3]; - u32x w1[4]; + u32 w1[4]; w1[0] = wordl1[0] | wordr1[0]; w1[1] = wordl1[1] | wordr1[1]; w1[2] = wordl1[2] | wordr1[2]; w1[3] = wordl1[3] | wordr1[3]; - u32x w2[4]; + u32 w2[4]; w2[0] = wordl2[0] | wordr2[0]; w2[1] = wordl2[1] | wordr2[1]; w2[2] = wordl2[2] | wordr2[2]; w2[3] = wordl2[3] | wordr2[3]; - u32x w3[4]; + u32 w3[4]; w3[0] = wordl3[0] | wordr3[0]; w3[1] = wordl3[1] | wordr3[1]; w3[2] = pw_len * 8; w3[3] = 0; - u32x a = MD5M_A; - u32x b = MD5M_B; - u32x c = MD5M_C; - u32x d = MD5M_D; + u32 a = MD5M_A; + u32 b = MD5M_B; + u32 c = MD5M_C; + u32 d = MD5M_D; MD5_STEP (MD5_Fo, a, b, c, d, w0[0], MD5C00, MD5S00); MD5_STEP (MD5_Fo, d, a, b, c, w0[1], MD5C01, MD5S01); @@ -282,72 +319,86 @@ __kernel void m05100_s04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, if (combs_mode == COMBINATOR_MODE_BASE_RIGHT) { - append_0x80_2x4_S (wordl0, wordl1, pw_l_len); + append_0x80_2x4 (wordl0, wordl1, pw_l_len); - switch_buffer_by_offset_le_S (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len); + switch_buffer_by_offset_le (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len); } /** * loop */ - for (u32 il_pos = 0; il_pos < combs_cnt; il_pos += VECT_SIZE) + for (u32 il_pos = 0; il_pos < combs_cnt; il_pos++) { - const u32x pw_r_len = pwlenx_create_combt (combs_buf, il_pos); + const u32 pw_r_len = combs_buf[il_pos].pw_len; + + const u32 pw_len = pw_l_len + pw_r_len; + + u32 wordr0[4]; + + wordr0[0] = combs_buf[il_pos].i[0]; + wordr0[1] = combs_buf[il_pos].i[1]; + wordr0[2] = combs_buf[il_pos].i[2]; + wordr0[3] = combs_buf[il_pos].i[3]; + + u32 wordr1[4]; + + wordr1[0] = combs_buf[il_pos].i[4]; + wordr1[1] = combs_buf[il_pos].i[5]; + wordr1[2] = combs_buf[il_pos].i[6]; + wordr1[3] = combs_buf[il_pos].i[7]; + + u32 wordr2[4]; - const u32x pw_len = pw_l_len + pw_r_len; + wordr2[0] = 0; + wordr2[1] = 0; + wordr2[2] = 0; + wordr2[3] = 0; - u32x wordr0[4] = { 0 }; - u32x wordr1[4] = { 0 }; - u32x wordr2[4] = { 0 }; - u32x wordr3[4] = { 0 }; + u32 wordr3[4]; - wordr0[0] = ix_create_combt (combs_buf, il_pos, 0); - wordr0[1] = ix_create_combt (combs_buf, il_pos, 1); - wordr0[2] = ix_create_combt (combs_buf, il_pos, 2); - wordr0[3] = ix_create_combt (combs_buf, il_pos, 3); - wordr1[0] = ix_create_combt (combs_buf, il_pos, 4); - wordr1[1] = ix_create_combt (combs_buf, il_pos, 5); - wordr1[2] = ix_create_combt (combs_buf, il_pos, 6); - wordr1[3] = ix_create_combt (combs_buf, il_pos, 7); + wordr3[0] = 0; + wordr3[1] = 0; + wordr3[2] = 0; + wordr3[3] = 0; if (combs_mode == COMBINATOR_MODE_BASE_LEFT) { switch_buffer_by_offset_le (wordr0, wordr1, wordr2, wordr3, pw_l_len); } - u32x w0[4]; + u32 w0[4]; w0[0] = wordl0[0] | wordr0[0]; w0[1] = wordl0[1] | wordr0[1]; w0[2] = wordl0[2] | wordr0[2]; w0[3] = wordl0[3] | wordr0[3]; - u32x w1[4]; + u32 w1[4]; w1[0] = wordl1[0] | wordr1[0]; w1[1] = wordl1[1] | wordr1[1]; w1[2] = wordl1[2] | wordr1[2]; w1[3] = wordl1[3] | wordr1[3]; - u32x w2[4]; + u32 w2[4]; w2[0] = wordl2[0] | wordr2[0]; w2[1] = wordl2[1] | wordr2[1]; w2[2] = wordl2[2] | wordr2[2]; w2[3] = wordl2[3] | wordr2[3]; - u32x w3[4]; + u32 w3[4]; w3[0] = wordl3[0] | wordr3[0]; w3[1] = wordl3[1] | wordr3[1]; w3[2] = pw_len * 8; w3[3] = 0; - u32x a = MD5M_A; - u32x b = MD5M_B; - u32x c = MD5M_C; - u32x d = MD5M_D; + u32 a = MD5M_A; + u32 b = MD5M_B; + u32 c = MD5M_C; + u32 d = MD5M_D; MD5_STEP (MD5_Fo, a, b, c, d, w0[0], MD5C00, MD5S00); MD5_STEP (MD5_Fo, d, a, b, c, w0[1], MD5C01, MD5S01); diff --git a/OpenCL/m05300_a1.cl b/OpenCL/m05300_a1.cl index 363a512..5b72c03 100644 --- a/OpenCL/m05300_a1.cl +++ b/OpenCL/m05300_a1.cl @@ -5,8 +5,6 @@ #define _MD5_ -#define NEW_SIMD_CODE - #include "include/constants.h" #include "include/kernel_vendor.h" @@ -18,7 +16,9 @@ #include "include/kernel_functions.c" #include "OpenCL/types_ocl.c" #include "OpenCL/common.c" -#include "OpenCL/simd.c" + +#define COMPARE_S "OpenCL/check_single_comp4.c" +#define COMPARE_M "OpenCL/check_multi_comp4.c" static void md5_transform (const u32 w0[4], const u32 w1[4], const u32 w2[4], const u32 w3[4], u32 digest[4]) { @@ -248,71 +248,106 @@ __kernel void m05300_m04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, * base */ - u32 pws0[4] = { 0 }; - u32 pws1[4] = { 0 }; + u32 wordl0[4]; + + wordl0[0] = pws[gid].i[ 0]; + wordl0[1] = pws[gid].i[ 1]; + wordl0[2] = pws[gid].i[ 2]; + wordl0[3] = pws[gid].i[ 3]; + + u32 wordl1[4]; + + wordl1[0] = pws[gid].i[ 4]; + wordl1[1] = pws[gid].i[ 5]; + wordl1[2] = pws[gid].i[ 6]; + wordl1[3] = pws[gid].i[ 7]; - pws0[0] = pws[gid].i[0]; - pws0[1] = pws[gid].i[1]; - pws0[2] = pws[gid].i[2]; - pws0[3] = pws[gid].i[3]; - pws1[0] = pws[gid].i[4]; - pws1[1] = pws[gid].i[5]; - pws1[2] = pws[gid].i[6]; - pws1[3] = pws[gid].i[7]; + u32 wordl2[4]; + + wordl2[0] = 0; + wordl2[1] = 0; + wordl2[2] = 0; + wordl2[3] = 0; + + u32 wordl3[4]; + + wordl3[0] = 0; + wordl3[1] = 0; + wordl3[2] = 0; + wordl3[3] = 0; const u32 pw_l_len = pws[gid].pw_len; + if (combs_mode == COMBINATOR_MODE_BASE_RIGHT) + { + switch_buffer_by_offset_le (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len); + } + /** * loop */ - for (u32 il_pos = 0; il_pos < combs_cnt; il_pos += VECT_SIZE) + for (u32 il_pos = 0; il_pos < combs_cnt; il_pos++) { - const u32x pw_r_len = pwlenx_create_combt (combs_buf, il_pos); + const u32 pw_r_len = combs_buf[il_pos].pw_len; + + const u32 pw_len = pw_l_len + pw_r_len; + + u32 wordr0[4]; + + wordr0[0] = combs_buf[il_pos].i[0]; + wordr0[1] = combs_buf[il_pos].i[1]; + wordr0[2] = combs_buf[il_pos].i[2]; + wordr0[3] = combs_buf[il_pos].i[3]; + + u32 wordr1[4]; + + wordr1[0] = combs_buf[il_pos].i[4]; + wordr1[1] = combs_buf[il_pos].i[5]; + wordr1[2] = combs_buf[il_pos].i[6]; + wordr1[3] = combs_buf[il_pos].i[7]; + + u32 wordr2[4]; - const u32x pw_len = pw_l_len + pw_r_len; + wordr2[0] = 0; + wordr2[1] = 0; + wordr2[2] = 0; + wordr2[3] = 0; - u32x wordr0[4] = { 0 }; - u32x wordr1[4] = { 0 }; - u32x wordr2[4] = { 0 }; - u32x wordr3[4] = { 0 }; + u32 wordr3[4]; - wordr0[0] = ix_create_combt (combs_buf, il_pos, 0); - wordr0[1] = ix_create_combt (combs_buf, il_pos, 1); - wordr0[2] = ix_create_combt (combs_buf, il_pos, 2); - wordr0[3] = ix_create_combt (combs_buf, il_pos, 3); - wordr1[0] = ix_create_combt (combs_buf, il_pos, 4); - wordr1[1] = ix_create_combt (combs_buf, il_pos, 5); - wordr1[2] = ix_create_combt (combs_buf, il_pos, 6); - wordr1[3] = ix_create_combt (combs_buf, il_pos, 7); + wordr3[0] = 0; + wordr3[1] = 0; + wordr3[2] = 0; + wordr3[3] = 0; if (combs_mode == COMBINATOR_MODE_BASE_LEFT) { switch_buffer_by_offset_le (wordr0, wordr1, wordr2, wordr3, pw_l_len); } - u32x w0[4]; + u32 w0[4]; w0[0] = wordl0[0] | wordr0[0]; w0[1] = wordl0[1] | wordr0[1]; w0[2] = wordl0[2] | wordr0[2]; w0[3] = wordl0[3] | wordr0[3]; - u32x w1[4]; + u32 w1[4]; w1[0] = wordl1[0] | wordr1[0]; w1[1] = wordl1[1] | wordr1[1]; w1[2] = wordl1[2] | wordr1[2]; w1[3] = wordl1[3] | wordr1[3]; - u32x w2[4]; + u32 w2[4]; w2[0] = wordl2[0] | wordr2[0]; w2[1] = wordl2[1] | wordr2[1]; w2[2] = wordl2[2] | wordr2[2]; w2[3] = wordl2[3] | wordr2[3]; - u32x w3[4]; + u32 w3[4]; w3[0] = wordl3[0] | wordr3[0]; w3[1] = wordl3[1] | wordr3[1]; @@ -323,28 +358,28 @@ __kernel void m05300_m04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, * pads */ - u32x w0_t[4]; + u32 w0_t[4]; w0_t[0] = w0[0]; w0_t[1] = w0[1]; w0_t[2] = w0[2]; w0_t[3] = w0[3]; - u32x w1_t[4]; + u32 w1_t[4]; w1_t[0] = w1[0]; w1_t[1] = w1[1]; w1_t[2] = w1[2]; w1_t[3] = w1[3]; - u32x w2_t[4]; + u32 w2_t[4]; w2_t[0] = w2[0]; w2_t[1] = w2[1]; w2_t[2] = w2[2]; w2_t[3] = w2[3]; - u32x w3_t[4]; + u32 w3_t[4]; w3_t[0] = w3[0]; w3_t[1] = w3[1]; @@ -440,7 +475,12 @@ __kernel void m05300_m04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, hmac_md5_run (w0_t, w1_t, w2_t, w3_t, ipad, opad, digest); - COMPARE_M_SIMD (digest[0], digest[3], digest[2], digest[1]); + const u32 r0 = digest[0]; + const u32 r1 = digest[3]; + const u32 r2 = digest[2]; + const u32 r3 = digest[1]; + + #include COMPARE_M } } @@ -497,20 +537,41 @@ __kernel void m05300_s04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, * base */ - u32 pws0[4] = { 0 }; - u32 pws1[4] = { 0 }; + u32 wordl0[4]; + + wordl0[0] = pws[gid].i[ 0]; + wordl0[1] = pws[gid].i[ 1]; + wordl0[2] = pws[gid].i[ 2]; + wordl0[3] = pws[gid].i[ 3]; + + u32 wordl1[4]; + + wordl1[0] = pws[gid].i[ 4]; + wordl1[1] = pws[gid].i[ 5]; + wordl1[2] = pws[gid].i[ 6]; + wordl1[3] = pws[gid].i[ 7]; + + u32 wordl2[4]; - pws0[0] = pws[gid].i[0]; - pws0[1] = pws[gid].i[1]; - pws0[2] = pws[gid].i[2]; - pws0[3] = pws[gid].i[3]; - pws1[0] = pws[gid].i[4]; - pws1[1] = pws[gid].i[5]; - pws1[2] = pws[gid].i[6]; - pws1[3] = pws[gid].i[7]; + wordl2[0] = 0; + wordl2[1] = 0; + wordl2[2] = 0; + wordl2[3] = 0; + + u32 wordl3[4]; + + wordl3[0] = 0; + wordl3[1] = 0; + wordl3[2] = 0; + wordl3[3] = 0; const u32 pw_l_len = pws[gid].pw_len; + if (combs_mode == COMBINATOR_MODE_BASE_RIGHT) + { + switch_buffer_by_offset_le (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len); + } + /** * digest */ @@ -527,53 +588,67 @@ __kernel void m05300_s04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, * loop */ - for (u32 il_pos = 0; il_pos < combs_cnt; il_pos += VECT_SIZE) + for (u32 il_pos = 0; il_pos < combs_cnt; il_pos++) { - const u32x pw_r_len = pwlenx_create_combt (combs_buf, il_pos); + const u32 pw_r_len = combs_buf[il_pos].pw_len; + + const u32 pw_len = pw_l_len + pw_r_len; + + u32 wordr0[4]; + + wordr0[0] = combs_buf[il_pos].i[0]; + wordr0[1] = combs_buf[il_pos].i[1]; + wordr0[2] = combs_buf[il_pos].i[2]; + wordr0[3] = combs_buf[il_pos].i[3]; - const u32x pw_len = pw_l_len + pw_r_len; + u32 wordr1[4]; - u32x wordr0[4] = { 0 }; - u32x wordr1[4] = { 0 }; - u32x wordr2[4] = { 0 }; - u32x wordr3[4] = { 0 }; + wordr1[0] = combs_buf[il_pos].i[4]; + wordr1[1] = combs_buf[il_pos].i[5]; + wordr1[2] = combs_buf[il_pos].i[6]; + wordr1[3] = combs_buf[il_pos].i[7]; - wordr0[0] = ix_create_combt (combs_buf, il_pos, 0); - wordr0[1] = ix_create_combt (combs_buf, il_pos, 1); - wordr0[2] = ix_create_combt (combs_buf, il_pos, 2); - wordr0[3] = ix_create_combt (combs_buf, il_pos, 3); - wordr1[0] = ix_create_combt (combs_buf, il_pos, 4); - wordr1[1] = ix_create_combt (combs_buf, il_pos, 5); - wordr1[2] = ix_create_combt (combs_buf, il_pos, 6); - wordr1[3] = ix_create_combt (combs_buf, il_pos, 7); + u32 wordr2[4]; + + wordr2[0] = 0; + wordr2[1] = 0; + wordr2[2] = 0; + wordr2[3] = 0; + + u32 wordr3[4]; + + wordr3[0] = 0; + wordr3[1] = 0; + wordr3[2] = 0; + wordr3[3] = 0; if (combs_mode == COMBINATOR_MODE_BASE_LEFT) { switch_buffer_by_offset_le (wordr0, wordr1, wordr2, wordr3, pw_l_len); } - u32x w0[4]; + u32 w0[4]; w0[0] = wordl0[0] | wordr0[0]; w0[1] = wordl0[1] | wordr0[1]; w0[2] = wordl0[2] | wordr0[2]; w0[3] = wordl0[3] | wordr0[3]; - u32x w1[4]; + u32 w1[4]; w1[0] = wordl1[0] | wordr1[0]; w1[1] = wordl1[1] | wordr1[1]; w1[2] = wordl1[2] | wordr1[2]; w1[3] = wordl1[3] | wordr1[3]; - u32x w2[4]; + u32 w2[4]; w2[0] = wordl2[0] | wordr2[0]; w2[1] = wordl2[1] | wordr2[1]; w2[2] = wordl2[2] | wordr2[2]; w2[3] = wordl2[3] | wordr2[3]; - u32x w3[4]; + u32 w3[4]; w3[0] = wordl3[0] | wordr3[0]; w3[1] = wordl3[1] | wordr3[1]; @@ -584,28 +659,28 @@ __kernel void m05300_s04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, * pads */ - u32x w0_t[4]; + u32 w0_t[4]; w0_t[0] = w0[0]; w0_t[1] = w0[1]; w0_t[2] = w0[2]; w0_t[3] = w0[3]; - u32x w1_t[4]; + u32 w1_t[4]; w1_t[0] = w1[0]; w1_t[1] = w1[1]; w1_t[2] = w1[2]; w1_t[3] = w1[3]; - u32x w2_t[4]; + u32 w2_t[4]; w2_t[0] = 0; w2_t[1] = 0; w2_t[2] = 0; w2_t[3] = 0; - u32x w3_t[4]; + u32 w3_t[4]; w3_t[0] = 0; w3_t[1] = 0; @@ -701,7 +776,12 @@ __kernel void m05300_s04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, hmac_md5_run (w0_t, w1_t, w2_t, w3_t, ipad, opad, digest); - COMPARE_S_SIMD (digest[0], digest[3], digest[2], digest[1]); + const u32 r0 = digest[0]; + const u32 r1 = digest[3]; + const u32 r2 = digest[2]; + const u32 r3 = digest[1]; + + #include COMPARE_S } } diff --git a/OpenCL/m05400_a1.cl b/OpenCL/m05400_a1.cl index a0b3ad8..e4a9f14 100644 --- a/OpenCL/m05400_a1.cl +++ b/OpenCL/m05400_a1.cl @@ -5,8 +5,6 @@ #define _SHA1_ -#define NEW_SIMD_CODE - #include "include/constants.h" #include "include/kernel_vendor.h" @@ -18,7 +16,9 @@ #include "include/kernel_functions.c" #include "OpenCL/types_ocl.c" #include "OpenCL/common.c" -#include "OpenCL/simd.c" + +#define COMPARE_S "OpenCL/check_single_comp4.c" +#define COMPARE_M "OpenCL/check_multi_comp4.c" static void sha1_transform (const u32 w0[4], const u32 w1[4], const u32 w2[4], const u32 w3[4], u32 digest[5]) { @@ -282,71 +282,106 @@ __kernel void m05400_m04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, * base */ - u32 pws0[4] = { 0 }; - u32 pws1[4] = { 0 }; + u32 wordl0[4]; + + wordl0[0] = pws[gid].i[ 0]; + wordl0[1] = pws[gid].i[ 1]; + wordl0[2] = pws[gid].i[ 2]; + wordl0[3] = pws[gid].i[ 3]; + + u32 wordl1[4]; + + wordl1[0] = pws[gid].i[ 4]; + wordl1[1] = pws[gid].i[ 5]; + wordl1[2] = pws[gid].i[ 6]; + wordl1[3] = pws[gid].i[ 7]; - pws0[0] = pws[gid].i[0]; - pws0[1] = pws[gid].i[1]; - pws0[2] = pws[gid].i[2]; - pws0[3] = pws[gid].i[3]; - pws1[0] = pws[gid].i[4]; - pws1[1] = pws[gid].i[5]; - pws1[2] = pws[gid].i[6]; - pws1[3] = pws[gid].i[7]; + u32 wordl2[4]; + + wordl2[0] = 0; + wordl2[1] = 0; + wordl2[2] = 0; + wordl2[3] = 0; + + u32 wordl3[4]; + + wordl3[0] = 0; + wordl3[1] = 0; + wordl3[2] = 0; + wordl3[3] = 0; const u32 pw_l_len = pws[gid].pw_len; + if (combs_mode == COMBINATOR_MODE_BASE_RIGHT) + { + switch_buffer_by_offset_le (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len); + } + /** * loop */ - for (u32 il_pos = 0; il_pos < combs_cnt; il_pos += VECT_SIZE) + for (u32 il_pos = 0; il_pos < combs_cnt; il_pos++) { - const u32x pw_r_len = pwlenx_create_combt (combs_buf, il_pos); + const u32 pw_r_len = combs_buf[il_pos].pw_len; + + const u32 pw_len = pw_l_len + pw_r_len; + + u32 wordr0[4]; + + wordr0[0] = combs_buf[il_pos].i[0]; + wordr0[1] = combs_buf[il_pos].i[1]; + wordr0[2] = combs_buf[il_pos].i[2]; + wordr0[3] = combs_buf[il_pos].i[3]; + + u32 wordr1[4]; + + wordr1[0] = combs_buf[il_pos].i[4]; + wordr1[1] = combs_buf[il_pos].i[5]; + wordr1[2] = combs_buf[il_pos].i[6]; + wordr1[3] = combs_buf[il_pos].i[7]; + + u32 wordr2[4]; - const u32x pw_len = pw_l_len + pw_r_len; + wordr2[0] = 0; + wordr2[1] = 0; + wordr2[2] = 0; + wordr2[3] = 0; - u32x wordr0[4] = { 0 }; - u32x wordr1[4] = { 0 }; - u32x wordr2[4] = { 0 }; - u32x wordr3[4] = { 0 }; + u32 wordr3[4]; - wordr0[0] = ix_create_combt (combs_buf, il_pos, 0); - wordr0[1] = ix_create_combt (combs_buf, il_pos, 1); - wordr0[2] = ix_create_combt (combs_buf, il_pos, 2); - wordr0[3] = ix_create_combt (combs_buf, il_pos, 3); - wordr1[0] = ix_create_combt (combs_buf, il_pos, 4); - wordr1[1] = ix_create_combt (combs_buf, il_pos, 5); - wordr1[2] = ix_create_combt (combs_buf, il_pos, 6); - wordr1[3] = ix_create_combt (combs_buf, il_pos, 7); + wordr3[0] = 0; + wordr3[1] = 0; + wordr3[2] = 0; + wordr3[3] = 0; if (combs_mode == COMBINATOR_MODE_BASE_LEFT) { switch_buffer_by_offset_le (wordr0, wordr1, wordr2, wordr3, pw_l_len); } - u32x w0[4]; + u32 w0[4]; w0[0] = wordl0[0] | wordr0[0]; w0[1] = wordl0[1] | wordr0[1]; w0[2] = wordl0[2] | wordr0[2]; w0[3] = wordl0[3] | wordr0[3]; - u32x w1[4]; + u32 w1[4]; w1[0] = wordl1[0] | wordr1[0]; w1[1] = wordl1[1] | wordr1[1]; w1[2] = wordl1[2] | wordr1[2]; w1[3] = wordl1[3] | wordr1[3]; - u32x w2[4]; + u32 w2[4]; w2[0] = wordl2[0] | wordr2[0]; w2[1] = wordl2[1] | wordr2[1]; w2[2] = wordl2[2] | wordr2[2]; w2[3] = wordl2[3] | wordr2[3]; - u32x w3[4]; + u32 w3[4]; w3[0] = wordl3[0] | wordr3[0]; w3[1] = wordl3[1] | wordr3[1]; @@ -357,28 +392,28 @@ __kernel void m05400_m04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, * pads */ - u32x w0_t[4]; + u32 w0_t[4]; w0_t[0] = swap32 (w0[0]); w0_t[1] = swap32 (w0[1]); w0_t[2] = swap32 (w0[2]); w0_t[3] = swap32 (w0[3]); - u32x w1_t[4]; + u32 w1_t[4]; w1_t[0] = swap32 (w1[0]); w1_t[1] = swap32 (w1[1]); w1_t[2] = swap32 (w1[2]); w1_t[3] = swap32 (w1[3]); - u32x w2_t[4]; + u32 w2_t[4]; w2_t[0] = 0; w2_t[1] = 0; w2_t[2] = 0; w2_t[3] = 0; - u32x w3_t[4]; + u32 w3_t[4]; w3_t[0] = 0; w3_t[1] = 0; @@ -474,7 +509,12 @@ __kernel void m05400_m04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, hmac_sha1_run (w0_t, w1_t, w2_t, w3_t, ipad, opad, digest); - COMPARE_M_SIMD (digest[3], digest[4], digest[2], digest[1]); + const u32 r0 = digest[3]; + const u32 r1 = digest[4]; + const u32 r2 = digest[2]; + const u32 r3 = digest[1]; + + #include COMPARE_M } } @@ -531,20 +571,41 @@ __kernel void m05400_s04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, * base */ - u32 pws0[4] = { 0 }; - u32 pws1[4] = { 0 }; + u32 wordl0[4]; + + wordl0[0] = pws[gid].i[ 0]; + wordl0[1] = pws[gid].i[ 1]; + wordl0[2] = pws[gid].i[ 2]; + wordl0[3] = pws[gid].i[ 3]; + + u32 wordl1[4]; + + wordl1[0] = pws[gid].i[ 4]; + wordl1[1] = pws[gid].i[ 5]; + wordl1[2] = pws[gid].i[ 6]; + wordl1[3] = pws[gid].i[ 7]; + + u32 wordl2[4]; - pws0[0] = pws[gid].i[0]; - pws0[1] = pws[gid].i[1]; - pws0[2] = pws[gid].i[2]; - pws0[3] = pws[gid].i[3]; - pws1[0] = pws[gid].i[4]; - pws1[1] = pws[gid].i[5]; - pws1[2] = pws[gid].i[6]; - pws1[3] = pws[gid].i[7]; + wordl2[0] = 0; + wordl2[1] = 0; + wordl2[2] = 0; + wordl2[3] = 0; + + u32 wordl3[4]; + + wordl3[0] = 0; + wordl3[1] = 0; + wordl3[2] = 0; + wordl3[3] = 0; const u32 pw_l_len = pws[gid].pw_len; + if (combs_mode == COMBINATOR_MODE_BASE_RIGHT) + { + switch_buffer_by_offset_le (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len); + } + /** * digest */ @@ -561,53 +622,67 @@ __kernel void m05400_s04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, * loop */ - for (u32 il_pos = 0; il_pos < combs_cnt; il_pos += VECT_SIZE) + for (u32 il_pos = 0; il_pos < combs_cnt; il_pos++) { - const u32x pw_r_len = pwlenx_create_combt (combs_buf, il_pos); + const u32 pw_r_len = combs_buf[il_pos].pw_len; + + const u32 pw_len = pw_l_len + pw_r_len; + + u32 wordr0[4]; + + wordr0[0] = combs_buf[il_pos].i[0]; + wordr0[1] = combs_buf[il_pos].i[1]; + wordr0[2] = combs_buf[il_pos].i[2]; + wordr0[3] = combs_buf[il_pos].i[3]; - const u32x pw_len = pw_l_len + pw_r_len; + u32 wordr1[4]; - u32x wordr0[4] = { 0 }; - u32x wordr1[4] = { 0 }; - u32x wordr2[4] = { 0 }; - u32x wordr3[4] = { 0 }; + wordr1[0] = combs_buf[il_pos].i[4]; + wordr1[1] = combs_buf[il_pos].i[5]; + wordr1[2] = combs_buf[il_pos].i[6]; + wordr1[3] = combs_buf[il_pos].i[7]; - wordr0[0] = ix_create_combt (combs_buf, il_pos, 0); - wordr0[1] = ix_create_combt (combs_buf, il_pos, 1); - wordr0[2] = ix_create_combt (combs_buf, il_pos, 2); - wordr0[3] = ix_create_combt (combs_buf, il_pos, 3); - wordr1[0] = ix_create_combt (combs_buf, il_pos, 4); - wordr1[1] = ix_create_combt (combs_buf, il_pos, 5); - wordr1[2] = ix_create_combt (combs_buf, il_pos, 6); - wordr1[3] = ix_create_combt (combs_buf, il_pos, 7); + u32 wordr2[4]; + + wordr2[0] = 0; + wordr2[1] = 0; + wordr2[2] = 0; + wordr2[3] = 0; + + u32 wordr3[4]; + + wordr3[0] = 0; + wordr3[1] = 0; + wordr3[2] = 0; + wordr3[3] = 0; if (combs_mode == COMBINATOR_MODE_BASE_LEFT) { switch_buffer_by_offset_le (wordr0, wordr1, wordr2, wordr3, pw_l_len); } - u32x w0[4]; + u32 w0[4]; w0[0] = wordl0[0] | wordr0[0]; w0[1] = wordl0[1] | wordr0[1]; w0[2] = wordl0[2] | wordr0[2]; w0[3] = wordl0[3] | wordr0[3]; - u32x w1[4]; + u32 w1[4]; w1[0] = wordl1[0] | wordr1[0]; w1[1] = wordl1[1] | wordr1[1]; w1[2] = wordl1[2] | wordr1[2]; w1[3] = wordl1[3] | wordr1[3]; - u32x w2[4]; + u32 w2[4]; w2[0] = wordl2[0] | wordr2[0]; w2[1] = wordl2[1] | wordr2[1]; w2[2] = wordl2[2] | wordr2[2]; w2[3] = wordl2[3] | wordr2[3]; - u32x w3[4]; + u32 w3[4]; w3[0] = wordl3[0] | wordr3[0]; w3[1] = wordl3[1] | wordr3[1]; @@ -618,28 +693,28 @@ __kernel void m05400_s04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, * pads */ - u32x w0_t[4]; + u32 w0_t[4]; w0_t[0] = swap32 (w0[0]); w0_t[1] = swap32 (w0[1]); w0_t[2] = swap32 (w0[2]); w0_t[3] = swap32 (w0[3]); - u32x w1_t[4]; + u32 w1_t[4]; w1_t[0] = swap32 (w1[0]); w1_t[1] = swap32 (w1[1]); w1_t[2] = swap32 (w1[2]); w1_t[3] = swap32 (w1[3]); - u32x w2_t[4]; + u32 w2_t[4]; w2_t[0] = 0; w2_t[1] = 0; w2_t[2] = 0; w2_t[3] = 0; - u32x w3_t[4]; + u32 w3_t[4]; w3_t[0] = 0; w3_t[1] = 0; @@ -735,7 +810,12 @@ __kernel void m05400_s04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, hmac_sha1_run (w0_t, w1_t, w2_t, w3_t, ipad, opad, digest); - COMPARE_S_SIMD (digest[3], digest[4], digest[2], digest[1]); + const u32 r0 = digest[3]; + const u32 r1 = digest[4]; + const u32 r2 = digest[2]; + const u32 r3 = digest[1]; + + #include COMPARE_S } } diff --git a/OpenCL/m05500_a1.cl b/OpenCL/m05500_a1.cl index 48437e3..ad0193b 100644 --- a/OpenCL/m05500_a1.cl +++ b/OpenCL/m05500_a1.cl @@ -7,8 +7,6 @@ #define _MD4_ -#define NEW_SIMD_CODE - #include "include/constants.h" #include "include/kernel_vendor.h" @@ -20,7 +18,9 @@ #include "include/kernel_functions.c" #include "OpenCL/types_ocl.c" #include "OpenCL/common.c" -#include "OpenCL/simd.c" + +#define COMPARE_S "OpenCL/check_single_comp4.c" +#define COMPARE_M "OpenCL/check_multi_comp4.c" #define PERM_OP(a,b,tt,n,m) \ { \ @@ -528,20 +528,43 @@ __kernel void m05500_m04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, * base */ - u32 pws0[4] = { 0 }; - u32 pws1[4] = { 0 }; + u32 wordl0[4]; + + wordl0[0] = pws[gid].i[ 0]; + wordl0[1] = pws[gid].i[ 1]; + wordl0[2] = pws[gid].i[ 2]; + wordl0[3] = pws[gid].i[ 3]; + + u32 wordl1[4]; + + wordl1[0] = pws[gid].i[ 4]; + wordl1[1] = pws[gid].i[ 5]; + wordl1[2] = pws[gid].i[ 6]; + wordl1[3] = pws[gid].i[ 7]; - pws0[0] = pws[gid].i[0]; - pws0[1] = pws[gid].i[1]; - pws0[2] = pws[gid].i[2]; - pws0[3] = pws[gid].i[3]; - pws1[0] = pws[gid].i[4]; - pws1[1] = pws[gid].i[5]; - pws1[2] = pws[gid].i[6]; - pws1[3] = pws[gid].i[7]; + u32 wordl2[4]; + + wordl2[0] = 0; + wordl2[1] = 0; + wordl2[2] = 0; + wordl2[3] = 0; + + u32 wordl3[4]; + + wordl3[0] = 0; + wordl3[1] = 0; + wordl3[2] = 0; + wordl3[3] = 0; const u32 pw_l_len = pws[gid].pw_len; + if (combs_mode == COMBINATOR_MODE_BASE_RIGHT) + { + append_0x80_2x4 (wordl0, wordl1, pw_l_len); + + switch_buffer_by_offset_le (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len); + } + /** * salt */ @@ -559,73 +582,87 @@ __kernel void m05500_m04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, * loop */ - for (u32 il_pos = 0; il_pos < combs_cnt; il_pos += VECT_SIZE) + for (u32 il_pos = 0; il_pos < combs_cnt; il_pos++) { - const u32x pw_r_len = pwlenx_create_combt (combs_buf, il_pos); + const u32 pw_r_len = combs_buf[il_pos].pw_len; + + const u32 pw_len = pw_l_len + pw_r_len; + + u32 wordr0[4]; + + wordr0[0] = combs_buf[il_pos].i[0]; + wordr0[1] = combs_buf[il_pos].i[1]; + wordr0[2] = combs_buf[il_pos].i[2]; + wordr0[3] = combs_buf[il_pos].i[3]; + + u32 wordr1[4]; + + wordr1[0] = combs_buf[il_pos].i[4]; + wordr1[1] = combs_buf[il_pos].i[5]; + wordr1[2] = combs_buf[il_pos].i[6]; + wordr1[3] = combs_buf[il_pos].i[7]; - const u32x pw_len = pw_l_len + pw_r_len; + u32 wordr2[4]; - u32x wordr0[4] = { 0 }; - u32x wordr1[4] = { 0 }; - u32x wordr2[4] = { 0 }; - u32x wordr3[4] = { 0 }; + wordr2[0] = 0; + wordr2[1] = 0; + wordr2[2] = 0; + wordr2[3] = 0; - wordr0[0] = ix_create_combt (combs_buf, il_pos, 0); - wordr0[1] = ix_create_combt (combs_buf, il_pos, 1); - wordr0[2] = ix_create_combt (combs_buf, il_pos, 2); - wordr0[3] = ix_create_combt (combs_buf, il_pos, 3); - wordr1[0] = ix_create_combt (combs_buf, il_pos, 4); - wordr1[1] = ix_create_combt (combs_buf, il_pos, 5); - wordr1[2] = ix_create_combt (combs_buf, il_pos, 6); - wordr1[3] = ix_create_combt (combs_buf, il_pos, 7); + u32 wordr3[4]; + + wordr3[0] = 0; + wordr3[1] = 0; + wordr3[2] = 0; + wordr3[3] = 0; if (combs_mode == COMBINATOR_MODE_BASE_LEFT) { switch_buffer_by_offset_le (wordr0, wordr1, wordr2, wordr3, pw_l_len); } - u32x w0[4]; + u32 w0[4]; w0[0] = wordl0[0] | wordr0[0]; w0[1] = wordl0[1] | wordr0[1]; w0[2] = wordl0[2] | wordr0[2]; w0[3] = wordl0[3] | wordr0[3]; - u32x w1[4]; + u32 w1[4]; w1[0] = wordl1[0] | wordr1[0]; w1[1] = wordl1[1] | wordr1[1]; w1[2] = wordl1[2] | wordr1[2]; w1[3] = wordl1[3] | wordr1[3]; - u32x w2[4]; + u32 w2[4]; w2[0] = 0; w2[1] = 0; w2[2] = 0; w2[3] = 0; - u32x w3[4]; + u32 w3[4]; w3[0] = 0; w3[1] = 0; w3[2] = 0; w3[3] = 0; - u32x w0_t[4]; - u32x w1_t[4]; - u32x w2_t[4]; - u32x w3_t[4]; + u32 w0_t[4]; + u32 w1_t[4]; + u32 w2_t[4]; + u32 w3_t[4]; make_unicode (w0, w0_t, w1_t); make_unicode (w1, w2_t, w3_t); w3_t[2] = pw_len * 8 * 2; - u32x a = MD4M_A; - u32x b = MD4M_B; - u32x c = MD4M_C; - u32x d = MD4M_D; + u32 a = MD4M_A; + u32 b = MD4M_B; + u32 c = MD4M_C; + u32 d = MD4M_D; MD4_STEP (MD4_Fo, a, b, c, d, w0_t[0], MD4C00, MD4S00); MD4_STEP (MD4_Fo, d, a, b, c, w0_t[1], MD4C00, MD4S01); @@ -785,20 +822,43 @@ __kernel void m05500_s04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, * base */ - u32 pws0[4] = { 0 }; - u32 pws1[4] = { 0 }; + u32 wordl0[4]; + + wordl0[0] = pws[gid].i[ 0]; + wordl0[1] = pws[gid].i[ 1]; + wordl0[2] = pws[gid].i[ 2]; + wordl0[3] = pws[gid].i[ 3]; + + u32 wordl1[4]; + + wordl1[0] = pws[gid].i[ 4]; + wordl1[1] = pws[gid].i[ 5]; + wordl1[2] = pws[gid].i[ 6]; + wordl1[3] = pws[gid].i[ 7]; + + u32 wordl2[4]; - pws0[0] = pws[gid].i[0]; - pws0[1] = pws[gid].i[1]; - pws0[2] = pws[gid].i[2]; - pws0[3] = pws[gid].i[3]; - pws1[0] = pws[gid].i[4]; - pws1[1] = pws[gid].i[5]; - pws1[2] = pws[gid].i[6]; - pws1[3] = pws[gid].i[7]; + wordl2[0] = 0; + wordl2[1] = 0; + wordl2[2] = 0; + wordl2[3] = 0; + + u32 wordl3[4]; + + wordl3[0] = 0; + wordl3[1] = 0; + wordl3[2] = 0; + wordl3[3] = 0; const u32 pw_l_len = pws[gid].pw_len; + if (combs_mode == COMBINATOR_MODE_BASE_RIGHT) + { + append_0x80_2x4 (wordl0, wordl1, pw_l_len); + + switch_buffer_by_offset_le (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len); + } + /** * salt */ @@ -828,73 +888,87 @@ __kernel void m05500_s04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, * loop */ - for (u32 il_pos = 0; il_pos < combs_cnt; il_pos += VECT_SIZE) + for (u32 il_pos = 0; il_pos < combs_cnt; il_pos++) { - const u32x pw_r_len = pwlenx_create_combt (combs_buf, il_pos); + const u32 pw_r_len = combs_buf[il_pos].pw_len; + + const u32 pw_len = pw_l_len + pw_r_len; + + u32 wordr0[4]; + + wordr0[0] = combs_buf[il_pos].i[0]; + wordr0[1] = combs_buf[il_pos].i[1]; + wordr0[2] = combs_buf[il_pos].i[2]; + wordr0[3] = combs_buf[il_pos].i[3]; + + u32 wordr1[4]; + + wordr1[0] = combs_buf[il_pos].i[4]; + wordr1[1] = combs_buf[il_pos].i[5]; + wordr1[2] = combs_buf[il_pos].i[6]; + wordr1[3] = combs_buf[il_pos].i[7]; + + u32 wordr2[4]; - const u32x pw_len = pw_l_len + pw_r_len; + wordr2[0] = 0; + wordr2[1] = 0; + wordr2[2] = 0; + wordr2[3] = 0; - u32x wordr0[4] = { 0 }; - u32x wordr1[4] = { 0 }; - u32x wordr2[4] = { 0 }; - u32x wordr3[4] = { 0 }; + u32 wordr3[4]; - wordr0[0] = ix_create_combt (combs_buf, il_pos, 0); - wordr0[1] = ix_create_combt (combs_buf, il_pos, 1); - wordr0[2] = ix_create_combt (combs_buf, il_pos, 2); - wordr0[3] = ix_create_combt (combs_buf, il_pos, 3); - wordr1[0] = ix_create_combt (combs_buf, il_pos, 4); - wordr1[1] = ix_create_combt (combs_buf, il_pos, 5); - wordr1[2] = ix_create_combt (combs_buf, il_pos, 6); - wordr1[3] = ix_create_combt (combs_buf, il_pos, 7); + wordr3[0] = 0; + wordr3[1] = 0; + wordr3[2] = 0; + wordr3[3] = 0; if (combs_mode == COMBINATOR_MODE_BASE_LEFT) { switch_buffer_by_offset_le (wordr0, wordr1, wordr2, wordr3, pw_l_len); } - u32x w0[4]; + u32 w0[4]; w0[0] = wordl0[0] | wordr0[0]; w0[1] = wordl0[1] | wordr0[1]; w0[2] = wordl0[2] | wordr0[2]; w0[3] = wordl0[3] | wordr0[3]; - u32x w1[4]; + u32 w1[4]; w1[0] = wordl1[0] | wordr1[0]; w1[1] = wordl1[1] | wordr1[1]; w1[2] = wordl1[2] | wordr1[2]; w1[3] = wordl1[3] | wordr1[3]; - u32x w2[4]; + u32 w2[4]; w2[0] = 0; w2[1] = 0; w2[2] = 0; w2[3] = 0; - u32x w3[4]; + u32 w3[4]; w3[0] = 0; w3[1] = 0; w3[2] = 0; w3[3] = 0; - u32x w0_t[4]; - u32x w1_t[4]; - u32x w2_t[4]; - u32x w3_t[4]; + u32 w0_t[4]; + u32 w1_t[4]; + u32 w2_t[4]; + u32 w3_t[4]; make_unicode (w0, w0_t, w1_t); make_unicode (w1, w2_t, w3_t); w3_t[2] = pw_len * 8 * 2; - u32x a = MD4M_A; - u32x b = MD4M_B; - u32x c = MD4M_C; - u32x d = MD4M_D; + u32 a = MD4M_A; + u32 b = MD4M_B; + u32 c = MD4M_C; + u32 d = MD4M_D; MD4_STEP (MD4_Fo, a, b, c, d, w0_t[0], MD4C00, MD4S00); MD4_STEP (MD4_Fo, d, a, b, c, w0_t[1], MD4C00, MD4S01); diff --git a/OpenCL/m05600_a1.cl b/OpenCL/m05600_a1.cl index 4aa7984..d133270 100644 --- a/OpenCL/m05600_a1.cl +++ b/OpenCL/m05600_a1.cl @@ -5,8 +5,6 @@ #define _NETNTLMV2_ -#define NEW_SIMD_CODE - #include "include/constants.h" #include "include/kernel_vendor.h" @@ -18,7 +16,9 @@ #include "include/kernel_functions.c" #include "OpenCL/types_ocl.c" #include "OpenCL/common.c" -#include "OpenCL/simd.c" + +#define COMPARE_S "OpenCL/check_single_comp4.c" +#define COMPARE_M "OpenCL/check_multi_comp4.c" static void md4_transform (const u32 w0[4], const u32 w1[4], const u32 w2[4], const u32 w3[4], u32 digest[4]) { @@ -326,81 +326,118 @@ __kernel void m05600_m04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, * base */ - u32 pws0[4] = { 0 }; - u32 pws1[4] = { 0 }; + u32 wordl0[4]; + + wordl0[0] = pws[gid].i[ 0]; + wordl0[1] = pws[gid].i[ 1]; + wordl0[2] = pws[gid].i[ 2]; + wordl0[3] = pws[gid].i[ 3]; + + u32 wordl1[4]; + + wordl1[0] = pws[gid].i[ 4]; + wordl1[1] = pws[gid].i[ 5]; + wordl1[2] = pws[gid].i[ 6]; + wordl1[3] = pws[gid].i[ 7]; - pws0[0] = pws[gid].i[0]; - pws0[1] = pws[gid].i[1]; - pws0[2] = pws[gid].i[2]; - pws0[3] = pws[gid].i[3]; - pws1[0] = pws[gid].i[4]; - pws1[1] = pws[gid].i[5]; - pws1[2] = pws[gid].i[6]; - pws1[3] = pws[gid].i[7]; + u32 wordl2[4]; + + wordl2[0] = 0; + wordl2[1] = 0; + wordl2[2] = 0; + wordl2[3] = 0; + + u32 wordl3[4]; + + wordl3[0] = 0; + wordl3[1] = 0; + wordl3[2] = 0; + wordl3[3] = 0; const u32 pw_l_len = pws[gid].pw_len; + if (combs_mode == COMBINATOR_MODE_BASE_RIGHT) + { + append_0x80_2x4 (wordl0, wordl1, pw_l_len); + + switch_buffer_by_offset_le (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len); + } + /** * loop */ - for (u32 il_pos = 0; il_pos < combs_cnt; il_pos += VECT_SIZE) + for (u32 il_pos = 0; il_pos < combs_cnt; il_pos++) { - const u32x pw_r_len = pwlenx_create_combt (combs_buf, il_pos); + const u32 pw_r_len = combs_buf[il_pos].pw_len; + + const u32 pw_len = pw_l_len + pw_r_len; + + u32 wordr0[4]; + + wordr0[0] = combs_buf[il_pos].i[0]; + wordr0[1] = combs_buf[il_pos].i[1]; + wordr0[2] = combs_buf[il_pos].i[2]; + wordr0[3] = combs_buf[il_pos].i[3]; + + u32 wordr1[4]; + + wordr1[0] = combs_buf[il_pos].i[4]; + wordr1[1] = combs_buf[il_pos].i[5]; + wordr1[2] = combs_buf[il_pos].i[6]; + wordr1[3] = combs_buf[il_pos].i[7]; + + u32 wordr2[4]; - const u32x pw_len = pw_l_len + pw_r_len; + wordr2[0] = 0; + wordr2[1] = 0; + wordr2[2] = 0; + wordr2[3] = 0; - u32x wordr0[4] = { 0 }; - u32x wordr1[4] = { 0 }; - u32x wordr2[4] = { 0 }; - u32x wordr3[4] = { 0 }; + u32 wordr3[4]; - wordr0[0] = ix_create_combt (combs_buf, il_pos, 0); - wordr0[1] = ix_create_combt (combs_buf, il_pos, 1); - wordr0[2] = ix_create_combt (combs_buf, il_pos, 2); - wordr0[3] = ix_create_combt (combs_buf, il_pos, 3); - wordr1[0] = ix_create_combt (combs_buf, il_pos, 4); - wordr1[1] = ix_create_combt (combs_buf, il_pos, 5); - wordr1[2] = ix_create_combt (combs_buf, il_pos, 6); - wordr1[3] = ix_create_combt (combs_buf, il_pos, 7); + wordr3[0] = 0; + wordr3[1] = 0; + wordr3[2] = 0; + wordr3[3] = 0; if (combs_mode == COMBINATOR_MODE_BASE_LEFT) { switch_buffer_by_offset_le (wordr0, wordr1, wordr2, wordr3, pw_l_len); } - u32x w0[4]; + u32 w0[4]; w0[0] = wordl0[0] | wordr0[0]; w0[1] = wordl0[1] | wordr0[1]; w0[2] = wordl0[2] | wordr0[2]; w0[3] = wordl0[3] | wordr0[3]; - u32x w1[4]; + u32 w1[4]; w1[0] = wordl1[0] | wordr1[0]; w1[1] = wordl1[1] | wordr1[1]; w1[2] = wordl1[2] | wordr1[2]; w1[3] = wordl1[3] | wordr1[3]; - u32x w2[4]; + u32 w2[4]; w2[0] = 0; w2[1] = 0; w2[2] = 0; w2[3] = 0; - u32x w3[4]; + u32 w3[4]; w3[0] = 0; w3[1] = 0; w3[2] = 0; w3[3] = 0; - u32x w0_t[4]; - u32x w1_t[4]; - u32x w2_t[4]; - u32x w3_t[4]; + u32 w0_t[4]; + u32 w1_t[4]; + u32 w2_t[4]; + u32 w3_t[4]; make_unicode (w0, w0_t, w1_t); make_unicode (w1, w2_t, w3_t); @@ -552,7 +589,12 @@ __kernel void m05600_m04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, hmac_md5_run (w0_t, w1_t, w2_t, w3_t, ipad, opad, digest); - COMPARE_M_SIMD (digest[0], digest[3], digest[2], digest[1]); + const u32 r0 = digest[0]; + const u32 r1 = digest[3]; + const u32 r2 = digest[2]; + const u32 r3 = digest[1]; + + #include COMPARE_M } } @@ -606,20 +648,43 @@ __kernel void m05600_s04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, * base */ - u32 pws0[4] = { 0 }; - u32 pws1[4] = { 0 }; + u32 wordl0[4]; + + wordl0[0] = pws[gid].i[ 0]; + wordl0[1] = pws[gid].i[ 1]; + wordl0[2] = pws[gid].i[ 2]; + wordl0[3] = pws[gid].i[ 3]; + + u32 wordl1[4]; + + wordl1[0] = pws[gid].i[ 4]; + wordl1[1] = pws[gid].i[ 5]; + wordl1[2] = pws[gid].i[ 6]; + wordl1[3] = pws[gid].i[ 7]; + + u32 wordl2[4]; - pws0[0] = pws[gid].i[0]; - pws0[1] = pws[gid].i[1]; - pws0[2] = pws[gid].i[2]; - pws0[3] = pws[gid].i[3]; - pws1[0] = pws[gid].i[4]; - pws1[1] = pws[gid].i[5]; - pws1[2] = pws[gid].i[6]; - pws1[3] = pws[gid].i[7]; + wordl2[0] = 0; + wordl2[1] = 0; + wordl2[2] = 0; + wordl2[3] = 0; + + u32 wordl3[4]; + + wordl3[0] = 0; + wordl3[1] = 0; + wordl3[2] = 0; + wordl3[3] = 0; const u32 pw_l_len = pws[gid].pw_len; + if (combs_mode == COMBINATOR_MODE_BASE_RIGHT) + { + append_0x80_2x4 (wordl0, wordl1, pw_l_len); + + switch_buffer_by_offset_le (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len); + } + /** * digest */ @@ -636,63 +701,77 @@ __kernel void m05600_s04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, * loop */ - for (u32 il_pos = 0; il_pos < combs_cnt; il_pos += VECT_SIZE) + for (u32 il_pos = 0; il_pos < combs_cnt; il_pos++) { - const u32x pw_r_len = pwlenx_create_combt (combs_buf, il_pos); + const u32 pw_r_len = combs_buf[il_pos].pw_len; + + const u32 pw_len = pw_l_len + pw_r_len; + + u32 wordr0[4]; + + wordr0[0] = combs_buf[il_pos].i[0]; + wordr0[1] = combs_buf[il_pos].i[1]; + wordr0[2] = combs_buf[il_pos].i[2]; + wordr0[3] = combs_buf[il_pos].i[3]; - const u32x pw_len = pw_l_len + pw_r_len; + u32 wordr1[4]; - u32x wordr0[4] = { 0 }; - u32x wordr1[4] = { 0 }; - u32x wordr2[4] = { 0 }; - u32x wordr3[4] = { 0 }; + wordr1[0] = combs_buf[il_pos].i[4]; + wordr1[1] = combs_buf[il_pos].i[5]; + wordr1[2] = combs_buf[il_pos].i[6]; + wordr1[3] = combs_buf[il_pos].i[7]; - wordr0[0] = ix_create_combt (combs_buf, il_pos, 0); - wordr0[1] = ix_create_combt (combs_buf, il_pos, 1); - wordr0[2] = ix_create_combt (combs_buf, il_pos, 2); - wordr0[3] = ix_create_combt (combs_buf, il_pos, 3); - wordr1[0] = ix_create_combt (combs_buf, il_pos, 4); - wordr1[1] = ix_create_combt (combs_buf, il_pos, 5); - wordr1[2] = ix_create_combt (combs_buf, il_pos, 6); - wordr1[3] = ix_create_combt (combs_buf, il_pos, 7); + u32 wordr2[4]; + + wordr2[0] = 0; + wordr2[1] = 0; + wordr2[2] = 0; + wordr2[3] = 0; + + u32 wordr3[4]; + + wordr3[0] = 0; + wordr3[1] = 0; + wordr3[2] = 0; + wordr3[3] = 0; if (combs_mode == COMBINATOR_MODE_BASE_LEFT) { switch_buffer_by_offset_le (wordr0, wordr1, wordr2, wordr3, pw_l_len); } - u32x w0[4]; + u32 w0[4]; w0[0] = wordl0[0] | wordr0[0]; w0[1] = wordl0[1] | wordr0[1]; w0[2] = wordl0[2] | wordr0[2]; w0[3] = wordl0[3] | wordr0[3]; - u32x w1[4]; + u32 w1[4]; w1[0] = wordl1[0] | wordr1[0]; w1[1] = wordl1[1] | wordr1[1]; w1[2] = wordl1[2] | wordr1[2]; w1[3] = wordl1[3] | wordr1[3]; - u32x w2[4]; + u32 w2[4]; w2[0] = 0; w2[1] = 0; w2[2] = 0; w2[3] = 0; - u32x w3[4]; + u32 w3[4]; w3[0] = 0; w3[1] = 0; w3[2] = 0; w3[3] = 0; - u32x w0_t[4]; - u32x w1_t[4]; - u32x w2_t[4]; - u32x w3_t[4]; + u32 w0_t[4]; + u32 w1_t[4]; + u32 w2_t[4]; + u32 w3_t[4]; make_unicode (w0, w0_t, w1_t); make_unicode (w1, w2_t, w3_t); @@ -844,7 +923,12 @@ __kernel void m05600_s04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, hmac_md5_run (w0_t, w1_t, w2_t, w3_t, ipad, opad, digest); - COMPARE_S_SIMD (digest[0], digest[3], digest[2], digest[1]); + const u32 r0 = digest[0]; + const u32 r1 = digest[3]; + const u32 r2 = digest[2]; + const u32 r3 = digest[1]; + + #include COMPARE_S } } diff --git a/OpenCL/m06000_a1.cl b/OpenCL/m06000_a1.cl index d0c97c7..ae9f541 100644 --- a/OpenCL/m06000_a1.cl +++ b/OpenCL/m06000_a1.cl @@ -5,8 +5,6 @@ #define _RIPEMD160_ -#define NEW_SIMD_CODE - #include "include/constants.h" #include "include/kernel_vendor.h" @@ -18,7 +16,9 @@ #include "include/kernel_functions.c" #include "OpenCL/types_ocl.c" #include "OpenCL/common.c" -#include "OpenCL/simd.c" + +#define COMPARE_S "OpenCL/check_single_comp4.c" +#define COMPARE_M "OpenCL/check_multi_comp4.c" static void ripemd160_transform (const u32 w[16], u32 dgst[5]) { @@ -233,78 +233,115 @@ __kernel void m06000_m04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, if (gid >= gid_max) return; - u32 pws0[4] = { 0 }; - u32 pws1[4] = { 0 }; + u32 wordl0[4]; + + wordl0[0] = pws[gid].i[ 0]; + wordl0[1] = pws[gid].i[ 1]; + wordl0[2] = pws[gid].i[ 2]; + wordl0[3] = pws[gid].i[ 3]; + + u32 wordl1[4]; + + wordl1[0] = pws[gid].i[ 4]; + wordl1[1] = pws[gid].i[ 5]; + wordl1[2] = pws[gid].i[ 6]; + wordl1[3] = pws[gid].i[ 7]; - pws0[0] = pws[gid].i[0]; - pws0[1] = pws[gid].i[1]; - pws0[2] = pws[gid].i[2]; - pws0[3] = pws[gid].i[3]; - pws1[0] = pws[gid].i[4]; - pws1[1] = pws[gid].i[5]; - pws1[2] = pws[gid].i[6]; - pws1[3] = pws[gid].i[7]; + u32 wordl2[4]; + + wordl2[0] = 0; + wordl2[1] = 0; + wordl2[2] = 0; + wordl2[3] = 0; + + u32 wordl3[4]; + + wordl3[0] = 0; + wordl3[1] = 0; + wordl3[2] = 0; + wordl3[3] = 0; const u32 pw_l_len = pws[gid].pw_len; + if (combs_mode == COMBINATOR_MODE_BASE_RIGHT) + { + append_0x80_2x4 (wordl0, wordl1, pw_l_len); + + switch_buffer_by_offset_le (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len); + } + /** * loop */ - for (u32 il_pos = 0; il_pos < combs_cnt; il_pos += VECT_SIZE) + for (u32 il_pos = 0; il_pos < combs_cnt; il_pos++) { - const u32x pw_r_len = pwlenx_create_combt (combs_buf, il_pos); + const u32 pw_r_len = combs_buf[il_pos].pw_len; + + const u32 pw_len = pw_l_len + pw_r_len; + + u32 wordr0[4]; + + wordr0[0] = combs_buf[il_pos].i[0]; + wordr0[1] = combs_buf[il_pos].i[1]; + wordr0[2] = combs_buf[il_pos].i[2]; + wordr0[3] = combs_buf[il_pos].i[3]; + + u32 wordr1[4]; + + wordr1[0] = combs_buf[il_pos].i[4]; + wordr1[1] = combs_buf[il_pos].i[5]; + wordr1[2] = combs_buf[il_pos].i[6]; + wordr1[3] = combs_buf[il_pos].i[7]; - const u32x pw_len = pw_l_len + pw_r_len; + u32 wordr2[4]; - u32x wordr0[4] = { 0 }; - u32x wordr1[4] = { 0 }; - u32x wordr2[4] = { 0 }; - u32x wordr3[4] = { 0 }; + wordr2[0] = 0; + wordr2[1] = 0; + wordr2[2] = 0; + wordr2[3] = 0; - wordr0[0] = ix_create_combt (combs_buf, il_pos, 0); - wordr0[1] = ix_create_combt (combs_buf, il_pos, 1); - wordr0[2] = ix_create_combt (combs_buf, il_pos, 2); - wordr0[3] = ix_create_combt (combs_buf, il_pos, 3); - wordr1[0] = ix_create_combt (combs_buf, il_pos, 4); - wordr1[1] = ix_create_combt (combs_buf, il_pos, 5); - wordr1[2] = ix_create_combt (combs_buf, il_pos, 6); - wordr1[3] = ix_create_combt (combs_buf, il_pos, 7); + u32 wordr3[4]; + + wordr3[0] = 0; + wordr3[1] = 0; + wordr3[2] = 0; + wordr3[3] = 0; if (combs_mode == COMBINATOR_MODE_BASE_LEFT) { switch_buffer_by_offset_le (wordr0, wordr1, wordr2, wordr3, pw_l_len); } - u32x w0[4]; + u32 w0[4]; w0[0] = wordl0[0] | wordr0[0]; w0[1] = wordl0[1] | wordr0[1]; w0[2] = wordl0[2] | wordr0[2]; w0[3] = wordl0[3] | wordr0[3]; - u32x w1[4]; + u32 w1[4]; w1[0] = wordl1[0] | wordr1[0]; w1[1] = wordl1[1] | wordr1[1]; w1[2] = wordl1[2] | wordr1[2]; w1[3] = wordl1[3] | wordr1[3]; - u32x w2[4]; + u32 w2[4]; w2[0] = wordl2[0] | wordr2[0]; w2[1] = wordl2[1] | wordr2[1]; w2[2] = wordl2[2] | wordr2[2]; w2[3] = wordl2[3] | wordr2[3]; - u32x w3[4]; + u32 w3[4]; w3[0] = wordl3[0] | wordr3[0]; w3[1] = wordl3[1] | wordr3[1]; w3[2] = pw_len * 8; w3[3] = 0; - u32x wl[16]; + u32 wl[16]; wl[ 0] = w0[0]; wl[ 1] = w0[1]; @@ -366,20 +403,43 @@ __kernel void m06000_s04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, if (gid >= gid_max) return; - u32 pws0[4] = { 0 }; - u32 pws1[4] = { 0 }; + u32 wordl0[4]; + + wordl0[0] = pws[gid].i[ 0]; + wordl0[1] = pws[gid].i[ 1]; + wordl0[2] = pws[gid].i[ 2]; + wordl0[3] = pws[gid].i[ 3]; + + u32 wordl1[4]; + + wordl1[0] = pws[gid].i[ 4]; + wordl1[1] = pws[gid].i[ 5]; + wordl1[2] = pws[gid].i[ 6]; + wordl1[3] = pws[gid].i[ 7]; + + u32 wordl2[4]; - pws0[0] = pws[gid].i[0]; - pws0[1] = pws[gid].i[1]; - pws0[2] = pws[gid].i[2]; - pws0[3] = pws[gid].i[3]; - pws1[0] = pws[gid].i[4]; - pws1[1] = pws[gid].i[5]; - pws1[2] = pws[gid].i[6]; - pws1[3] = pws[gid].i[7]; + wordl2[0] = 0; + wordl2[1] = 0; + wordl2[2] = 0; + wordl2[3] = 0; + + u32 wordl3[4]; + + wordl3[0] = 0; + wordl3[1] = 0; + wordl3[2] = 0; + wordl3[3] = 0; const u32 pw_l_len = pws[gid].pw_len; + if (combs_mode == COMBINATOR_MODE_BASE_RIGHT) + { + append_0x80_2x4 (wordl0, wordl1, pw_l_len); + + switch_buffer_by_offset_le (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len); + } + /** * digest */ @@ -396,60 +456,74 @@ __kernel void m06000_s04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, * loop */ - for (u32 il_pos = 0; il_pos < combs_cnt; il_pos += VECT_SIZE) + for (u32 il_pos = 0; il_pos < combs_cnt; il_pos++) { - const u32x pw_r_len = pwlenx_create_combt (combs_buf, il_pos); + const u32 pw_r_len = combs_buf[il_pos].pw_len; + + const u32 pw_len = pw_l_len + pw_r_len; + + u32 wordr0[4]; + + wordr0[0] = combs_buf[il_pos].i[0]; + wordr0[1] = combs_buf[il_pos].i[1]; + wordr0[2] = combs_buf[il_pos].i[2]; + wordr0[3] = combs_buf[il_pos].i[3]; + + u32 wordr1[4]; + + wordr1[0] = combs_buf[il_pos].i[4]; + wordr1[1] = combs_buf[il_pos].i[5]; + wordr1[2] = combs_buf[il_pos].i[6]; + wordr1[3] = combs_buf[il_pos].i[7]; + + u32 wordr2[4]; - const u32x pw_len = pw_l_len + pw_r_len; + wordr2[0] = 0; + wordr2[1] = 0; + wordr2[2] = 0; + wordr2[3] = 0; - u32x wordr0[4] = { 0 }; - u32x wordr1[4] = { 0 }; - u32x wordr2[4] = { 0 }; - u32x wordr3[4] = { 0 }; + u32 wordr3[4]; - wordr0[0] = ix_create_combt (combs_buf, il_pos, 0); - wordr0[1] = ix_create_combt (combs_buf, il_pos, 1); - wordr0[2] = ix_create_combt (combs_buf, il_pos, 2); - wordr0[3] = ix_create_combt (combs_buf, il_pos, 3); - wordr1[0] = ix_create_combt (combs_buf, il_pos, 4); - wordr1[1] = ix_create_combt (combs_buf, il_pos, 5); - wordr1[2] = ix_create_combt (combs_buf, il_pos, 6); - wordr1[3] = ix_create_combt (combs_buf, il_pos, 7); + wordr3[0] = 0; + wordr3[1] = 0; + wordr3[2] = 0; + wordr3[3] = 0; if (combs_mode == COMBINATOR_MODE_BASE_LEFT) { switch_buffer_by_offset_le (wordr0, wordr1, wordr2, wordr3, pw_l_len); } - u32x w0[4]; + u32 w0[4]; w0[0] = wordl0[0] | wordr0[0]; w0[1] = wordl0[1] | wordr0[1]; w0[2] = wordl0[2] | wordr0[2]; w0[3] = wordl0[3] | wordr0[3]; - u32x w1[4]; + u32 w1[4]; w1[0] = wordl1[0] | wordr1[0]; w1[1] = wordl1[1] | wordr1[1]; w1[2] = wordl1[2] | wordr1[2]; w1[3] = wordl1[3] | wordr1[3]; - u32x w2[4]; + u32 w2[4]; w2[0] = wordl2[0] | wordr2[0]; w2[1] = wordl2[1] | wordr2[1]; w2[2] = wordl2[2] | wordr2[2]; w2[3] = wordl2[3] | wordr2[3]; - u32x w3[4]; + u32 w3[4]; w3[0] = wordl3[0] | wordr3[0]; w3[1] = wordl3[1] | wordr3[1]; w3[2] = pw_len * 8; w3[3] = 0; - u32x wl[16]; + u32 wl[16]; wl[ 0] = w0[0]; wl[ 1] = w0[1]; diff --git a/OpenCL/m06100_a1.cl b/OpenCL/m06100_a1.cl index 6923192..d499903 100644 --- a/OpenCL/m06100_a1.cl +++ b/OpenCL/m06100_a1.cl @@ -7,8 +7,6 @@ #define _WHIRLPOOL_ -#define NEW_SIMD_CODE - #include "include/constants.h" #include "include/kernel_vendor.h" @@ -20,7 +18,9 @@ #include "include/kernel_functions.c" #include "OpenCL/types_ocl.c" #include "OpenCL/common.c" -#include "OpenCL/simd.c" + +#define COMPARE_S "OpenCL/check_single_comp4.c" +#define COMPARE_M "OpenCL/check_multi_comp4.c" #define R 10 @@ -1381,78 +1381,115 @@ __kernel void m06100_m04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, * base */ - u32 pws0[4] = { 0 }; - u32 pws1[4] = { 0 }; + u32 wordl0[4]; + + wordl0[0] = pws[gid].i[ 0]; + wordl0[1] = pws[gid].i[ 1]; + wordl0[2] = pws[gid].i[ 2]; + wordl0[3] = pws[gid].i[ 3]; + + u32 wordl1[4]; + + wordl1[0] = pws[gid].i[ 4]; + wordl1[1] = pws[gid].i[ 5]; + wordl1[2] = pws[gid].i[ 6]; + wordl1[3] = pws[gid].i[ 7]; - pws0[0] = pws[gid].i[0]; - pws0[1] = pws[gid].i[1]; - pws0[2] = pws[gid].i[2]; - pws0[3] = pws[gid].i[3]; - pws1[0] = pws[gid].i[4]; - pws1[1] = pws[gid].i[5]; - pws1[2] = pws[gid].i[6]; - pws1[3] = pws[gid].i[7]; + u32 wordl2[4]; + + wordl2[0] = 0; + wordl2[1] = 0; + wordl2[2] = 0; + wordl2[3] = 0; + + u32 wordl3[4]; + + wordl3[0] = 0; + wordl3[1] = 0; + wordl3[2] = 0; + wordl3[3] = 0; const u32 pw_l_len = pws[gid].pw_len; + if (combs_mode == COMBINATOR_MODE_BASE_RIGHT) + { + append_0x80_2x4 (wordl0, wordl1, pw_l_len); + + switch_buffer_by_offset_le (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len); + } + /** * loop */ - for (u32 il_pos = 0; il_pos < combs_cnt; il_pos += VECT_SIZE) + for (u32 il_pos = 0; il_pos < combs_cnt; il_pos++) { - const u32x pw_r_len = pwlenx_create_combt (combs_buf, il_pos); + const u32 pw_r_len = combs_buf[il_pos].pw_len; + + const u32 pw_len = pw_l_len + pw_r_len; + + u32 wordr0[4]; + + wordr0[0] = combs_buf[il_pos].i[0]; + wordr0[1] = combs_buf[il_pos].i[1]; + wordr0[2] = combs_buf[il_pos].i[2]; + wordr0[3] = combs_buf[il_pos].i[3]; + + u32 wordr1[4]; + + wordr1[0] = combs_buf[il_pos].i[4]; + wordr1[1] = combs_buf[il_pos].i[5]; + wordr1[2] = combs_buf[il_pos].i[6]; + wordr1[3] = combs_buf[il_pos].i[7]; - const u32x pw_len = pw_l_len + pw_r_len; + u32 wordr2[4]; - u32x wordr0[4] = { 0 }; - u32x wordr1[4] = { 0 }; - u32x wordr2[4] = { 0 }; - u32x wordr3[4] = { 0 }; + wordr2[0] = 0; + wordr2[1] = 0; + wordr2[2] = 0; + wordr2[3] = 0; - wordr0[0] = ix_create_combt (combs_buf, il_pos, 0); - wordr0[1] = ix_create_combt (combs_buf, il_pos, 1); - wordr0[2] = ix_create_combt (combs_buf, il_pos, 2); - wordr0[3] = ix_create_combt (combs_buf, il_pos, 3); - wordr1[0] = ix_create_combt (combs_buf, il_pos, 4); - wordr1[1] = ix_create_combt (combs_buf, il_pos, 5); - wordr1[2] = ix_create_combt (combs_buf, il_pos, 6); - wordr1[3] = ix_create_combt (combs_buf, il_pos, 7); + u32 wordr3[4]; + + wordr3[0] = 0; + wordr3[1] = 0; + wordr3[2] = 0; + wordr3[3] = 0; if (combs_mode == COMBINATOR_MODE_BASE_LEFT) { switch_buffer_by_offset_le (wordr0, wordr1, wordr2, wordr3, pw_l_len); } - u32x w0[4]; + u32 w0[4]; w0[0] = wordl0[0] | wordr0[0]; w0[1] = wordl0[1] | wordr0[1]; w0[2] = wordl0[2] | wordr0[2]; w0[3] = wordl0[3] | wordr0[3]; - u32x w1[4]; + u32 w1[4]; w1[0] = wordl1[0] | wordr1[0]; w1[1] = wordl1[1] | wordr1[1]; w1[2] = wordl1[2] | wordr1[2]; w1[3] = wordl1[3] | wordr1[3]; - u32x w2[4]; + u32 w2[4]; w2[0] = wordl2[0] | wordr2[0]; w2[1] = wordl2[1] | wordr2[1]; w2[2] = wordl2[2] | wordr2[2]; w2[3] = wordl2[3] | wordr2[3]; - u32x w3[4]; + u32 w3[4]; w3[0] = wordl3[0] | wordr3[0]; w3[1] = wordl3[1] | wordr3[1]; w3[2] = 0; w3[3] = 0; - u32x wl[16]; + u32 wl[16]; wl[ 0] = swap32 (w0[0]); wl[ 1] = swap32 (w0[1]); @@ -1555,20 +1592,43 @@ __kernel void m06100_s04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, * base */ - u32 pws0[4] = { 0 }; - u32 pws1[4] = { 0 }; + u32 wordl0[4]; + + wordl0[0] = pws[gid].i[ 0]; + wordl0[1] = pws[gid].i[ 1]; + wordl0[2] = pws[gid].i[ 2]; + wordl0[3] = pws[gid].i[ 3]; + + u32 wordl1[4]; + + wordl1[0] = pws[gid].i[ 4]; + wordl1[1] = pws[gid].i[ 5]; + wordl1[2] = pws[gid].i[ 6]; + wordl1[3] = pws[gid].i[ 7]; + + u32 wordl2[4]; - pws0[0] = pws[gid].i[0]; - pws0[1] = pws[gid].i[1]; - pws0[2] = pws[gid].i[2]; - pws0[3] = pws[gid].i[3]; - pws1[0] = pws[gid].i[4]; - pws1[1] = pws[gid].i[5]; - pws1[2] = pws[gid].i[6]; - pws1[3] = pws[gid].i[7]; + wordl2[0] = 0; + wordl2[1] = 0; + wordl2[2] = 0; + wordl2[3] = 0; + + u32 wordl3[4]; + + wordl3[0] = 0; + wordl3[1] = 0; + wordl3[2] = 0; + wordl3[3] = 0; const u32 pw_l_len = pws[gid].pw_len; + if (combs_mode == COMBINATOR_MODE_BASE_RIGHT) + { + append_0x80_2x4 (wordl0, wordl1, pw_l_len); + + switch_buffer_by_offset_le (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len); + } + /** * digest */ @@ -1585,60 +1645,74 @@ __kernel void m06100_s04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, * loop */ - for (u32 il_pos = 0; il_pos < combs_cnt; il_pos += VECT_SIZE) + for (u32 il_pos = 0; il_pos < combs_cnt; il_pos++) { - const u32x pw_r_len = pwlenx_create_combt (combs_buf, il_pos); + const u32 pw_r_len = combs_buf[il_pos].pw_len; + + const u32 pw_len = pw_l_len + pw_r_len; + + u32 wordr0[4]; + + wordr0[0] = combs_buf[il_pos].i[0]; + wordr0[1] = combs_buf[il_pos].i[1]; + wordr0[2] = combs_buf[il_pos].i[2]; + wordr0[3] = combs_buf[il_pos].i[3]; + + u32 wordr1[4]; + + wordr1[0] = combs_buf[il_pos].i[4]; + wordr1[1] = combs_buf[il_pos].i[5]; + wordr1[2] = combs_buf[il_pos].i[6]; + wordr1[3] = combs_buf[il_pos].i[7]; + + u32 wordr2[4]; - const u32x pw_len = pw_l_len + pw_r_len; + wordr2[0] = 0; + wordr2[1] = 0; + wordr2[2] = 0; + wordr2[3] = 0; - u32x wordr0[4] = { 0 }; - u32x wordr1[4] = { 0 }; - u32x wordr2[4] = { 0 }; - u32x wordr3[4] = { 0 }; + u32 wordr3[4]; - wordr0[0] = ix_create_combt (combs_buf, il_pos, 0); - wordr0[1] = ix_create_combt (combs_buf, il_pos, 1); - wordr0[2] = ix_create_combt (combs_buf, il_pos, 2); - wordr0[3] = ix_create_combt (combs_buf, il_pos, 3); - wordr1[0] = ix_create_combt (combs_buf, il_pos, 4); - wordr1[1] = ix_create_combt (combs_buf, il_pos, 5); - wordr1[2] = ix_create_combt (combs_buf, il_pos, 6); - wordr1[3] = ix_create_combt (combs_buf, il_pos, 7); + wordr3[0] = 0; + wordr3[1] = 0; + wordr3[2] = 0; + wordr3[3] = 0; if (combs_mode == COMBINATOR_MODE_BASE_LEFT) { switch_buffer_by_offset_le (wordr0, wordr1, wordr2, wordr3, pw_l_len); } - u32x w0[4]; + u32 w0[4]; w0[0] = wordl0[0] | wordr0[0]; w0[1] = wordl0[1] | wordr0[1]; w0[2] = wordl0[2] | wordr0[2]; w0[3] = wordl0[3] | wordr0[3]; - u32x w1[4]; + u32 w1[4]; w1[0] = wordl1[0] | wordr1[0]; w1[1] = wordl1[1] | wordr1[1]; w1[2] = wordl1[2] | wordr1[2]; w1[3] = wordl1[3] | wordr1[3]; - u32x w2[4]; + u32 w2[4]; w2[0] = wordl2[0] | wordr2[0]; w2[1] = wordl2[1] | wordr2[1]; w2[2] = wordl2[2] | wordr2[2]; w2[3] = wordl2[3] | wordr2[3]; - u32x w3[4]; + u32 w3[4]; w3[0] = wordl3[0] | wordr3[0]; w3[1] = wordl3[1] | wordr3[1]; w3[2] = 0; w3[3] = 0; - u32x wl[16]; + u32 wl[16]; wl[ 0] = swap32 (w0[0]); wl[ 1] = swap32 (w0[1]); diff --git a/OpenCL/m06900_a1.cl b/OpenCL/m06900_a1.cl index 269b89b..123eb5d 100644 --- a/OpenCL/m06900_a1.cl +++ b/OpenCL/m06900_a1.cl @@ -7,8 +7,6 @@ #define _GOST_ -#define NEW_SIMD_CODE - #include "include/constants.h" #include "include/kernel_vendor.h" @@ -20,7 +18,9 @@ #include "include/kernel_functions.c" #include "OpenCL/types_ocl.c" #include "OpenCL/common.c" -#include "OpenCL/simd.c" + +#define COMPARE_S "OpenCL/check_single_comp4.c" +#define COMPARE_M "OpenCL/check_multi_comp4.c" __constant u32 c_tables[4][256] = { @@ -727,71 +727,106 @@ __kernel void m06900_m04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, * base */ - u32 pws0[4] = { 0 }; - u32 pws1[4] = { 0 }; + u32 wordl0[4]; + + wordl0[0] = pws[gid].i[ 0]; + wordl0[1] = pws[gid].i[ 1]; + wordl0[2] = pws[gid].i[ 2]; + wordl0[3] = pws[gid].i[ 3]; + + u32 wordl1[4]; + + wordl1[0] = pws[gid].i[ 4]; + wordl1[1] = pws[gid].i[ 5]; + wordl1[2] = pws[gid].i[ 6]; + wordl1[3] = pws[gid].i[ 7]; + + u32 wordl2[4]; + + wordl2[0] = 0; + wordl2[1] = 0; + wordl2[2] = 0; + wordl2[3] = 0; + + u32 wordl3[4]; - pws0[0] = pws[gid].i[0]; - pws0[1] = pws[gid].i[1]; - pws0[2] = pws[gid].i[2]; - pws0[3] = pws[gid].i[3]; - pws1[0] = pws[gid].i[4]; - pws1[1] = pws[gid].i[5]; - pws1[2] = pws[gid].i[6]; - pws1[3] = pws[gid].i[7]; + wordl3[0] = 0; + wordl3[1] = 0; + wordl3[2] = 0; + wordl3[3] = 0; const u32 pw_l_len = pws[gid].pw_len; + if (combs_mode == COMBINATOR_MODE_BASE_RIGHT) + { + switch_buffer_by_offset_le (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len); + } + /** * loop */ - for (u32 il_pos = 0; il_pos < combs_cnt; il_pos += VECT_SIZE) + for (u32 il_pos = 0; il_pos < combs_cnt; il_pos++) { - const u32x pw_r_len = pwlenx_create_combt (combs_buf, il_pos); + const u32 pw_r_len = combs_buf[il_pos].pw_len; + + const u32 pw_len = pw_l_len + pw_r_len; + + u32 wordr0[4]; + + wordr0[0] = combs_buf[il_pos].i[0]; + wordr0[1] = combs_buf[il_pos].i[1]; + wordr0[2] = combs_buf[il_pos].i[2]; + wordr0[3] = combs_buf[il_pos].i[3]; + + u32 wordr1[4]; + + wordr1[0] = combs_buf[il_pos].i[4]; + wordr1[1] = combs_buf[il_pos].i[5]; + wordr1[2] = combs_buf[il_pos].i[6]; + wordr1[3] = combs_buf[il_pos].i[7]; - const u32x pw_len = pw_l_len + pw_r_len; + u32 wordr2[4]; - u32x wordr0[4] = { 0 }; - u32x wordr1[4] = { 0 }; - u32x wordr2[4] = { 0 }; - u32x wordr3[4] = { 0 }; + wordr2[0] = 0; + wordr2[1] = 0; + wordr2[2] = 0; + wordr2[3] = 0; - wordr0[0] = ix_create_combt (combs_buf, il_pos, 0); - wordr0[1] = ix_create_combt (combs_buf, il_pos, 1); - wordr0[2] = ix_create_combt (combs_buf, il_pos, 2); - wordr0[3] = ix_create_combt (combs_buf, il_pos, 3); - wordr1[0] = ix_create_combt (combs_buf, il_pos, 4); - wordr1[1] = ix_create_combt (combs_buf, il_pos, 5); - wordr1[2] = ix_create_combt (combs_buf, il_pos, 6); - wordr1[3] = ix_create_combt (combs_buf, il_pos, 7); + u32 wordr3[4]; + + wordr3[0] = 0; + wordr3[1] = 0; + wordr3[2] = 0; + wordr3[3] = 0; if (combs_mode == COMBINATOR_MODE_BASE_LEFT) { switch_buffer_by_offset_le (wordr0, wordr1, wordr2, wordr3, pw_l_len); } - u32x w0[4]; + u32 w0[4]; w0[0] = wordl0[0] | wordr0[0]; w0[1] = wordl0[1] | wordr0[1]; w0[2] = wordl0[2] | wordr0[2]; w0[3] = wordl0[3] | wordr0[3]; - u32x w1[4]; + u32 w1[4]; w1[0] = wordl1[0] | wordr1[0]; w1[1] = wordl1[1] | wordr1[1]; w1[2] = wordl1[2] | wordr1[2]; w1[3] = wordl1[3] | wordr1[3]; - u32x w2[4]; + u32 w2[4]; w2[0] = wordl2[0] | wordr2[0]; w2[1] = wordl2[1] | wordr2[1]; w2[2] = wordl2[2] | wordr2[2]; w2[3] = wordl2[3] | wordr2[3]; - u32x w3[4]; + u32 w3[4]; w3[0] = wordl3[0] | wordr3[0]; w3[1] = wordl3[1] | wordr3[1]; @@ -994,20 +1029,41 @@ __kernel void m06900_s04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, * base */ - u32 pws0[4] = { 0 }; - u32 pws1[4] = { 0 }; + u32 wordl0[4]; + + wordl0[0] = pws[gid].i[ 0]; + wordl0[1] = pws[gid].i[ 1]; + wordl0[2] = pws[gid].i[ 2]; + wordl0[3] = pws[gid].i[ 3]; + + u32 wordl1[4]; + + wordl1[0] = pws[gid].i[ 4]; + wordl1[1] = pws[gid].i[ 5]; + wordl1[2] = pws[gid].i[ 6]; + wordl1[3] = pws[gid].i[ 7]; - pws0[0] = pws[gid].i[0]; - pws0[1] = pws[gid].i[1]; - pws0[2] = pws[gid].i[2]; - pws0[3] = pws[gid].i[3]; - pws1[0] = pws[gid].i[4]; - pws1[1] = pws[gid].i[5]; - pws1[2] = pws[gid].i[6]; - pws1[3] = pws[gid].i[7]; + u32 wordl2[4]; + + wordl2[0] = 0; + wordl2[1] = 0; + wordl2[2] = 0; + wordl2[3] = 0; + + u32 wordl3[4]; + + wordl3[0] = 0; + wordl3[1] = 0; + wordl3[2] = 0; + wordl3[3] = 0; const u32 pw_l_len = pws[gid].pw_len; + if (combs_mode == COMBINATOR_MODE_BASE_RIGHT) + { + switch_buffer_by_offset_le (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len); + } + /** * digest */ @@ -1024,53 +1080,67 @@ __kernel void m06900_s04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, * loop */ - for (u32 il_pos = 0; il_pos < combs_cnt; il_pos += VECT_SIZE) + for (u32 il_pos = 0; il_pos < combs_cnt; il_pos++) { - const u32x pw_r_len = pwlenx_create_combt (combs_buf, il_pos); + const u32 pw_r_len = combs_buf[il_pos].pw_len; + + const u32 pw_len = pw_l_len + pw_r_len; + + u32 wordr0[4]; + + wordr0[0] = combs_buf[il_pos].i[0]; + wordr0[1] = combs_buf[il_pos].i[1]; + wordr0[2] = combs_buf[il_pos].i[2]; + wordr0[3] = combs_buf[il_pos].i[3]; + + u32 wordr1[4]; + + wordr1[0] = combs_buf[il_pos].i[4]; + wordr1[1] = combs_buf[il_pos].i[5]; + wordr1[2] = combs_buf[il_pos].i[6]; + wordr1[3] = combs_buf[il_pos].i[7]; + + u32 wordr2[4]; - const u32x pw_len = pw_l_len + pw_r_len; + wordr2[0] = 0; + wordr2[1] = 0; + wordr2[2] = 0; + wordr2[3] = 0; - u32x wordr0[4] = { 0 }; - u32x wordr1[4] = { 0 }; - u32x wordr2[4] = { 0 }; - u32x wordr3[4] = { 0 }; + u32 wordr3[4]; - wordr0[0] = ix_create_combt (combs_buf, il_pos, 0); - wordr0[1] = ix_create_combt (combs_buf, il_pos, 1); - wordr0[2] = ix_create_combt (combs_buf, il_pos, 2); - wordr0[3] = ix_create_combt (combs_buf, il_pos, 3); - wordr1[0] = ix_create_combt (combs_buf, il_pos, 4); - wordr1[1] = ix_create_combt (combs_buf, il_pos, 5); - wordr1[2] = ix_create_combt (combs_buf, il_pos, 6); - wordr1[3] = ix_create_combt (combs_buf, il_pos, 7); + wordr3[0] = 0; + wordr3[1] = 0; + wordr3[2] = 0; + wordr3[3] = 0; if (combs_mode == COMBINATOR_MODE_BASE_LEFT) { switch_buffer_by_offset_le (wordr0, wordr1, wordr2, wordr3, pw_l_len); } - u32x w0[4]; + u32 w0[4]; w0[0] = wordl0[0] | wordr0[0]; w0[1] = wordl0[1] | wordr0[1]; w0[2] = wordl0[2] | wordr0[2]; w0[3] = wordl0[3] | wordr0[3]; - u32x w1[4]; + u32 w1[4]; w1[0] = wordl1[0] | wordr1[0]; w1[1] = wordl1[1] | wordr1[1]; w1[2] = wordl1[2] | wordr1[2]; w1[3] = wordl1[3] | wordr1[3]; - u32x w2[4]; + u32 w2[4]; w2[0] = wordl2[0] | wordr2[0]; w2[1] = wordl2[1] | wordr2[1]; w2[2] = wordl2[2] | wordr2[2]; w2[3] = wordl2[3] | wordr2[3]; - u32x w3[4]; + u32 w3[4]; w3[0] = wordl3[0] | wordr3[0]; w3[1] = wordl3[1] | wordr3[1]; diff --git a/OpenCL/m07300_a1.cl b/OpenCL/m07300_a1.cl index c922ae8..5032ac5 100644 --- a/OpenCL/m07300_a1.cl +++ b/OpenCL/m07300_a1.cl @@ -5,8 +5,6 @@ #define _SHA1_ -#define NEW_SIMD_CODE - #include "include/constants.h" #include "include/kernel_vendor.h" @@ -18,7 +16,9 @@ #include "include/kernel_functions.c" #include "OpenCL/types_ocl.c" #include "OpenCL/common.c" -#include "OpenCL/simd.c" + +#define COMPARE_S "OpenCL/check_single_comp4.c" +#define COMPARE_M "OpenCL/check_multi_comp4.c" static void sha1_transform (const u32 w0[4], const u32 w1[4], const u32 w2[4], const u32 w3[4], u32 digest[5]) { @@ -253,20 +253,41 @@ __kernel void m07300_m04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, if (gid >= gid_max) return; - u32 pws0[4] = { 0 }; - u32 pws1[4] = { 0 }; + u32 wordl0[4]; + + wordl0[0] = pws[gid].i[ 0]; + wordl0[1] = pws[gid].i[ 1]; + wordl0[2] = pws[gid].i[ 2]; + wordl0[3] = pws[gid].i[ 3]; + + u32 wordl1[4]; + + wordl1[0] = pws[gid].i[ 4]; + wordl1[1] = pws[gid].i[ 5]; + wordl1[2] = pws[gid].i[ 6]; + wordl1[3] = pws[gid].i[ 7]; - pws0[0] = pws[gid].i[0]; - pws0[1] = pws[gid].i[1]; - pws0[2] = pws[gid].i[2]; - pws0[3] = pws[gid].i[3]; - pws1[0] = pws[gid].i[4]; - pws1[1] = pws[gid].i[5]; - pws1[2] = pws[gid].i[6]; - pws1[3] = pws[gid].i[7]; + u32 wordl2[4]; + + wordl2[0] = 0; + wordl2[1] = 0; + wordl2[2] = 0; + wordl2[3] = 0; + + u32 wordl3[4]; + + wordl3[0] = 0; + wordl3[1] = 0; + wordl3[2] = 0; + wordl3[3] = 0; const u32 pw_l_len = pws[gid].pw_len; + if (combs_mode == COMBINATOR_MODE_BASE_RIGHT) + { + switch_buffer_by_offset_le (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len); + } + /** * salt */ @@ -277,53 +298,67 @@ __kernel void m07300_m04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, * loop */ - for (u32 il_pos = 0; il_pos < combs_cnt; il_pos += VECT_SIZE) + for (u32 il_pos = 0; il_pos < combs_cnt; il_pos++) { - const u32x pw_r_len = pwlenx_create_combt (combs_buf, il_pos); + const u32 pw_r_len = combs_buf[il_pos].pw_len; + + const u32 pw_len = pw_l_len + pw_r_len; + + u32 wordr0[4]; + + wordr0[0] = combs_buf[il_pos].i[0]; + wordr0[1] = combs_buf[il_pos].i[1]; + wordr0[2] = combs_buf[il_pos].i[2]; + wordr0[3] = combs_buf[il_pos].i[3]; + + u32 wordr1[4]; + + wordr1[0] = combs_buf[il_pos].i[4]; + wordr1[1] = combs_buf[il_pos].i[5]; + wordr1[2] = combs_buf[il_pos].i[6]; + wordr1[3] = combs_buf[il_pos].i[7]; + + u32 wordr2[4]; - const u32x pw_len = pw_l_len + pw_r_len; + wordr2[0] = 0; + wordr2[1] = 0; + wordr2[2] = 0; + wordr2[3] = 0; - u32x wordr0[4] = { 0 }; - u32x wordr1[4] = { 0 }; - u32x wordr2[4] = { 0 }; - u32x wordr3[4] = { 0 }; + u32 wordr3[4]; - wordr0[0] = ix_create_combt (combs_buf, il_pos, 0); - wordr0[1] = ix_create_combt (combs_buf, il_pos, 1); - wordr0[2] = ix_create_combt (combs_buf, il_pos, 2); - wordr0[3] = ix_create_combt (combs_buf, il_pos, 3); - wordr1[0] = ix_create_combt (combs_buf, il_pos, 4); - wordr1[1] = ix_create_combt (combs_buf, il_pos, 5); - wordr1[2] = ix_create_combt (combs_buf, il_pos, 6); - wordr1[3] = ix_create_combt (combs_buf, il_pos, 7); + wordr3[0] = 0; + wordr3[1] = 0; + wordr3[2] = 0; + wordr3[3] = 0; if (combs_mode == COMBINATOR_MODE_BASE_LEFT) { switch_buffer_by_offset_le (wordr0, wordr1, wordr2, wordr3, pw_l_len); } - u32x w0[4]; + u32 w0[4]; w0[0] = wordl0[0] | wordr0[0]; w0[1] = wordl0[1] | wordr0[1]; w0[2] = wordl0[2] | wordr0[2]; w0[3] = wordl0[3] | wordr0[3]; - u32x w1[4]; + u32 w1[4]; w1[0] = wordl1[0] | wordr1[0]; w1[1] = wordl1[1] | wordr1[1]; w1[2] = wordl1[2] | wordr1[2]; w1[3] = wordl1[3] | wordr1[3]; - u32x w2[4]; + u32 w2[4]; w2[0] = wordl2[0] | wordr2[0]; w2[1] = wordl2[1] | wordr2[1]; w2[2] = wordl2[2] | wordr2[2]; w2[3] = wordl2[3] | wordr2[3]; - u32x w3[4]; + u32 w3[4]; w3[0] = wordl3[0] | wordr3[0]; w3[1] = wordl3[1] | wordr3[1]; @@ -334,28 +369,28 @@ __kernel void m07300_m04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, * pads */ - u32x w0_t[4]; + u32 w0_t[4]; w0_t[0] = swap32 (w0[0]); w0_t[1] = swap32 (w0[1]); w0_t[2] = swap32 (w0[2]); w0_t[3] = swap32 (w0[3]); - u32x w1_t[4]; + u32 w1_t[4]; w1_t[0] = swap32 (w1[0]); w1_t[1] = swap32 (w1[1]); w1_t[2] = swap32 (w1[2]); w1_t[3] = swap32 (w1[3]); - u32x w2_t[4]; + u32 w2_t[4]; w2_t[0] = 0; w2_t[1] = 0; w2_t[2] = 0; w2_t[3] = 0; - u32x w3_t[4]; + u32 w3_t[4]; w3_t[0] = 0; w3_t[1] = 0; @@ -415,7 +450,12 @@ __kernel void m07300_m04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, hmac_sha1_run (w0_t, w1_t, w2_t, w3_t, ipad, opad, digest); - COMPARE_M_SIMD (digest[3], digest[4], digest[2], digest[1]); + const u32 r0 = digest[3]; + const u32 r1 = digest[4]; + const u32 r2 = digest[2]; + const u32 r3 = digest[1]; + + #include COMPARE_M } } @@ -443,20 +483,41 @@ __kernel void m07300_s04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, if (gid >= gid_max) return; - u32 pws0[4] = { 0 }; - u32 pws1[4] = { 0 }; + u32 wordl0[4]; + + wordl0[0] = pws[gid].i[ 0]; + wordl0[1] = pws[gid].i[ 1]; + wordl0[2] = pws[gid].i[ 2]; + wordl0[3] = pws[gid].i[ 3]; + + u32 wordl1[4]; + + wordl1[0] = pws[gid].i[ 4]; + wordl1[1] = pws[gid].i[ 5]; + wordl1[2] = pws[gid].i[ 6]; + wordl1[3] = pws[gid].i[ 7]; + + u32 wordl2[4]; - pws0[0] = pws[gid].i[0]; - pws0[1] = pws[gid].i[1]; - pws0[2] = pws[gid].i[2]; - pws0[3] = pws[gid].i[3]; - pws1[0] = pws[gid].i[4]; - pws1[1] = pws[gid].i[5]; - pws1[2] = pws[gid].i[6]; - pws1[3] = pws[gid].i[7]; + wordl2[0] = 0; + wordl2[1] = 0; + wordl2[2] = 0; + wordl2[3] = 0; + + u32 wordl3[4]; + + wordl3[0] = 0; + wordl3[1] = 0; + wordl3[2] = 0; + wordl3[3] = 0; const u32 pw_l_len = pws[gid].pw_len; + if (combs_mode == COMBINATOR_MODE_BASE_RIGHT) + { + switch_buffer_by_offset_le (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len); + } + /** * salt */ @@ -479,53 +540,67 @@ __kernel void m07300_s04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, * loop */ - for (u32 il_pos = 0; il_pos < combs_cnt; il_pos += VECT_SIZE) + for (u32 il_pos = 0; il_pos < combs_cnt; il_pos++) { - const u32x pw_r_len = pwlenx_create_combt (combs_buf, il_pos); + const u32 pw_r_len = combs_buf[il_pos].pw_len; + + const u32 pw_len = pw_l_len + pw_r_len; + + u32 wordr0[4]; + + wordr0[0] = combs_buf[il_pos].i[0]; + wordr0[1] = combs_buf[il_pos].i[1]; + wordr0[2] = combs_buf[il_pos].i[2]; + wordr0[3] = combs_buf[il_pos].i[3]; - const u32x pw_len = pw_l_len + pw_r_len; + u32 wordr1[4]; - u32x wordr0[4] = { 0 }; - u32x wordr1[4] = { 0 }; - u32x wordr2[4] = { 0 }; - u32x wordr3[4] = { 0 }; + wordr1[0] = combs_buf[il_pos].i[4]; + wordr1[1] = combs_buf[il_pos].i[5]; + wordr1[2] = combs_buf[il_pos].i[6]; + wordr1[3] = combs_buf[il_pos].i[7]; - wordr0[0] = ix_create_combt (combs_buf, il_pos, 0); - wordr0[1] = ix_create_combt (combs_buf, il_pos, 1); - wordr0[2] = ix_create_combt (combs_buf, il_pos, 2); - wordr0[3] = ix_create_combt (combs_buf, il_pos, 3); - wordr1[0] = ix_create_combt (combs_buf, il_pos, 4); - wordr1[1] = ix_create_combt (combs_buf, il_pos, 5); - wordr1[2] = ix_create_combt (combs_buf, il_pos, 6); - wordr1[3] = ix_create_combt (combs_buf, il_pos, 7); + u32 wordr2[4]; + + wordr2[0] = 0; + wordr2[1] = 0; + wordr2[2] = 0; + wordr2[3] = 0; + + u32 wordr3[4]; + + wordr3[0] = 0; + wordr3[1] = 0; + wordr3[2] = 0; + wordr3[3] = 0; if (combs_mode == COMBINATOR_MODE_BASE_LEFT) { switch_buffer_by_offset_le (wordr0, wordr1, wordr2, wordr3, pw_l_len); } - u32x w0[4]; + u32 w0[4]; w0[0] = wordl0[0] | wordr0[0]; w0[1] = wordl0[1] | wordr0[1]; w0[2] = wordl0[2] | wordr0[2]; w0[3] = wordl0[3] | wordr0[3]; - u32x w1[4]; + u32 w1[4]; w1[0] = wordl1[0] | wordr1[0]; w1[1] = wordl1[1] | wordr1[1]; w1[2] = wordl1[2] | wordr1[2]; w1[3] = wordl1[3] | wordr1[3]; - u32x w2[4]; + u32 w2[4]; w2[0] = wordl2[0] | wordr2[0]; w2[1] = wordl2[1] | wordr2[1]; w2[2] = wordl2[2] | wordr2[2]; w2[3] = wordl2[3] | wordr2[3]; - u32x w3[4]; + u32 w3[4]; w3[0] = wordl3[0] | wordr3[0]; w3[1] = wordl3[1] | wordr3[1]; @@ -536,28 +611,28 @@ __kernel void m07300_s04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, * pads */ - u32x w0_t[4]; + u32 w0_t[4]; w0_t[0] = swap32 (w0[0]); w0_t[1] = swap32 (w0[1]); w0_t[2] = swap32 (w0[2]); w0_t[3] = swap32 (w0[3]); - u32x w1_t[4]; + u32 w1_t[4]; w1_t[0] = swap32 (w1[0]); w1_t[1] = swap32 (w1[1]); w1_t[2] = swap32 (w1[2]); w1_t[3] = swap32 (w1[3]); - u32x w2_t[4]; + u32 w2_t[4]; w2_t[0] = 0; w2_t[1] = 0; w2_t[2] = 0; w2_t[3] = 0; - u32x w3_t[4]; + u32 w3_t[4]; w3_t[0] = 0; w3_t[1] = 0; @@ -617,7 +692,12 @@ __kernel void m07300_s04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, hmac_sha1_run (w0_t, w1_t, w2_t, w3_t, ipad, opad, digest); - COMPARE_S_SIMD (digest[3], digest[4], digest[2], digest[1]); + const u32 r0 = digest[3]; + const u32 r1 = digest[4]; + const u32 r2 = digest[2]; + const u32 r3 = digest[1]; + + #include COMPARE_S } } diff --git a/OpenCL/m07500_a1.cl b/OpenCL/m07500_a1.cl index 1251a99..b9b74c3 100644 --- a/OpenCL/m07500_a1.cl +++ b/OpenCL/m07500_a1.cl @@ -5,8 +5,6 @@ #define _KRB5PA_ -#define NEW_SIMD_CODE - #include "include/constants.h" #include "include/kernel_vendor.h" @@ -561,20 +559,41 @@ __kernel void m07500_m04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, * base */ - u32 pws0[4] = { 0 }; - u32 pws1[4] = { 0 }; + u32 wordl0[4]; + + wordl0[0] = pws[gid].i[ 0]; + wordl0[1] = pws[gid].i[ 1]; + wordl0[2] = pws[gid].i[ 2]; + wordl0[3] = pws[gid].i[ 3]; + + u32 wordl1[4]; + + wordl1[0] = pws[gid].i[ 4]; + wordl1[1] = pws[gid].i[ 5]; + wordl1[2] = pws[gid].i[ 6]; + wordl1[3] = pws[gid].i[ 7]; - pws0[0] = pws[gid].i[0]; - pws0[1] = pws[gid].i[1]; - pws0[2] = pws[gid].i[2]; - pws0[3] = pws[gid].i[3]; - pws1[0] = pws[gid].i[4]; - pws1[1] = pws[gid].i[5]; - pws1[2] = pws[gid].i[6]; - pws1[3] = pws[gid].i[7]; + u32 wordl2[4]; + + wordl2[0] = 0; + wordl2[1] = 0; + wordl2[2] = 0; + wordl2[3] = 0; + + u32 wordl3[4]; + + wordl3[0] = 0; + wordl3[1] = 0; + wordl3[2] = 0; + wordl3[3] = 0; const u32 pw_l_len = pws[gid].pw_len; + if (combs_mode == COMBINATOR_MODE_BASE_RIGHT) + { + switch_buffer_by_offset_le (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len); + } + /** * salt */ @@ -601,53 +620,67 @@ __kernel void m07500_m04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, * loop */ - for (u32 il_pos = 0; il_pos < combs_cnt; il_pos += VECT_SIZE) + for (u32 il_pos = 0; il_pos < combs_cnt; il_pos++) { - const u32x pw_r_len = pwlenx_create_combt (combs_buf, il_pos); + const u32 pw_r_len = combs_buf[il_pos].pw_len; + + const u32 pw_len = pw_l_len + pw_r_len; + + u32 wordr0[4]; + + wordr0[0] = combs_buf[il_pos].i[0]; + wordr0[1] = combs_buf[il_pos].i[1]; + wordr0[2] = combs_buf[il_pos].i[2]; + wordr0[3] = combs_buf[il_pos].i[3]; + + u32 wordr1[4]; + + wordr1[0] = combs_buf[il_pos].i[4]; + wordr1[1] = combs_buf[il_pos].i[5]; + wordr1[2] = combs_buf[il_pos].i[6]; + wordr1[3] = combs_buf[il_pos].i[7]; + + u32 wordr2[4]; - const u32x pw_len = pw_l_len + pw_r_len; + wordr2[0] = 0; + wordr2[1] = 0; + wordr2[2] = 0; + wordr2[3] = 0; - u32x wordr0[4] = { 0 }; - u32x wordr1[4] = { 0 }; - u32x wordr2[4] = { 0 }; - u32x wordr3[4] = { 0 }; + u32 wordr3[4]; - wordr0[0] = ix_create_combt (combs_buf, il_pos, 0); - wordr0[1] = ix_create_combt (combs_buf, il_pos, 1); - wordr0[2] = ix_create_combt (combs_buf, il_pos, 2); - wordr0[3] = ix_create_combt (combs_buf, il_pos, 3); - wordr1[0] = ix_create_combt (combs_buf, il_pos, 4); - wordr1[1] = ix_create_combt (combs_buf, il_pos, 5); - wordr1[2] = ix_create_combt (combs_buf, il_pos, 6); - wordr1[3] = ix_create_combt (combs_buf, il_pos, 7); + wordr3[0] = 0; + wordr3[1] = 0; + wordr3[2] = 0; + wordr3[3] = 0; if (combs_mode == COMBINATOR_MODE_BASE_LEFT) { switch_buffer_by_offset_le (wordr0, wordr1, wordr2, wordr3, pw_l_len); } - u32x w0[4]; + u32 w0[4]; w0[0] = wordl0[0] | wordr0[0]; w0[1] = wordl0[1] | wordr0[1]; w0[2] = wordl0[2] | wordr0[2]; w0[3] = wordl0[3] | wordr0[3]; - u32x w1[4]; + u32 w1[4]; w1[0] = wordl1[0] | wordr1[0]; w1[1] = wordl1[1] | wordr1[1]; w1[2] = wordl1[2] | wordr1[2]; w1[3] = wordl1[3] | wordr1[3]; - u32x w2[4]; + u32 w2[4]; w2[0] = wordl2[0] | wordr2[0]; w2[1] = wordl2[1] | wordr2[1]; w2[2] = wordl2[2] | wordr2[2]; w2[3] = wordl2[3] | wordr2[3]; - u32x w3[4]; + u32 w3[4]; w3[0] = wordl3[0] | wordr3[0]; w3[1] = wordl3[1] | wordr3[1]; @@ -703,20 +736,41 @@ __kernel void m07500_s04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, * base */ - u32 pws0[4] = { 0 }; - u32 pws1[4] = { 0 }; + u32 wordl0[4]; - pws0[0] = pws[gid].i[0]; - pws0[1] = pws[gid].i[1]; - pws0[2] = pws[gid].i[2]; - pws0[3] = pws[gid].i[3]; - pws1[0] = pws[gid].i[4]; - pws1[1] = pws[gid].i[5]; - pws1[2] = pws[gid].i[6]; - pws1[3] = pws[gid].i[7]; + wordl0[0] = pws[gid].i[ 0]; + wordl0[1] = pws[gid].i[ 1]; + wordl0[2] = pws[gid].i[ 2]; + wordl0[3] = pws[gid].i[ 3]; + + u32 wordl1[4]; + + wordl1[0] = pws[gid].i[ 4]; + wordl1[1] = pws[gid].i[ 5]; + wordl1[2] = pws[gid].i[ 6]; + wordl1[3] = pws[gid].i[ 7]; + + u32 wordl2[4]; + + wordl2[0] = 0; + wordl2[1] = 0; + wordl2[2] = 0; + wordl2[3] = 0; + + u32 wordl3[4]; + + wordl3[0] = 0; + wordl3[1] = 0; + wordl3[2] = 0; + wordl3[3] = 0; const u32 pw_l_len = pws[gid].pw_len; + if (combs_mode == COMBINATOR_MODE_BASE_RIGHT) + { + switch_buffer_by_offset_le (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len); + } + /** * salt */ @@ -743,53 +797,67 @@ __kernel void m07500_s04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, * loop */ - for (u32 il_pos = 0; il_pos < combs_cnt; il_pos += VECT_SIZE) + for (u32 il_pos = 0; il_pos < combs_cnt; il_pos++) { - const u32x pw_r_len = pwlenx_create_combt (combs_buf, il_pos); + const u32 pw_r_len = combs_buf[il_pos].pw_len; + + const u32 pw_len = pw_l_len + pw_r_len; + + u32 wordr0[4]; + + wordr0[0] = combs_buf[il_pos].i[0]; + wordr0[1] = combs_buf[il_pos].i[1]; + wordr0[2] = combs_buf[il_pos].i[2]; + wordr0[3] = combs_buf[il_pos].i[3]; + + u32 wordr1[4]; + + wordr1[0] = combs_buf[il_pos].i[4]; + wordr1[1] = combs_buf[il_pos].i[5]; + wordr1[2] = combs_buf[il_pos].i[6]; + wordr1[3] = combs_buf[il_pos].i[7]; + + u32 wordr2[4]; - const u32x pw_len = pw_l_len + pw_r_len; + wordr2[0] = 0; + wordr2[1] = 0; + wordr2[2] = 0; + wordr2[3] = 0; - u32x wordr0[4] = { 0 }; - u32x wordr1[4] = { 0 }; - u32x wordr2[4] = { 0 }; - u32x wordr3[4] = { 0 }; + u32 wordr3[4]; - wordr0[0] = ix_create_combt (combs_buf, il_pos, 0); - wordr0[1] = ix_create_combt (combs_buf, il_pos, 1); - wordr0[2] = ix_create_combt (combs_buf, il_pos, 2); - wordr0[3] = ix_create_combt (combs_buf, il_pos, 3); - wordr1[0] = ix_create_combt (combs_buf, il_pos, 4); - wordr1[1] = ix_create_combt (combs_buf, il_pos, 5); - wordr1[2] = ix_create_combt (combs_buf, il_pos, 6); - wordr1[3] = ix_create_combt (combs_buf, il_pos, 7); + wordr3[0] = 0; + wordr3[1] = 0; + wordr3[2] = 0; + wordr3[3] = 0; if (combs_mode == COMBINATOR_MODE_BASE_LEFT) { switch_buffer_by_offset_le (wordr0, wordr1, wordr2, wordr3, pw_l_len); } - u32x w0[4]; + u32 w0[4]; w0[0] = wordl0[0] | wordr0[0]; w0[1] = wordl0[1] | wordr0[1]; w0[2] = wordl0[2] | wordr0[2]; w0[3] = wordl0[3] | wordr0[3]; - u32x w1[4]; + u32 w1[4]; w1[0] = wordl1[0] | wordr1[0]; w1[1] = wordl1[1] | wordr1[1]; w1[2] = wordl1[2] | wordr1[2]; w1[3] = wordl1[3] | wordr1[3]; - u32x w2[4]; + u32 w2[4]; w2[0] = wordl2[0] | wordr2[0]; w2[1] = wordl2[1] | wordr2[1]; w2[2] = wordl2[2] | wordr2[2]; w2[3] = wordl2[3] | wordr2[3]; - u32x w3[4]; + u32 w3[4]; w3[0] = wordl3[0] | wordr3[0]; w3[1] = wordl3[1] | wordr3[1]; diff --git a/OpenCL/m07600_a1.cl b/OpenCL/m07600_a1.cl index 264f187..62db3ee 100644 --- a/OpenCL/m07600_a1.cl +++ b/OpenCL/m07600_a1.cl @@ -5,8 +5,6 @@ #define _SHA1_ -#define NEW_SIMD_CODE - #include "include/constants.h" #include "include/kernel_vendor.h" @@ -18,7 +16,9 @@ #include "include/kernel_functions.c" #include "OpenCL/types_ocl.c" #include "OpenCL/common.c" -#include "OpenCL/simd.c" + +#define COMPARE_S "OpenCL/check_single_comp4.c" +#define COMPARE_M "OpenCL/check_multi_comp4.c" #define uint_to_hex_lower8(i) l_bin2asc[(i)] @@ -55,20 +55,43 @@ __kernel void m07600_m04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, * base */ - u32 pws0[4] = { 0 }; - u32 pws1[4] = { 0 }; + u32 wordl0[4]; + + wordl0[0] = pws[gid].i[ 0]; + wordl0[1] = pws[gid].i[ 1]; + wordl0[2] = pws[gid].i[ 2]; + wordl0[3] = pws[gid].i[ 3]; + + u32 wordl1[4]; + + wordl1[0] = pws[gid].i[ 4]; + wordl1[1] = pws[gid].i[ 5]; + wordl1[2] = pws[gid].i[ 6]; + wordl1[3] = pws[gid].i[ 7]; - pws0[0] = pws[gid].i[0]; - pws0[1] = pws[gid].i[1]; - pws0[2] = pws[gid].i[2]; - pws0[3] = pws[gid].i[3]; - pws1[0] = pws[gid].i[4]; - pws1[1] = pws[gid].i[5]; - pws1[2] = pws[gid].i[6]; - pws1[3] = pws[gid].i[7]; + u32 wordl2[4]; + + wordl2[0] = 0; + wordl2[1] = 0; + wordl2[2] = 0; + wordl2[3] = 0; + + u32 wordl3[4]; + + wordl3[0] = 0; + wordl3[1] = 0; + wordl3[2] = 0; + wordl3[3] = 0; const u32 pw_l_len = pws[gid].pw_len; + if (combs_mode == COMBINATOR_MODE_BASE_RIGHT) + { + append_0x80_2x4 (wordl0, wordl1, pw_l_len); + + switch_buffer_by_offset_le (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len); + } + /** * salt */ @@ -109,25 +132,39 @@ __kernel void m07600_m04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, * loop */ - for (u32 il_pos = 0; il_pos < combs_cnt; il_pos += VECT_SIZE) + for (u32 il_pos = 0; il_pos < combs_cnt; il_pos++) { - const u32x pw_r_len = pwlenx_create_combt (combs_buf, il_pos); + const u32 pw_r_len = combs_buf[il_pos].pw_len; + + const u32 pw_len = pw_l_len + pw_r_len; + + u32 wordr0[4]; + + wordr0[0] = combs_buf[il_pos].i[0]; + wordr0[1] = combs_buf[il_pos].i[1]; + wordr0[2] = combs_buf[il_pos].i[2]; + wordr0[3] = combs_buf[il_pos].i[3]; - const u32x pw_len = pw_l_len + pw_r_len; + u32 wordr1[4]; - u32x wordr0[4] = { 0 }; - u32x wordr1[4] = { 0 }; - u32x wordr2[4] = { 0 }; - u32x wordr3[4] = { 0 }; + wordr1[0] = combs_buf[il_pos].i[4]; + wordr1[1] = combs_buf[il_pos].i[5]; + wordr1[2] = combs_buf[il_pos].i[6]; + wordr1[3] = combs_buf[il_pos].i[7]; - wordr0[0] = ix_create_combt (combs_buf, il_pos, 0); - wordr0[1] = ix_create_combt (combs_buf, il_pos, 1); - wordr0[2] = ix_create_combt (combs_buf, il_pos, 2); - wordr0[3] = ix_create_combt (combs_buf, il_pos, 3); - wordr1[0] = ix_create_combt (combs_buf, il_pos, 4); - wordr1[1] = ix_create_combt (combs_buf, il_pos, 5); - wordr1[2] = ix_create_combt (combs_buf, il_pos, 6); - wordr1[3] = ix_create_combt (combs_buf, il_pos, 7); + u32 wordr2[4]; + + wordr2[0] = 0; + wordr2[1] = 0; + wordr2[2] = 0; + wordr2[3] = 0; + + u32 wordr3[4]; + + wordr3[0] = 0; + wordr3[1] = 0; + wordr3[2] = 0; + wordr3[3] = 0; if (combs_mode == COMBINATOR_MODE_BASE_LEFT) { @@ -136,28 +173,28 @@ __kernel void m07600_m04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, switch_buffer_by_offset_le (wordr0, wordr1, wordr2, wordr3, pw_l_len); } - u32x w0[4]; + u32 w0[4]; w0[0] = wordl0[0] | wordr0[0]; w0[1] = wordl0[1] | wordr0[1]; w0[2] = wordl0[2] | wordr0[2]; w0[3] = wordl0[3] | wordr0[3]; - u32x w1[4]; + u32 w1[4]; w1[0] = wordl1[0] | wordr1[0]; w1[1] = wordl1[1] | wordr1[1]; w1[2] = wordl1[2] | wordr1[2]; w1[3] = wordl1[3] | wordr1[3]; - u32x w2[4]; + u32 w2[4]; w2[0] = wordl2[0] | wordr2[0]; w2[1] = wordl2[1] | wordr2[1]; w2[2] = wordl2[2] | wordr2[2]; w2[3] = wordl2[3] | wordr2[3]; - u32x w3[4]; + u32 w3[4]; w3[0] = wordl3[0] | wordr3[0]; w3[1] = wordl3[1] | wordr3[1]; @@ -168,28 +205,28 @@ __kernel void m07600_m04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, * sha1 */ - u32x w0_t = swap32 (w0[0]); - u32x w1_t = swap32 (w0[1]); - u32x w2_t = swap32 (w0[2]); - u32x w3_t = swap32 (w0[3]); - u32x w4_t = swap32 (w1[0]); - u32x w5_t = swap32 (w1[1]); - u32x w6_t = swap32 (w1[2]); - u32x w7_t = swap32 (w1[3]); - u32x w8_t = swap32 (w2[0]); - u32x w9_t = swap32 (w2[1]); - u32x wa_t = swap32 (w2[2]); - u32x wb_t = swap32 (w2[3]); - u32x wc_t = swap32 (w3[0]); - u32x wd_t = swap32 (w3[1]); - u32x we_t = 0; - u32x wf_t = pw_len * 8; - - u32x a = SHA1M_A; - u32x b = SHA1M_B; - u32x c = SHA1M_C; - u32x d = SHA1M_D; - u32x e = SHA1M_E; + u32 w0_t = swap32 (w0[0]); + u32 w1_t = swap32 (w0[1]); + u32 w2_t = swap32 (w0[2]); + u32 w3_t = swap32 (w0[3]); + u32 w4_t = swap32 (w1[0]); + u32 w5_t = swap32 (w1[1]); + u32 w6_t = swap32 (w1[2]); + u32 w7_t = swap32 (w1[3]); + u32 w8_t = swap32 (w2[0]); + u32 w9_t = swap32 (w2[1]); + u32 wa_t = swap32 (w2[2]); + u32 wb_t = swap32 (w2[3]); + u32 wc_t = swap32 (w3[0]); + u32 wd_t = swap32 (w3[1]); + u32 we_t = 0; + u32 wf_t = pw_len * 8; + + u32 a = SHA1M_A; + u32 b = SHA1M_B; + u32 c = SHA1M_C; + u32 d = SHA1M_D; + u32 e = SHA1M_E; #undef K #define K SHA1C00 @@ -297,7 +334,7 @@ __kernel void m07600_m04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, * Prepend salt */ - u32x w0t[4]; + u32 w0t[4]; w0t[0] = uint_to_hex_lower8 ((a >> 24) & 255) << 0 | uint_to_hex_lower8 ((a >> 16) & 255) << 16; @@ -308,7 +345,7 @@ __kernel void m07600_m04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, w0t[3] = uint_to_hex_lower8 ((b >> 8) & 255) << 0 | uint_to_hex_lower8 ((b >> 0) & 255) << 16; - u32x w1t[4]; + u32 w1t[4]; w1t[0] = uint_to_hex_lower8 ((c >> 24) & 255) << 0 | uint_to_hex_lower8 ((c >> 16) & 255) << 16; @@ -319,7 +356,7 @@ __kernel void m07600_m04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, w1t[3] = uint_to_hex_lower8 ((d >> 8) & 255) << 0 | uint_to_hex_lower8 ((d >> 0) & 255) << 16; - u32x w2t[2]; + u32 w2t[2]; w2t[0] = uint_to_hex_lower8 ((e >> 24) & 255) << 0 | uint_to_hex_lower8 ((e >> 16) & 255) << 16; @@ -600,7 +637,13 @@ __kernel void m07600_m04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, c += r_c; d += r_d; e += r_e; - COMPARE_M_SIMD (a, d, c, b); + + const u32 r0 = d; + const u32 r1 = e; + const u32 r2 = c; + const u32 r3 = b; + + #include COMPARE_M } } @@ -645,20 +688,43 @@ __kernel void m07600_s04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, * base */ - u32 pws0[4] = { 0 }; - u32 pws1[4] = { 0 }; + u32 wordl0[4]; + + wordl0[0] = pws[gid].i[ 0]; + wordl0[1] = pws[gid].i[ 1]; + wordl0[2] = pws[gid].i[ 2]; + wordl0[3] = pws[gid].i[ 3]; + + u32 wordl1[4]; + + wordl1[0] = pws[gid].i[ 4]; + wordl1[1] = pws[gid].i[ 5]; + wordl1[2] = pws[gid].i[ 6]; + wordl1[3] = pws[gid].i[ 7]; + + u32 wordl2[4]; - pws0[0] = pws[gid].i[0]; - pws0[1] = pws[gid].i[1]; - pws0[2] = pws[gid].i[2]; - pws0[3] = pws[gid].i[3]; - pws1[0] = pws[gid].i[4]; - pws1[1] = pws[gid].i[5]; - pws1[2] = pws[gid].i[6]; - pws1[3] = pws[gid].i[7]; + wordl2[0] = 0; + wordl2[1] = 0; + wordl2[2] = 0; + wordl2[3] = 0; + + u32 wordl3[4]; + + wordl3[0] = 0; + wordl3[1] = 0; + wordl3[2] = 0; + wordl3[3] = 0; const u32 pw_l_len = pws[gid].pw_len; + if (combs_mode == COMBINATOR_MODE_BASE_RIGHT) + { + append_0x80_2x4 (wordl0, wordl1, pw_l_len); + + switch_buffer_by_offset_le (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len); + } + /** * salt */ @@ -711,31 +777,45 @@ __kernel void m07600_s04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, * reverse */ - const u32 e_rev = rotl32_S (search[1], 2u); + const u32 e_rev = rotl32 (search[1], 2u); /** * loop */ - for (u32 il_pos = 0; il_pos < combs_cnt; il_pos += VECT_SIZE) + for (u32 il_pos = 0; il_pos < combs_cnt; il_pos++) { - const u32x pw_r_len = pwlenx_create_combt (combs_buf, il_pos); + const u32 pw_r_len = combs_buf[il_pos].pw_len; + + const u32 pw_len = pw_l_len + pw_r_len; + + u32 wordr0[4]; + + wordr0[0] = combs_buf[il_pos].i[0]; + wordr0[1] = combs_buf[il_pos].i[1]; + wordr0[2] = combs_buf[il_pos].i[2]; + wordr0[3] = combs_buf[il_pos].i[3]; + + u32 wordr1[4]; - const u32x pw_len = pw_l_len + pw_r_len; + wordr1[0] = combs_buf[il_pos].i[4]; + wordr1[1] = combs_buf[il_pos].i[5]; + wordr1[2] = combs_buf[il_pos].i[6]; + wordr1[3] = combs_buf[il_pos].i[7]; - u32x wordr0[4] = { 0 }; - u32x wordr1[4] = { 0 }; - u32x wordr2[4] = { 0 }; - u32x wordr3[4] = { 0 }; + u32 wordr2[4]; - wordr0[0] = ix_create_combt (combs_buf, il_pos, 0); - wordr0[1] = ix_create_combt (combs_buf, il_pos, 1); - wordr0[2] = ix_create_combt (combs_buf, il_pos, 2); - wordr0[3] = ix_create_combt (combs_buf, il_pos, 3); - wordr1[0] = ix_create_combt (combs_buf, il_pos, 4); - wordr1[1] = ix_create_combt (combs_buf, il_pos, 5); - wordr1[2] = ix_create_combt (combs_buf, il_pos, 6); - wordr1[3] = ix_create_combt (combs_buf, il_pos, 7); + wordr2[0] = 0; + wordr2[1] = 0; + wordr2[2] = 0; + wordr2[3] = 0; + + u32 wordr3[4]; + + wordr3[0] = 0; + wordr3[1] = 0; + wordr3[2] = 0; + wordr3[3] = 0; if (combs_mode == COMBINATOR_MODE_BASE_LEFT) { @@ -744,28 +824,28 @@ __kernel void m07600_s04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, switch_buffer_by_offset_le (wordr0, wordr1, wordr2, wordr3, pw_l_len); } - u32x w0[4]; + u32 w0[4]; w0[0] = wordl0[0] | wordr0[0]; w0[1] = wordl0[1] | wordr0[1]; w0[2] = wordl0[2] | wordr0[2]; w0[3] = wordl0[3] | wordr0[3]; - u32x w1[4]; + u32 w1[4]; w1[0] = wordl1[0] | wordr1[0]; w1[1] = wordl1[1] | wordr1[1]; w1[2] = wordl1[2] | wordr1[2]; w1[3] = wordl1[3] | wordr1[3]; - u32x w2[4]; + u32 w2[4]; w2[0] = wordl2[0] | wordr2[0]; w2[1] = wordl2[1] | wordr2[1]; w2[2] = wordl2[2] | wordr2[2]; w2[3] = wordl2[3] | wordr2[3]; - u32x w3[4]; + u32 w3[4]; w3[0] = wordl3[0] | wordr3[0]; w3[1] = wordl3[1] | wordr3[1]; @@ -776,28 +856,28 @@ __kernel void m07600_s04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, * sha1 */ - u32x w0_t = swap32 (w0[0]); - u32x w1_t = swap32 (w0[1]); - u32x w2_t = swap32 (w0[2]); - u32x w3_t = swap32 (w0[3]); - u32x w4_t = swap32 (w1[0]); - u32x w5_t = swap32 (w1[1]); - u32x w6_t = swap32 (w1[2]); - u32x w7_t = swap32 (w1[3]); - u32x w8_t = swap32 (w2[0]); - u32x w9_t = swap32 (w2[1]); - u32x wa_t = swap32 (w2[2]); - u32x wb_t = swap32 (w2[3]); - u32x wc_t = swap32 (w3[0]); - u32x wd_t = swap32 (w3[1]); - u32x we_t = 0; - u32x wf_t = pw_len * 8; - - u32x a = SHA1M_A; - u32x b = SHA1M_B; - u32x c = SHA1M_C; - u32x d = SHA1M_D; - u32x e = SHA1M_E; + u32 w0_t = swap32 (w0[0]); + u32 w1_t = swap32 (w0[1]); + u32 w2_t = swap32 (w0[2]); + u32 w3_t = swap32 (w0[3]); + u32 w4_t = swap32 (w1[0]); + u32 w5_t = swap32 (w1[1]); + u32 w6_t = swap32 (w1[2]); + u32 w7_t = swap32 (w1[3]); + u32 w8_t = swap32 (w2[0]); + u32 w9_t = swap32 (w2[1]); + u32 wa_t = swap32 (w2[2]); + u32 wb_t = swap32 (w2[3]); + u32 wc_t = swap32 (w3[0]); + u32 wd_t = swap32 (w3[1]); + u32 we_t = 0; + u32 wf_t = pw_len * 8; + + u32 a = SHA1M_A; + u32 b = SHA1M_B; + u32 c = SHA1M_C; + u32 d = SHA1M_D; + u32 e = SHA1M_E; #undef K #define K SHA1C00 @@ -905,7 +985,7 @@ __kernel void m07600_s04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, * Prepend salt */ - u32x w0t[4]; + u32 w0t[4]; w0t[0] = uint_to_hex_lower8 ((a >> 24) & 255) << 0 | uint_to_hex_lower8 ((a >> 16) & 255) << 16; @@ -916,7 +996,7 @@ __kernel void m07600_s04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, w0t[3] = uint_to_hex_lower8 ((b >> 8) & 255) << 0 | uint_to_hex_lower8 ((b >> 0) & 255) << 16; - u32x w1t[4]; + u32 w1t[4]; w1t[0] = uint_to_hex_lower8 ((c >> 24) & 255) << 0 | uint_to_hex_lower8 ((c >> 16) & 255) << 16; @@ -927,7 +1007,7 @@ __kernel void m07600_s04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, w1t[3] = uint_to_hex_lower8 ((d >> 8) & 255) << 0 | uint_to_hex_lower8 ((d >> 0) & 255) << 16; - u32x w2t[2]; + u32 w2t[2]; w2t[0] = uint_to_hex_lower8 ((e >> 24) & 255) << 0 | uint_to_hex_lower8 ((e >> 16) & 255) << 16; @@ -1208,7 +1288,13 @@ __kernel void m07600_s04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, c += r_c; d += r_d; e += r_e; - COMPARE_S_SIMD (a, d, c, b); + + const u32 r0 = d; + const u32 r1 = e; + const u32 r2 = c; + const u32 r3 = b; + + #include COMPARE_S } } diff --git a/OpenCL/m08000_a1.cl b/OpenCL/m08000_a1.cl index 152fca8..07e6399 100644 --- a/OpenCL/m08000_a1.cl +++ b/OpenCL/m08000_a1.cl @@ -7,8 +7,6 @@ #define _SHA256_ -#define NEW_SIMD_CODE - #include "include/constants.h" #include "include/kernel_vendor.h" @@ -20,7 +18,9 @@ #include "include/kernel_functions.c" #include "OpenCL/types_ocl.c" #include "OpenCL/common.c" -#include "OpenCL/simd.c" + +#define COMPARE_S "OpenCL/check_single_comp4.c" +#define COMPARE_M "OpenCL/check_multi_comp4.c" __constant u32 k_sha256[64] = { @@ -289,49 +289,78 @@ __kernel void m08000_m04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, * base */ - u32 pws0[4] = { 0 }; - u32 pws1[4] = { 0 }; + u32 wordl0[4]; + + wordl0[0] = pws[gid].i[ 0]; + wordl0[1] = pws[gid].i[ 1]; + wordl0[2] = pws[gid].i[ 2]; + wordl0[3] = pws[gid].i[ 3]; + + u32 wordl1[4]; + + wordl1[0] = pws[gid].i[ 4]; + wordl1[1] = pws[gid].i[ 5]; + wordl1[2] = pws[gid].i[ 6]; + wordl1[3] = pws[gid].i[ 7]; + + u32 wordl2[4]; - pws0[0] = pws[gid].i[0]; - pws0[1] = pws[gid].i[1]; - pws0[2] = pws[gid].i[2]; - pws0[3] = pws[gid].i[3]; - pws1[0] = pws[gid].i[4]; - pws1[1] = pws[gid].i[5]; - pws1[2] = pws[gid].i[6]; - pws1[3] = pws[gid].i[7]; + wordl2[0] = 0; + wordl2[1] = 0; + wordl2[2] = 0; + wordl2[3] = 0; + + u32 wordl3[4]; + + wordl3[0] = 0; + wordl3[1] = 0; + wordl3[2] = 0; + wordl3[3] = 0; const u32 pw_l_len = pws[gid].pw_len; + if (combs_mode == COMBINATOR_MODE_BASE_RIGHT) + { + switch_buffer_by_offset_le (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len); + } + /** * loop */ - for (u32 il_pos = 0; il_pos < combs_cnt; il_pos += VECT_SIZE) + for (u32 il_pos = 0; il_pos < combs_cnt; il_pos++) { - u32x wordr0[4] = { 0 }; - u32x wordr1[4] = { 0 }; - u32x wordr2[4] = { 0 }; - u32x wordr3[4] = { 0 }; - - wordr0[0] = ix_create_combt (combs_buf, il_pos, 0); - wordr0[1] = ix_create_combt (combs_buf, il_pos, 1); - wordr0[2] = ix_create_combt (combs_buf, il_pos, 2); - wordr0[3] = ix_create_combt (combs_buf, il_pos, 3); - wordr1[0] = ix_create_combt (combs_buf, il_pos, 4); - wordr1[1] = ix_create_combt (combs_buf, il_pos, 5); - wordr1[2] = ix_create_combt (combs_buf, il_pos, 6); - wordr1[3] = ix_create_combt (combs_buf, il_pos, 7); + u32 wordr0[4]; + u32 wordr1[4]; + u32 wordr2[4]; + u32 wordr3[4]; + + wordr0[0] = combs_buf[il_pos].i[0]; + wordr0[1] = combs_buf[il_pos].i[1]; + wordr0[2] = combs_buf[il_pos].i[2]; + wordr0[3] = combs_buf[il_pos].i[3]; + wordr1[0] = combs_buf[il_pos].i[4]; + wordr1[1] = combs_buf[il_pos].i[5]; + wordr1[2] = combs_buf[il_pos].i[6]; + wordr1[3] = combs_buf[il_pos].i[7]; + wordr2[0] = 0; + wordr2[1] = 0; + wordr2[2] = 0; + wordr2[3] = 0; + wordr3[0] = 0; + wordr3[1] = 0; + wordr3[2] = 0; + wordr3[3] = 0; if (combs_mode == COMBINATOR_MODE_BASE_LEFT) { switch_buffer_by_offset_le (wordr0, wordr1, wordr2, wordr3, pw_l_len); } - u32x w0[4]; - u32x w1[4]; - u32x w2[4]; - u32x w3[4]; + u32 w0[4]; + u32 w1[4]; + u32 w2[4]; + u32 w3[4]; w0[0] = wordl0[0] | wordr0[0]; w0[1] = wordl0[1] | wordr0[1]; @@ -350,16 +379,16 @@ __kernel void m08000_m04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, w3[2] = wordl3[2] | wordr3[2]; w3[3] = wordl3[3] | wordr3[3]; - u32x w0_t[4]; - u32x w1_t[4]; - u32x w2_t[4]; - u32x w3_t[4]; + u32 w0_t[4]; + u32 w1_t[4]; + u32 w2_t[4]; + u32 w3_t[4]; make_unicode (w0, w0_t, w1_t); make_unicode (w1, w2_t, w3_t); - u32x w_t[16]; + u32 w_t[16]; w_t[ 0] = swap32 (w0_t[0]); w_t[ 1] = swap32 (w0_t[1]); @@ -416,7 +445,12 @@ __kernel void m08000_m04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, sha256_transform_s (digest, w_s1); // 448 - 512 sha256_transform_s (digest, w_s2); // 512 - 576 - COMPARE_M_SIMD (digest[3], digest[7], digest[2], digest[6]); + const u32 r0 = digest[3]; + const u32 r1 = digest[7]; + const u32 r2 = digest[2]; + const u32 r3 = digest[6]; + + #include COMPARE_M } } @@ -491,20 +525,41 @@ __kernel void m08000_s04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, * base */ - u32 pws0[4] = { 0 }; - u32 pws1[4] = { 0 }; + u32 wordl0[4]; + + wordl0[0] = pws[gid].i[ 0]; + wordl0[1] = pws[gid].i[ 1]; + wordl0[2] = pws[gid].i[ 2]; + wordl0[3] = pws[gid].i[ 3]; + + u32 wordl1[4]; + + wordl1[0] = pws[gid].i[ 4]; + wordl1[1] = pws[gid].i[ 5]; + wordl1[2] = pws[gid].i[ 6]; + wordl1[3] = pws[gid].i[ 7]; + + u32 wordl2[4]; + + wordl2[0] = 0; + wordl2[1] = 0; + wordl2[2] = 0; + wordl2[3] = 0; - pws0[0] = pws[gid].i[0]; - pws0[1] = pws[gid].i[1]; - pws0[2] = pws[gid].i[2]; - pws0[3] = pws[gid].i[3]; - pws1[0] = pws[gid].i[4]; - pws1[1] = pws[gid].i[5]; - pws1[2] = pws[gid].i[6]; - pws1[3] = pws[gid].i[7]; + u32 wordl3[4]; + + wordl3[0] = 0; + wordl3[1] = 0; + wordl3[2] = 0; + wordl3[3] = 0; const u32 pw_l_len = pws[gid].pw_len; + if (combs_mode == COMBINATOR_MODE_BASE_RIGHT) + { + switch_buffer_by_offset_le (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len); + } + /** * digest */ @@ -521,31 +576,39 @@ __kernel void m08000_s04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, * loop */ - for (u32 il_pos = 0; il_pos < combs_cnt; il_pos += VECT_SIZE) + for (u32 il_pos = 0; il_pos < combs_cnt; il_pos++) { - u32x wordr0[4] = { 0 }; - u32x wordr1[4] = { 0 }; - u32x wordr2[4] = { 0 }; - u32x wordr3[4] = { 0 }; - - wordr0[0] = ix_create_combt (combs_buf, il_pos, 0); - wordr0[1] = ix_create_combt (combs_buf, il_pos, 1); - wordr0[2] = ix_create_combt (combs_buf, il_pos, 2); - wordr0[3] = ix_create_combt (combs_buf, il_pos, 3); - wordr1[0] = ix_create_combt (combs_buf, il_pos, 4); - wordr1[1] = ix_create_combt (combs_buf, il_pos, 5); - wordr1[2] = ix_create_combt (combs_buf, il_pos, 6); - wordr1[3] = ix_create_combt (combs_buf, il_pos, 7); + u32 wordr0[4]; + u32 wordr1[4]; + u32 wordr2[4]; + u32 wordr3[4]; + + wordr0[0] = combs_buf[il_pos].i[0]; + wordr0[1] = combs_buf[il_pos].i[1]; + wordr0[2] = combs_buf[il_pos].i[2]; + wordr0[3] = combs_buf[il_pos].i[3]; + wordr1[0] = combs_buf[il_pos].i[4]; + wordr1[1] = combs_buf[il_pos].i[5]; + wordr1[2] = combs_buf[il_pos].i[6]; + wordr1[3] = combs_buf[il_pos].i[7]; + wordr2[0] = 0; + wordr2[1] = 0; + wordr2[2] = 0; + wordr2[3] = 0; + wordr3[0] = 0; + wordr3[1] = 0; + wordr3[2] = 0; + wordr3[3] = 0; if (combs_mode == COMBINATOR_MODE_BASE_LEFT) { switch_buffer_by_offset_le (wordr0, wordr1, wordr2, wordr3, pw_l_len); } - u32x w0[4]; - u32x w1[4]; - u32x w2[4]; - u32x w3[4]; + u32 w0[4]; + u32 w1[4]; + u32 w2[4]; + u32 w3[4]; w0[0] = wordl0[0] | wordr0[0]; w0[1] = wordl0[1] | wordr0[1]; @@ -564,16 +627,16 @@ __kernel void m08000_s04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, w3[2] = wordl3[2] | wordr3[2]; w3[3] = wordl3[3] | wordr3[3]; - u32x w0_t[4]; - u32x w1_t[4]; - u32x w2_t[4]; - u32x w3_t[4]; + u32 w0_t[4]; + u32 w1_t[4]; + u32 w2_t[4]; + u32 w3_t[4]; make_unicode (w0, w0_t, w1_t); make_unicode (w1, w2_t, w3_t); - u32x w_t[16]; + u32 w_t[16]; w_t[ 0] = swap32 (w0_t[0]); w_t[ 1] = swap32 (w0_t[1]); @@ -630,7 +693,12 @@ __kernel void m08000_s04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, sha256_transform_s (digest, w_s1); // 448 - 512 sha256_transform_s (digest, w_s2); // 512 - 576 - COMPARE_S_SIMD (digest[3], digest[7], digest[2], digest[6]); + const u32 r0 = digest[3]; + const u32 r1 = digest[7]; + const u32 r2 = digest[2]; + const u32 r3 = digest[6]; + + #include COMPARE_S } } diff --git a/OpenCL/m08100_a1.cl b/OpenCL/m08100_a1.cl index 91b9fa1..325c08b 100644 --- a/OpenCL/m08100_a1.cl +++ b/OpenCL/m08100_a1.cl @@ -5,8 +5,6 @@ #define _SHA1_ -#define NEW_SIMD_CODE - #include "include/constants.h" #include "include/kernel_vendor.h" @@ -18,7 +16,9 @@ #include "include/kernel_functions.c" #include "OpenCL/types_ocl.c" #include "OpenCL/common.c" -#include "OpenCL/simd.c" + +#define COMPARE_S "OpenCL/check_single_comp4.c" +#define COMPARE_M "OpenCL/check_multi_comp4.c" __kernel void m08100_m04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, __global comb_t *combs_buf, __global bf_t *bfs_buf, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 combs_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max) { @@ -36,20 +36,41 @@ __kernel void m08100_m04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, if (gid >= gid_max) return; - u32 pws0[4] = { 0 }; - u32 pws1[4] = { 0 }; + u32 wordl0[4]; + + wordl0[0] = pws[gid].i[ 0]; + wordl0[1] = pws[gid].i[ 1]; + wordl0[2] = pws[gid].i[ 2]; + wordl0[3] = pws[gid].i[ 3]; + + u32 wordl1[4]; + + wordl1[0] = pws[gid].i[ 4]; + wordl1[1] = pws[gid].i[ 5]; + wordl1[2] = pws[gid].i[ 6]; + wordl1[3] = pws[gid].i[ 7]; + + u32 wordl2[4]; + + wordl2[0] = 0; + wordl2[1] = 0; + wordl2[2] = 0; + wordl2[3] = 0; + + u32 wordl3[4]; - pws0[0] = pws[gid].i[0]; - pws0[1] = pws[gid].i[1]; - pws0[2] = pws[gid].i[2]; - pws0[3] = pws[gid].i[3]; - pws1[0] = pws[gid].i[4]; - pws1[1] = pws[gid].i[5]; - pws1[2] = pws[gid].i[6]; - pws1[3] = pws[gid].i[7]; + wordl3[0] = 0; + wordl3[1] = 0; + wordl3[2] = 0; + wordl3[3] = 0; const u32 pw_l_len = pws[gid].pw_len; + if (combs_mode == COMBINATOR_MODE_BASE_RIGHT) + { + switch_buffer_by_offset_le (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len); + } + /** * salt */ @@ -65,35 +86,43 @@ __kernel void m08100_m04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, * loop */ - for (u32 il_pos = 0; il_pos < combs_cnt; il_pos += VECT_SIZE) + for (u32 il_pos = 0; il_pos < combs_cnt; il_pos++) { - const u32x pw_r_len = pwlenx_create_combt (combs_buf, il_pos); - - const u32x pw_len = pw_l_len + pw_r_len; - - u32x wordr0[4] = { 0 }; - u32x wordr1[4] = { 0 }; - u32x wordr2[4] = { 0 }; - u32x wordr3[4] = { 0 }; - - wordr0[0] = ix_create_combt (combs_buf, il_pos, 0); - wordr0[1] = ix_create_combt (combs_buf, il_pos, 1); - wordr0[2] = ix_create_combt (combs_buf, il_pos, 2); - wordr0[3] = ix_create_combt (combs_buf, il_pos, 3); - wordr1[0] = ix_create_combt (combs_buf, il_pos, 4); - wordr1[1] = ix_create_combt (combs_buf, il_pos, 5); - wordr1[2] = ix_create_combt (combs_buf, il_pos, 6); - wordr1[3] = ix_create_combt (combs_buf, il_pos, 7); + const u32 pw_r_len = combs_buf[il_pos].pw_len; + + const u32 pw_len = pw_l_len + pw_r_len; + + u32 wordr0[4]; + u32 wordr1[4]; + u32 wordr2[4]; + u32 wordr3[4]; + + wordr0[0] = combs_buf[il_pos].i[0]; + wordr0[1] = combs_buf[il_pos].i[1]; + wordr0[2] = combs_buf[il_pos].i[2]; + wordr0[3] = combs_buf[il_pos].i[3]; + wordr1[0] = combs_buf[il_pos].i[4]; + wordr1[1] = combs_buf[il_pos].i[5]; + wordr1[2] = combs_buf[il_pos].i[6]; + wordr1[3] = combs_buf[il_pos].i[7]; + wordr2[0] = 0; + wordr2[1] = 0; + wordr2[2] = 0; + wordr2[3] = 0; + wordr3[0] = 0; + wordr3[1] = 0; + wordr3[2] = 0; + wordr3[3] = 0; if (combs_mode == COMBINATOR_MODE_BASE_LEFT) { switch_buffer_by_offset_le (wordr0, wordr1, wordr2, wordr3, pw_l_len); } - u32x w0[4]; - u32x w1[4]; - u32x w2[4]; - u32x w3[4]; + u32 w0[4]; + u32 w1[4]; + u32 w2[4]; + u32 w3[4]; w0[0] = wordl0[0] | wordr0[0]; w0[1] = wordl0[1] | wordr0[1]; @@ -116,12 +145,12 @@ __kernel void m08100_m04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, * prepend salt */ - const u32x pw_salt_len = pw_len + salt_len; + const u32 pw_salt_len = pw_len + salt_len; - u32x w0_t[4]; - u32x w1_t[4]; - u32x w2_t[4]; - u32x w3_t[4]; + u32 w0_t[4]; + u32 w1_t[4]; + u32 w2_t[4]; + u32 w3_t[4]; w0_t[0] = salt_buf0[0]; w0_t[1] = salt_buf0[1]; @@ -163,11 +192,11 @@ __kernel void m08100_m04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, //w3_t[2] = swap32 (w3_t[2]); //w3_t[3] = swap32 (w3_t[3]); - u32x a = SHA1M_A; - u32x b = SHA1M_B; - u32x c = SHA1M_C; - u32x d = SHA1M_D; - u32x e = SHA1M_E; + u32 a = SHA1M_A; + u32 b = SHA1M_B; + u32 c = SHA1M_C; + u32 d = SHA1M_D; + u32 e = SHA1M_E; #undef K #define K SHA1C00 @@ -264,7 +293,13 @@ __kernel void m08100_m04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, w3_t[1] = rotl32 ((w2_t[2] ^ w1_t[1] ^ w3_t[3] ^ w3_t[1]), 1u); SHA1_STEP (SHA1_F1, d, e, a, b, c, w3_t[1]); w3_t[2] = rotl32 ((w2_t[3] ^ w1_t[2] ^ w0_t[0] ^ w3_t[2]), 1u); SHA1_STEP (SHA1_F1, c, d, e, a, b, w3_t[2]); w3_t[3] = rotl32 ((w3_t[0] ^ w1_t[3] ^ w0_t[1] ^ w3_t[3]), 1u); SHA1_STEP (SHA1_F1, b, c, d, e, a, w3_t[3]); - COMPARE_M_SIMD (a, d, c, b); + + const u32 r0 = d; + const u32 r1 = e; + const u32 r2 = c; + const u32 r3 = b; + + #include COMPARE_M } } @@ -292,20 +327,41 @@ __kernel void m08100_s04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, if (gid >= gid_max) return; - u32 pws0[4] = { 0 }; - u32 pws1[4] = { 0 }; + u32 wordl0[4]; + + wordl0[0] = pws[gid].i[ 0]; + wordl0[1] = pws[gid].i[ 1]; + wordl0[2] = pws[gid].i[ 2]; + wordl0[3] = pws[gid].i[ 3]; + + u32 wordl1[4]; + + wordl1[0] = pws[gid].i[ 4]; + wordl1[1] = pws[gid].i[ 5]; + wordl1[2] = pws[gid].i[ 6]; + wordl1[3] = pws[gid].i[ 7]; - pws0[0] = pws[gid].i[0]; - pws0[1] = pws[gid].i[1]; - pws0[2] = pws[gid].i[2]; - pws0[3] = pws[gid].i[3]; - pws1[0] = pws[gid].i[4]; - pws1[1] = pws[gid].i[5]; - pws1[2] = pws[gid].i[6]; - pws1[3] = pws[gid].i[7]; + u32 wordl2[4]; + + wordl2[0] = 0; + wordl2[1] = 0; + wordl2[2] = 0; + wordl2[3] = 0; + + u32 wordl3[4]; + + wordl3[0] = 0; + wordl3[1] = 0; + wordl3[2] = 0; + wordl3[3] = 0; const u32 pw_l_len = pws[gid].pw_len; + if (combs_mode == COMBINATOR_MODE_BASE_RIGHT) + { + switch_buffer_by_offset_le (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len); + } + /** * salt */ @@ -333,41 +389,49 @@ __kernel void m08100_s04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, * reverse */ - const u32 e_rev = rotl32_S (search[1], 2u); + const u32 e_rev = rotl32 (search[1], 2u); /** * loop */ - for (u32 il_pos = 0; il_pos < combs_cnt; il_pos += VECT_SIZE) + for (u32 il_pos = 0; il_pos < combs_cnt; il_pos++) { - const u32x pw_r_len = pwlenx_create_combt (combs_buf, il_pos); - - const u32x pw_len = pw_l_len + pw_r_len; - - u32x wordr0[4] = { 0 }; - u32x wordr1[4] = { 0 }; - u32x wordr2[4] = { 0 }; - u32x wordr3[4] = { 0 }; - - wordr0[0] = ix_create_combt (combs_buf, il_pos, 0); - wordr0[1] = ix_create_combt (combs_buf, il_pos, 1); - wordr0[2] = ix_create_combt (combs_buf, il_pos, 2); - wordr0[3] = ix_create_combt (combs_buf, il_pos, 3); - wordr1[0] = ix_create_combt (combs_buf, il_pos, 4); - wordr1[1] = ix_create_combt (combs_buf, il_pos, 5); - wordr1[2] = ix_create_combt (combs_buf, il_pos, 6); - wordr1[3] = ix_create_combt (combs_buf, il_pos, 7); + const u32 pw_r_len = combs_buf[il_pos].pw_len; + + const u32 pw_len = pw_l_len + pw_r_len; + + u32 wordr0[4]; + u32 wordr1[4]; + u32 wordr2[4]; + u32 wordr3[4]; + + wordr0[0] = combs_buf[il_pos].i[0]; + wordr0[1] = combs_buf[il_pos].i[1]; + wordr0[2] = combs_buf[il_pos].i[2]; + wordr0[3] = combs_buf[il_pos].i[3]; + wordr1[0] = combs_buf[il_pos].i[4]; + wordr1[1] = combs_buf[il_pos].i[5]; + wordr1[2] = combs_buf[il_pos].i[6]; + wordr1[3] = combs_buf[il_pos].i[7]; + wordr2[0] = 0; + wordr2[1] = 0; + wordr2[2] = 0; + wordr2[3] = 0; + wordr3[0] = 0; + wordr3[1] = 0; + wordr3[2] = 0; + wordr3[3] = 0; if (combs_mode == COMBINATOR_MODE_BASE_LEFT) { switch_buffer_by_offset_le (wordr0, wordr1, wordr2, wordr3, pw_l_len); } - u32x w0[4]; - u32x w1[4]; - u32x w2[4]; - u32x w3[4]; + u32 w0[4]; + u32 w1[4]; + u32 w2[4]; + u32 w3[4]; w0[0] = wordl0[0] | wordr0[0]; w0[1] = wordl0[1] | wordr0[1]; @@ -390,12 +454,12 @@ __kernel void m08100_s04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, * prepend salt */ - const u32x pw_salt_len = pw_len + salt_len; + const u32 pw_salt_len = pw_len + salt_len; - u32x w0_t[4]; - u32x w1_t[4]; - u32x w2_t[4]; - u32x w3_t[4]; + u32 w0_t[4]; + u32 w1_t[4]; + u32 w2_t[4]; + u32 w3_t[4]; w0_t[0] = salt_buf0[0]; w0_t[1] = salt_buf0[1]; @@ -437,11 +501,11 @@ __kernel void m08100_s04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, //w3_t[2] = swap32 (w3_t[2]); //w3_t[3] = swap32 (w3_t[3]); - u32x a = SHA1M_A; - u32x b = SHA1M_B; - u32x c = SHA1M_C; - u32x d = SHA1M_D; - u32x e = SHA1M_E; + u32 a = SHA1M_A; + u32 b = SHA1M_B; + u32 c = SHA1M_C; + u32 d = SHA1M_D; + u32 e = SHA1M_E; #undef K #define K SHA1C00 @@ -536,12 +600,18 @@ __kernel void m08100_s04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, w2_t[3] = rotl32 ((w2_t[0] ^ w0_t[3] ^ w3_t[1] ^ w2_t[3]), 1u); SHA1_STEP (SHA1_F1, a, b, c, d, e, w2_t[3]); w3_t[0] = rotl32 ((w2_t[1] ^ w1_t[0] ^ w3_t[2] ^ w3_t[0]), 1u); SHA1_STEP (SHA1_F1, e, a, b, c, d, w3_t[0]); - if (MATCHES_NONE_VS (e, e_rev)) continue; + if (allx (e != e_rev)) continue; w3_t[1] = rotl32 ((w2_t[2] ^ w1_t[1] ^ w3_t[3] ^ w3_t[1]), 1u); SHA1_STEP (SHA1_F1, d, e, a, b, c, w3_t[1]); w3_t[2] = rotl32 ((w2_t[3] ^ w1_t[2] ^ w0_t[0] ^ w3_t[2]), 1u); SHA1_STEP (SHA1_F1, c, d, e, a, b, w3_t[2]); w3_t[3] = rotl32 ((w3_t[0] ^ w1_t[3] ^ w0_t[1] ^ w3_t[3]), 1u); SHA1_STEP (SHA1_F1, b, c, d, e, a, w3_t[3]); - COMPARE_S_SIMD (a, d, c, b); + + const u32 r0 = d; + const u32 r1 = e; + const u32 r2 = c; + const u32 r3 = b; + + #include COMPARE_S } } diff --git a/OpenCL/m08300_a1.cl b/OpenCL/m08300_a1.cl index a1f30e6..74b7658 100644 --- a/OpenCL/m08300_a1.cl +++ b/OpenCL/m08300_a1.cl @@ -5,8 +5,6 @@ #define _SHA1_ -#define NEW_SIMD_CODE - #include "include/constants.h" #include "include/kernel_vendor.h" @@ -18,7 +16,9 @@ #include "include/kernel_functions.c" #include "OpenCL/types_ocl.c" #include "OpenCL/common.c" -#include "OpenCL/simd.c" + +#define COMPARE_S "OpenCL/check_single_comp4.c" +#define COMPARE_M "OpenCL/check_multi_comp4.c" static void sha1_transform (const u32 w0[4], const u32 w1[4], const u32 w2[4], const u32 w3[4], u32 digest[5]) { @@ -164,20 +164,41 @@ __kernel void m08300_m04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, if (gid >= gid_max) return; - u32 pws0[4] = { 0 }; - u32 pws1[4] = { 0 }; + u32 wordl0[4]; + + wordl0[0] = pws[gid].i[ 0]; + wordl0[1] = pws[gid].i[ 1]; + wordl0[2] = pws[gid].i[ 2]; + wordl0[3] = pws[gid].i[ 3]; + + u32 wordl1[4]; + + wordl1[0] = pws[gid].i[ 4]; + wordl1[1] = pws[gid].i[ 5]; + wordl1[2] = pws[gid].i[ 6]; + wordl1[3] = pws[gid].i[ 7]; + + u32 wordl2[4]; + + wordl2[0] = 0; + wordl2[1] = 0; + wordl2[2] = 0; + wordl2[3] = 0; - pws0[0] = pws[gid].i[0]; - pws0[1] = pws[gid].i[1]; - pws0[2] = pws[gid].i[2]; - pws0[3] = pws[gid].i[3]; - pws1[0] = pws[gid].i[4]; - pws1[1] = pws[gid].i[5]; - pws1[2] = pws[gid].i[6]; - pws1[3] = pws[gid].i[7]; + u32 wordl3[4]; + + wordl3[0] = 0; + wordl3[1] = 0; + wordl3[2] = 0; + wordl3[3] = 0; const u32 pw_l_len = pws[gid].pw_len; + if (combs_mode == COMBINATOR_MODE_BASE_RIGHT) + { + switch_buffer_by_offset_le (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len); + } + /** * salt */ @@ -220,35 +241,43 @@ __kernel void m08300_m04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, * loop */ - for (u32 il_pos = 0; il_pos < combs_cnt; il_pos += VECT_SIZE) + for (u32 il_pos = 0; il_pos < combs_cnt; il_pos++) { - const u32x pw_r_len = pwlenx_create_combt (combs_buf, il_pos); - - const u32x pw_len = pw_l_len + pw_r_len; - - u32x wordr0[4] = { 0 }; - u32x wordr1[4] = { 0 }; - u32x wordr2[4] = { 0 }; - u32x wordr3[4] = { 0 }; - - wordr0[0] = ix_create_combt (combs_buf, il_pos, 0); - wordr0[1] = ix_create_combt (combs_buf, il_pos, 1); - wordr0[2] = ix_create_combt (combs_buf, il_pos, 2); - wordr0[3] = ix_create_combt (combs_buf, il_pos, 3); - wordr1[0] = ix_create_combt (combs_buf, il_pos, 4); - wordr1[1] = ix_create_combt (combs_buf, il_pos, 5); - wordr1[2] = ix_create_combt (combs_buf, il_pos, 6); - wordr1[3] = ix_create_combt (combs_buf, il_pos, 7); + const u32 pw_r_len = combs_buf[il_pos].pw_len; + + const u32 pw_len = pw_l_len + pw_r_len; + + u32 wordr0[4]; + u32 wordr1[4]; + u32 wordr2[4]; + u32 wordr3[4]; + + wordr0[0] = combs_buf[il_pos].i[0]; + wordr0[1] = combs_buf[il_pos].i[1]; + wordr0[2] = combs_buf[il_pos].i[2]; + wordr0[3] = combs_buf[il_pos].i[3]; + wordr1[0] = combs_buf[il_pos].i[4]; + wordr1[1] = combs_buf[il_pos].i[5]; + wordr1[2] = combs_buf[il_pos].i[6]; + wordr1[3] = combs_buf[il_pos].i[7]; + wordr2[0] = 0; + wordr2[1] = 0; + wordr2[2] = 0; + wordr2[3] = 0; + wordr3[0] = 0; + wordr3[1] = 0; + wordr3[2] = 0; + wordr3[3] = 0; if (combs_mode == COMBINATOR_MODE_BASE_LEFT) { switch_buffer_by_offset_le (wordr0, wordr1, wordr2, wordr3, pw_l_len); } - u32x w0[4]; - u32x w1[4]; - u32x w2[4]; - u32x w3[4]; + u32 w0[4]; + u32 w1[4]; + u32 w2[4]; + u32 w3[4]; w0[0] = wordl0[0] | wordr0[0]; w0[1] = wordl0[1] | wordr0[1]; @@ -267,28 +296,28 @@ __kernel void m08300_m04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, w3[2] = wordl3[2] | wordr3[2]; w3[3] = wordl3[3] | wordr3[3]; - u32x w0_t[4]; + u32 w0_t[4]; w0_t[0] = w0[0]; w0_t[1] = w0[1]; w0_t[2] = w0[2]; w0_t[3] = w0[3]; - u32x w1_t[4]; + u32 w1_t[4]; w1_t[0] = w1[0]; w1_t[1] = w1[1]; w1_t[2] = w1[2]; w1_t[3] = w1[3]; - u32x w2_t[4]; + u32 w2_t[4]; w2_t[0] = w2[0]; w2_t[1] = w2[1]; w2_t[2] = w2[2]; w2_t[3] = w2[3]; - u32x w3_t[4]; + u32 w3_t[4]; w3_t[0] = w3[0]; w3_t[1] = w3[1]; @@ -367,28 +396,28 @@ __kernel void m08300_m04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, * sha1 */ - u32x w0_t2[4]; + u32 w0_t2[4]; w0_t2[0] = swap32 (w0_t[0] | d0[0] | s0[0]); w0_t2[1] = swap32 (w0_t[1] | d0[1] | s0[1]); w0_t2[2] = swap32 (w0_t[2] | d0[2] | s0[2]); w0_t2[3] = swap32 (w0_t[3] | d0[3] | s0[3]); - u32x w1_t2[4]; + u32 w1_t2[4]; w1_t2[0] = swap32 (w1_t[0] | d1[0] | s1[0]); w1_t2[1] = swap32 (w1_t[1] | d1[1] | s1[1]); w1_t2[2] = swap32 (w1_t[2] | d1[2] | s1[2]); w1_t2[3] = swap32 (w1_t[3] | d1[3] | s1[3]); - u32x w2_t2[4]; + u32 w2_t2[4]; w2_t2[0] = swap32 (w2_t[0] | d2[0] | s2[0]); w2_t2[1] = swap32 (w2_t[1] | d2[1] | s2[1]); w2_t2[2] = swap32 (w2_t[2] | d2[2] | s2[2]); w2_t2[3] = swap32 (w2_t[3] | d2[3] | s2[3]); - u32x w3_t2[4]; + u32 w3_t2[4]; w3_t2[0] = swap32 (w3_t[0] | d3[0] | s3[0]); w3_t2[1] = swap32 (w3_t[1] | d3[1] | s3[1]); @@ -409,28 +438,28 @@ __kernel void m08300_m04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, for (u32 i = 0; i < salt_iter; i++) { - u32x w0_t3[4]; + u32 w0_t3[4]; w0_t3[0] = digest[0]; w0_t3[1] = digest[1]; w0_t3[2] = digest[2]; w0_t3[3] = digest[3]; - u32x w1_t3[4]; + u32 w1_t3[4]; w1_t3[0] = digest[4]; w1_t3[1] = swap32 (salt_buf0[0]); w1_t3[2] = swap32 (salt_buf0[1]); w1_t3[3] = swap32 (salt_buf0[2]); - u32x w2_t3[4]; + u32 w2_t3[4]; w2_t3[0] = swap32 (salt_buf0[3]); w2_t3[1] = swap32 (salt_buf1[0]); w2_t3[2] = swap32 (salt_buf1[1]); w2_t3[3] = swap32 (salt_buf1[2]); - u32x w3_t3[4]; + u32 w3_t3[4]; w3_t3[0] = swap32 (salt_buf1[3]); w3_t3[1] = 0; @@ -446,7 +475,12 @@ __kernel void m08300_m04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, sha1_transform (w0_t3, w1_t3, w2_t3, w3_t3, digest); } - COMPARE_M_SIMD (digest[3], digest[4], digest[2], digest[1]); + const u32 r0 = digest[3]; + const u32 r1 = digest[4]; + const u32 r2 = digest[2]; + const u32 r3 = digest[1]; + + #include COMPARE_M } } @@ -474,20 +508,41 @@ __kernel void m08300_s04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, if (gid >= gid_max) return; - u32 pws0[4] = { 0 }; - u32 pws1[4] = { 0 }; + u32 wordl0[4]; + + wordl0[0] = pws[gid].i[ 0]; + wordl0[1] = pws[gid].i[ 1]; + wordl0[2] = pws[gid].i[ 2]; + wordl0[3] = pws[gid].i[ 3]; + + u32 wordl1[4]; + + wordl1[0] = pws[gid].i[ 4]; + wordl1[1] = pws[gid].i[ 5]; + wordl1[2] = pws[gid].i[ 6]; + wordl1[3] = pws[gid].i[ 7]; + + u32 wordl2[4]; + + wordl2[0] = 0; + wordl2[1] = 0; + wordl2[2] = 0; + wordl2[3] = 0; + + u32 wordl3[4]; - pws0[0] = pws[gid].i[0]; - pws0[1] = pws[gid].i[1]; - pws0[2] = pws[gid].i[2]; - pws0[3] = pws[gid].i[3]; - pws1[0] = pws[gid].i[4]; - pws1[1] = pws[gid].i[5]; - pws1[2] = pws[gid].i[6]; - pws1[3] = pws[gid].i[7]; + wordl3[0] = 0; + wordl3[1] = 0; + wordl3[2] = 0; + wordl3[3] = 0; const u32 pw_l_len = pws[gid].pw_len; + if (combs_mode == COMBINATOR_MODE_BASE_RIGHT) + { + switch_buffer_by_offset_le (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len); + } + /** * salt */ @@ -542,35 +597,43 @@ __kernel void m08300_s04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, * loop */ - for (u32 il_pos = 0; il_pos < combs_cnt; il_pos += VECT_SIZE) + for (u32 il_pos = 0; il_pos < combs_cnt; il_pos++) { - const u32x pw_r_len = pwlenx_create_combt (combs_buf, il_pos); - - const u32x pw_len = pw_l_len + pw_r_len; - - u32x wordr0[4] = { 0 }; - u32x wordr1[4] = { 0 }; - u32x wordr2[4] = { 0 }; - u32x wordr3[4] = { 0 }; - - wordr0[0] = ix_create_combt (combs_buf, il_pos, 0); - wordr0[1] = ix_create_combt (combs_buf, il_pos, 1); - wordr0[2] = ix_create_combt (combs_buf, il_pos, 2); - wordr0[3] = ix_create_combt (combs_buf, il_pos, 3); - wordr1[0] = ix_create_combt (combs_buf, il_pos, 4); - wordr1[1] = ix_create_combt (combs_buf, il_pos, 5); - wordr1[2] = ix_create_combt (combs_buf, il_pos, 6); - wordr1[3] = ix_create_combt (combs_buf, il_pos, 7); + const u32 pw_r_len = combs_buf[il_pos].pw_len; + + const u32 pw_len = pw_l_len + pw_r_len; + + u32 wordr0[4]; + u32 wordr1[4]; + u32 wordr2[4]; + u32 wordr3[4]; + + wordr0[0] = combs_buf[il_pos].i[0]; + wordr0[1] = combs_buf[il_pos].i[1]; + wordr0[2] = combs_buf[il_pos].i[2]; + wordr0[3] = combs_buf[il_pos].i[3]; + wordr1[0] = combs_buf[il_pos].i[4]; + wordr1[1] = combs_buf[il_pos].i[5]; + wordr1[2] = combs_buf[il_pos].i[6]; + wordr1[3] = combs_buf[il_pos].i[7]; + wordr2[0] = 0; + wordr2[1] = 0; + wordr2[2] = 0; + wordr2[3] = 0; + wordr3[0] = 0; + wordr3[1] = 0; + wordr3[2] = 0; + wordr3[3] = 0; if (combs_mode == COMBINATOR_MODE_BASE_LEFT) { switch_buffer_by_offset_le (wordr0, wordr1, wordr2, wordr3, pw_l_len); } - u32x w0[4]; - u32x w1[4]; - u32x w2[4]; - u32x w3[4]; + u32 w0[4]; + u32 w1[4]; + u32 w2[4]; + u32 w3[4]; w0[0] = wordl0[0] | wordr0[0]; w0[1] = wordl0[1] | wordr0[1]; @@ -589,28 +652,28 @@ __kernel void m08300_s04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, w3[2] = wordl3[2] | wordr3[2]; w3[3] = wordl3[3] | wordr3[3]; - u32x w0_t[4]; + u32 w0_t[4]; w0_t[0] = w0[0]; w0_t[1] = w0[1]; w0_t[2] = w0[2]; w0_t[3] = w0[3]; - u32x w1_t[4]; + u32 w1_t[4]; w1_t[0] = w1[0]; w1_t[1] = w1[1]; w1_t[2] = w1[2]; w1_t[3] = w1[3]; - u32x w2_t[4]; + u32 w2_t[4]; w2_t[0] = w2[0]; w2_t[1] = w2[1]; w2_t[2] = w2[2]; w2_t[3] = w2[3]; - u32x w3_t[4]; + u32 w3_t[4]; w3_t[0] = w3[0]; w3_t[1] = w3[1]; @@ -689,28 +752,28 @@ __kernel void m08300_s04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, * sha1 */ - u32x w0_t2[4]; + u32 w0_t2[4]; w0_t2[0] = swap32 (w0_t[0] | d0[0] | s0[0]); w0_t2[1] = swap32 (w0_t[1] | d0[1] | s0[1]); w0_t2[2] = swap32 (w0_t[2] | d0[2] | s0[2]); w0_t2[3] = swap32 (w0_t[3] | d0[3] | s0[3]); - u32x w1_t2[4]; + u32 w1_t2[4]; w1_t2[0] = swap32 (w1_t[0] | d1[0] | s1[0]); w1_t2[1] = swap32 (w1_t[1] | d1[1] | s1[1]); w1_t2[2] = swap32 (w1_t[2] | d1[2] | s1[2]); w1_t2[3] = swap32 (w1_t[3] | d1[3] | s1[3]); - u32x w2_t2[4]; + u32 w2_t2[4]; w2_t2[0] = swap32 (w2_t[0] | d2[0] | s2[0]); w2_t2[1] = swap32 (w2_t[1] | d2[1] | s2[1]); w2_t2[2] = swap32 (w2_t[2] | d2[2] | s2[2]); w2_t2[3] = swap32 (w2_t[3] | d2[3] | s2[3]); - u32x w3_t2[4]; + u32 w3_t2[4]; w3_t2[0] = swap32 (w3_t[0] | d3[0] | s3[0]); w3_t2[1] = swap32 (w3_t[1] | d3[1] | s3[1]); @@ -731,28 +794,28 @@ __kernel void m08300_s04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, for (u32 i = 0; i < salt_iter; i++) { - u32x w0_t3[4]; + u32 w0_t3[4]; w0_t3[0] = digest[0]; w0_t3[1] = digest[1]; w0_t3[2] = digest[2]; w0_t3[3] = digest[3]; - u32x w1_t3[4]; + u32 w1_t3[4]; w1_t3[0] = digest[4]; w1_t3[1] = swap32 (salt_buf0[0]); w1_t3[2] = swap32 (salt_buf0[1]); w1_t3[3] = swap32 (salt_buf0[2]); - u32x w2_t3[4]; + u32 w2_t3[4]; w2_t3[0] = swap32 (salt_buf0[3]); w2_t3[1] = swap32 (salt_buf1[0]); w2_t3[2] = swap32 (salt_buf1[1]); w2_t3[3] = swap32 (salt_buf1[2]); - u32x w3_t3[4]; + u32 w3_t3[4]; w3_t3[0] = swap32 (salt_buf1[3]); w3_t3[1] = 0; @@ -768,7 +831,12 @@ __kernel void m08300_s04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, sha1_transform (w0_t3, w1_t3, w2_t3, w3_t3, digest); } - COMPARE_S_SIMD (digest[3], digest[4], digest[2], digest[1]); + const u32 r0 = digest[3]; + const u32 r1 = digest[4]; + const u32 r2 = digest[2]; + const u32 r3 = digest[1]; + + #include COMPARE_S } } diff --git a/OpenCL/m08400_a1.cl b/OpenCL/m08400_a1.cl index cfffdef..be3048b 100644 --- a/OpenCL/m08400_a1.cl +++ b/OpenCL/m08400_a1.cl @@ -5,8 +5,6 @@ #define _SHA1_ -#define NEW_SIMD_CODE - #include "include/constants.h" #include "include/kernel_vendor.h" @@ -18,7 +16,9 @@ #include "include/kernel_functions.c" #include "OpenCL/types_ocl.c" #include "OpenCL/common.c" -#include "OpenCL/simd.c" + +#define COMPARE_S "OpenCL/check_single_comp4.c" +#define COMPARE_M "OpenCL/check_multi_comp4.c" #define uint_to_hex_lower8_le(i) l_bin2asc[(i)] @@ -183,20 +183,43 @@ __kernel void m08400_m04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, * base */ - u32 pws0[4] = { 0 }; - u32 pws1[4] = { 0 }; + u32 wordl0[4]; + + wordl0[0] = pws[gid].i[ 0]; + wordl0[1] = pws[gid].i[ 1]; + wordl0[2] = pws[gid].i[ 2]; + wordl0[3] = pws[gid].i[ 3]; + + u32 wordl1[4]; + + wordl1[0] = pws[gid].i[ 4]; + wordl1[1] = pws[gid].i[ 5]; + wordl1[2] = pws[gid].i[ 6]; + wordl1[3] = pws[gid].i[ 7]; - pws0[0] = pws[gid].i[0]; - pws0[1] = pws[gid].i[1]; - pws0[2] = pws[gid].i[2]; - pws0[3] = pws[gid].i[3]; - pws1[0] = pws[gid].i[4]; - pws1[1] = pws[gid].i[5]; - pws1[2] = pws[gid].i[6]; - pws1[3] = pws[gid].i[7]; + u32 wordl2[4]; + + wordl2[0] = 0; + wordl2[1] = 0; + wordl2[2] = 0; + wordl2[3] = 0; + + u32 wordl3[4]; + + wordl3[0] = 0; + wordl3[1] = 0; + wordl3[2] = 0; + wordl3[3] = 0; const u32 pw_l_len = pws[gid].pw_len; + if (combs_mode == COMBINATOR_MODE_BASE_RIGHT) + { + append_0x80_2x4 (wordl0, wordl1, pw_l_len); + + switch_buffer_by_offset_le (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len); + } + /** * salt */ @@ -228,25 +251,39 @@ __kernel void m08400_m04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, * loop */ - for (u32 il_pos = 0; il_pos < combs_cnt; il_pos += VECT_SIZE) + for (u32 il_pos = 0; il_pos < combs_cnt; il_pos++) { - const u32x pw_r_len = pwlenx_create_combt (combs_buf, il_pos); + const u32 pw_r_len = combs_buf[il_pos].pw_len; + + const u32 pw_len = pw_l_len + pw_r_len; + + u32 wordr0[4]; + + wordr0[0] = combs_buf[il_pos].i[0]; + wordr0[1] = combs_buf[il_pos].i[1]; + wordr0[2] = combs_buf[il_pos].i[2]; + wordr0[3] = combs_buf[il_pos].i[3]; + + u32 wordr1[4]; + + wordr1[0] = combs_buf[il_pos].i[4]; + wordr1[1] = combs_buf[il_pos].i[5]; + wordr1[2] = combs_buf[il_pos].i[6]; + wordr1[3] = combs_buf[il_pos].i[7]; + + u32 wordr2[4]; - const u32x pw_len = pw_l_len + pw_r_len; + wordr2[0] = 0; + wordr2[1] = 0; + wordr2[2] = 0; + wordr2[3] = 0; - u32x wordr0[4] = { 0 }; - u32x wordr1[4] = { 0 }; - u32x wordr2[4] = { 0 }; - u32x wordr3[4] = { 0 }; + u32 wordr3[4]; - wordr0[0] = ix_create_combt (combs_buf, il_pos, 0); - wordr0[1] = ix_create_combt (combs_buf, il_pos, 1); - wordr0[2] = ix_create_combt (combs_buf, il_pos, 2); - wordr0[3] = ix_create_combt (combs_buf, il_pos, 3); - wordr1[0] = ix_create_combt (combs_buf, il_pos, 4); - wordr1[1] = ix_create_combt (combs_buf, il_pos, 5); - wordr1[2] = ix_create_combt (combs_buf, il_pos, 6); - wordr1[3] = ix_create_combt (combs_buf, il_pos, 7); + wordr3[0] = 0; + wordr3[1] = 0; + wordr3[2] = 0; + wordr3[3] = 0; if (combs_mode == COMBINATOR_MODE_BASE_LEFT) { @@ -255,56 +292,56 @@ __kernel void m08400_m04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, switch_buffer_by_offset_le (wordr0, wordr1, wordr2, wordr3, pw_l_len); } - u32x w0[4]; + u32 w0[4]; w0[0] = wordl0[0] | wordr0[0]; w0[1] = wordl0[1] | wordr0[1]; w0[2] = wordl0[2] | wordr0[2]; w0[3] = wordl0[3] | wordr0[3]; - u32x w1[4]; + u32 w1[4]; w1[0] = wordl1[0] | wordr1[0]; w1[1] = wordl1[1] | wordr1[1]; w1[2] = wordl1[2] | wordr1[2]; w1[3] = wordl1[3] | wordr1[3]; - u32x w2[4]; + u32 w2[4]; w2[0] = wordl2[0] | wordr2[0]; w2[1] = wordl2[1] | wordr2[1]; w2[2] = wordl2[2] | wordr2[2]; w2[3] = wordl2[3] | wordr2[3]; - u32x w3[4]; + u32 w3[4]; w3[0] = wordl3[0] | wordr3[0]; w3[1] = wordl3[1] | wordr3[1]; w3[2] = wordl3[2] | wordr3[2]; w3[3] = wordl3[3] | wordr3[3]; - u32x w0_t[4]; + u32 w0_t[4]; w0_t[0] = swap32 (w0[0]); w0_t[1] = swap32 (w0[1]); w0_t[2] = swap32 (w0[2]); w0_t[3] = swap32 (w0[3]); - u32x w1_t[4]; + u32 w1_t[4]; w1_t[0] = swap32 (w1[0]); w1_t[1] = swap32 (w1[1]); w1_t[2] = swap32 (w1[2]); w1_t[3] = swap32 (w1[3]); - u32x w2_t[4]; + u32 w2_t[4]; w2_t[0] = swap32 (w2[0]); w2_t[1] = swap32 (w2[1]); w2_t[2] = swap32 (w2[2]); w2_t[3] = swap32 (w2[3]); - u32x w3_t[4]; + u32 w3_t[4]; w3_t[0] = swap32 (w3[0]); w3_t[1] = swap32 (w3[1]); @@ -447,7 +484,12 @@ __kernel void m08400_m04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, sha1_transform (w0_t, w1_t, w2_t, w3_t, digest); - COMPARE_M_SIMD (digest[3], digest[4], digest[2], digest[1]); + const u32 r0 = digest[3]; + const u32 r1 = digest[4]; + const u32 r2 = digest[2]; + const u32 r3 = digest[1]; + + #include COMPARE_M } } @@ -492,20 +534,43 @@ __kernel void m08400_s04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, * base */ - u32 pws0[4] = { 0 }; - u32 pws1[4] = { 0 }; + u32 wordl0[4]; + + wordl0[0] = pws[gid].i[ 0]; + wordl0[1] = pws[gid].i[ 1]; + wordl0[2] = pws[gid].i[ 2]; + wordl0[3] = pws[gid].i[ 3]; + + u32 wordl1[4]; + + wordl1[0] = pws[gid].i[ 4]; + wordl1[1] = pws[gid].i[ 5]; + wordl1[2] = pws[gid].i[ 6]; + wordl1[3] = pws[gid].i[ 7]; + + u32 wordl2[4]; - pws0[0] = pws[gid].i[0]; - pws0[1] = pws[gid].i[1]; - pws0[2] = pws[gid].i[2]; - pws0[3] = pws[gid].i[3]; - pws1[0] = pws[gid].i[4]; - pws1[1] = pws[gid].i[5]; - pws1[2] = pws[gid].i[6]; - pws1[3] = pws[gid].i[7]; + wordl2[0] = 0; + wordl2[1] = 0; + wordl2[2] = 0; + wordl2[3] = 0; + + u32 wordl3[4]; + + wordl3[0] = 0; + wordl3[1] = 0; + wordl3[2] = 0; + wordl3[3] = 0; const u32 pw_l_len = pws[gid].pw_len; + if (combs_mode == COMBINATOR_MODE_BASE_RIGHT) + { + append_0x80_2x4 (wordl0, wordl1, pw_l_len); + + switch_buffer_by_offset_le (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len); + } + /** * salt */ @@ -549,25 +614,39 @@ __kernel void m08400_s04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, * loop */ - for (u32 il_pos = 0; il_pos < combs_cnt; il_pos += VECT_SIZE) + for (u32 il_pos = 0; il_pos < combs_cnt; il_pos++) { - const u32x pw_r_len = pwlenx_create_combt (combs_buf, il_pos); + const u32 pw_r_len = combs_buf[il_pos].pw_len; + + const u32 pw_len = pw_l_len + pw_r_len; + + u32 wordr0[4]; + + wordr0[0] = combs_buf[il_pos].i[0]; + wordr0[1] = combs_buf[il_pos].i[1]; + wordr0[2] = combs_buf[il_pos].i[2]; + wordr0[3] = combs_buf[il_pos].i[3]; - const u32x pw_len = pw_l_len + pw_r_len; + u32 wordr1[4]; - u32x wordr0[4] = { 0 }; - u32x wordr1[4] = { 0 }; - u32x wordr2[4] = { 0 }; - u32x wordr3[4] = { 0 }; + wordr1[0] = combs_buf[il_pos].i[4]; + wordr1[1] = combs_buf[il_pos].i[5]; + wordr1[2] = combs_buf[il_pos].i[6]; + wordr1[3] = combs_buf[il_pos].i[7]; - wordr0[0] = ix_create_combt (combs_buf, il_pos, 0); - wordr0[1] = ix_create_combt (combs_buf, il_pos, 1); - wordr0[2] = ix_create_combt (combs_buf, il_pos, 2); - wordr0[3] = ix_create_combt (combs_buf, il_pos, 3); - wordr1[0] = ix_create_combt (combs_buf, il_pos, 4); - wordr1[1] = ix_create_combt (combs_buf, il_pos, 5); - wordr1[2] = ix_create_combt (combs_buf, il_pos, 6); - wordr1[3] = ix_create_combt (combs_buf, il_pos, 7); + u32 wordr2[4]; + + wordr2[0] = 0; + wordr2[1] = 0; + wordr2[2] = 0; + wordr2[3] = 0; + + u32 wordr3[4]; + + wordr3[0] = 0; + wordr3[1] = 0; + wordr3[2] = 0; + wordr3[3] = 0; if (combs_mode == COMBINATOR_MODE_BASE_LEFT) { @@ -576,56 +655,56 @@ __kernel void m08400_s04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, switch_buffer_by_offset_le (wordr0, wordr1, wordr2, wordr3, pw_l_len); } - u32x w0[4]; + u32 w0[4]; w0[0] = wordl0[0] | wordr0[0]; w0[1] = wordl0[1] | wordr0[1]; w0[2] = wordl0[2] | wordr0[2]; w0[3] = wordl0[3] | wordr0[3]; - u32x w1[4]; + u32 w1[4]; w1[0] = wordl1[0] | wordr1[0]; w1[1] = wordl1[1] | wordr1[1]; w1[2] = wordl1[2] | wordr1[2]; w1[3] = wordl1[3] | wordr1[3]; - u32x w2[4]; + u32 w2[4]; w2[0] = wordl2[0] | wordr2[0]; w2[1] = wordl2[1] | wordr2[1]; w2[2] = wordl2[2] | wordr2[2]; w2[3] = wordl2[3] | wordr2[3]; - u32x w3[4]; + u32 w3[4]; w3[0] = wordl3[0] | wordr3[0]; w3[1] = wordl3[1] | wordr3[1]; w3[2] = wordl3[2] | wordr3[2]; w3[3] = wordl3[3] | wordr3[3]; - u32x w0_t[4]; + u32 w0_t[4]; w0_t[0] = swap32 (w0[0]); w0_t[1] = swap32 (w0[1]); w0_t[2] = swap32 (w0[2]); w0_t[3] = swap32 (w0[3]); - u32x w1_t[4]; + u32 w1_t[4]; w1_t[0] = swap32 (w1[0]); w1_t[1] = swap32 (w1[1]); w1_t[2] = swap32 (w1[2]); w1_t[3] = swap32 (w1[3]); - u32x w2_t[4]; + u32 w2_t[4]; w2_t[0] = swap32 (w2[0]); w2_t[1] = swap32 (w2[1]); w2_t[2] = swap32 (w2[2]); w2_t[3] = swap32 (w2[3]); - u32x w3_t[4]; + u32 w3_t[4]; w3_t[0] = swap32 (w3[0]); w3_t[1] = swap32 (w3[1]); @@ -768,7 +847,12 @@ __kernel void m08400_s04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, sha1_transform (w0_t, w1_t, w2_t, w3_t, digest); - COMPARE_S_SIMD (digest[3], digest[4], digest[2], digest[1]); + const u32 r0 = digest[3]; + const u32 r1 = digest[4]; + const u32 r2 = digest[2]; + const u32 r3 = digest[1]; + + #include COMPARE_S } } diff --git a/OpenCL/m08500_a1.cl b/OpenCL/m08500_a1.cl index fa71396..f3f653d 100644 --- a/OpenCL/m08500_a1.cl +++ b/OpenCL/m08500_a1.cl @@ -7,8 +7,6 @@ #define _DES_ -#define NEW_SIMD_CODE - #include "include/constants.h" #include "include/kernel_vendor.h" @@ -20,7 +18,9 @@ #include "include/kernel_functions.c" #include "OpenCL/types_ocl.c" #include "OpenCL/common.c" -#include "OpenCL/simd.c" + +#define COMPARE_S "OpenCL/check_single_comp4.c" +#define COMPARE_M "OpenCL/check_multi_comp4.c" #define PERM_OP(a,b,tt,n,m) \ { \ @@ -583,7 +583,7 @@ __kernel void m08500_m04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, if (combs_mode == COMBINATOR_MODE_BASE_RIGHT) { - switch_buffer_by_offset_le_S (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len); + switch_buffer_by_offset_le (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len); } /** @@ -599,49 +599,69 @@ __kernel void m08500_m04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, * main */ - for (u32 il_pos = 0; il_pos < combs_cnt; il_pos += VECT_SIZE) + for (u32 il_pos = 0; il_pos < combs_cnt; il_pos++) { - const u32x pw_r_len = pwlenx_create_combt (combs_buf, il_pos); + const u32 pw_r_len = combs_buf[il_pos].pw_len; - u32x pw_len = pw_l_len + pw_r_len; + u32 pw_len = pw_l_len + pw_r_len; pw_len = (pw_len >= 8) ? 8 : pw_len; - u32x wordr0[4] = { 0 }; - u32x wordr1[4] = { 0 }; - u32x wordr2[4] = { 0 }; - u32x wordr3[4] = { 0 }; + u32 wordr0[4]; + + wordr0[0] = combs_buf[il_pos].i[0]; + wordr0[1] = combs_buf[il_pos].i[1]; + wordr0[2] = 0; + wordr0[3] = 0; + + u32 wordr1[4]; + + wordr1[0] = 0; + wordr1[1] = 0; + wordr1[2] = 0; + wordr1[3] = 0; + + u32 wordr2[4]; + + wordr2[0] = 0; + wordr2[1] = 0; + wordr2[2] = 0; + wordr2[3] = 0; - wordr0[0] = ix_create_combt (combs_buf, il_pos, 0); - wordr0[1] = ix_create_combt (combs_buf, il_pos, 1); + u32 wordr3[4]; + + wordr3[0] = 0; + wordr3[1] = 0; + wordr3[2] = 0; + wordr3[3] = 0; if (combs_mode == COMBINATOR_MODE_BASE_LEFT) { switch_buffer_by_offset_le (wordr0, wordr1, wordr2, wordr3, pw_l_len); } - u32x w0[4]; + u32 w0[4]; w0[0] = wordl0[0] | wordr0[0]; w0[1] = wordl0[1] | wordr0[1]; w0[2] = 0; w0[3] = 0; - u32x w1[4]; + u32 w1[4]; w1[0] = 0; w1[1] = 0; w1[2] = 0; w1[3] = 0; - u32x w2[4]; + u32 w2[4]; w2[0] = 0; w2[1] = 0; w2[2] = 0; w2[3] = 0; - u32x w3[4]; + u32 w3[4]; w3[0] = 0; w3[1] = 0; @@ -669,10 +689,12 @@ __kernel void m08500_m04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, _des_crypt_encrypt (iv, data, Kc, Kd, s_SPtrans); - u32x c = 0; - u32x d = 0; + const u32 r0 = iv[0]; + const u32 r1 = iv[1]; + const u32 r2 = 0; + const u32 r3 = 0; - COMPARE_M_SIMD (iv[0], iv[1], c, d); + #include COMPARE_M } } @@ -762,7 +784,7 @@ __kernel void m08500_s04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, if (combs_mode == COMBINATOR_MODE_BASE_RIGHT) { - switch_buffer_by_offset_le_S (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len); + switch_buffer_by_offset_le (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len); } /** @@ -790,49 +812,69 @@ __kernel void m08500_s04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, * main */ - for (u32 il_pos = 0; il_pos < combs_cnt; il_pos += VECT_SIZE) + for (u32 il_pos = 0; il_pos < combs_cnt; il_pos++) { - const u32x pw_r_len = pwlenx_create_combt (combs_buf, il_pos); + const u32 pw_r_len = combs_buf[il_pos].pw_len; - u32x pw_len = pw_l_len + pw_r_len; + u32 pw_len = pw_l_len + pw_r_len; pw_len = (pw_len >= 8) ? 8 : pw_len; - u32x wordr0[4] = { 0 }; - u32x wordr1[4] = { 0 }; - u32x wordr2[4] = { 0 }; - u32x wordr3[4] = { 0 }; + u32 wordr0[4]; + + wordr0[0] = combs_buf[il_pos].i[0]; + wordr0[1] = combs_buf[il_pos].i[1]; + wordr0[2] = 0; + wordr0[3] = 0; + + u32 wordr1[4]; + + wordr1[0] = 0; + wordr1[1] = 0; + wordr1[2] = 0; + wordr1[3] = 0; + + u32 wordr2[4]; + + wordr2[0] = 0; + wordr2[1] = 0; + wordr2[2] = 0; + wordr2[3] = 0; + + u32 wordr3[4]; - wordr0[0] = ix_create_combt (combs_buf, il_pos, 0); - wordr0[1] = ix_create_combt (combs_buf, il_pos, 1); + wordr3[0] = 0; + wordr3[1] = 0; + wordr3[2] = 0; + wordr3[3] = 0; if (combs_mode == COMBINATOR_MODE_BASE_LEFT) { switch_buffer_by_offset_le (wordr0, wordr1, wordr2, wordr3, pw_l_len); } - u32x w0[4]; + u32 w0[4]; w0[0] = wordl0[0] | wordr0[0]; w0[1] = wordl0[1] | wordr0[1]; w0[2] = 0; w0[3] = 0; - u32x w1[4]; + u32 w1[4]; w1[0] = 0; w1[1] = 0; w1[2] = 0; w1[3] = 0; - u32x w2[4]; + u32 w2[4]; w2[0] = 0; w2[1] = 0; w2[2] = 0; w2[3] = 0; - u32x w3[4]; + u32 w3[4]; w3[0] = 0; w3[1] = 0; @@ -860,10 +902,12 @@ __kernel void m08500_s04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, _des_crypt_encrypt (iv, data, Kc, Kd, s_SPtrans); - u32x c = 0; - u32x d = 0; + const u32 r0 = iv[0]; + const u32 r1 = iv[1]; + const u32 r2 = 0; + const u32 r3 = 0; - COMPARE_S_SIMD (iv[0], iv[1], c, d); + #include COMPARE_S } } diff --git a/OpenCL/m08600_a1.cl b/OpenCL/m08600_a1.cl index ed37d29..9ff7b37 100644 --- a/OpenCL/m08600_a1.cl +++ b/OpenCL/m08600_a1.cl @@ -7,8 +7,6 @@ #define _LOTUS5_ -#define NEW_SIMD_CODE - #include "include/constants.h" #include "include/kernel_vendor.h" @@ -20,7 +18,9 @@ #include "include/kernel_functions.c" #include "OpenCL/types_ocl.c" #include "OpenCL/common.c" -#include "OpenCL/simd.c" + +#define COMPARE_S "OpenCL/check_single_comp4.c" +#define COMPARE_M "OpenCL/check_multi_comp4.c" __constant u32 lotus_magic_table[256] = { @@ -261,50 +261,85 @@ __kernel void m08600_m04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, * base */ - u32 pws0[4] = { 0 }; - u32 pws1[4] = { 0 }; + u32 wordl0[4]; + + wordl0[0] = pws[gid].i[ 0]; + wordl0[1] = pws[gid].i[ 1]; + wordl0[2] = pws[gid].i[ 2]; + wordl0[3] = pws[gid].i[ 3]; + + u32 wordl1[4]; + + wordl1[0] = pws[gid].i[ 4]; + wordl1[1] = pws[gid].i[ 5]; + wordl1[2] = pws[gid].i[ 6]; + wordl1[3] = pws[gid].i[ 7]; - pws0[0] = pws[gid].i[0]; - pws0[1] = pws[gid].i[1]; - pws0[2] = pws[gid].i[2]; - pws0[3] = pws[gid].i[3]; - pws1[0] = pws[gid].i[4]; - pws1[1] = pws[gid].i[5]; - pws1[2] = pws[gid].i[6]; - pws1[3] = pws[gid].i[7]; + u32 wordl2[4]; + + wordl2[0] = 0; + wordl2[1] = 0; + wordl2[2] = 0; + wordl2[3] = 0; + + u32 wordl3[4]; + + wordl3[0] = 0; + wordl3[1] = 0; + wordl3[2] = 0; + wordl3[3] = 0; const u32 pw_l_len = pws[gid].pw_len; + if (combs_mode == COMBINATOR_MODE_BASE_RIGHT) + { + switch_buffer_by_offset_le (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len); + } + /** * loop */ - for (u32 il_pos = 0; il_pos < combs_cnt; il_pos += VECT_SIZE) + for (u32 il_pos = 0; il_pos < combs_cnt; il_pos++) { - const u32x pw_r_len = pwlenx_create_combt (combs_buf, il_pos); + const u32 pw_r_len = combs_buf[il_pos].pw_len; + + u32 pw_len = pw_l_len + pw_r_len; + + u32 wordr0[4]; + + wordr0[0] = combs_buf[il_pos].i[0]; + wordr0[1] = combs_buf[il_pos].i[1]; + wordr0[2] = combs_buf[il_pos].i[2]; + wordr0[3] = combs_buf[il_pos].i[3]; + + u32 wordr1[4]; + + wordr1[0] = combs_buf[il_pos].i[4]; + wordr1[1] = combs_buf[il_pos].i[5]; + wordr1[2] = combs_buf[il_pos].i[6]; + wordr1[3] = combs_buf[il_pos].i[7]; + + u32 wordr2[4]; - u32x pw_len = pw_l_len + pw_r_len; + wordr2[0] = 0; + wordr2[1] = 0; + wordr2[2] = 0; + wordr2[3] = 0; - u32x wordr0[4] = { 0 }; - u32x wordr1[4] = { 0 }; - u32x wordr2[4] = { 0 }; - u32x wordr3[4] = { 0 }; + u32 wordr3[4]; - wordr0[0] = ix_create_combt (combs_buf, il_pos, 0); - wordr0[1] = ix_create_combt (combs_buf, il_pos, 1); - wordr0[2] = ix_create_combt (combs_buf, il_pos, 2); - wordr0[3] = ix_create_combt (combs_buf, il_pos, 3); - wordr1[0] = ix_create_combt (combs_buf, il_pos, 4); - wordr1[1] = ix_create_combt (combs_buf, il_pos, 5); - wordr1[2] = ix_create_combt (combs_buf, il_pos, 6); - wordr1[3] = ix_create_combt (combs_buf, il_pos, 7); + wordr3[0] = 0; + wordr3[1] = 0; + wordr3[2] = 0; + wordr3[3] = 0; if (combs_mode == COMBINATOR_MODE_BASE_LEFT) { switch_buffer_by_offset_le (wordr0, wordr1, wordr2, wordr3, pw_l_len); } - u32x w[16]; + u32 w[16]; w[ 0] = wordl0[0] | wordr0[0]; w[ 1] = wordl0[1] | wordr0[1]; @@ -399,20 +434,41 @@ __kernel void m08600_s04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, * base */ - u32 pws0[4] = { 0 }; - u32 pws1[4] = { 0 }; + u32 wordl0[4]; - pws0[0] = pws[gid].i[0]; - pws0[1] = pws[gid].i[1]; - pws0[2] = pws[gid].i[2]; - pws0[3] = pws[gid].i[3]; - pws1[0] = pws[gid].i[4]; - pws1[1] = pws[gid].i[5]; - pws1[2] = pws[gid].i[6]; - pws1[3] = pws[gid].i[7]; + wordl0[0] = pws[gid].i[ 0]; + wordl0[1] = pws[gid].i[ 1]; + wordl0[2] = pws[gid].i[ 2]; + wordl0[3] = pws[gid].i[ 3]; + + u32 wordl1[4]; + + wordl1[0] = pws[gid].i[ 4]; + wordl1[1] = pws[gid].i[ 5]; + wordl1[2] = pws[gid].i[ 6]; + wordl1[3] = pws[gid].i[ 7]; + + u32 wordl2[4]; + + wordl2[0] = 0; + wordl2[1] = 0; + wordl2[2] = 0; + wordl2[3] = 0; + + u32 wordl3[4]; + + wordl3[0] = 0; + wordl3[1] = 0; + wordl3[2] = 0; + wordl3[3] = 0; const u32 pw_l_len = pws[gid].pw_len; + if (combs_mode == COMBINATOR_MODE_BASE_RIGHT) + { + switch_buffer_by_offset_le (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len); + } + /** * digest */ @@ -429,32 +485,46 @@ __kernel void m08600_s04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, * loop */ - for (u32 il_pos = 0; il_pos < combs_cnt; il_pos += VECT_SIZE) + for (u32 il_pos = 0; il_pos < combs_cnt; il_pos++) { - const u32x pw_r_len = pwlenx_create_combt (combs_buf, il_pos); + const u32 pw_r_len = combs_buf[il_pos].pw_len; + + u32 pw_len = pw_l_len + pw_r_len; + + u32 wordr0[4]; + + wordr0[0] = combs_buf[il_pos].i[0]; + wordr0[1] = combs_buf[il_pos].i[1]; + wordr0[2] = combs_buf[il_pos].i[2]; + wordr0[3] = combs_buf[il_pos].i[3]; - u32x pw_len = pw_l_len + pw_r_len; + u32 wordr1[4]; - u32x wordr0[4] = { 0 }; - u32x wordr1[4] = { 0 }; - u32x wordr2[4] = { 0 }; - u32x wordr3[4] = { 0 }; + wordr1[0] = combs_buf[il_pos].i[4]; + wordr1[1] = combs_buf[il_pos].i[5]; + wordr1[2] = combs_buf[il_pos].i[6]; + wordr1[3] = combs_buf[il_pos].i[7]; - wordr0[0] = ix_create_combt (combs_buf, il_pos, 0); - wordr0[1] = ix_create_combt (combs_buf, il_pos, 1); - wordr0[2] = ix_create_combt (combs_buf, il_pos, 2); - wordr0[3] = ix_create_combt (combs_buf, il_pos, 3); - wordr1[0] = ix_create_combt (combs_buf, il_pos, 4); - wordr1[1] = ix_create_combt (combs_buf, il_pos, 5); - wordr1[2] = ix_create_combt (combs_buf, il_pos, 6); - wordr1[3] = ix_create_combt (combs_buf, il_pos, 7); + u32 wordr2[4]; + + wordr2[0] = 0; + wordr2[1] = 0; + wordr2[2] = 0; + wordr2[3] = 0; + + u32 wordr3[4]; + + wordr3[0] = 0; + wordr3[1] = 0; + wordr3[2] = 0; + wordr3[3] = 0; if (combs_mode == COMBINATOR_MODE_BASE_LEFT) { switch_buffer_by_offset_le (wordr0, wordr1, wordr2, wordr3, pw_l_len); } - u32x w[16]; + u32 w[16]; w[ 0] = wordl0[0] | wordr0[0]; w[ 1] = wordl0[1] | wordr0[1]; @@ -473,7 +543,7 @@ __kernel void m08600_s04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, w[14] = wordl3[2] | wordr3[2]; w[15] = wordl3[3] | wordr3[3]; - u32x state[4]; + u32 state[4]; state[0] = 0; state[1] = 0; diff --git a/OpenCL/m08700_a1.cl b/OpenCL/m08700_a1.cl index 4298d6d..7ee90f4 100644 --- a/OpenCL/m08700_a1.cl +++ b/OpenCL/m08700_a1.cl @@ -7,8 +7,6 @@ #define _LOTUS6_ -#define NEW_SIMD_CODE - #include "include/constants.h" #include "include/kernel_vendor.h" @@ -20,7 +18,9 @@ #include "include/kernel_functions.c" #include "OpenCL/types_ocl.c" #include "OpenCL/common.c" -#include "OpenCL/simd.c" + +#define COMPARE_S "OpenCL/check_single_comp4.c" +#define COMPARE_M "OpenCL/check_multi_comp4.c" __constant u32 lotus_magic_table[256] = { @@ -292,20 +292,41 @@ __kernel void m08700_m04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, * base */ - u32 pws0[4] = { 0 }; - u32 pws1[4] = { 0 }; + u32 wordl0[4]; + + wordl0[0] = pws[gid].i[ 0]; + wordl0[1] = pws[gid].i[ 1]; + wordl0[2] = pws[gid].i[ 2]; + wordl0[3] = pws[gid].i[ 3]; + + u32 wordl1[4]; + + wordl1[0] = pws[gid].i[ 4]; + wordl1[1] = pws[gid].i[ 5]; + wordl1[2] = pws[gid].i[ 6]; + wordl1[3] = pws[gid].i[ 7]; + + u32 wordl2[4]; + + wordl2[0] = 0; + wordl2[1] = 0; + wordl2[2] = 0; + wordl2[3] = 0; + + u32 wordl3[4]; - pws0[0] = pws[gid].i[0]; - pws0[1] = pws[gid].i[1]; - pws0[2] = pws[gid].i[2]; - pws0[3] = pws[gid].i[3]; - pws1[0] = pws[gid].i[4]; - pws1[1] = pws[gid].i[5]; - pws1[2] = pws[gid].i[6]; - pws1[3] = pws[gid].i[7]; + wordl3[0] = 0; + wordl3[1] = 0; + wordl3[2] = 0; + wordl3[3] = 0; const u32 pw_l_len = pws[gid].pw_len; + if (combs_mode == COMBINATOR_MODE_BASE_RIGHT) + { + switch_buffer_by_offset_le (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len); + } + /** * salt */ @@ -317,32 +338,46 @@ __kernel void m08700_m04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, * loop */ - for (u32 il_pos = 0; il_pos < combs_cnt; il_pos += VECT_SIZE) + for (u32 il_pos = 0; il_pos < combs_cnt; il_pos++) { - const u32x pw_r_len = pwlenx_create_combt (combs_buf, il_pos); + const u32 pw_r_len = combs_buf[il_pos].pw_len; + + const u32 pw_len = pw_l_len + pw_r_len; + + u32 wordr0[4]; + + wordr0[0] = combs_buf[il_pos].i[0]; + wordr0[1] = combs_buf[il_pos].i[1]; + wordr0[2] = combs_buf[il_pos].i[2]; + wordr0[3] = combs_buf[il_pos].i[3]; + + u32 wordr1[4]; + + wordr1[0] = combs_buf[il_pos].i[4]; + wordr1[1] = combs_buf[il_pos].i[5]; + wordr1[2] = combs_buf[il_pos].i[6]; + wordr1[3] = combs_buf[il_pos].i[7]; - const u32x pw_len = pw_l_len + pw_r_len; + u32 wordr2[4]; - u32x wordr0[4] = { 0 }; - u32x wordr1[4] = { 0 }; - u32x wordr2[4] = { 0 }; - u32x wordr3[4] = { 0 }; + wordr2[0] = 0; + wordr2[1] = 0; + wordr2[2] = 0; + wordr2[3] = 0; - wordr0[0] = ix_create_combt (combs_buf, il_pos, 0); - wordr0[1] = ix_create_combt (combs_buf, il_pos, 1); - wordr0[2] = ix_create_combt (combs_buf, il_pos, 2); - wordr0[3] = ix_create_combt (combs_buf, il_pos, 3); - wordr1[0] = ix_create_combt (combs_buf, il_pos, 4); - wordr1[1] = ix_create_combt (combs_buf, il_pos, 5); - wordr1[2] = ix_create_combt (combs_buf, il_pos, 6); - wordr1[3] = ix_create_combt (combs_buf, il_pos, 7); + u32 wordr3[4]; + + wordr3[0] = 0; + wordr3[1] = 0; + wordr3[2] = 0; + wordr3[3] = 0; if (combs_mode == COMBINATOR_MODE_BASE_LEFT) { switch_buffer_by_offset_le (wordr0, wordr1, wordr2, wordr3, pw_l_len); } - u32x w[16]; + u32 w[16]; w[ 0] = wordl0[0] | wordr0[0]; w[ 1] = wordl0[1] | wordr0[1]; @@ -361,7 +396,7 @@ __kernel void m08700_m04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, w[14] = wordl3[2] | wordr3[2]; w[15] = wordl3[3] | wordr3[3]; - u32x state[4]; + u32 state[4]; state[0] = 0; state[1] = 0; @@ -434,10 +469,10 @@ __kernel void m08700_m04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, domino_big_md (w, 34, state, s_lotus_magic_table); - u32x a = state[0] & 0xffffffff; - u32x b = state[1] & 0xffffffff; - u32x c = state[2] & 0x000000ff; - u32x d = state[3] & 0x00000000; + u32 a = state[0] & 0xffffffff; + u32 b = state[1] & 0xffffffff; + u32 c = state[2] & 0x000000ff; + u32 d = state[3] & 0x00000000; const u32 r0 = a; const u32 r1 = b; @@ -496,20 +531,41 @@ __kernel void m08700_s04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, * base */ - u32 pws0[4] = { 0 }; - u32 pws1[4] = { 0 }; + u32 wordl0[4]; + + wordl0[0] = pws[gid].i[ 0]; + wordl0[1] = pws[gid].i[ 1]; + wordl0[2] = pws[gid].i[ 2]; + wordl0[3] = pws[gid].i[ 3]; + + u32 wordl1[4]; + + wordl1[0] = pws[gid].i[ 4]; + wordl1[1] = pws[gid].i[ 5]; + wordl1[2] = pws[gid].i[ 6]; + wordl1[3] = pws[gid].i[ 7]; - pws0[0] = pws[gid].i[0]; - pws0[1] = pws[gid].i[1]; - pws0[2] = pws[gid].i[2]; - pws0[3] = pws[gid].i[3]; - pws1[0] = pws[gid].i[4]; - pws1[1] = pws[gid].i[5]; - pws1[2] = pws[gid].i[6]; - pws1[3] = pws[gid].i[7]; + u32 wordl2[4]; + + wordl2[0] = 0; + wordl2[1] = 0; + wordl2[2] = 0; + wordl2[3] = 0; + + u32 wordl3[4]; + + wordl3[0] = 0; + wordl3[1] = 0; + wordl3[2] = 0; + wordl3[3] = 0; const u32 pw_l_len = pws[gid].pw_len; + if (combs_mode == COMBINATOR_MODE_BASE_RIGHT) + { + switch_buffer_by_offset_le (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len); + } + /** * salt */ @@ -533,32 +589,46 @@ __kernel void m08700_s04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, * loop */ - for (u32 il_pos = 0; il_pos < combs_cnt; il_pos += VECT_SIZE) + for (u32 il_pos = 0; il_pos < combs_cnt; il_pos++) { - const u32x pw_r_len = pwlenx_create_combt (combs_buf, il_pos); + const u32 pw_r_len = combs_buf[il_pos].pw_len; + + const u32 pw_len = pw_l_len + pw_r_len; + + u32 wordr0[4]; + + wordr0[0] = combs_buf[il_pos].i[0]; + wordr0[1] = combs_buf[il_pos].i[1]; + wordr0[2] = combs_buf[il_pos].i[2]; + wordr0[3] = combs_buf[il_pos].i[3]; + + u32 wordr1[4]; + + wordr1[0] = combs_buf[il_pos].i[4]; + wordr1[1] = combs_buf[il_pos].i[5]; + wordr1[2] = combs_buf[il_pos].i[6]; + wordr1[3] = combs_buf[il_pos].i[7]; + + u32 wordr2[4]; - const u32x pw_len = pw_l_len + pw_r_len; + wordr2[0] = 0; + wordr2[1] = 0; + wordr2[2] = 0; + wordr2[3] = 0; - u32x wordr0[4] = { 0 }; - u32x wordr1[4] = { 0 }; - u32x wordr2[4] = { 0 }; - u32x wordr3[4] = { 0 }; + u32 wordr3[4]; - wordr0[0] = ix_create_combt (combs_buf, il_pos, 0); - wordr0[1] = ix_create_combt (combs_buf, il_pos, 1); - wordr0[2] = ix_create_combt (combs_buf, il_pos, 2); - wordr0[3] = ix_create_combt (combs_buf, il_pos, 3); - wordr1[0] = ix_create_combt (combs_buf, il_pos, 4); - wordr1[1] = ix_create_combt (combs_buf, il_pos, 5); - wordr1[2] = ix_create_combt (combs_buf, il_pos, 6); - wordr1[3] = ix_create_combt (combs_buf, il_pos, 7); + wordr3[0] = 0; + wordr3[1] = 0; + wordr3[2] = 0; + wordr3[3] = 0; if (combs_mode == COMBINATOR_MODE_BASE_LEFT) { switch_buffer_by_offset_le (wordr0, wordr1, wordr2, wordr3, pw_l_len); } - u32x w[16]; + u32 w[16]; w[ 0] = wordl0[0] | wordr0[0]; w[ 1] = wordl0[1] | wordr0[1]; @@ -577,7 +647,7 @@ __kernel void m08700_s04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, w[14] = wordl3[2] | wordr3[2]; w[15] = wordl3[3] | wordr3[3]; - u32x state[4]; + u32 state[4]; state[0] = 0; state[1] = 0; @@ -650,10 +720,10 @@ __kernel void m08700_s04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, domino_big_md (w, 34, state, s_lotus_magic_table); - u32x a = state[0] & 0xffffffff; - u32x b = state[1] & 0xffffffff; - u32x c = state[2] & 0x000000ff; - u32x d = state[3] & 0x00000000; + u32 a = state[0] & 0xffffffff; + u32 b = state[1] & 0xffffffff; + u32 c = state[2] & 0x000000ff; + u32 d = state[3] & 0x00000000; const u32 r0 = a; const u32 r1 = b; diff --git a/OpenCL/m09720_a1.cl b/OpenCL/m09720_a1.cl index ac59921..fdff5b9 100644 --- a/OpenCL/m09720_a1.cl +++ b/OpenCL/m09720_a1.cl @@ -5,8 +5,6 @@ #define _OLDOFFICE01_ -#define NEW_SIMD_CODE - #include "include/constants.h" #include "include/kernel_vendor.h" @@ -18,7 +16,9 @@ #include "include/kernel_functions.c" #include "OpenCL/types_ocl.c" #include "OpenCL/common.c" -#include "OpenCL/simd.c" + +#define COMPARE_S "OpenCL/check_single_comp4.c" +#define COMPARE_M "OpenCL/check_multi_comp4.c" static void md5_transform (const u32 w0[4], const u32 w1[4], const u32 w2[4], const u32 w3[4], u32 digest[4]) { @@ -478,20 +478,41 @@ __kernel void m09720_m04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, if (gid >= gid_max) return; - u32 pws0[4] = { 0 }; - u32 pws1[4] = { 0 }; + u32 wordl0[4]; + + wordl0[0] = pws[gid].i[ 0]; + wordl0[1] = pws[gid].i[ 1]; + wordl0[2] = pws[gid].i[ 2]; + wordl0[3] = pws[gid].i[ 3]; + + u32 wordl1[4]; + + wordl1[0] = pws[gid].i[ 4]; + wordl1[1] = pws[gid].i[ 5]; + wordl1[2] = pws[gid].i[ 6]; + wordl1[3] = pws[gid].i[ 7]; + + u32 wordl2[4]; + + wordl2[0] = 0; + wordl2[1] = 0; + wordl2[2] = 0; + wordl2[3] = 0; + + u32 wordl3[4]; - pws0[0] = pws[gid].i[0]; - pws0[1] = pws[gid].i[1]; - pws0[2] = pws[gid].i[2]; - pws0[3] = pws[gid].i[3]; - pws1[0] = pws[gid].i[4]; - pws1[1] = pws[gid].i[5]; - pws1[2] = pws[gid].i[6]; - pws1[3] = pws[gid].i[7]; + wordl3[0] = 0; + wordl3[1] = 0; + wordl3[2] = 0; + wordl3[3] = 0; const u32 pw_l_len = pws[gid].pw_len; + if (combs_mode == COMBINATOR_MODE_BASE_RIGHT) + { + switch_buffer_by_offset_le (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len); + } + /** * salt */ @@ -507,53 +528,67 @@ __kernel void m09720_m04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, * loop */ - for (u32 il_pos = 0; il_pos < combs_cnt; il_pos += VECT_SIZE) + for (u32 il_pos = 0; il_pos < combs_cnt; il_pos++) { - const u32x pw_r_len = pwlenx_create_combt (combs_buf, il_pos); + const u32 pw_r_len = combs_buf[il_pos].pw_len; + + const u32 pw_len = pw_l_len + pw_r_len; + + u32 wordr0[4]; + + wordr0[0] = combs_buf[il_pos].i[0]; + wordr0[1] = combs_buf[il_pos].i[1]; + wordr0[2] = combs_buf[il_pos].i[2]; + wordr0[3] = combs_buf[il_pos].i[3]; + + u32 wordr1[4]; + + wordr1[0] = combs_buf[il_pos].i[4]; + wordr1[1] = combs_buf[il_pos].i[5]; + wordr1[2] = combs_buf[il_pos].i[6]; + wordr1[3] = combs_buf[il_pos].i[7]; - const u32x pw_len = pw_l_len + pw_r_len; + u32 wordr2[4]; - u32x wordr0[4] = { 0 }; - u32x wordr1[4] = { 0 }; - u32x wordr2[4] = { 0 }; - u32x wordr3[4] = { 0 }; + wordr2[0] = 0; + wordr2[1] = 0; + wordr2[2] = 0; + wordr2[3] = 0; - wordr0[0] = ix_create_combt (combs_buf, il_pos, 0); - wordr0[1] = ix_create_combt (combs_buf, il_pos, 1); - wordr0[2] = ix_create_combt (combs_buf, il_pos, 2); - wordr0[3] = ix_create_combt (combs_buf, il_pos, 3); - wordr1[0] = ix_create_combt (combs_buf, il_pos, 4); - wordr1[1] = ix_create_combt (combs_buf, il_pos, 5); - wordr1[2] = ix_create_combt (combs_buf, il_pos, 6); - wordr1[3] = ix_create_combt (combs_buf, il_pos, 7); + u32 wordr3[4]; + + wordr3[0] = 0; + wordr3[1] = 0; + wordr3[2] = 0; + wordr3[3] = 0; if (combs_mode == COMBINATOR_MODE_BASE_LEFT) { switch_buffer_by_offset_le (wordr0, wordr1, wordr2, wordr3, pw_l_len); } - u32x w0[4]; + u32 w0[4]; w0[0] = wordl0[0] | wordr0[0]; w0[1] = wordl0[1] | wordr0[1]; w0[2] = wordl0[2] | wordr0[2]; w0[3] = wordl0[3] | wordr0[3]; - u32x w1[4]; + u32 w1[4]; w1[0] = wordl1[0] | wordr1[0]; w1[1] = wordl1[1] | wordr1[1]; w1[2] = wordl1[2] | wordr1[2]; w1[3] = wordl1[3] | wordr1[3]; - u32x w2[4]; + u32 w2[4]; w2[0] = wordl2[0] | wordr2[0]; w2[1] = wordl2[1] | wordr2[1]; w2[2] = wordl2[2] | wordr2[2]; w2[3] = wordl2[3] | wordr2[3]; - u32x w3[4]; + u32 w3[4]; w3[0] = wordl3[0] | wordr3[0]; w3[1] = wordl3[1] | wordr3[1]; @@ -562,10 +597,10 @@ __kernel void m09720_m04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, append_0x80_2x4 (w0, w1, pw_len); - u32x w0_t[4]; - u32x w1_t[4]; - u32x w2_t[4]; - u32x w3_t[4]; + u32 w0_t[4]; + u32 w1_t[4]; + u32 w2_t[4]; + u32 w3_t[4]; make_unicode (w0, w0_t, w1_t); make_unicode (w1, w2_t, w3_t); @@ -595,8 +630,8 @@ __kernel void m09720_m04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, gen336 (digest_pre, salt_buf, digest); - u32x a = digest[0]; - u32x b = digest[1] & 0xff; + u32 a = digest[0]; + u32 b = digest[1] & 0xff; const u32 r0 = a; const u32 r1 = b; @@ -631,20 +666,41 @@ __kernel void m09720_s04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, if (gid >= gid_max) return; - u32 pws0[4] = { 0 }; - u32 pws1[4] = { 0 }; + u32 wordl0[4]; + + wordl0[0] = pws[gid].i[ 0]; + wordl0[1] = pws[gid].i[ 1]; + wordl0[2] = pws[gid].i[ 2]; + wordl0[3] = pws[gid].i[ 3]; + + u32 wordl1[4]; + + wordl1[0] = pws[gid].i[ 4]; + wordl1[1] = pws[gid].i[ 5]; + wordl1[2] = pws[gid].i[ 6]; + wordl1[3] = pws[gid].i[ 7]; - pws0[0] = pws[gid].i[0]; - pws0[1] = pws[gid].i[1]; - pws0[2] = pws[gid].i[2]; - pws0[3] = pws[gid].i[3]; - pws1[0] = pws[gid].i[4]; - pws1[1] = pws[gid].i[5]; - pws1[2] = pws[gid].i[6]; - pws1[3] = pws[gid].i[7]; + u32 wordl2[4]; + + wordl2[0] = 0; + wordl2[1] = 0; + wordl2[2] = 0; + wordl2[3] = 0; + + u32 wordl3[4]; + + wordl3[0] = 0; + wordl3[1] = 0; + wordl3[2] = 0; + wordl3[3] = 0; const u32 pw_l_len = pws[gid].pw_len; + if (combs_mode == COMBINATOR_MODE_BASE_RIGHT) + { + switch_buffer_by_offset_le (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len); + } + /** * digest */ @@ -672,53 +728,67 @@ __kernel void m09720_s04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, * loop */ - for (u32 il_pos = 0; il_pos < combs_cnt; il_pos += VECT_SIZE) + for (u32 il_pos = 0; il_pos < combs_cnt; il_pos++) { - const u32x pw_r_len = pwlenx_create_combt (combs_buf, il_pos); + const u32 pw_r_len = combs_buf[il_pos].pw_len; + + const u32 pw_len = pw_l_len + pw_r_len; + + u32 wordr0[4]; + + wordr0[0] = combs_buf[il_pos].i[0]; + wordr0[1] = combs_buf[il_pos].i[1]; + wordr0[2] = combs_buf[il_pos].i[2]; + wordr0[3] = combs_buf[il_pos].i[3]; + + u32 wordr1[4]; + + wordr1[0] = combs_buf[il_pos].i[4]; + wordr1[1] = combs_buf[il_pos].i[5]; + wordr1[2] = combs_buf[il_pos].i[6]; + wordr1[3] = combs_buf[il_pos].i[7]; + + u32 wordr2[4]; - const u32x pw_len = pw_l_len + pw_r_len; + wordr2[0] = 0; + wordr2[1] = 0; + wordr2[2] = 0; + wordr2[3] = 0; - u32x wordr0[4] = { 0 }; - u32x wordr1[4] = { 0 }; - u32x wordr2[4] = { 0 }; - u32x wordr3[4] = { 0 }; + u32 wordr3[4]; - wordr0[0] = ix_create_combt (combs_buf, il_pos, 0); - wordr0[1] = ix_create_combt (combs_buf, il_pos, 1); - wordr0[2] = ix_create_combt (combs_buf, il_pos, 2); - wordr0[3] = ix_create_combt (combs_buf, il_pos, 3); - wordr1[0] = ix_create_combt (combs_buf, il_pos, 4); - wordr1[1] = ix_create_combt (combs_buf, il_pos, 5); - wordr1[2] = ix_create_combt (combs_buf, il_pos, 6); - wordr1[3] = ix_create_combt (combs_buf, il_pos, 7); + wordr3[0] = 0; + wordr3[1] = 0; + wordr3[2] = 0; + wordr3[3] = 0; if (combs_mode == COMBINATOR_MODE_BASE_LEFT) { switch_buffer_by_offset_le (wordr0, wordr1, wordr2, wordr3, pw_l_len); } - u32x w0[4]; + u32 w0[4]; w0[0] = wordl0[0] | wordr0[0]; w0[1] = wordl0[1] | wordr0[1]; w0[2] = wordl0[2] | wordr0[2]; w0[3] = wordl0[3] | wordr0[3]; - u32x w1[4]; + u32 w1[4]; w1[0] = wordl1[0] | wordr1[0]; w1[1] = wordl1[1] | wordr1[1]; w1[2] = wordl1[2] | wordr1[2]; w1[3] = wordl1[3] | wordr1[3]; - u32x w2[4]; + u32 w2[4]; w2[0] = wordl2[0] | wordr2[0]; w2[1] = wordl2[1] | wordr2[1]; w2[2] = wordl2[2] | wordr2[2]; w2[3] = wordl2[3] | wordr2[3]; - u32x w3[4]; + u32 w3[4]; w3[0] = wordl3[0] | wordr3[0]; w3[1] = wordl3[1] | wordr3[1]; @@ -727,10 +797,10 @@ __kernel void m09720_s04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, append_0x80_2x4 (w0, w1, pw_len); - u32x w0_t[4]; - u32x w1_t[4]; - u32x w2_t[4]; - u32x w3_t[4]; + u32 w0_t[4]; + u32 w1_t[4]; + u32 w2_t[4]; + u32 w3_t[4]; make_unicode (w0, w0_t, w1_t); make_unicode (w1, w2_t, w3_t); @@ -760,8 +830,8 @@ __kernel void m09720_s04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, gen336 (digest_pre, salt_buf, digest); - u32x a = digest[0]; - u32x b = digest[1] & 0xff; + u32 a = digest[0]; + u32 b = digest[1] & 0xff; const u32 r0 = a; const u32 r1 = b; diff --git a/OpenCL/m09820_a1.cl b/OpenCL/m09820_a1.cl index dd2a05a..73f62b9 100644 --- a/OpenCL/m09820_a1.cl +++ b/OpenCL/m09820_a1.cl @@ -5,8 +5,6 @@ #define _OLDOFFICE34_ -#define NEW_SIMD_CODE - #include "include/constants.h" #include "include/kernel_vendor.h" @@ -18,7 +16,9 @@ #include "include/kernel_functions.c" #include "OpenCL/types_ocl.c" #include "OpenCL/common.c" -#include "OpenCL/simd.c" + +#define COMPARE_S "OpenCL/check_single_comp4.c" +#define COMPARE_M "OpenCL/check_multi_comp4.c" static void sha1_transform (const u32 w0[4], const u32 w1[4], const u32 w2[4], const u32 w3[4], u32 digest[5]) { @@ -164,20 +164,41 @@ __kernel void m09820_m04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, if (gid >= gid_max) return; - u32 pws0[4] = { 0 }; - u32 pws1[4] = { 0 }; + u32 wordl0[4]; + + wordl0[0] = pws[gid].i[ 0]; + wordl0[1] = pws[gid].i[ 1]; + wordl0[2] = pws[gid].i[ 2]; + wordl0[3] = pws[gid].i[ 3]; + + u32 wordl1[4]; + + wordl1[0] = pws[gid].i[ 4]; + wordl1[1] = pws[gid].i[ 5]; + wordl1[2] = pws[gid].i[ 6]; + wordl1[3] = pws[gid].i[ 7]; + + u32 wordl2[4]; + + wordl2[0] = 0; + wordl2[1] = 0; + wordl2[2] = 0; + wordl2[3] = 0; + + u32 wordl3[4]; - pws0[0] = pws[gid].i[0]; - pws0[1] = pws[gid].i[1]; - pws0[2] = pws[gid].i[2]; - pws0[3] = pws[gid].i[3]; - pws1[0] = pws[gid].i[4]; - pws1[1] = pws[gid].i[5]; - pws1[2] = pws[gid].i[6]; - pws1[3] = pws[gid].i[7]; + wordl3[0] = 0; + wordl3[1] = 0; + wordl3[2] = 0; + wordl3[3] = 0; const u32 pw_l_len = pws[gid].pw_len; + if (combs_mode == COMBINATOR_MODE_BASE_RIGHT) + { + switch_buffer_by_offset_le (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len); + } + /** * salt */ @@ -195,55 +216,69 @@ __kernel void m09820_m04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, * loop */ - for (u32 il_pos = 0; il_pos < combs_cnt; il_pos += VECT_SIZE) + for (u32 il_pos = 0; il_pos < combs_cnt; il_pos++) { - const u32x pw_r_len = pwlenx_create_combt (combs_buf, il_pos); + const u32 pw_r_len = combs_buf[il_pos].pw_len; - const u32x pw_len = pw_l_len + pw_r_len; + const u32 pw_len = pw_l_len + pw_r_len; const u32 pw_salt_len = (pw_len * 2) + salt_len; - u32x wordr0[4] = { 0 }; - u32x wordr1[4] = { 0 }; - u32x wordr2[4] = { 0 }; - u32x wordr3[4] = { 0 }; + u32 wordr0[4]; + + wordr0[0] = combs_buf[il_pos].i[0]; + wordr0[1] = combs_buf[il_pos].i[1]; + wordr0[2] = combs_buf[il_pos].i[2]; + wordr0[3] = combs_buf[il_pos].i[3]; + + u32 wordr1[4]; + + wordr1[0] = combs_buf[il_pos].i[4]; + wordr1[1] = combs_buf[il_pos].i[5]; + wordr1[2] = combs_buf[il_pos].i[6]; + wordr1[3] = combs_buf[il_pos].i[7]; + + u32 wordr2[4]; + + wordr2[0] = 0; + wordr2[1] = 0; + wordr2[2] = 0; + wordr2[3] = 0; - wordr0[0] = ix_create_combt (combs_buf, il_pos, 0); - wordr0[1] = ix_create_combt (combs_buf, il_pos, 1); - wordr0[2] = ix_create_combt (combs_buf, il_pos, 2); - wordr0[3] = ix_create_combt (combs_buf, il_pos, 3); - wordr1[0] = ix_create_combt (combs_buf, il_pos, 4); - wordr1[1] = ix_create_combt (combs_buf, il_pos, 5); - wordr1[2] = ix_create_combt (combs_buf, il_pos, 6); - wordr1[3] = ix_create_combt (combs_buf, il_pos, 7); + u32 wordr3[4]; + + wordr3[0] = 0; + wordr3[1] = 0; + wordr3[2] = 0; + wordr3[3] = 0; if (combs_mode == COMBINATOR_MODE_BASE_LEFT) { switch_buffer_by_offset_le (wordr0, wordr1, wordr2, wordr3, pw_l_len); } - u32x w0[4]; + u32 w0[4]; w0[0] = wordl0[0] | wordr0[0]; w0[1] = wordl0[1] | wordr0[1]; w0[2] = wordl0[2] | wordr0[2]; w0[3] = wordl0[3] | wordr0[3]; - u32x w1[4]; + u32 w1[4]; w1[0] = wordl1[0] | wordr1[0]; w1[1] = wordl1[1] | wordr1[1]; w1[2] = wordl1[2] | wordr1[2]; w1[3] = wordl1[3] | wordr1[3]; - u32x w2[4]; + u32 w2[4]; w2[0] = wordl2[0] | wordr2[0]; w2[1] = wordl2[1] | wordr2[1]; w2[2] = wordl2[2] | wordr2[2]; w2[3] = wordl2[3] | wordr2[3]; - u32x w3[4]; + u32 w3[4]; w3[0] = wordl3[0] | wordr3[0]; w3[1] = wordl3[1] | wordr3[1]; @@ -252,10 +287,10 @@ __kernel void m09820_m04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, append_0x80_2x4 (w0, w1, pw_len); - u32x w0_t[4]; - u32x w1_t[4]; - u32x w2_t[4]; - u32x w3_t[4]; + u32 w0_t[4]; + u32 w1_t[4]; + u32 w2_t[4]; + u32 w3_t[4]; make_unicode (w0, w0_t, w1_t); make_unicode (w1, w2_t, w3_t); @@ -314,8 +349,8 @@ __kernel void m09820_m04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, sha1_transform (w0_t, w1_t, w2_t, w3_t, digest); - u32x a = swap32 (digest[0]); - u32x b = swap32 (digest[1]) & 0xff; + u32 a = swap32 (digest[0]); + u32 b = swap32 (digest[1]) & 0xff; const u32 r0 = a; const u32 r1 = b; @@ -350,20 +385,41 @@ __kernel void m09820_s04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, if (gid >= gid_max) return; - u32 pws0[4] = { 0 }; - u32 pws1[4] = { 0 }; + u32 wordl0[4]; + + wordl0[0] = pws[gid].i[ 0]; + wordl0[1] = pws[gid].i[ 1]; + wordl0[2] = pws[gid].i[ 2]; + wordl0[3] = pws[gid].i[ 3]; + + u32 wordl1[4]; + + wordl1[0] = pws[gid].i[ 4]; + wordl1[1] = pws[gid].i[ 5]; + wordl1[2] = pws[gid].i[ 6]; + wordl1[3] = pws[gid].i[ 7]; - pws0[0] = pws[gid].i[0]; - pws0[1] = pws[gid].i[1]; - pws0[2] = pws[gid].i[2]; - pws0[3] = pws[gid].i[3]; - pws1[0] = pws[gid].i[4]; - pws1[1] = pws[gid].i[5]; - pws1[2] = pws[gid].i[6]; - pws1[3] = pws[gid].i[7]; + u32 wordl2[4]; + + wordl2[0] = 0; + wordl2[1] = 0; + wordl2[2] = 0; + wordl2[3] = 0; + + u32 wordl3[4]; + + wordl3[0] = 0; + wordl3[1] = 0; + wordl3[2] = 0; + wordl3[3] = 0; const u32 pw_l_len = pws[gid].pw_len; + if (combs_mode == COMBINATOR_MODE_BASE_RIGHT) + { + switch_buffer_by_offset_le (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len); + } + /** * digest */ @@ -393,55 +449,69 @@ __kernel void m09820_s04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, * loop */ - for (u32 il_pos = 0; il_pos < combs_cnt; il_pos += VECT_SIZE) + for (u32 il_pos = 0; il_pos < combs_cnt; il_pos++) { - const u32x pw_r_len = pwlenx_create_combt (combs_buf, il_pos); + const u32 pw_r_len = combs_buf[il_pos].pw_len; - const u32x pw_len = pw_l_len + pw_r_len; + const u32 pw_len = pw_l_len + pw_r_len; const u32 pw_salt_len = (pw_len * 2) + salt_len; - u32x wordr0[4] = { 0 }; - u32x wordr1[4] = { 0 }; - u32x wordr2[4] = { 0 }; - u32x wordr3[4] = { 0 }; + u32 wordr0[4]; + + wordr0[0] = combs_buf[il_pos].i[0]; + wordr0[1] = combs_buf[il_pos].i[1]; + wordr0[2] = combs_buf[il_pos].i[2]; + wordr0[3] = combs_buf[il_pos].i[3]; + + u32 wordr1[4]; + + wordr1[0] = combs_buf[il_pos].i[4]; + wordr1[1] = combs_buf[il_pos].i[5]; + wordr1[2] = combs_buf[il_pos].i[6]; + wordr1[3] = combs_buf[il_pos].i[7]; + + u32 wordr2[4]; + + wordr2[0] = 0; + wordr2[1] = 0; + wordr2[2] = 0; + wordr2[3] = 0; + + u32 wordr3[4]; - wordr0[0] = ix_create_combt (combs_buf, il_pos, 0); - wordr0[1] = ix_create_combt (combs_buf, il_pos, 1); - wordr0[2] = ix_create_combt (combs_buf, il_pos, 2); - wordr0[3] = ix_create_combt (combs_buf, il_pos, 3); - wordr1[0] = ix_create_combt (combs_buf, il_pos, 4); - wordr1[1] = ix_create_combt (combs_buf, il_pos, 5); - wordr1[2] = ix_create_combt (combs_buf, il_pos, 6); - wordr1[3] = ix_create_combt (combs_buf, il_pos, 7); + wordr3[0] = 0; + wordr3[1] = 0; + wordr3[2] = 0; + wordr3[3] = 0; if (combs_mode == COMBINATOR_MODE_BASE_LEFT) { switch_buffer_by_offset_le (wordr0, wordr1, wordr2, wordr3, pw_l_len); } - u32x w0[4]; + u32 w0[4]; w0[0] = wordl0[0] | wordr0[0]; w0[1] = wordl0[1] | wordr0[1]; w0[2] = wordl0[2] | wordr0[2]; w0[3] = wordl0[3] | wordr0[3]; - u32x w1[4]; + u32 w1[4]; w1[0] = wordl1[0] | wordr1[0]; w1[1] = wordl1[1] | wordr1[1]; w1[2] = wordl1[2] | wordr1[2]; w1[3] = wordl1[3] | wordr1[3]; - u32x w2[4]; + u32 w2[4]; w2[0] = wordl2[0] | wordr2[0]; w2[1] = wordl2[1] | wordr2[1]; w2[2] = wordl2[2] | wordr2[2]; w2[3] = wordl2[3] | wordr2[3]; - u32x w3[4]; + u32 w3[4]; w3[0] = wordl3[0] | wordr3[0]; w3[1] = wordl3[1] | wordr3[1]; @@ -450,10 +520,10 @@ __kernel void m09820_s04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, append_0x80_2x4 (w0, w1, pw_len); - u32x w0_t[4]; - u32x w1_t[4]; - u32x w2_t[4]; - u32x w3_t[4]; + u32 w0_t[4]; + u32 w1_t[4]; + u32 w2_t[4]; + u32 w3_t[4]; make_unicode (w0, w0_t, w1_t); make_unicode (w1, w2_t, w3_t); @@ -512,8 +582,8 @@ __kernel void m09820_s04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, sha1_transform (w0_t, w1_t, w2_t, w3_t, digest); - u32x a = swap32 (digest[0]); - u32x b = swap32 (digest[1]) & 0xff; + u32 a = swap32 (digest[0]); + u32 b = swap32 (digest[1]) & 0xff; const u32 r0 = a; const u32 r1 = b; diff --git a/OpenCL/m09900_a1.cl b/OpenCL/m09900_a1.cl index 69659f7..32759c1 100644 --- a/OpenCL/m09900_a1.cl +++ b/OpenCL/m09900_a1.cl @@ -5,8 +5,6 @@ #define _MD5_ -#define NEW_SIMD_CODE - #include "include/constants.h" #include "include/kernel_vendor.h" @@ -18,7 +16,9 @@ #include "include/kernel_functions.c" #include "OpenCL/types_ocl.c" #include "OpenCL/common.c" -#include "OpenCL/simd.c" + +#define COMPARE_S "OpenCL/check_single_comp4.c" +#define COMPARE_M "OpenCL/check_multi_comp4.c" __kernel void m09900_m04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, __global comb_t *combs_buf, __global bf_t *bfs_buf, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 combs_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max) { @@ -36,81 +36,116 @@ __kernel void m09900_m04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, if (gid >= gid_max) return; - u32 pws0[4] = { 0 }; - u32 pws1[4] = { 0 }; + u32 wordl0[4]; + + wordl0[0] = pws[gid].i[ 0]; + wordl0[1] = pws[gid].i[ 1]; + wordl0[2] = pws[gid].i[ 2]; + wordl0[3] = pws[gid].i[ 3]; + + u32 wordl1[4]; + + wordl1[0] = pws[gid].i[ 4]; + wordl1[1] = pws[gid].i[ 5]; + wordl1[2] = pws[gid].i[ 6]; + wordl1[3] = pws[gid].i[ 7]; - pws0[0] = pws[gid].i[0]; - pws0[1] = pws[gid].i[1]; - pws0[2] = pws[gid].i[2]; - pws0[3] = pws[gid].i[3]; - pws1[0] = pws[gid].i[4]; - pws1[1] = pws[gid].i[5]; - pws1[2] = pws[gid].i[6]; - pws1[3] = pws[gid].i[7]; + u32 wordl2[4]; + + wordl2[0] = 0; + wordl2[1] = 0; + wordl2[2] = 0; + wordl2[3] = 0; + + u32 wordl3[4]; + + wordl3[0] = 0; + wordl3[1] = 0; + wordl3[2] = 0; + wordl3[3] = 0; const u32 pw_l_len = pws[gid].pw_len; + if (combs_mode == COMBINATOR_MODE_BASE_RIGHT) + { + switch_buffer_by_offset_le (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len); + } + /** * loop */ - for (u32 il_pos = 0; il_pos < combs_cnt; il_pos += VECT_SIZE) + for (u32 il_pos = 0; il_pos < combs_cnt; il_pos++) { - const u32x pw_r_len = pwlenx_create_combt (combs_buf, il_pos); + const u32 pw_r_len = combs_buf[il_pos].pw_len; + + const u32 pw_len = pw_l_len + pw_r_len; + + u32 wordr0[4]; + + wordr0[0] = combs_buf[il_pos].i[0]; + wordr0[1] = combs_buf[il_pos].i[1]; + wordr0[2] = combs_buf[il_pos].i[2]; + wordr0[3] = combs_buf[il_pos].i[3]; - const u32x pw_len = pw_l_len + pw_r_len; + u32 wordr1[4]; - u32x wordr0[4] = { 0 }; - u32x wordr1[4] = { 0 }; - u32x wordr2[4] = { 0 }; - u32x wordr3[4] = { 0 }; + wordr1[0] = combs_buf[il_pos].i[4]; + wordr1[1] = combs_buf[il_pos].i[5]; + wordr1[2] = combs_buf[il_pos].i[6]; + wordr1[3] = combs_buf[il_pos].i[7]; - wordr0[0] = ix_create_combt (combs_buf, il_pos, 0); - wordr0[1] = ix_create_combt (combs_buf, il_pos, 1); - wordr0[2] = ix_create_combt (combs_buf, il_pos, 2); - wordr0[3] = ix_create_combt (combs_buf, il_pos, 3); - wordr1[0] = ix_create_combt (combs_buf, il_pos, 4); - wordr1[1] = ix_create_combt (combs_buf, il_pos, 5); - wordr1[2] = ix_create_combt (combs_buf, il_pos, 6); - wordr1[3] = ix_create_combt (combs_buf, il_pos, 7); + u32 wordr2[4]; + + wordr2[0] = 0; + wordr2[1] = 0; + wordr2[2] = 0; + wordr2[3] = 0; + + u32 wordr3[4]; + + wordr3[0] = 0; + wordr3[1] = 0; + wordr3[2] = 0; + wordr3[3] = 0; if (combs_mode == COMBINATOR_MODE_BASE_LEFT) { switch_buffer_by_offset_le (wordr0, wordr1, wordr2, wordr3, pw_l_len); } - u32x w0[4]; + u32 w0[4]; w0[0] = wordl0[0] | wordr0[0]; w0[1] = wordl0[1] | wordr0[1]; w0[2] = wordl0[2] | wordr0[2]; w0[3] = wordl0[3] | wordr0[3]; - u32x w1[4]; + u32 w1[4]; w1[0] = wordl1[0] | wordr1[0]; w1[1] = wordl1[1] | wordr1[1]; w1[2] = wordl1[2] | wordr1[2]; w1[3] = wordl1[3] | wordr1[3]; - u32x w2[4]; + u32 w2[4]; w2[0] = wordl2[0] | wordr2[0]; w2[1] = wordl2[1] | wordr2[1]; w2[2] = wordl2[2] | wordr2[2]; w2[3] = wordl2[3] | wordr2[3]; - u32x w3[4]; + u32 w3[4]; w3[0] = wordl3[0] | wordr3[0]; w3[1] = wordl3[1] | wordr3[1]; w3[2] = wordl3[2] | wordr3[2]; w3[3] = wordl3[3] | wordr3[3]; - u32x a = MD5M_A; - u32x b = MD5M_B; - u32x c = MD5M_C; - u32x d = MD5M_D; + u32 a = MD5M_A; + u32 b = MD5M_B; + u32 c = MD5M_C; + u32 d = MD5M_D; MD5_STEP (MD5_Fo, a, b, c, d, w0[0], MD5C00, MD5S00); MD5_STEP (MD5_Fo, d, a, b, c, w0[1], MD5C01, MD5S01); @@ -284,7 +319,13 @@ __kernel void m09900_m04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, b += r_b; c += r_c; d += r_d; - COMPARE_M_SIMD (a, d, c, b); + + const u32 r0 = a; + const u32 r1 = d; + const u32 r2 = c; + const u32 r3 = b; + + #include COMPARE_M } } @@ -312,20 +353,41 @@ __kernel void m09900_s04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, if (gid >= gid_max) return; - u32 pws0[4] = { 0 }; - u32 pws1[4] = { 0 }; + u32 wordl0[4]; + + wordl0[0] = pws[gid].i[ 0]; + wordl0[1] = pws[gid].i[ 1]; + wordl0[2] = pws[gid].i[ 2]; + wordl0[3] = pws[gid].i[ 3]; + + u32 wordl1[4]; + + wordl1[0] = pws[gid].i[ 4]; + wordl1[1] = pws[gid].i[ 5]; + wordl1[2] = pws[gid].i[ 6]; + wordl1[3] = pws[gid].i[ 7]; + + u32 wordl2[4]; - pws0[0] = pws[gid].i[0]; - pws0[1] = pws[gid].i[1]; - pws0[2] = pws[gid].i[2]; - pws0[3] = pws[gid].i[3]; - pws1[0] = pws[gid].i[4]; - pws1[1] = pws[gid].i[5]; - pws1[2] = pws[gid].i[6]; - pws1[3] = pws[gid].i[7]; + wordl2[0] = 0; + wordl2[1] = 0; + wordl2[2] = 0; + wordl2[3] = 0; + + u32 wordl3[4]; + + wordl3[0] = 0; + wordl3[1] = 0; + wordl3[2] = 0; + wordl3[3] = 0; const u32 pw_l_len = pws[gid].pw_len; + if (combs_mode == COMBINATOR_MODE_BASE_RIGHT) + { + switch_buffer_by_offset_le (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len); + } + /** * digest */ @@ -342,63 +404,77 @@ __kernel void m09900_s04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, * loop */ - for (u32 il_pos = 0; il_pos < combs_cnt; il_pos += VECT_SIZE) + for (u32 il_pos = 0; il_pos < combs_cnt; il_pos++) { - const u32x pw_r_len = pwlenx_create_combt (combs_buf, il_pos); + const u32 pw_r_len = combs_buf[il_pos].pw_len; + + const u32 pw_len = pw_l_len + pw_r_len; + + u32 wordr0[4]; + + wordr0[0] = combs_buf[il_pos].i[0]; + wordr0[1] = combs_buf[il_pos].i[1]; + wordr0[2] = combs_buf[il_pos].i[2]; + wordr0[3] = combs_buf[il_pos].i[3]; + + u32 wordr1[4]; - const u32x pw_len = pw_l_len + pw_r_len; + wordr1[0] = combs_buf[il_pos].i[4]; + wordr1[1] = combs_buf[il_pos].i[5]; + wordr1[2] = combs_buf[il_pos].i[6]; + wordr1[3] = combs_buf[il_pos].i[7]; - u32x wordr0[4] = { 0 }; - u32x wordr1[4] = { 0 }; - u32x wordr2[4] = { 0 }; - u32x wordr3[4] = { 0 }; + u32 wordr2[4]; - wordr0[0] = ix_create_combt (combs_buf, il_pos, 0); - wordr0[1] = ix_create_combt (combs_buf, il_pos, 1); - wordr0[2] = ix_create_combt (combs_buf, il_pos, 2); - wordr0[3] = ix_create_combt (combs_buf, il_pos, 3); - wordr1[0] = ix_create_combt (combs_buf, il_pos, 4); - wordr1[1] = ix_create_combt (combs_buf, il_pos, 5); - wordr1[2] = ix_create_combt (combs_buf, il_pos, 6); - wordr1[3] = ix_create_combt (combs_buf, il_pos, 7); + wordr2[0] = 0; + wordr2[1] = 0; + wordr2[2] = 0; + wordr2[3] = 0; + + u32 wordr3[4]; + + wordr3[0] = 0; + wordr3[1] = 0; + wordr3[2] = 0; + wordr3[3] = 0; if (combs_mode == COMBINATOR_MODE_BASE_LEFT) { switch_buffer_by_offset_le (wordr0, wordr1, wordr2, wordr3, pw_l_len); } - u32x w0[4]; + u32 w0[4]; w0[0] = wordl0[0] | wordr0[0]; w0[1] = wordl0[1] | wordr0[1]; w0[2] = wordl0[2] | wordr0[2]; w0[3] = wordl0[3] | wordr0[3]; - u32x w1[4]; + u32 w1[4]; w1[0] = wordl1[0] | wordr1[0]; w1[1] = wordl1[1] | wordr1[1]; w1[2] = wordl1[2] | wordr1[2]; w1[3] = wordl1[3] | wordr1[3]; - u32x w2[4]; + u32 w2[4]; w2[0] = wordl2[0] | wordr2[0]; w2[1] = wordl2[1] | wordr2[1]; w2[2] = wordl2[2] | wordr2[2]; w2[3] = wordl2[3] | wordr2[3]; - u32x w3[4]; + u32 w3[4]; w3[0] = wordl3[0] | wordr3[0]; w3[1] = wordl3[1] | wordr3[1]; w3[2] = wordl3[2] | wordr3[2]; w3[3] = wordl3[3] | wordr3[3]; - u32x a = MD5M_A; - u32x b = MD5M_B; - u32x c = MD5M_C; - u32x d = MD5M_D; + u32 a = MD5M_A; + u32 b = MD5M_B; + u32 c = MD5M_C; + u32 d = MD5M_D; MD5_STEP (MD5_Fo, a, b, c, d, w0[0], MD5C00, MD5S00); MD5_STEP (MD5_Fo, d, a, b, c, w0[1], MD5C01, MD5S01); @@ -575,7 +651,13 @@ __kernel void m09900_s04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, b += r_b; c += r_c; d += r_d; - COMPARE_S_SIMD (a, d, c, b); + + const u32 r0 = a; + const u32 r1 = d; + const u32 r2 = c; + const u32 r3 = b; + + #include COMPARE_S } } diff --git a/OpenCL/m10100_a1.cl b/OpenCL/m10100_a1.cl index 157b643..7960896 100644 --- a/OpenCL/m10100_a1.cl +++ b/OpenCL/m10100_a1.cl @@ -5,8 +5,6 @@ #define _SIPHASH_ -#define NEW_SIMD_CODE - #include "include/constants.h" #include "include/kernel_vendor.h" @@ -18,7 +16,9 @@ #include "include/kernel_functions.c" #include "OpenCL/types_ocl.c" #include "OpenCL/common.c" -#include "OpenCL/simd.c" + +#define COMPARE_S "OpenCL/check_single_comp4.c" +#define COMPARE_M "OpenCL/check_multi_comp4.c" #define SIPROUND(v0,v1,v2,v3) \ (v0) += (v1); \ @@ -52,20 +52,41 @@ __kernel void m10100_m04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, if (gid >= gid_max) return; - u32 pws0[4] = { 0 }; - u32 pws1[4] = { 0 }; + u32 wordl0[4]; + + wordl0[0] = pws[gid].i[ 0]; + wordl0[1] = pws[gid].i[ 1]; + wordl0[2] = pws[gid].i[ 2]; + wordl0[3] = pws[gid].i[ 3]; + + u32 wordl1[4]; + + wordl1[0] = pws[gid].i[ 4]; + wordl1[1] = pws[gid].i[ 5]; + wordl1[2] = pws[gid].i[ 6]; + wordl1[3] = pws[gid].i[ 7]; + + u32 wordl2[4]; + + wordl2[0] = 0; + wordl2[1] = 0; + wordl2[2] = 0; + wordl2[3] = 0; + + u32 wordl3[4]; - pws0[0] = pws[gid].i[0]; - pws0[1] = pws[gid].i[1]; - pws0[2] = pws[gid].i[2]; - pws0[3] = pws[gid].i[3]; - pws1[0] = pws[gid].i[4]; - pws1[1] = pws[gid].i[5]; - pws1[2] = pws[gid].i[6]; - pws1[3] = pws[gid].i[7]; + wordl3[0] = 0; + wordl3[1] = 0; + wordl3[2] = 0; + wordl3[3] = 0; const u32 pw_l_len = pws[gid].pw_len; + if (combs_mode == COMBINATOR_MODE_BASE_RIGHT) + { + switch_buffer_by_offset_le (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len); + } + /** * base */ @@ -84,32 +105,46 @@ __kernel void m10100_m04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, * loop */ - for (u32 il_pos = 0; il_pos < combs_cnt; il_pos += VECT_SIZE) + for (u32 il_pos = 0; il_pos < combs_cnt; il_pos++) { - const u32x pw_r_len = pwlenx_create_combt (combs_buf, il_pos); + const u32 pw_r_len = combs_buf[il_pos].pw_len; + + const u32 pw_len = pw_l_len + pw_r_len; + + u32 wordr0[4]; + + wordr0[0] = combs_buf[il_pos].i[0]; + wordr0[1] = combs_buf[il_pos].i[1]; + wordr0[2] = combs_buf[il_pos].i[2]; + wordr0[3] = combs_buf[il_pos].i[3]; + + u32 wordr1[4]; + + wordr1[0] = combs_buf[il_pos].i[4]; + wordr1[1] = combs_buf[il_pos].i[5]; + wordr1[2] = combs_buf[il_pos].i[6]; + wordr1[3] = combs_buf[il_pos].i[7]; - const u32x pw_len = pw_l_len + pw_r_len; + u32 wordr2[4]; - u32x wordr0[4] = { 0 }; - u32x wordr1[4] = { 0 }; - u32x wordr2[4] = { 0 }; - u32x wordr3[4] = { 0 }; + wordr2[0] = 0; + wordr2[1] = 0; + wordr2[2] = 0; + wordr2[3] = 0; - wordr0[0] = ix_create_combt (combs_buf, il_pos, 0); - wordr0[1] = ix_create_combt (combs_buf, il_pos, 1); - wordr0[2] = ix_create_combt (combs_buf, il_pos, 2); - wordr0[3] = ix_create_combt (combs_buf, il_pos, 3); - wordr1[0] = ix_create_combt (combs_buf, il_pos, 4); - wordr1[1] = ix_create_combt (combs_buf, il_pos, 5); - wordr1[2] = ix_create_combt (combs_buf, il_pos, 6); - wordr1[3] = ix_create_combt (combs_buf, il_pos, 7); + u32 wordr3[4]; + + wordr3[0] = 0; + wordr3[1] = 0; + wordr3[2] = 0; + wordr3[3] = 0; if (combs_mode == COMBINATOR_MODE_BASE_LEFT) { switch_buffer_by_offset_le (wordr0, wordr1, wordr2, wordr3, pw_l_len); } - u32x w[16]; + u32 w[16]; w[ 0] = wordl0[0] | wordr0[0]; w[ 1] = wordl0[1] | wordr0[1]; @@ -197,20 +232,41 @@ __kernel void m10100_s04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, if (gid >= gid_max) return; - u32 pws0[4] = { 0 }; - u32 pws1[4] = { 0 }; + u32 wordl0[4]; + + wordl0[0] = pws[gid].i[ 0]; + wordl0[1] = pws[gid].i[ 1]; + wordl0[2] = pws[gid].i[ 2]; + wordl0[3] = pws[gid].i[ 3]; + + u32 wordl1[4]; + + wordl1[0] = pws[gid].i[ 4]; + wordl1[1] = pws[gid].i[ 5]; + wordl1[2] = pws[gid].i[ 6]; + wordl1[3] = pws[gid].i[ 7]; - pws0[0] = pws[gid].i[0]; - pws0[1] = pws[gid].i[1]; - pws0[2] = pws[gid].i[2]; - pws0[3] = pws[gid].i[3]; - pws1[0] = pws[gid].i[4]; - pws1[1] = pws[gid].i[5]; - pws1[2] = pws[gid].i[6]; - pws1[3] = pws[gid].i[7]; + u32 wordl2[4]; + + wordl2[0] = 0; + wordl2[1] = 0; + wordl2[2] = 0; + wordl2[3] = 0; + + u32 wordl3[4]; + + wordl3[0] = 0; + wordl3[1] = 0; + wordl3[2] = 0; + wordl3[3] = 0; const u32 pw_l_len = pws[gid].pw_len; + if (combs_mode == COMBINATOR_MODE_BASE_RIGHT) + { + switch_buffer_by_offset_le (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len); + } + /** * digest */ @@ -241,32 +297,46 @@ __kernel void m10100_s04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, * loop */ - for (u32 il_pos = 0; il_pos < combs_cnt; il_pos += VECT_SIZE) + for (u32 il_pos = 0; il_pos < combs_cnt; il_pos++) { - const u32x pw_r_len = pwlenx_create_combt (combs_buf, il_pos); + const u32 pw_r_len = combs_buf[il_pos].pw_len; + + const u32 pw_len = pw_l_len + pw_r_len; + + u32 wordr0[4]; + + wordr0[0] = combs_buf[il_pos].i[0]; + wordr0[1] = combs_buf[il_pos].i[1]; + wordr0[2] = combs_buf[il_pos].i[2]; + wordr0[3] = combs_buf[il_pos].i[3]; + + u32 wordr1[4]; + + wordr1[0] = combs_buf[il_pos].i[4]; + wordr1[1] = combs_buf[il_pos].i[5]; + wordr1[2] = combs_buf[il_pos].i[6]; + wordr1[3] = combs_buf[il_pos].i[7]; + + u32 wordr2[4]; - const u32x pw_len = pw_l_len + pw_r_len; + wordr2[0] = 0; + wordr2[1] = 0; + wordr2[2] = 0; + wordr2[3] = 0; - u32x wordr0[4] = { 0 }; - u32x wordr1[4] = { 0 }; - u32x wordr2[4] = { 0 }; - u32x wordr3[4] = { 0 }; + u32 wordr3[4]; - wordr0[0] = ix_create_combt (combs_buf, il_pos, 0); - wordr0[1] = ix_create_combt (combs_buf, il_pos, 1); - wordr0[2] = ix_create_combt (combs_buf, il_pos, 2); - wordr0[3] = ix_create_combt (combs_buf, il_pos, 3); - wordr1[0] = ix_create_combt (combs_buf, il_pos, 4); - wordr1[1] = ix_create_combt (combs_buf, il_pos, 5); - wordr1[2] = ix_create_combt (combs_buf, il_pos, 6); - wordr1[3] = ix_create_combt (combs_buf, il_pos, 7); + wordr3[0] = 0; + wordr3[1] = 0; + wordr3[2] = 0; + wordr3[3] = 0; if (combs_mode == COMBINATOR_MODE_BASE_LEFT) { switch_buffer_by_offset_le (wordr0, wordr1, wordr2, wordr3, pw_l_len); } - u32x w[16]; + u32 w[16]; w[ 0] = wordl0[0] | wordr0[0]; w[ 1] = wordl0[1] | wordr0[1]; diff --git a/OpenCL/m10420_a1.cl b/OpenCL/m10420_a1.cl index 3a31ac0..17778ad 100644 --- a/OpenCL/m10420_a1.cl +++ b/OpenCL/m10420_a1.cl @@ -5,8 +5,6 @@ #define _MD5_ -#define NEW_SIMD_CODE - #include "include/constants.h" #include "include/kernel_vendor.h" @@ -18,7 +16,9 @@ #include "include/kernel_functions.c" #include "OpenCL/types_ocl.c" #include "OpenCL/common.c" -#include "OpenCL/simd.c" + +#define COMPARE_S "OpenCL/check_single_comp4.c" +#define COMPARE_M "OpenCL/check_multi_comp4.c" __constant u32 padding[8] = { @@ -146,20 +146,43 @@ __kernel void m10420_m04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, if (gid >= gid_max) return; - u32 pws0[4] = { 0 }; - u32 pws1[4] = { 0 }; + u32 wordl0[4]; + + wordl0[0] = pws[gid].i[ 0]; + wordl0[1] = pws[gid].i[ 1]; + wordl0[2] = pws[gid].i[ 2]; + wordl0[3] = pws[gid].i[ 3]; + + u32 wordl1[4]; + + wordl1[0] = pws[gid].i[ 4]; + wordl1[1] = pws[gid].i[ 5]; + wordl1[2] = pws[gid].i[ 6]; + wordl1[3] = pws[gid].i[ 7]; - pws0[0] = pws[gid].i[0]; - pws0[1] = pws[gid].i[1]; - pws0[2] = pws[gid].i[2]; - pws0[3] = pws[gid].i[3]; - pws1[0] = pws[gid].i[4]; - pws1[1] = pws[gid].i[5]; - pws1[2] = pws[gid].i[6]; - pws1[3] = pws[gid].i[7]; + u32 wordl2[4]; + + wordl2[0] = 0; + wordl2[1] = 0; + wordl2[2] = 0; + wordl2[3] = 0; + + u32 wordl3[4]; + + wordl3[0] = 0; + wordl3[1] = 0; + wordl3[2] = 0; + wordl3[3] = 0; const u32 pw_l_len = pws[gid].pw_len; + if (combs_mode == COMBINATOR_MODE_BASE_RIGHT) + { + append_0x80_2x4 (wordl0, wordl1, pw_l_len); + + switch_buffer_by_offset_le (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len); + } + /** * U_buf */ @@ -188,63 +211,77 @@ __kernel void m10420_m04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, * loop */ - for (u32 il_pos = 0; il_pos < combs_cnt; il_pos += VECT_SIZE) + for (u32 il_pos = 0; il_pos < combs_cnt; il_pos++) { - const u32x pw_r_len = pwlenx_create_combt (combs_buf, il_pos); + const u32 pw_r_len = combs_buf[il_pos].pw_len; + + const u32 pw_len = pw_l_len + pw_r_len; + + u32 wordr0[4]; + + wordr0[0] = combs_buf[il_pos].i[0]; + wordr0[1] = combs_buf[il_pos].i[1]; + wordr0[2] = combs_buf[il_pos].i[2]; + wordr0[3] = combs_buf[il_pos].i[3]; + + u32 wordr1[4]; + + wordr1[0] = combs_buf[il_pos].i[4]; + wordr1[1] = combs_buf[il_pos].i[5]; + wordr1[2] = combs_buf[il_pos].i[6]; + wordr1[3] = combs_buf[il_pos].i[7]; - const u32x pw_len = pw_l_len + pw_r_len; + u32 wordr2[4]; - u32x wordr0[4] = { 0 }; - u32x wordr1[4] = { 0 }; - u32x wordr2[4] = { 0 }; - u32x wordr3[4] = { 0 }; + wordr2[0] = 0; + wordr2[1] = 0; + wordr2[2] = 0; + wordr2[3] = 0; - wordr0[0] = ix_create_combt (combs_buf, il_pos, 0); - wordr0[1] = ix_create_combt (combs_buf, il_pos, 1); - wordr0[2] = ix_create_combt (combs_buf, il_pos, 2); - wordr0[3] = ix_create_combt (combs_buf, il_pos, 3); - wordr1[0] = ix_create_combt (combs_buf, il_pos, 4); - wordr1[1] = ix_create_combt (combs_buf, il_pos, 5); - wordr1[2] = ix_create_combt (combs_buf, il_pos, 6); - wordr1[3] = ix_create_combt (combs_buf, il_pos, 7); + u32 wordr3[4]; + + wordr3[0] = 0; + wordr3[1] = 0; + wordr3[2] = 0; + wordr3[3] = 0; if (combs_mode == COMBINATOR_MODE_BASE_LEFT) { switch_buffer_by_offset_le (wordr0, wordr1, wordr2, wordr3, pw_l_len); } - u32x w0[4]; + u32 w0[4]; w0[0] = wordl0[0] | wordr0[0]; w0[1] = wordl0[1] | wordr0[1]; w0[2] = wordl0[2] | wordr0[2]; w0[3] = wordl0[3] | wordr0[3]; - u32x w1[4]; + u32 w1[4]; w1[0] = wordl1[0] | wordr1[0]; w1[1] = wordl1[1] | wordr1[1]; w1[2] = wordl1[2] | wordr1[2]; w1[3] = wordl1[3] | wordr1[3]; - u32x w2[4]; + u32 w2[4]; w2[0] = wordl2[0] | wordr2[0]; w2[1] = wordl2[1] | wordr2[1]; w2[2] = wordl2[2] | wordr2[2]; w2[3] = wordl2[3] | wordr2[3]; - u32x w3[4]; + u32 w3[4]; w3[0] = wordl3[0] | wordr3[0]; w3[1] = wordl3[1] | wordr3[1]; w3[2] = wordl3[2] | wordr3[2]; w3[3] = wordl3[3] | wordr3[3]; - u32x w0_t[4]; - u32x w1_t[4]; - u32x w2_t[4]; - u32x w3_t[4]; + u32 w0_t[4]; + u32 w1_t[4]; + u32 w2_t[4]; + u32 w3_t[4]; // max length supported by pdf11 is 32 @@ -316,8 +353,8 @@ __kernel void m10420_m04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, md5_transform (w0_t, w1_t, w2_t, w3_t, digest); - u32x a = digest[0]; - u32x b = digest[1] & 0xff; + u32 a = digest[0]; + u32 b = digest[1] & 0xff; const u32 r0 = a; const u32 r1 = b; @@ -352,20 +389,43 @@ __kernel void m10420_s04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, if (gid >= gid_max) return; - u32 pws0[4] = { 0 }; - u32 pws1[4] = { 0 }; + u32 wordl0[4]; + + wordl0[0] = pws[gid].i[ 0]; + wordl0[1] = pws[gid].i[ 1]; + wordl0[2] = pws[gid].i[ 2]; + wordl0[3] = pws[gid].i[ 3]; + + u32 wordl1[4]; + + wordl1[0] = pws[gid].i[ 4]; + wordl1[1] = pws[gid].i[ 5]; + wordl1[2] = pws[gid].i[ 6]; + wordl1[3] = pws[gid].i[ 7]; + + u32 wordl2[4]; - pws0[0] = pws[gid].i[0]; - pws0[1] = pws[gid].i[1]; - pws0[2] = pws[gid].i[2]; - pws0[3] = pws[gid].i[3]; - pws1[0] = pws[gid].i[4]; - pws1[1] = pws[gid].i[5]; - pws1[2] = pws[gid].i[6]; - pws1[3] = pws[gid].i[7]; + wordl2[0] = 0; + wordl2[1] = 0; + wordl2[2] = 0; + wordl2[3] = 0; + + u32 wordl3[4]; + + wordl3[0] = 0; + wordl3[1] = 0; + wordl3[2] = 0; + wordl3[3] = 0; const u32 pw_l_len = pws[gid].pw_len; + if (combs_mode == COMBINATOR_MODE_BASE_RIGHT) + { + append_0x80_2x4 (wordl0, wordl1, pw_l_len); + + switch_buffer_by_offset_le (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len); + } + /** * digest */ @@ -406,63 +466,77 @@ __kernel void m10420_s04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, * loop */ - for (u32 il_pos = 0; il_pos < combs_cnt; il_pos += VECT_SIZE) + for (u32 il_pos = 0; il_pos < combs_cnt; il_pos++) { - const u32x pw_r_len = pwlenx_create_combt (combs_buf, il_pos); + const u32 pw_r_len = combs_buf[il_pos].pw_len; + + const u32 pw_len = pw_l_len + pw_r_len; + + u32 wordr0[4]; + + wordr0[0] = combs_buf[il_pos].i[0]; + wordr0[1] = combs_buf[il_pos].i[1]; + wordr0[2] = combs_buf[il_pos].i[2]; + wordr0[3] = combs_buf[il_pos].i[3]; + + u32 wordr1[4]; + + wordr1[0] = combs_buf[il_pos].i[4]; + wordr1[1] = combs_buf[il_pos].i[5]; + wordr1[2] = combs_buf[il_pos].i[6]; + wordr1[3] = combs_buf[il_pos].i[7]; + + u32 wordr2[4]; - const u32x pw_len = pw_l_len + pw_r_len; + wordr2[0] = 0; + wordr2[1] = 0; + wordr2[2] = 0; + wordr2[3] = 0; - u32x wordr0[4] = { 0 }; - u32x wordr1[4] = { 0 }; - u32x wordr2[4] = { 0 }; - u32x wordr3[4] = { 0 }; + u32 wordr3[4]; - wordr0[0] = ix_create_combt (combs_buf, il_pos, 0); - wordr0[1] = ix_create_combt (combs_buf, il_pos, 1); - wordr0[2] = ix_create_combt (combs_buf, il_pos, 2); - wordr0[3] = ix_create_combt (combs_buf, il_pos, 3); - wordr1[0] = ix_create_combt (combs_buf, il_pos, 4); - wordr1[1] = ix_create_combt (combs_buf, il_pos, 5); - wordr1[2] = ix_create_combt (combs_buf, il_pos, 6); - wordr1[3] = ix_create_combt (combs_buf, il_pos, 7); + wordr3[0] = 0; + wordr3[1] = 0; + wordr3[2] = 0; + wordr3[3] = 0; if (combs_mode == COMBINATOR_MODE_BASE_LEFT) { switch_buffer_by_offset_le (wordr0, wordr1, wordr2, wordr3, pw_l_len); } - u32x w0[4]; + u32 w0[4]; w0[0] = wordl0[0] | wordr0[0]; w0[1] = wordl0[1] | wordr0[1]; w0[2] = wordl0[2] | wordr0[2]; w0[3] = wordl0[3] | wordr0[3]; - u32x w1[4]; + u32 w1[4]; w1[0] = wordl1[0] | wordr1[0]; w1[1] = wordl1[1] | wordr1[1]; w1[2] = wordl1[2] | wordr1[2]; w1[3] = wordl1[3] | wordr1[3]; - u32x w2[4]; + u32 w2[4]; w2[0] = wordl2[0] | wordr2[0]; w2[1] = wordl2[1] | wordr2[1]; w2[2] = wordl2[2] | wordr2[2]; w2[3] = wordl2[3] | wordr2[3]; - u32x w3[4]; + u32 w3[4]; w3[0] = wordl3[0] | wordr3[0]; w3[1] = wordl3[1] | wordr3[1]; w3[2] = wordl3[2] | wordr3[2]; w3[3] = wordl3[3] | wordr3[3]; - u32x w0_t[4]; - u32x w1_t[4]; - u32x w2_t[4]; - u32x w3_t[4]; + u32 w0_t[4]; + u32 w1_t[4]; + u32 w2_t[4]; + u32 w3_t[4]; // max length supported by pdf11 is 32 @@ -534,8 +608,8 @@ __kernel void m10420_s04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, md5_transform (w0_t, w1_t, w2_t, w3_t, digest); - u32x a = digest[0]; - u32x b = digest[1] & 0xff; + u32 a = digest[0]; + u32 b = digest[1] & 0xff; const u32 r0 = a; const u32 r1 = b; diff --git a/OpenCL/m10800_a1.cl b/OpenCL/m10800_a1.cl index 73e72ac..c547934 100644 --- a/OpenCL/m10800_a1.cl +++ b/OpenCL/m10800_a1.cl @@ -5,8 +5,6 @@ #define _SHA384_ -#define NEW_SIMD_CODE - #include "include/constants.h" #include "include/kernel_vendor.h" @@ -18,7 +16,9 @@ #include "include/kernel_functions.c" #include "OpenCL/types_ocl.c" #include "OpenCL/common.c" -#include "OpenCL/simd.c" + +#define COMPARE_S "OpenCL/check_single_comp4.c" +#define COMPARE_M "OpenCL/check_multi_comp4.c" __constant u64 k_sha384[80] = { @@ -156,43 +156,80 @@ __kernel void m10800_m04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, if (gid >= gid_max) return; - u32 pws0[4] = { 0 }; - u32 pws1[4] = { 0 }; + u32 wordl0[4]; + + wordl0[0] = pws[gid].i[ 0]; + wordl0[1] = pws[gid].i[ 1]; + wordl0[2] = pws[gid].i[ 2]; + wordl0[3] = pws[gid].i[ 3]; + + u32 wordl1[4]; + + wordl1[0] = pws[gid].i[ 4]; + wordl1[1] = pws[gid].i[ 5]; + wordl1[2] = pws[gid].i[ 6]; + wordl1[3] = pws[gid].i[ 7]; - pws0[0] = pws[gid].i[0]; - pws0[1] = pws[gid].i[1]; - pws0[2] = pws[gid].i[2]; - pws0[3] = pws[gid].i[3]; - pws1[0] = pws[gid].i[4]; - pws1[1] = pws[gid].i[5]; - pws1[2] = pws[gid].i[6]; - pws1[3] = pws[gid].i[7]; + u32 wordl2[4]; + + wordl2[0] = 0; + wordl2[1] = 0; + wordl2[2] = 0; + wordl2[3] = 0; + + u32 wordl3[4]; + + wordl3[0] = 0; + wordl3[1] = 0; + wordl3[2] = 0; + wordl3[3] = 0; const u32 pw_l_len = pws[gid].pw_len; + if (combs_mode == COMBINATOR_MODE_BASE_RIGHT) + { + append_0x80_2x4 (wordl0, wordl1, pw_l_len); + + switch_buffer_by_offset_le (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len); + } + /** * loop */ - for (u32 il_pos = 0; il_pos < combs_cnt; il_pos += VECT_SIZE) + for (u32 il_pos = 0; il_pos < combs_cnt; il_pos++) { - const u32x pw_r_len = pwlenx_create_combt (combs_buf, il_pos); + const u32 pw_r_len = combs_buf[il_pos].pw_len; + + const u32 pw_len = pw_l_len + pw_r_len; + + u32 wordr0[4]; + + wordr0[0] = combs_buf[il_pos].i[0]; + wordr0[1] = combs_buf[il_pos].i[1]; + wordr0[2] = combs_buf[il_pos].i[2]; + wordr0[3] = combs_buf[il_pos].i[3]; + + u32 wordr1[4]; + + wordr1[0] = combs_buf[il_pos].i[4]; + wordr1[1] = combs_buf[il_pos].i[5]; + wordr1[2] = combs_buf[il_pos].i[6]; + wordr1[3] = combs_buf[il_pos].i[7]; - const u32x pw_len = pw_l_len + pw_r_len; + u32 wordr2[4]; - u32x wordr0[4] = { 0 }; - u32x wordr1[4] = { 0 }; - u32x wordr2[4] = { 0 }; - u32x wordr3[4] = { 0 }; + wordr2[0] = 0; + wordr2[1] = 0; + wordr2[2] = 0; + wordr2[3] = 0; - wordr0[0] = ix_create_combt (combs_buf, il_pos, 0); - wordr0[1] = ix_create_combt (combs_buf, il_pos, 1); - wordr0[2] = ix_create_combt (combs_buf, il_pos, 2); - wordr0[3] = ix_create_combt (combs_buf, il_pos, 3); - wordr1[0] = ix_create_combt (combs_buf, il_pos, 4); - wordr1[1] = ix_create_combt (combs_buf, il_pos, 5); - wordr1[2] = ix_create_combt (combs_buf, il_pos, 6); - wordr1[3] = ix_create_combt (combs_buf, il_pos, 7); + u32 wordr3[4]; + + wordr3[0] = 0; + wordr3[1] = 0; + wordr3[2] = 0; + wordr3[3] = 0; if (combs_mode == COMBINATOR_MODE_BASE_LEFT) { @@ -201,10 +238,10 @@ __kernel void m10800_m04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, switch_buffer_by_offset_le (wordr0, wordr1, wordr2, wordr3, pw_l_len); } - u32x w0[4]; - u32x w1[4]; - u32x w2[4]; - u32x w3[4]; + u32 w0[4]; + u32 w1[4]; + u32 w2[4]; + u32 w3[4]; w0[0] = wordl0[0] | wordr0[0]; w0[1] = wordl0[1] | wordr0[1]; @@ -227,10 +264,10 @@ __kernel void m10800_m04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, * SHA384 */ - u32x w0_t[4]; - u32x w1_t[4]; - u32x w2_t[4]; - u32x w3_t[4]; + u32 w0_t[4]; + u32 w1_t[4]; + u32 w2_t[4]; + u32 w3_t[4]; w0_t[0] = swap32 (w0[0]); w0_t[1] = swap32 (w0[1]); @@ -295,20 +332,43 @@ __kernel void m10800_s04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, if (gid >= gid_max) return; - u32 pws0[4] = { 0 }; - u32 pws1[4] = { 0 }; + u32 wordl0[4]; + + wordl0[0] = pws[gid].i[ 0]; + wordl0[1] = pws[gid].i[ 1]; + wordl0[2] = pws[gid].i[ 2]; + wordl0[3] = pws[gid].i[ 3]; + + u32 wordl1[4]; + + wordl1[0] = pws[gid].i[ 4]; + wordl1[1] = pws[gid].i[ 5]; + wordl1[2] = pws[gid].i[ 6]; + wordl1[3] = pws[gid].i[ 7]; + + u32 wordl2[4]; - pws0[0] = pws[gid].i[0]; - pws0[1] = pws[gid].i[1]; - pws0[2] = pws[gid].i[2]; - pws0[3] = pws[gid].i[3]; - pws1[0] = pws[gid].i[4]; - pws1[1] = pws[gid].i[5]; - pws1[2] = pws[gid].i[6]; - pws1[3] = pws[gid].i[7]; + wordl2[0] = 0; + wordl2[1] = 0; + wordl2[2] = 0; + wordl2[3] = 0; + + u32 wordl3[4]; + + wordl3[0] = 0; + wordl3[1] = 0; + wordl3[2] = 0; + wordl3[3] = 0; const u32 pw_l_len = pws[gid].pw_len; + if (combs_mode == COMBINATOR_MODE_BASE_RIGHT) + { + append_0x80_2x4 (wordl0, wordl1, pw_l_len); + + switch_buffer_by_offset_le (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len); + } + /** * digest */ @@ -325,25 +385,39 @@ __kernel void m10800_s04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, * loop */ - for (u32 il_pos = 0; il_pos < combs_cnt; il_pos += VECT_SIZE) + for (u32 il_pos = 0; il_pos < combs_cnt; il_pos++) { - const u32x pw_r_len = pwlenx_create_combt (combs_buf, il_pos); + const u32 pw_r_len = combs_buf[il_pos].pw_len; + + const u32 pw_len = pw_l_len + pw_r_len; + + u32 wordr0[4]; + + wordr0[0] = combs_buf[il_pos].i[0]; + wordr0[1] = combs_buf[il_pos].i[1]; + wordr0[2] = combs_buf[il_pos].i[2]; + wordr0[3] = combs_buf[il_pos].i[3]; + + u32 wordr1[4]; + + wordr1[0] = combs_buf[il_pos].i[4]; + wordr1[1] = combs_buf[il_pos].i[5]; + wordr1[2] = combs_buf[il_pos].i[6]; + wordr1[3] = combs_buf[il_pos].i[7]; + + u32 wordr2[4]; - const u32x pw_len = pw_l_len + pw_r_len; + wordr2[0] = 0; + wordr2[1] = 0; + wordr2[2] = 0; + wordr2[3] = 0; - u32x wordr0[4] = { 0 }; - u32x wordr1[4] = { 0 }; - u32x wordr2[4] = { 0 }; - u32x wordr3[4] = { 0 }; + u32 wordr3[4]; - wordr0[0] = ix_create_combt (combs_buf, il_pos, 0); - wordr0[1] = ix_create_combt (combs_buf, il_pos, 1); - wordr0[2] = ix_create_combt (combs_buf, il_pos, 2); - wordr0[3] = ix_create_combt (combs_buf, il_pos, 3); - wordr1[0] = ix_create_combt (combs_buf, il_pos, 4); - wordr1[1] = ix_create_combt (combs_buf, il_pos, 5); - wordr1[2] = ix_create_combt (combs_buf, il_pos, 6); - wordr1[3] = ix_create_combt (combs_buf, il_pos, 7); + wordr3[0] = 0; + wordr3[1] = 0; + wordr3[2] = 0; + wordr3[3] = 0; if (combs_mode == COMBINATOR_MODE_BASE_LEFT) { @@ -352,10 +426,10 @@ __kernel void m10800_s04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, switch_buffer_by_offset_le (wordr0, wordr1, wordr2, wordr3, pw_l_len); } - u32x w0[4]; - u32x w1[4]; - u32x w2[4]; - u32x w3[4]; + u32 w0[4]; + u32 w1[4]; + u32 w2[4]; + u32 w3[4]; w0[0] = wordl0[0] | wordr0[0]; w0[1] = wordl0[1] | wordr0[1]; @@ -378,10 +452,10 @@ __kernel void m10800_s04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, * SHA384 */ - u32x w0_t[4]; - u32x w1_t[4]; - u32x w2_t[4]; - u32x w3_t[4]; + u32 w0_t[4]; + u32 w1_t[4]; + u32 w2_t[4]; + u32 w3_t[4]; w0_t[0] = swap32 (w0[0]); w0_t[1] = swap32 (w0[1]); diff --git a/OpenCL/m11000_a1.cl b/OpenCL/m11000_a1.cl index d97ea2a..0798737 100644 --- a/OpenCL/m11000_a1.cl +++ b/OpenCL/m11000_a1.cl @@ -5,8 +5,6 @@ #define _MD5_ -#define NEW_SIMD_CODE - #include "include/constants.h" #include "include/kernel_vendor.h" @@ -18,7 +16,9 @@ #include "include/kernel_functions.c" #include "OpenCL/types_ocl.c" #include "OpenCL/common.c" -#include "OpenCL/simd.c" + +#define COMPARE_S "OpenCL/check_single_comp4.c" +#define COMPARE_M "OpenCL/check_multi_comp4.c" __kernel void m11000_m04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, __global comb_t *combs_buf, __global bf_t *bfs_buf, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 combs_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max) { @@ -36,20 +36,41 @@ __kernel void m11000_m04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, if (gid >= gid_max) return; - u32 pws0[4] = { 0 }; - u32 pws1[4] = { 0 }; + u32 wordl0[4]; + + wordl0[0] = pws[gid].i[ 0]; + wordl0[1] = pws[gid].i[ 1]; + wordl0[2] = pws[gid].i[ 2]; + wordl0[3] = pws[gid].i[ 3]; + + u32 wordl1[4]; + + wordl1[0] = pws[gid].i[ 4]; + wordl1[1] = pws[gid].i[ 5]; + wordl1[2] = pws[gid].i[ 6]; + wordl1[3] = pws[gid].i[ 7]; + + u32 wordl2[4]; + + wordl2[0] = 0; + wordl2[1] = 0; + wordl2[2] = 0; + wordl2[3] = 0; + + u32 wordl3[4]; - pws0[0] = pws[gid].i[0]; - pws0[1] = pws[gid].i[1]; - pws0[2] = pws[gid].i[2]; - pws0[3] = pws[gid].i[3]; - pws1[0] = pws[gid].i[4]; - pws1[1] = pws[gid].i[5]; - pws1[2] = pws[gid].i[6]; - pws1[3] = pws[gid].i[7]; + wordl3[0] = 0; + wordl3[1] = 0; + wordl3[2] = 0; + wordl3[3] = 0; const u32 pw_l_len = pws[gid].pw_len; + if (combs_mode == COMBINATOR_MODE_BASE_RIGHT) + { + switch_buffer_by_offset_le (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len); + } + /** * salt */ @@ -86,35 +107,43 @@ __kernel void m11000_m04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, * loop */ - for (u32 il_pos = 0; il_pos < combs_cnt; il_pos += VECT_SIZE) + for (u32 il_pos = 0; il_pos < combs_cnt; il_pos++) { - const u32x pw_r_len = pwlenx_create_combt (combs_buf, il_pos); - - const u32x pw_len = pw_l_len + pw_r_len; - - u32x wordr0[4] = { 0 }; - u32x wordr1[4] = { 0 }; - u32x wordr2[4] = { 0 }; - u32x wordr3[4] = { 0 }; - - wordr0[0] = ix_create_combt (combs_buf, il_pos, 0); - wordr0[1] = ix_create_combt (combs_buf, il_pos, 1); - wordr0[2] = ix_create_combt (combs_buf, il_pos, 2); - wordr0[3] = ix_create_combt (combs_buf, il_pos, 3); - wordr1[0] = ix_create_combt (combs_buf, il_pos, 4); - wordr1[1] = ix_create_combt (combs_buf, il_pos, 5); - wordr1[2] = ix_create_combt (combs_buf, il_pos, 6); - wordr1[3] = ix_create_combt (combs_buf, il_pos, 7); + const u32 pw_r_len = combs_buf[il_pos].pw_len; + + const u32 pw_len = pw_l_len + pw_r_len; + + u32 wordr0[4]; + u32 wordr1[4]; + u32 wordr2[4]; + u32 wordr3[4]; + + wordr0[0] = combs_buf[il_pos].i[0]; + wordr0[1] = combs_buf[il_pos].i[1]; + wordr0[2] = combs_buf[il_pos].i[2]; + wordr0[3] = combs_buf[il_pos].i[3]; + wordr1[0] = combs_buf[il_pos].i[4]; + wordr1[1] = combs_buf[il_pos].i[5]; + wordr1[2] = combs_buf[il_pos].i[6]; + wordr1[3] = combs_buf[il_pos].i[7]; + wordr2[0] = 0; + wordr2[1] = 0; + wordr2[2] = 0; + wordr2[3] = 0; + wordr3[0] = 0; + wordr3[1] = 0; + wordr3[2] = 0; + wordr3[3] = 0; if (combs_mode == COMBINATOR_MODE_BASE_LEFT) { switch_buffer_by_offset_le (wordr0, wordr1, wordr2, wordr3, pw_l_len); } - u32x w0[4]; - u32x w1[4]; - u32x w2[4]; - u32x w3[4]; + u32 w0[4]; + u32 w1[4]; + u32 w2[4]; + u32 w3[4]; w0[0] = wordl0[0] | wordr0[0]; w0[1] = wordl0[1] | wordr0[1]; @@ -135,7 +164,7 @@ __kernel void m11000_m04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, append_0x80_4x4 (w0, w1, w2, w3, pw_len); - const u32x pw_salt_len = pw_len + salt_len; + const u32 pw_salt_len = pw_len + salt_len; /** * prepend salt @@ -143,10 +172,10 @@ __kernel void m11000_m04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, // first step fixed 56 bytes of salt - u32x w0_t[4]; - u32x w1_t[4]; - u32x w2_t[4]; - u32x w3_t[4]; + u32 w0_t[4]; + u32 w1_t[4]; + u32 w2_t[4]; + u32 w3_t[4]; w0_t[0] = salt_buf0[0]; w0_t[1] = salt_buf0[1]; @@ -174,10 +203,10 @@ __kernel void m11000_m04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, // first transform - u32x a = MD5M_A; - u32x b = MD5M_B; - u32x c = MD5M_C; - u32x d = MD5M_D; + u32 a = MD5M_A; + u32 b = MD5M_B; + u32 c = MD5M_C; + u32 d = MD5M_D; MD5_STEP (MD5_Fo, a, b, c, d, w0_t[0], MD5C00, MD5S00); MD5_STEP (MD5_Fo, d, a, b, c, w0_t[1], MD5C01, MD5S01); @@ -348,7 +377,13 @@ __kernel void m11000_m04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, b += r_b; c += r_c; d += r_d; - COMPARE_M_SIMD (a, d, c, b); + + const u32 r0 = a; + const u32 r1 = d; + const u32 r2 = c; + const u32 r3 = b; + + #include COMPARE_M } } @@ -376,20 +411,41 @@ __kernel void m11000_s04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, if (gid >= gid_max) return; - u32 pws0[4] = { 0 }; - u32 pws1[4] = { 0 }; + u32 wordl0[4]; + + wordl0[0] = pws[gid].i[ 0]; + wordl0[1] = pws[gid].i[ 1]; + wordl0[2] = pws[gid].i[ 2]; + wordl0[3] = pws[gid].i[ 3]; + + u32 wordl1[4]; + + wordl1[0] = pws[gid].i[ 4]; + wordl1[1] = pws[gid].i[ 5]; + wordl1[2] = pws[gid].i[ 6]; + wordl1[3] = pws[gid].i[ 7]; - pws0[0] = pws[gid].i[0]; - pws0[1] = pws[gid].i[1]; - pws0[2] = pws[gid].i[2]; - pws0[3] = pws[gid].i[3]; - pws1[0] = pws[gid].i[4]; - pws1[1] = pws[gid].i[5]; - pws1[2] = pws[gid].i[6]; - pws1[3] = pws[gid].i[7]; + u32 wordl2[4]; + + wordl2[0] = 0; + wordl2[1] = 0; + wordl2[2] = 0; + wordl2[3] = 0; + + u32 wordl3[4]; + + wordl3[0] = 0; + wordl3[1] = 0; + wordl3[2] = 0; + wordl3[3] = 0; const u32 pw_l_len = pws[gid].pw_len; + if (combs_mode == COMBINATOR_MODE_BASE_RIGHT) + { + switch_buffer_by_offset_le (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len); + } + /** * salt */ @@ -438,35 +494,43 @@ __kernel void m11000_s04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, * loop */ - for (u32 il_pos = 0; il_pos < combs_cnt; il_pos += VECT_SIZE) + for (u32 il_pos = 0; il_pos < combs_cnt; il_pos++) { - const u32x pw_r_len = pwlenx_create_combt (combs_buf, il_pos); - - const u32x pw_len = pw_l_len + pw_r_len; - - u32x wordr0[4] = { 0 }; - u32x wordr1[4] = { 0 }; - u32x wordr2[4] = { 0 }; - u32x wordr3[4] = { 0 }; - - wordr0[0] = ix_create_combt (combs_buf, il_pos, 0); - wordr0[1] = ix_create_combt (combs_buf, il_pos, 1); - wordr0[2] = ix_create_combt (combs_buf, il_pos, 2); - wordr0[3] = ix_create_combt (combs_buf, il_pos, 3); - wordr1[0] = ix_create_combt (combs_buf, il_pos, 4); - wordr1[1] = ix_create_combt (combs_buf, il_pos, 5); - wordr1[2] = ix_create_combt (combs_buf, il_pos, 6); - wordr1[3] = ix_create_combt (combs_buf, il_pos, 7); + const u32 pw_r_len = combs_buf[il_pos].pw_len; + + const u32 pw_len = pw_l_len + pw_r_len; + + u32 wordr0[4]; + u32 wordr1[4]; + u32 wordr2[4]; + u32 wordr3[4]; + + wordr0[0] = combs_buf[il_pos].i[0]; + wordr0[1] = combs_buf[il_pos].i[1]; + wordr0[2] = combs_buf[il_pos].i[2]; + wordr0[3] = combs_buf[il_pos].i[3]; + wordr1[0] = combs_buf[il_pos].i[4]; + wordr1[1] = combs_buf[il_pos].i[5]; + wordr1[2] = combs_buf[il_pos].i[6]; + wordr1[3] = combs_buf[il_pos].i[7]; + wordr2[0] = 0; + wordr2[1] = 0; + wordr2[2] = 0; + wordr2[3] = 0; + wordr3[0] = 0; + wordr3[1] = 0; + wordr3[2] = 0; + wordr3[3] = 0; if (combs_mode == COMBINATOR_MODE_BASE_LEFT) { switch_buffer_by_offset_le (wordr0, wordr1, wordr2, wordr3, pw_l_len); } - u32x w0[4]; - u32x w1[4]; - u32x w2[4]; - u32x w3[4]; + u32 w0[4]; + u32 w1[4]; + u32 w2[4]; + u32 w3[4]; w0[0] = wordl0[0] | wordr0[0]; w0[1] = wordl0[1] | wordr0[1]; @@ -487,7 +551,7 @@ __kernel void m11000_s04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, append_0x80_4x4 (w0, w1, w2, w3, pw_len); - const u32x pw_salt_len = pw_len + salt_len; + const u32 pw_salt_len = pw_len + salt_len; /** * prepend salt @@ -495,10 +559,10 @@ __kernel void m11000_s04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, // first step fixed 56 bytes of salt - u32x w0_t[4]; - u32x w1_t[4]; - u32x w2_t[4]; - u32x w3_t[4]; + u32 w0_t[4]; + u32 w1_t[4]; + u32 w2_t[4]; + u32 w3_t[4]; w0_t[0] = salt_buf0[0]; w0_t[1] = salt_buf0[1]; @@ -526,10 +590,10 @@ __kernel void m11000_s04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, // first transform - u32x a = MD5M_A; - u32x b = MD5M_B; - u32x c = MD5M_C; - u32x d = MD5M_D; + u32 a = MD5M_A; + u32 b = MD5M_B; + u32 c = MD5M_C; + u32 d = MD5M_D; MD5_STEP (MD5_Fo, a, b, c, d, w0_t[0], MD5C00, MD5S00); MD5_STEP (MD5_Fo, d, a, b, c, w0_t[1], MD5C01, MD5S01); @@ -700,7 +764,13 @@ __kernel void m11000_s04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, b += r_b; c += r_c; d += r_d; - COMPARE_S_SIMD (a, d, c, b); + + const u32 r0 = a; + const u32 r1 = d; + const u32 r2 = c; + const u32 r3 = b; + + #include COMPARE_S } } diff --git a/OpenCL/m11100_a1.cl b/OpenCL/m11100_a1.cl index 8f0a84b..baca662 100644 --- a/OpenCL/m11100_a1.cl +++ b/OpenCL/m11100_a1.cl @@ -5,8 +5,6 @@ #define _MD5_ -#define NEW_SIMD_CODE - #include "include/constants.h" #include "include/kernel_vendor.h" @@ -18,7 +16,9 @@ #include "include/kernel_functions.c" #include "OpenCL/types_ocl.c" #include "OpenCL/common.c" -#include "OpenCL/simd.c" + +#define COMPARE_S "OpenCL/check_single_comp4.c" +#define COMPARE_M "OpenCL/check_multi_comp4.c" #define uint_to_hex_lower8(i) l_bin2asc[(i)] @@ -55,20 +55,41 @@ __kernel void m11100_m04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, * base */ - u32 pws0[4] = { 0 }; - u32 pws1[4] = { 0 }; + u32 wordl0[4]; + + wordl0[0] = pws[gid].i[ 0]; + wordl0[1] = pws[gid].i[ 1]; + wordl0[2] = pws[gid].i[ 2]; + wordl0[3] = pws[gid].i[ 3]; + + u32 wordl1[4]; + + wordl1[0] = pws[gid].i[ 4]; + wordl1[1] = pws[gid].i[ 5]; + wordl1[2] = pws[gid].i[ 6]; + wordl1[3] = pws[gid].i[ 7]; - pws0[0] = pws[gid].i[0]; - pws0[1] = pws[gid].i[1]; - pws0[2] = pws[gid].i[2]; - pws0[3] = pws[gid].i[3]; - pws1[0] = pws[gid].i[4]; - pws1[1] = pws[gid].i[5]; - pws1[2] = pws[gid].i[6]; - pws1[3] = pws[gid].i[7]; + u32 wordl2[4]; + + wordl2[0] = 0; + wordl2[1] = 0; + wordl2[2] = 0; + wordl2[3] = 0; + + u32 wordl3[4]; + + wordl3[0] = 0; + wordl3[1] = 0; + wordl3[2] = 0; + wordl3[3] = 0; const u32 pw_l_len = pws[gid].pw_len; + if (combs_mode == COMBINATOR_MODE_BASE_RIGHT) + { + switch_buffer_by_offset_le (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len); + } + /** * challenge */ @@ -101,53 +122,67 @@ __kernel void m11100_m04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, * loop */ - for (u32 il_pos = 0; il_pos < combs_cnt; il_pos += VECT_SIZE) + for (u32 il_pos = 0; il_pos < combs_cnt; il_pos++) { - const u32x pw_r_len = pwlenx_create_combt (combs_buf, il_pos); + const u32 pw_r_len = combs_buf[il_pos].pw_len; + + const u32 pw_len = pw_l_len + pw_r_len; + + u32 wordr0[4]; + + wordr0[0] = combs_buf[il_pos].i[0]; + wordr0[1] = combs_buf[il_pos].i[1]; + wordr0[2] = combs_buf[il_pos].i[2]; + wordr0[3] = combs_buf[il_pos].i[3]; - const u32x pw_len = pw_l_len + pw_r_len; + u32 wordr1[4]; - u32x wordr0[4] = { 0 }; - u32x wordr1[4] = { 0 }; - u32x wordr2[4] = { 0 }; - u32x wordr3[4] = { 0 }; + wordr1[0] = combs_buf[il_pos].i[4]; + wordr1[1] = combs_buf[il_pos].i[5]; + wordr1[2] = combs_buf[il_pos].i[6]; + wordr1[3] = combs_buf[il_pos].i[7]; - wordr0[0] = ix_create_combt (combs_buf, il_pos, 0); - wordr0[1] = ix_create_combt (combs_buf, il_pos, 1); - wordr0[2] = ix_create_combt (combs_buf, il_pos, 2); - wordr0[3] = ix_create_combt (combs_buf, il_pos, 3); - wordr1[0] = ix_create_combt (combs_buf, il_pos, 4); - wordr1[1] = ix_create_combt (combs_buf, il_pos, 5); - wordr1[2] = ix_create_combt (combs_buf, il_pos, 6); - wordr1[3] = ix_create_combt (combs_buf, il_pos, 7); + u32 wordr2[4]; + + wordr2[0] = 0; + wordr2[1] = 0; + wordr2[2] = 0; + wordr2[3] = 0; + + u32 wordr3[4]; + + wordr3[0] = 0; + wordr3[1] = 0; + wordr3[2] = 0; + wordr3[3] = 0; if (combs_mode == COMBINATOR_MODE_BASE_LEFT) { switch_buffer_by_offset_le (wordr0, wordr1, wordr2, wordr3, pw_l_len); } - u32x w0_t[4]; + u32 w0_t[4]; w0_t[0] = salt_buf0[0]; w0_t[1] = salt_buf0[1]; w0_t[2] = salt_buf0[2]; w0_t[3] = salt_buf0[3]; - u32x w1_t[4]; + u32 w1_t[4]; w1_t[0] = salt_buf1[0]; w1_t[1] = salt_buf1[1]; w1_t[2] = salt_buf1[2]; w1_t[3] = salt_buf1[3]; - u32x w2_t[4]; + u32 w2_t[4]; w2_t[0] = 0; w2_t[1] = 0; w2_t[2] = 0; w2_t[3] = 0; - u32x w3_t[4]; + u32 w3_t[4]; w3_t[0] = 0; w3_t[1] = 0; @@ -160,7 +195,7 @@ __kernel void m11100_m04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, switch_buffer_by_offset_le (w0_t, w1_t, w2_t, w3_t, pw_len); - const u32x pw_salt_len = pw_len + salt_len; + const u32 pw_salt_len = pw_len + salt_len; w0_t[0] |= wordl0[0] | wordr0[0]; w0_t[1] |= wordl0[1] | wordr0[1]; @@ -186,10 +221,10 @@ __kernel void m11100_m04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, * md5 ($pass.$salt) */ - u32x a = MD5M_A; - u32x b = MD5M_B; - u32x c = MD5M_C; - u32x d = MD5M_D; + u32 a = MD5M_A; + u32 b = MD5M_B; + u32 c = MD5M_C; + u32 d = MD5M_D; MD5_STEP (MD5_Fo, a, b, c, d, w0_t[0], MD5C00, MD5S00); MD5_STEP (MD5_Fo, d, a, b, c, w0_t[1], MD5C01, MD5S01); @@ -369,7 +404,13 @@ __kernel void m11100_m04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, MD5_STEP (MD5_I , d, a, b, c, w2_t[3], MD5C3d, MD5S31); MD5_STEP (MD5_I , c, d, a, b, w0_t[2], MD5C3e, MD5S32); MD5_STEP (MD5_I , b, c, d, a, w2_t[1], MD5C3f, MD5S33); - COMPARE_M_SIMD (a, d, c, b); + + const u32 r0 = a; + const u32 r1 = d; + const u32 r2 = c; + const u32 r3 = b; + + #include COMPARE_M } } @@ -414,20 +455,41 @@ __kernel void m11100_s04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, * base */ - u32 pws0[4] = { 0 }; - u32 pws1[4] = { 0 }; + u32 wordl0[4]; + + wordl0[0] = pws[gid].i[ 0]; + wordl0[1] = pws[gid].i[ 1]; + wordl0[2] = pws[gid].i[ 2]; + wordl0[3] = pws[gid].i[ 3]; + + u32 wordl1[4]; + + wordl1[0] = pws[gid].i[ 4]; + wordl1[1] = pws[gid].i[ 5]; + wordl1[2] = pws[gid].i[ 6]; + wordl1[3] = pws[gid].i[ 7]; + + u32 wordl2[4]; - pws0[0] = pws[gid].i[0]; - pws0[1] = pws[gid].i[1]; - pws0[2] = pws[gid].i[2]; - pws0[3] = pws[gid].i[3]; - pws1[0] = pws[gid].i[4]; - pws1[1] = pws[gid].i[5]; - pws1[2] = pws[gid].i[6]; - pws1[3] = pws[gid].i[7]; + wordl2[0] = 0; + wordl2[1] = 0; + wordl2[2] = 0; + wordl2[3] = 0; + + u32 wordl3[4]; + + wordl3[0] = 0; + wordl3[1] = 0; + wordl3[2] = 0; + wordl3[3] = 0; const u32 pw_l_len = pws[gid].pw_len; + if (combs_mode == COMBINATOR_MODE_BASE_RIGHT) + { + switch_buffer_by_offset_le (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len); + } + /** * challenge */ @@ -472,53 +534,67 @@ __kernel void m11100_s04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, * loop */ - for (u32 il_pos = 0; il_pos < combs_cnt; il_pos += VECT_SIZE) + for (u32 il_pos = 0; il_pos < combs_cnt; il_pos++) { - const u32x pw_r_len = pwlenx_create_combt (combs_buf, il_pos); + const u32 pw_r_len = combs_buf[il_pos].pw_len; + + const u32 pw_len = pw_l_len + pw_r_len; + + u32 wordr0[4]; + + wordr0[0] = combs_buf[il_pos].i[0]; + wordr0[1] = combs_buf[il_pos].i[1]; + wordr0[2] = combs_buf[il_pos].i[2]; + wordr0[3] = combs_buf[il_pos].i[3]; + + u32 wordr1[4]; - const u32x pw_len = pw_l_len + pw_r_len; + wordr1[0] = combs_buf[il_pos].i[4]; + wordr1[1] = combs_buf[il_pos].i[5]; + wordr1[2] = combs_buf[il_pos].i[6]; + wordr1[3] = combs_buf[il_pos].i[7]; - u32x wordr0[4] = { 0 }; - u32x wordr1[4] = { 0 }; - u32x wordr2[4] = { 0 }; - u32x wordr3[4] = { 0 }; + u32 wordr2[4]; - wordr0[0] = ix_create_combt (combs_buf, il_pos, 0); - wordr0[1] = ix_create_combt (combs_buf, il_pos, 1); - wordr0[2] = ix_create_combt (combs_buf, il_pos, 2); - wordr0[3] = ix_create_combt (combs_buf, il_pos, 3); - wordr1[0] = ix_create_combt (combs_buf, il_pos, 4); - wordr1[1] = ix_create_combt (combs_buf, il_pos, 5); - wordr1[2] = ix_create_combt (combs_buf, il_pos, 6); - wordr1[3] = ix_create_combt (combs_buf, il_pos, 7); + wordr2[0] = 0; + wordr2[1] = 0; + wordr2[2] = 0; + wordr2[3] = 0; + + u32 wordr3[4]; + + wordr3[0] = 0; + wordr3[1] = 0; + wordr3[2] = 0; + wordr3[3] = 0; if (combs_mode == COMBINATOR_MODE_BASE_LEFT) { switch_buffer_by_offset_le (wordr0, wordr1, wordr2, wordr3, pw_l_len); } - u32x w0_t[4]; + u32 w0_t[4]; w0_t[0] = salt_buf0[0]; w0_t[1] = salt_buf0[1]; w0_t[2] = salt_buf0[2]; w0_t[3] = salt_buf0[3]; - u32x w1_t[4]; + u32 w1_t[4]; w1_t[0] = salt_buf1[0]; w1_t[1] = salt_buf1[1]; w1_t[2] = salt_buf1[2]; w1_t[3] = salt_buf1[3]; - u32x w2_t[4]; + u32 w2_t[4]; w2_t[0] = 0; w2_t[1] = 0; w2_t[2] = 0; w2_t[3] = 0; - u32x w3_t[4]; + u32 w3_t[4]; w3_t[0] = 0; w3_t[1] = 0; @@ -531,7 +607,7 @@ __kernel void m11100_s04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, switch_buffer_by_offset_le (w0_t, w1_t, w2_t, w3_t, pw_len); - const u32x pw_salt_len = pw_len + salt_len; + const u32 pw_salt_len = pw_len + salt_len; w0_t[0] |= wordl0[0] | wordr0[0]; w0_t[1] |= wordl0[1] | wordr0[1]; @@ -557,10 +633,10 @@ __kernel void m11100_s04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, * md5 ($pass.$salt) */ - u32x a = MD5M_A; - u32x b = MD5M_B; - u32x c = MD5M_C; - u32x d = MD5M_D; + u32 a = MD5M_A; + u32 b = MD5M_B; + u32 c = MD5M_C; + u32 d = MD5M_D; MD5_STEP (MD5_Fo, a, b, c, d, w0_t[0], MD5C00, MD5S00); MD5_STEP (MD5_Fo, d, a, b, c, w0_t[1], MD5C01, MD5S01); @@ -740,7 +816,13 @@ __kernel void m11100_s04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, MD5_STEP (MD5_I , d, a, b, c, w2_t[3], MD5C3d, MD5S31); MD5_STEP (MD5_I , c, d, a, b, w0_t[2], MD5C3e, MD5S32); MD5_STEP (MD5_I , b, c, d, a, w2_t[1], MD5C3f, MD5S33); - COMPARE_S_SIMD (a, d, c, b); + + const u32 r0 = a; + const u32 r1 = d; + const u32 r2 = c; + const u32 r3 = b; + + #include COMPARE_S } } diff --git a/OpenCL/m11200_a1.cl b/OpenCL/m11200_a1.cl index 37437f5..830761f 100644 --- a/OpenCL/m11200_a1.cl +++ b/OpenCL/m11200_a1.cl @@ -5,8 +5,6 @@ #define _SHA1_ -#define NEW_SIMD_CODE - #include "include/constants.h" #include "include/kernel_vendor.h" @@ -18,7 +16,9 @@ #include "include/kernel_functions.c" #include "OpenCL/types_ocl.c" #include "OpenCL/common.c" -#include "OpenCL/simd.c" + +#define COMPARE_S "OpenCL/check_single_comp4.c" +#define COMPARE_M "OpenCL/check_multi_comp4.c" __kernel void m11200_m04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, __global comb_t *combs_buf, __global bf_t *bfs_buf, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 combs_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max) { @@ -36,20 +36,43 @@ __kernel void m11200_m04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, if (gid >= gid_max) return; - u32 pws0[4] = { 0 }; - u32 pws1[4] = { 0 }; + u32 wordl0[4]; + + wordl0[0] = pws[gid].i[ 0]; + wordl0[1] = pws[gid].i[ 1]; + wordl0[2] = pws[gid].i[ 2]; + wordl0[3] = pws[gid].i[ 3]; + + u32 wordl1[4]; + + wordl1[0] = pws[gid].i[ 4]; + wordl1[1] = pws[gid].i[ 5]; + wordl1[2] = pws[gid].i[ 6]; + wordl1[3] = pws[gid].i[ 7]; - pws0[0] = pws[gid].i[0]; - pws0[1] = pws[gid].i[1]; - pws0[2] = pws[gid].i[2]; - pws0[3] = pws[gid].i[3]; - pws1[0] = pws[gid].i[4]; - pws1[1] = pws[gid].i[5]; - pws1[2] = pws[gid].i[6]; - pws1[3] = pws[gid].i[7]; + u32 wordl2[4]; + + wordl2[0] = 0; + wordl2[1] = 0; + wordl2[2] = 0; + wordl2[3] = 0; + + u32 wordl3[4]; + + wordl3[0] = 0; + wordl3[1] = 0; + wordl3[2] = 0; + wordl3[3] = 0; const u32 pw_l_len = pws[gid].pw_len; + if (combs_mode == COMBINATOR_MODE_BASE_RIGHT) + { + append_0x80_2x4 (wordl0, wordl1, pw_l_len); + + switch_buffer_by_offset_le (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len); + } + /** * salt */ @@ -66,25 +89,39 @@ __kernel void m11200_m04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, * loop */ - for (u32 il_pos = 0; il_pos < combs_cnt; il_pos += VECT_SIZE) + for (u32 il_pos = 0; il_pos < combs_cnt; il_pos++) { - const u32x pw_r_len = pwlenx_create_combt (combs_buf, il_pos); + const u32 pw_r_len = combs_buf[il_pos].pw_len; + + const u32 pw_len = pw_l_len + pw_r_len; + + u32 wordr0[4]; + + wordr0[0] = combs_buf[il_pos].i[0]; + wordr0[1] = combs_buf[il_pos].i[1]; + wordr0[2] = combs_buf[il_pos].i[2]; + wordr0[3] = combs_buf[il_pos].i[3]; - const u32x pw_len = pw_l_len + pw_r_len; + u32 wordr1[4]; - u32x wordr0[4] = { 0 }; - u32x wordr1[4] = { 0 }; - u32x wordr2[4] = { 0 }; - u32x wordr3[4] = { 0 }; + wordr1[0] = combs_buf[il_pos].i[4]; + wordr1[1] = combs_buf[il_pos].i[5]; + wordr1[2] = combs_buf[il_pos].i[6]; + wordr1[3] = combs_buf[il_pos].i[7]; - wordr0[0] = ix_create_combt (combs_buf, il_pos, 0); - wordr0[1] = ix_create_combt (combs_buf, il_pos, 1); - wordr0[2] = ix_create_combt (combs_buf, il_pos, 2); - wordr0[3] = ix_create_combt (combs_buf, il_pos, 3); - wordr1[0] = ix_create_combt (combs_buf, il_pos, 4); - wordr1[1] = ix_create_combt (combs_buf, il_pos, 5); - wordr1[2] = ix_create_combt (combs_buf, il_pos, 6); - wordr1[3] = ix_create_combt (combs_buf, il_pos, 7); + u32 wordr2[4]; + + wordr2[0] = 0; + wordr2[1] = 0; + wordr2[2] = 0; + wordr2[3] = 0; + + u32 wordr3[4]; + + wordr3[0] = 0; + wordr3[1] = 0; + wordr3[2] = 0; + wordr3[3] = 0; if (combs_mode == COMBINATOR_MODE_BASE_LEFT) { @@ -93,28 +130,28 @@ __kernel void m11200_m04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, switch_buffer_by_offset_le (wordr0, wordr1, wordr2, wordr3, pw_l_len); } - u32x w0[4]; + u32 w0[4]; w0[0] = wordl0[0] | wordr0[0]; w0[1] = wordl0[1] | wordr0[1]; w0[2] = wordl0[2] | wordr0[2]; w0[3] = wordl0[3] | wordr0[3]; - u32x w1[4]; + u32 w1[4]; w1[0] = wordl1[0] | wordr1[0]; w1[1] = wordl1[1] | wordr1[1]; w1[2] = wordl1[2] | wordr1[2]; w1[3] = wordl1[3] | wordr1[3]; - u32x w2[4]; + u32 w2[4]; w2[0] = wordl2[0] | wordr2[0]; w2[1] = wordl2[1] | wordr2[1]; w2[2] = wordl2[2] | wordr2[2]; w2[3] = wordl2[3] | wordr2[3]; - u32x w3[4]; + u32 w3[4]; w3[0] = wordl3[0] | wordr3[0]; w3[1] = wordl3[1] | wordr3[1]; @@ -125,28 +162,28 @@ __kernel void m11200_m04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, * sha1 ($pass) */ - u32x w0_t = swap32 (w0[0]); - u32x w1_t = swap32 (w0[1]); - u32x w2_t = swap32 (w0[2]); - u32x w3_t = swap32 (w0[3]); - u32x w4_t = swap32 (w1[0]); - u32x w5_t = swap32 (w1[1]); - u32x w6_t = swap32 (w1[2]); - u32x w7_t = swap32 (w1[3]); - u32x w8_t = swap32 (w2[0]); - u32x w9_t = swap32 (w2[1]); - u32x wa_t = swap32 (w2[2]); - u32x wb_t = swap32 (w2[3]); - u32x wc_t = swap32 (w3[0]); - u32x wd_t = swap32 (w3[1]); - u32x we_t = 0; - u32x wf_t = pw_len * 8; - - u32x a = SHA1M_A; - u32x b = SHA1M_B; - u32x c = SHA1M_C; - u32x d = SHA1M_D; - u32x e = SHA1M_E; + u32 w0_t = swap32 (w0[0]); + u32 w1_t = swap32 (w0[1]); + u32 w2_t = swap32 (w0[2]); + u32 w3_t = swap32 (w0[3]); + u32 w4_t = swap32 (w1[0]); + u32 w5_t = swap32 (w1[1]); + u32 w6_t = swap32 (w1[2]); + u32 w7_t = swap32 (w1[3]); + u32 w8_t = swap32 (w2[0]); + u32 w9_t = swap32 (w2[1]); + u32 wa_t = swap32 (w2[2]); + u32 wb_t = swap32 (w2[3]); + u32 wc_t = swap32 (w3[0]); + u32 wd_t = swap32 (w3[1]); + u32 we_t = 0; + u32 wf_t = pw_len * 8; + + u32 a = SHA1M_A; + u32 b = SHA1M_B; + u32 c = SHA1M_C; + u32 d = SHA1M_D; + u32 e = SHA1M_E; #undef K #define K SHA1C00 @@ -514,7 +551,13 @@ __kernel void m11200_m04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, c ^= plain_sha1_c; d ^= plain_sha1_d; e ^= plain_sha1_e; - COMPARE_M_SIMD (a, d, c, b); + + const u32 r0 = d; + const u32 r1 = e; + const u32 r2 = c; + const u32 r3 = b; + + #include COMPARE_M } } @@ -542,20 +585,43 @@ __kernel void m11200_s04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, if (gid >= gid_max) return; - u32 pws0[4] = { 0 }; - u32 pws1[4] = { 0 }; + u32 wordl0[4]; + + wordl0[0] = pws[gid].i[ 0]; + wordl0[1] = pws[gid].i[ 1]; + wordl0[2] = pws[gid].i[ 2]; + wordl0[3] = pws[gid].i[ 3]; + + u32 wordl1[4]; + + wordl1[0] = pws[gid].i[ 4]; + wordl1[1] = pws[gid].i[ 5]; + wordl1[2] = pws[gid].i[ 6]; + wordl1[3] = pws[gid].i[ 7]; + + u32 wordl2[4]; - pws0[0] = pws[gid].i[0]; - pws0[1] = pws[gid].i[1]; - pws0[2] = pws[gid].i[2]; - pws0[3] = pws[gid].i[3]; - pws1[0] = pws[gid].i[4]; - pws1[1] = pws[gid].i[5]; - pws1[2] = pws[gid].i[6]; - pws1[3] = pws[gid].i[7]; + wordl2[0] = 0; + wordl2[1] = 0; + wordl2[2] = 0; + wordl2[3] = 0; + + u32 wordl3[4]; + + wordl3[0] = 0; + wordl3[1] = 0; + wordl3[2] = 0; + wordl3[3] = 0; const u32 pw_l_len = pws[gid].pw_len; + if (combs_mode == COMBINATOR_MODE_BASE_RIGHT) + { + append_0x80_2x4 (wordl0, wordl1, pw_l_len); + + switch_buffer_by_offset_le (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len); + } + /** * digest */ @@ -584,25 +650,39 @@ __kernel void m11200_s04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, * loop */ - for (u32 il_pos = 0; il_pos < combs_cnt; il_pos += VECT_SIZE) + for (u32 il_pos = 0; il_pos < combs_cnt; il_pos++) { - const u32x pw_r_len = pwlenx_create_combt (combs_buf, il_pos); + const u32 pw_r_len = combs_buf[il_pos].pw_len; + + const u32 pw_len = pw_l_len + pw_r_len; + + u32 wordr0[4]; + + wordr0[0] = combs_buf[il_pos].i[0]; + wordr0[1] = combs_buf[il_pos].i[1]; + wordr0[2] = combs_buf[il_pos].i[2]; + wordr0[3] = combs_buf[il_pos].i[3]; + + u32 wordr1[4]; - const u32x pw_len = pw_l_len + pw_r_len; + wordr1[0] = combs_buf[il_pos].i[4]; + wordr1[1] = combs_buf[il_pos].i[5]; + wordr1[2] = combs_buf[il_pos].i[6]; + wordr1[3] = combs_buf[il_pos].i[7]; - u32x wordr0[4] = { 0 }; - u32x wordr1[4] = { 0 }; - u32x wordr2[4] = { 0 }; - u32x wordr3[4] = { 0 }; + u32 wordr2[4]; - wordr0[0] = ix_create_combt (combs_buf, il_pos, 0); - wordr0[1] = ix_create_combt (combs_buf, il_pos, 1); - wordr0[2] = ix_create_combt (combs_buf, il_pos, 2); - wordr0[3] = ix_create_combt (combs_buf, il_pos, 3); - wordr1[0] = ix_create_combt (combs_buf, il_pos, 4); - wordr1[1] = ix_create_combt (combs_buf, il_pos, 5); - wordr1[2] = ix_create_combt (combs_buf, il_pos, 6); - wordr1[3] = ix_create_combt (combs_buf, il_pos, 7); + wordr2[0] = 0; + wordr2[1] = 0; + wordr2[2] = 0; + wordr2[3] = 0; + + u32 wordr3[4]; + + wordr3[0] = 0; + wordr3[1] = 0; + wordr3[2] = 0; + wordr3[3] = 0; if (combs_mode == COMBINATOR_MODE_BASE_LEFT) { @@ -611,28 +691,28 @@ __kernel void m11200_s04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, switch_buffer_by_offset_le (wordr0, wordr1, wordr2, wordr3, pw_l_len); } - u32x w0[4]; + u32 w0[4]; w0[0] = wordl0[0] | wordr0[0]; w0[1] = wordl0[1] | wordr0[1]; w0[2] = wordl0[2] | wordr0[2]; w0[3] = wordl0[3] | wordr0[3]; - u32x w1[4]; + u32 w1[4]; w1[0] = wordl1[0] | wordr1[0]; w1[1] = wordl1[1] | wordr1[1]; w1[2] = wordl1[2] | wordr1[2]; w1[3] = wordl1[3] | wordr1[3]; - u32x w2[4]; + u32 w2[4]; w2[0] = wordl2[0] | wordr2[0]; w2[1] = wordl2[1] | wordr2[1]; w2[2] = wordl2[2] | wordr2[2]; w2[3] = wordl2[3] | wordr2[3]; - u32x w3[4]; + u32 w3[4]; w3[0] = wordl3[0] | wordr3[0]; w3[1] = wordl3[1] | wordr3[1]; @@ -643,28 +723,28 @@ __kernel void m11200_s04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, * sha1 ($pass) */ - u32x w0_t = swap32 (w0[0]); - u32x w1_t = swap32 (w0[1]); - u32x w2_t = swap32 (w0[2]); - u32x w3_t = swap32 (w0[3]); - u32x w4_t = swap32 (w1[0]); - u32x w5_t = swap32 (w1[1]); - u32x w6_t = swap32 (w1[2]); - u32x w7_t = swap32 (w1[3]); - u32x w8_t = swap32 (w2[0]); - u32x w9_t = swap32 (w2[1]); - u32x wa_t = swap32 (w2[2]); - u32x wb_t = swap32 (w2[3]); - u32x wc_t = swap32 (w3[0]); - u32x wd_t = swap32 (w3[1]); - u32x we_t = 0; - u32x wf_t = pw_len * 8; - - u32x a = SHA1M_A; - u32x b = SHA1M_B; - u32x c = SHA1M_C; - u32x d = SHA1M_D; - u32x e = SHA1M_E; + u32 w0_t = swap32 (w0[0]); + u32 w1_t = swap32 (w0[1]); + u32 w2_t = swap32 (w0[2]); + u32 w3_t = swap32 (w0[3]); + u32 w4_t = swap32 (w1[0]); + u32 w5_t = swap32 (w1[1]); + u32 w6_t = swap32 (w1[2]); + u32 w7_t = swap32 (w1[3]); + u32 w8_t = swap32 (w2[0]); + u32 w9_t = swap32 (w2[1]); + u32 wa_t = swap32 (w2[2]); + u32 wb_t = swap32 (w2[3]); + u32 wc_t = swap32 (w3[0]); + u32 wd_t = swap32 (w3[1]); + u32 we_t = 0; + u32 wf_t = pw_len * 8; + + u32 a = SHA1M_A; + u32 b = SHA1M_B; + u32 c = SHA1M_C; + u32 d = SHA1M_D; + u32 e = SHA1M_E; #undef K #define K SHA1C00 @@ -1032,7 +1112,13 @@ __kernel void m11200_s04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, c ^= plain_sha1_c; d ^= plain_sha1_d; e ^= plain_sha1_e; - COMPARE_S_SIMD (a, d, c, b); + + const u32 r0 = d; + const u32 r1 = e; + const u32 r2 = c; + const u32 r3 = b; + + #include COMPARE_S } } diff --git a/OpenCL/m11400_a1.cl b/OpenCL/m11400_a1.cl index d48562f..e23223d 100644 --- a/OpenCL/m11400_a1.cl +++ b/OpenCL/m11400_a1.cl @@ -5,8 +5,6 @@ #define _MD5_ -#define NEW_SIMD_CODE - #include "include/constants.h" #include "include/kernel_vendor.h" @@ -18,7 +16,9 @@ #include "include/kernel_functions.c" #include "OpenCL/types_ocl.c" #include "OpenCL/common.c" -#include "OpenCL/simd.c" + +#define COMPARE_S "OpenCL/check_single_comp4.c" +#define COMPARE_M "OpenCL/check_multi_comp4.c" #define uint_to_hex_lower8(i) l_bin2asc[(i)] @@ -776,20 +776,43 @@ __kernel void m11400_m04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, * base */ - u32 pws0[4] = { 0 }; - u32 pws1[4] = { 0 }; + u32 wordl0[4]; + + wordl0[0] = pws[gid].i[ 0]; + wordl0[1] = pws[gid].i[ 1]; + wordl0[2] = pws[gid].i[ 2]; + wordl0[3] = pws[gid].i[ 3]; + + u32 wordl1[4]; + + wordl1[0] = pws[gid].i[ 4]; + wordl1[1] = pws[gid].i[ 5]; + wordl1[2] = pws[gid].i[ 6]; + wordl1[3] = pws[gid].i[ 7]; - pws0[0] = pws[gid].i[0]; - pws0[1] = pws[gid].i[1]; - pws0[2] = pws[gid].i[2]; - pws0[3] = pws[gid].i[3]; - pws1[0] = pws[gid].i[4]; - pws1[1] = pws[gid].i[5]; - pws1[2] = pws[gid].i[6]; - pws1[3] = pws[gid].i[7]; + u32 wordl2[4]; + + wordl2[0] = 0; + wordl2[1] = 0; + wordl2[2] = 0; + wordl2[3] = 0; + + u32 wordl3[4]; + + wordl3[0] = 0; + wordl3[1] = 0; + wordl3[2] = 0; + wordl3[3] = 0; const u32 pw_l_len = pws[gid].pw_len; + if (combs_mode == COMBINATOR_MODE_BASE_RIGHT) + { + append_0x80_2x4 (wordl0, wordl1, pw_l_len); + + switch_buffer_by_offset_le (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len); + } + /** * salt */ @@ -904,53 +927,67 @@ __kernel void m11400_m04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, * loop */ - for (u32 il_pos = 0; il_pos < combs_cnt; il_pos += VECT_SIZE) + for (u32 il_pos = 0; il_pos < combs_cnt; il_pos++) { - const u32x pw_r_len = pwlenx_create_combt (combs_buf, il_pos); + const u32 pw_r_len = combs_buf[il_pos].pw_len; + + const u32 pw_len = pw_l_len + pw_r_len; + + u32 wordr0[4]; + + wordr0[0] = combs_buf[il_pos].i[0]; + wordr0[1] = combs_buf[il_pos].i[1]; + wordr0[2] = combs_buf[il_pos].i[2]; + wordr0[3] = combs_buf[il_pos].i[3]; - const u32x pw_len = pw_l_len + pw_r_len; + u32 wordr1[4]; - u32x wordr0[4] = { 0 }; - u32x wordr1[4] = { 0 }; - u32x wordr2[4] = { 0 }; - u32x wordr3[4] = { 0 }; + wordr1[0] = combs_buf[il_pos].i[4]; + wordr1[1] = combs_buf[il_pos].i[5]; + wordr1[2] = combs_buf[il_pos].i[6]; + wordr1[3] = combs_buf[il_pos].i[7]; - wordr0[0] = ix_create_combt (combs_buf, il_pos, 0); - wordr0[1] = ix_create_combt (combs_buf, il_pos, 1); - wordr0[2] = ix_create_combt (combs_buf, il_pos, 2); - wordr0[3] = ix_create_combt (combs_buf, il_pos, 3); - wordr1[0] = ix_create_combt (combs_buf, il_pos, 4); - wordr1[1] = ix_create_combt (combs_buf, il_pos, 5); - wordr1[2] = ix_create_combt (combs_buf, il_pos, 6); - wordr1[3] = ix_create_combt (combs_buf, il_pos, 7); + u32 wordr2[4]; + + wordr2[0] = 0; + wordr2[1] = 0; + wordr2[2] = 0; + wordr2[3] = 0; + + u32 wordr3[4]; + + wordr3[0] = 0; + wordr3[1] = 0; + wordr3[2] = 0; + wordr3[3] = 0; if (combs_mode == COMBINATOR_MODE_BASE_LEFT) { switch_buffer_by_offset_le (wordr0, wordr1, wordr2, wordr3, pw_l_len); } - u32x w0[4]; + u32 w0[4]; w0[0] = wordl0[0] | wordr0[0]; w0[1] = wordl0[1] | wordr0[1]; w0[2] = wordl0[2] | wordr0[2]; w0[3] = wordl0[3] | wordr0[3]; - u32x w1[4]; + u32 w1[4]; w1[0] = wordl1[0] | wordr1[0]; w1[1] = wordl1[1] | wordr1[1]; w1[2] = wordl1[2] | wordr1[2]; w1[3] = wordl1[3] | wordr1[3]; - u32x w2[4]; + u32 w2[4]; w2[0] = wordl2[0] | wordr2[0]; w2[1] = wordl2[1] | wordr2[1]; w2[2] = wordl2[2] | wordr2[2]; w2[3] = wordl2[3] | wordr2[3]; - u32x w3[4]; + u32 w3[4]; w3[0] = wordl3[0] | wordr3[0]; w3[1] = wordl3[1] | wordr3[1]; @@ -1007,28 +1044,28 @@ __kernel void m11400_m04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, block_len = memcat32 (block0, block1, salt_len, w0, w1, w2, w3, pw_len); - u32x w0_t[4]; + u32 w0_t[4]; w0_t[0] = block0[ 0]; w0_t[1] = block0[ 1]; w0_t[2] = block0[ 2]; w0_t[3] = block0[ 3]; - u32x w1_t[4]; + u32 w1_t[4]; w1_t[0] = block0[ 4]; w1_t[1] = block0[ 5]; w1_t[2] = block0[ 6]; w1_t[3] = block0[ 7]; - u32x w2_t[4]; + u32 w2_t[4]; w2_t[0] = block0[ 8]; w2_t[1] = block0[ 9]; w2_t[2] = block0[10]; w2_t[3] = block0[11]; - u32x w3_t[4]; + u32 w3_t[4]; w3_t[0] = block0[12]; w3_t[1] = block0[13]; @@ -1042,10 +1079,10 @@ __kernel void m11400_m04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, // md5 - u32x a = MD5M_A; - u32x b = MD5M_B; - u32x c = MD5M_C; - u32x d = MD5M_D; + u32 a = MD5M_A; + u32 b = MD5M_B; + u32 c = MD5M_C; + u32 d = MD5M_D; MD5_STEP (MD5_Fo, a, b, c, d, w0_t[0], MD5C00, MD5S00); MD5_STEP (MD5_Fo, d, a, b, c, w0_t[1], MD5C01, MD5S01); @@ -1547,7 +1584,13 @@ __kernel void m11400_m04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, b += r_b; c += r_c; d += r_d; - COMPARE_M_SIMD (a, d, c, b); + + const u32 r0 = a; + const u32 r1 = d; + const u32 r2 = c; + const u32 r3 = b; + + #include COMPARE_M } } @@ -1592,20 +1635,43 @@ __kernel void m11400_s04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, * base */ - u32 pws0[4] = { 0 }; - u32 pws1[4] = { 0 }; + u32 wordl0[4]; + + wordl0[0] = pws[gid].i[ 0]; + wordl0[1] = pws[gid].i[ 1]; + wordl0[2] = pws[gid].i[ 2]; + wordl0[3] = pws[gid].i[ 3]; + + u32 wordl1[4]; + + wordl1[0] = pws[gid].i[ 4]; + wordl1[1] = pws[gid].i[ 5]; + wordl1[2] = pws[gid].i[ 6]; + wordl1[3] = pws[gid].i[ 7]; + + u32 wordl2[4]; - pws0[0] = pws[gid].i[0]; - pws0[1] = pws[gid].i[1]; - pws0[2] = pws[gid].i[2]; - pws0[3] = pws[gid].i[3]; - pws1[0] = pws[gid].i[4]; - pws1[1] = pws[gid].i[5]; - pws1[2] = pws[gid].i[6]; - pws1[3] = pws[gid].i[7]; + wordl2[0] = 0; + wordl2[1] = 0; + wordl2[2] = 0; + wordl2[3] = 0; + + u32 wordl3[4]; + + wordl3[0] = 0; + wordl3[1] = 0; + wordl3[2] = 0; + wordl3[3] = 0; const u32 pw_l_len = pws[gid].pw_len; + if (combs_mode == COMBINATOR_MODE_BASE_RIGHT) + { + append_0x80_2x4 (wordl0, wordl1, pw_l_len); + + switch_buffer_by_offset_le (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len); + } + /** * salt */ @@ -1732,53 +1798,67 @@ __kernel void m11400_s04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, * loop */ - for (u32 il_pos = 0; il_pos < combs_cnt; il_pos += VECT_SIZE) + for (u32 il_pos = 0; il_pos < combs_cnt; il_pos++) { - const u32x pw_r_len = pwlenx_create_combt (combs_buf, il_pos); + const u32 pw_r_len = combs_buf[il_pos].pw_len; + + const u32 pw_len = pw_l_len + pw_r_len; + + u32 wordr0[4]; + + wordr0[0] = combs_buf[il_pos].i[0]; + wordr0[1] = combs_buf[il_pos].i[1]; + wordr0[2] = combs_buf[il_pos].i[2]; + wordr0[3] = combs_buf[il_pos].i[3]; + + u32 wordr1[4]; - const u32x pw_len = pw_l_len + pw_r_len; + wordr1[0] = combs_buf[il_pos].i[4]; + wordr1[1] = combs_buf[il_pos].i[5]; + wordr1[2] = combs_buf[il_pos].i[6]; + wordr1[3] = combs_buf[il_pos].i[7]; - u32x wordr0[4] = { 0 }; - u32x wordr1[4] = { 0 }; - u32x wordr2[4] = { 0 }; - u32x wordr3[4] = { 0 }; + u32 wordr2[4]; - wordr0[0] = ix_create_combt (combs_buf, il_pos, 0); - wordr0[1] = ix_create_combt (combs_buf, il_pos, 1); - wordr0[2] = ix_create_combt (combs_buf, il_pos, 2); - wordr0[3] = ix_create_combt (combs_buf, il_pos, 3); - wordr1[0] = ix_create_combt (combs_buf, il_pos, 4); - wordr1[1] = ix_create_combt (combs_buf, il_pos, 5); - wordr1[2] = ix_create_combt (combs_buf, il_pos, 6); - wordr1[3] = ix_create_combt (combs_buf, il_pos, 7); + wordr2[0] = 0; + wordr2[1] = 0; + wordr2[2] = 0; + wordr2[3] = 0; + + u32 wordr3[4]; + + wordr3[0] = 0; + wordr3[1] = 0; + wordr3[2] = 0; + wordr3[3] = 0; if (combs_mode == COMBINATOR_MODE_BASE_LEFT) { switch_buffer_by_offset_le (wordr0, wordr1, wordr2, wordr3, pw_l_len); } - u32x w0[4]; + u32 w0[4]; w0[0] = wordl0[0] | wordr0[0]; w0[1] = wordl0[1] | wordr0[1]; w0[2] = wordl0[2] | wordr0[2]; w0[3] = wordl0[3] | wordr0[3]; - u32x w1[4]; + u32 w1[4]; w1[0] = wordl1[0] | wordr1[0]; w1[1] = wordl1[1] | wordr1[1]; w1[2] = wordl1[2] | wordr1[2]; w1[3] = wordl1[3] | wordr1[3]; - u32x w2[4]; + u32 w2[4]; w2[0] = wordl2[0] | wordr2[0]; w2[1] = wordl2[1] | wordr2[1]; w2[2] = wordl2[2] | wordr2[2]; w2[3] = wordl2[3] | wordr2[3]; - u32x w3[4]; + u32 w3[4]; w3[0] = wordl3[0] | wordr3[0]; w3[1] = wordl3[1] | wordr3[1]; @@ -1835,28 +1915,28 @@ __kernel void m11400_s04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, block_len = memcat32 (block0, block1, salt_len, w0, w1, w2, w3, pw_len); - u32x w0_t[4]; + u32 w0_t[4]; w0_t[0] = block0[ 0]; w0_t[1] = block0[ 1]; w0_t[2] = block0[ 2]; w0_t[3] = block0[ 3]; - u32x w1_t[4]; + u32 w1_t[4]; w1_t[0] = block0[ 4]; w1_t[1] = block0[ 5]; w1_t[2] = block0[ 6]; w1_t[3] = block0[ 7]; - u32x w2_t[4]; + u32 w2_t[4]; w2_t[0] = block0[ 8]; w2_t[1] = block0[ 9]; w2_t[2] = block0[10]; w2_t[3] = block0[11]; - u32x w3_t[4]; + u32 w3_t[4]; w3_t[0] = block0[12]; w3_t[1] = block0[13]; @@ -1870,10 +1950,10 @@ __kernel void m11400_s04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, // md5 - u32x a = MD5M_A; - u32x b = MD5M_B; - u32x c = MD5M_C; - u32x d = MD5M_D; + u32 a = MD5M_A; + u32 b = MD5M_B; + u32 c = MD5M_C; + u32 d = MD5M_D; MD5_STEP (MD5_Fo, a, b, c, d, w0_t[0], MD5C00, MD5S00); MD5_STEP (MD5_Fo, d, a, b, c, w0_t[1], MD5C01, MD5S01); @@ -2375,7 +2455,13 @@ __kernel void m11400_s04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, b += r_b; c += r_c; d += r_d; - COMPARE_S_SIMD (a, d, c, b); + + const u32 r0 = a; + const u32 r1 = d; + const u32 r2 = c; + const u32 r3 = b; + + #include COMPARE_S } } diff --git a/OpenCL/m11500_a1.cl b/OpenCL/m11500_a1.cl index 63a90dc..c537c0e 100644 --- a/OpenCL/m11500_a1.cl +++ b/OpenCL/m11500_a1.cl @@ -5,8 +5,6 @@ #define _CRC32_ -#define NEW_SIMD_CODE - #include "include/constants.h" #include "include/kernel_vendor.h" @@ -18,7 +16,9 @@ #include "include/kernel_functions.c" #include "OpenCL/types_ocl.c" #include "OpenCL/common.c" -#include "OpenCL/simd.c" + +#define COMPARE_S "OpenCL/check_single_comp4.c" +#define COMPARE_M "OpenCL/check_multi_comp4.c" __constant u32 crc32tab[0x100] = { @@ -143,50 +143,85 @@ __kernel void m11500_m04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, if (gid >= gid_max) return; - u32 pws0[4] = { 0 }; - u32 pws1[4] = { 0 }; + u32 wordl0[4]; + + wordl0[0] = pws[gid].i[ 0]; + wordl0[1] = pws[gid].i[ 1]; + wordl0[2] = pws[gid].i[ 2]; + wordl0[3] = pws[gid].i[ 3]; + + u32 wordl1[4]; + + wordl1[0] = pws[gid].i[ 4]; + wordl1[1] = pws[gid].i[ 5]; + wordl1[2] = pws[gid].i[ 6]; + wordl1[3] = pws[gid].i[ 7]; + + u32 wordl2[4]; + + wordl2[0] = 0; + wordl2[1] = 0; + wordl2[2] = 0; + wordl2[3] = 0; + + u32 wordl3[4]; - pws0[0] = pws[gid].i[0]; - pws0[1] = pws[gid].i[1]; - pws0[2] = pws[gid].i[2]; - pws0[3] = pws[gid].i[3]; - pws1[0] = pws[gid].i[4]; - pws1[1] = pws[gid].i[5]; - pws1[2] = pws[gid].i[6]; - pws1[3] = pws[gid].i[7]; + wordl3[0] = 0; + wordl3[1] = 0; + wordl3[2] = 0; + wordl3[3] = 0; const u32 pw_l_len = pws[gid].pw_len; + if (combs_mode == COMBINATOR_MODE_BASE_RIGHT) + { + switch_buffer_by_offset_le (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len); + } + /** * loop */ - for (u32 il_pos = 0; il_pos < combs_cnt; il_pos += VECT_SIZE) + for (u32 il_pos = 0; il_pos < combs_cnt; il_pos++) { - const u32x pw_r_len = pwlenx_create_combt (combs_buf, il_pos); + const u32 pw_r_len = combs_buf[il_pos].pw_len; + + const u32 pw_len = pw_l_len + pw_r_len; + + u32 wordr0[4]; + + wordr0[0] = combs_buf[il_pos].i[0]; + wordr0[1] = combs_buf[il_pos].i[1]; + wordr0[2] = combs_buf[il_pos].i[2]; + wordr0[3] = combs_buf[il_pos].i[3]; + + u32 wordr1[4]; + + wordr1[0] = combs_buf[il_pos].i[4]; + wordr1[1] = combs_buf[il_pos].i[5]; + wordr1[2] = combs_buf[il_pos].i[6]; + wordr1[3] = combs_buf[il_pos].i[7]; - const u32x pw_len = pw_l_len + pw_r_len; + u32 wordr2[4]; - u32x wordr0[4] = { 0 }; - u32x wordr1[4] = { 0 }; - u32x wordr2[4] = { 0 }; - u32x wordr3[4] = { 0 }; + wordr2[0] = 0; + wordr2[1] = 0; + wordr2[2] = 0; + wordr2[3] = 0; - wordr0[0] = ix_create_combt (combs_buf, il_pos, 0); - wordr0[1] = ix_create_combt (combs_buf, il_pos, 1); - wordr0[2] = ix_create_combt (combs_buf, il_pos, 2); - wordr0[3] = ix_create_combt (combs_buf, il_pos, 3); - wordr1[0] = ix_create_combt (combs_buf, il_pos, 4); - wordr1[1] = ix_create_combt (combs_buf, il_pos, 5); - wordr1[2] = ix_create_combt (combs_buf, il_pos, 6); - wordr1[3] = ix_create_combt (combs_buf, il_pos, 7); + u32 wordr3[4]; + + wordr3[0] = 0; + wordr3[1] = 0; + wordr3[2] = 0; + wordr3[3] = 0; if (combs_mode == COMBINATOR_MODE_BASE_LEFT) { switch_buffer_by_offset_le (wordr0, wordr1, wordr2, wordr3, pw_l_len); } - u32x w_t[16]; + u32 w_t[16]; w_t[ 0] = wordl0[0] | wordr0[0]; w_t[ 1] = wordl0[1] | wordr0[1]; @@ -205,8 +240,8 @@ __kernel void m11500_m04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, w_t[14] = wordl3[2] | wordr3[2]; w_t[15] = 0; - u32x a = crc32 (w_t, pw_len, iv); - u32x b = 0; + u32 a = crc32 (w_t, pw_len, iv); + u32 b = 0; const u32 r0 = a; const u32 r1 = b; @@ -241,20 +276,41 @@ __kernel void m11500_s04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, if (gid >= gid_max) return; - u32 pws0[4] = { 0 }; - u32 pws1[4] = { 0 }; + u32 wordl0[4]; + + wordl0[0] = pws[gid].i[ 0]; + wordl0[1] = pws[gid].i[ 1]; + wordl0[2] = pws[gid].i[ 2]; + wordl0[3] = pws[gid].i[ 3]; + + u32 wordl1[4]; + + wordl1[0] = pws[gid].i[ 4]; + wordl1[1] = pws[gid].i[ 5]; + wordl1[2] = pws[gid].i[ 6]; + wordl1[3] = pws[gid].i[ 7]; - pws0[0] = pws[gid].i[0]; - pws0[1] = pws[gid].i[1]; - pws0[2] = pws[gid].i[2]; - pws0[3] = pws[gid].i[3]; - pws1[0] = pws[gid].i[4]; - pws1[1] = pws[gid].i[5]; - pws1[2] = pws[gid].i[6]; - pws1[3] = pws[gid].i[7]; + u32 wordl2[4]; + + wordl2[0] = 0; + wordl2[1] = 0; + wordl2[2] = 0; + wordl2[3] = 0; + + u32 wordl3[4]; + + wordl3[0] = 0; + wordl3[1] = 0; + wordl3[2] = 0; + wordl3[3] = 0; const u32 pw_l_len = pws[gid].pw_len; + if (combs_mode == COMBINATOR_MODE_BASE_RIGHT) + { + switch_buffer_by_offset_le (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len); + } + /** * digest */ @@ -273,32 +329,46 @@ __kernel void m11500_s04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, * loop */ - for (u32 il_pos = 0; il_pos < combs_cnt; il_pos += VECT_SIZE) + for (u32 il_pos = 0; il_pos < combs_cnt; il_pos++) { - const u32x pw_r_len = pwlenx_create_combt (combs_buf, il_pos); + const u32 pw_r_len = combs_buf[il_pos].pw_len; + + const u32 pw_len = pw_l_len + pw_r_len; + + u32 wordr0[4]; + + wordr0[0] = combs_buf[il_pos].i[0]; + wordr0[1] = combs_buf[il_pos].i[1]; + wordr0[2] = combs_buf[il_pos].i[2]; + wordr0[3] = combs_buf[il_pos].i[3]; + + u32 wordr1[4]; + + wordr1[0] = combs_buf[il_pos].i[4]; + wordr1[1] = combs_buf[il_pos].i[5]; + wordr1[2] = combs_buf[il_pos].i[6]; + wordr1[3] = combs_buf[il_pos].i[7]; + + u32 wordr2[4]; - const u32x pw_len = pw_l_len + pw_r_len; + wordr2[0] = 0; + wordr2[1] = 0; + wordr2[2] = 0; + wordr2[3] = 0; - u32x wordr0[4] = { 0 }; - u32x wordr1[4] = { 0 }; - u32x wordr2[4] = { 0 }; - u32x wordr3[4] = { 0 }; + u32 wordr3[4]; - wordr0[0] = ix_create_combt (combs_buf, il_pos, 0); - wordr0[1] = ix_create_combt (combs_buf, il_pos, 1); - wordr0[2] = ix_create_combt (combs_buf, il_pos, 2); - wordr0[3] = ix_create_combt (combs_buf, il_pos, 3); - wordr1[0] = ix_create_combt (combs_buf, il_pos, 4); - wordr1[1] = ix_create_combt (combs_buf, il_pos, 5); - wordr1[2] = ix_create_combt (combs_buf, il_pos, 6); - wordr1[3] = ix_create_combt (combs_buf, il_pos, 7); + wordr3[0] = 0; + wordr3[1] = 0; + wordr3[2] = 0; + wordr3[3] = 0; if (combs_mode == COMBINATOR_MODE_BASE_LEFT) { switch_buffer_by_offset_le (wordr0, wordr1, wordr2, wordr3, pw_l_len); } - u32x w_t[16]; + u32 w_t[16]; w_t[ 0] = wordl0[0] | wordr0[0]; w_t[ 1] = wordl0[1] | wordr0[1]; @@ -317,8 +387,8 @@ __kernel void m11500_s04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, w_t[14] = wordl3[2] | wordr3[2]; w_t[15] = 0; - u32x a = crc32 (w_t, pw_len, iv); - u32x b = 0; + u32 a = crc32 (w_t, pw_len, iv); + u32 b = 0; const u32 r0 = a; const u32 r1 = b; diff --git a/OpenCL/m11700_a1.cl b/OpenCL/m11700_a1.cl index e0d4d2c..361cba3 100644 --- a/OpenCL/m11700_a1.cl +++ b/OpenCL/m11700_a1.cl @@ -7,8 +7,6 @@ #define _GOST2012_256_ -#define NEW_SIMD_CODE - #include "include/constants.h" #include "include/kernel_vendor.h" @@ -20,7 +18,9 @@ #include "include/kernel_functions.c" #include "OpenCL/types_ocl.c" #include "OpenCL/common.c" -#include "OpenCL/simd.c" + +#define COMPARE_S "OpenCL/check_single_comp4.c" +#define COMPARE_M "OpenCL/check_multi_comp4.c" #define INITVAL 0x0101010101010101 @@ -2320,50 +2320,87 @@ __kernel void m11700_m04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, * base */ - u32 pws0[4] = { 0 }; - u32 pws1[4] = { 0 }; + u32 wordl0[4]; + + wordl0[0] = pws[gid].i[ 0]; + wordl0[1] = pws[gid].i[ 1]; + wordl0[2] = pws[gid].i[ 2]; + wordl0[3] = pws[gid].i[ 3]; + + u32 wordl1[4]; + + wordl1[0] = pws[gid].i[ 4]; + wordl1[1] = pws[gid].i[ 5]; + wordl1[2] = pws[gid].i[ 6]; + wordl1[3] = pws[gid].i[ 7]; - pws0[0] = pws[gid].i[0]; - pws0[1] = pws[gid].i[1]; - pws0[2] = pws[gid].i[2]; - pws0[3] = pws[gid].i[3]; - pws1[0] = pws[gid].i[4]; - pws1[1] = pws[gid].i[5]; - pws1[2] = pws[gid].i[6]; - pws1[3] = pws[gid].i[7]; + u32 wordl2[4]; + + wordl2[0] = 0; + wordl2[1] = 0; + wordl2[2] = 0; + wordl2[3] = 0; + + u32 wordl3[4]; + + wordl3[0] = 0; + wordl3[1] = 0; + wordl3[2] = 0; + wordl3[3] = 0; const u32 pw_l_len = pws[gid].pw_len; + if (combs_mode == COMBINATOR_MODE_BASE_RIGHT) + { + append_0x80_2x4 (wordl0, wordl1, pw_l_len); + + switch_buffer_by_offset_le (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len); + } + /** * loop */ - for (u32 il_pos = 0; il_pos < combs_cnt; il_pos += VECT_SIZE) + for (u32 il_pos = 0; il_pos < combs_cnt; il_pos++) { - const u32x pw_r_len = pwlenx_create_combt (combs_buf, il_pos); + const u32 pw_r_len = combs_buf[il_pos].pw_len; + + const u32 pw_len = pw_l_len + pw_r_len; + + u32 wordr0[4]; + + wordr0[0] = combs_buf[il_pos].i[0]; + wordr0[1] = combs_buf[il_pos].i[1]; + wordr0[2] = combs_buf[il_pos].i[2]; + wordr0[3] = combs_buf[il_pos].i[3]; + + u32 wordr1[4]; + + wordr1[0] = combs_buf[il_pos].i[4]; + wordr1[1] = combs_buf[il_pos].i[5]; + wordr1[2] = combs_buf[il_pos].i[6]; + wordr1[3] = combs_buf[il_pos].i[7]; - const u32x pw_len = pw_l_len + pw_r_len; + u32 wordr2[4]; - u32x wordr0[4] = { 0 }; - u32x wordr1[4] = { 0 }; - u32x wordr2[4] = { 0 }; - u32x wordr3[4] = { 0 }; + wordr2[0] = 0; + wordr2[1] = 0; + wordr2[2] = 0; + wordr2[3] = 0; - wordr0[0] = ix_create_combt (combs_buf, il_pos, 0); - wordr0[1] = ix_create_combt (combs_buf, il_pos, 1); - wordr0[2] = ix_create_combt (combs_buf, il_pos, 2); - wordr0[3] = ix_create_combt (combs_buf, il_pos, 3); - wordr1[0] = ix_create_combt (combs_buf, il_pos, 4); - wordr1[1] = ix_create_combt (combs_buf, il_pos, 5); - wordr1[2] = ix_create_combt (combs_buf, il_pos, 6); - wordr1[3] = ix_create_combt (combs_buf, il_pos, 7); + u32 wordr3[4]; + + wordr3[0] = 0; + wordr3[1] = 0; + wordr3[2] = 0; + wordr3[3] = 0; if (combs_mode == COMBINATOR_MODE_BASE_LEFT) { switch_buffer_by_offset_le (wordr0, wordr1, wordr2, wordr3, pw_l_len); } - u32x w[16]; + u32 w[16]; w[ 0] = wordl0[0] | wordr0[0]; w[ 1] = wordl0[1] | wordr0[1]; @@ -2502,50 +2539,87 @@ __kernel void m11700_s04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, * base */ - u32 pws0[4] = { 0 }; - u32 pws1[4] = { 0 }; + u32 wordl0[4]; + + wordl0[0] = pws[gid].i[ 0]; + wordl0[1] = pws[gid].i[ 1]; + wordl0[2] = pws[gid].i[ 2]; + wordl0[3] = pws[gid].i[ 3]; + + u32 wordl1[4]; + + wordl1[0] = pws[gid].i[ 4]; + wordl1[1] = pws[gid].i[ 5]; + wordl1[2] = pws[gid].i[ 6]; + wordl1[3] = pws[gid].i[ 7]; + + u32 wordl2[4]; - pws0[0] = pws[gid].i[0]; - pws0[1] = pws[gid].i[1]; - pws0[2] = pws[gid].i[2]; - pws0[3] = pws[gid].i[3]; - pws1[0] = pws[gid].i[4]; - pws1[1] = pws[gid].i[5]; - pws1[2] = pws[gid].i[6]; - pws1[3] = pws[gid].i[7]; + wordl2[0] = 0; + wordl2[1] = 0; + wordl2[2] = 0; + wordl2[3] = 0; + + u32 wordl3[4]; + + wordl3[0] = 0; + wordl3[1] = 0; + wordl3[2] = 0; + wordl3[3] = 0; const u32 pw_l_len = pws[gid].pw_len; + if (combs_mode == COMBINATOR_MODE_BASE_RIGHT) + { + append_0x80_2x4 (wordl0, wordl1, pw_l_len); + + switch_buffer_by_offset_le (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len); + } + /** * loop */ - for (u32 il_pos = 0; il_pos < combs_cnt; il_pos += VECT_SIZE) + for (u32 il_pos = 0; il_pos < combs_cnt; il_pos++) { - const u32x pw_r_len = pwlenx_create_combt (combs_buf, il_pos); + const u32 pw_r_len = combs_buf[il_pos].pw_len; + + const u32 pw_len = pw_l_len + pw_r_len; + + u32 wordr0[4]; + + wordr0[0] = combs_buf[il_pos].i[0]; + wordr0[1] = combs_buf[il_pos].i[1]; + wordr0[2] = combs_buf[il_pos].i[2]; + wordr0[3] = combs_buf[il_pos].i[3]; + + u32 wordr1[4]; + + wordr1[0] = combs_buf[il_pos].i[4]; + wordr1[1] = combs_buf[il_pos].i[5]; + wordr1[2] = combs_buf[il_pos].i[6]; + wordr1[3] = combs_buf[il_pos].i[7]; + + u32 wordr2[4]; - const u32x pw_len = pw_l_len + pw_r_len; + wordr2[0] = 0; + wordr2[1] = 0; + wordr2[2] = 0; + wordr2[3] = 0; - u32x wordr0[4] = { 0 }; - u32x wordr1[4] = { 0 }; - u32x wordr2[4] = { 0 }; - u32x wordr3[4] = { 0 }; + u32 wordr3[4]; - wordr0[0] = ix_create_combt (combs_buf, il_pos, 0); - wordr0[1] = ix_create_combt (combs_buf, il_pos, 1); - wordr0[2] = ix_create_combt (combs_buf, il_pos, 2); - wordr0[3] = ix_create_combt (combs_buf, il_pos, 3); - wordr1[0] = ix_create_combt (combs_buf, il_pos, 4); - wordr1[1] = ix_create_combt (combs_buf, il_pos, 5); - wordr1[2] = ix_create_combt (combs_buf, il_pos, 6); - wordr1[3] = ix_create_combt (combs_buf, il_pos, 7); + wordr3[0] = 0; + wordr3[1] = 0; + wordr3[2] = 0; + wordr3[3] = 0; if (combs_mode == COMBINATOR_MODE_BASE_LEFT) { switch_buffer_by_offset_le (wordr0, wordr1, wordr2, wordr3, pw_l_len); } - u32x w[16]; + u32 w[16]; w[ 0] = wordl0[0] | wordr0[0]; w[ 1] = wordl0[1] | wordr0[1]; diff --git a/OpenCL/m11800_a1.cl b/OpenCL/m11800_a1.cl index be8e7be..2932a24 100644 --- a/OpenCL/m11800_a1.cl +++ b/OpenCL/m11800_a1.cl @@ -7,8 +7,6 @@ #define _GOST2012_512_ -#define NEW_SIMD_CODE - #include "include/constants.h" #include "include/kernel_vendor.h" @@ -20,7 +18,9 @@ #include "include/kernel_functions.c" #include "OpenCL/types_ocl.c" #include "OpenCL/common.c" -#include "OpenCL/simd.c" + +#define COMPARE_S "OpenCL/check_single_comp4.c" +#define COMPARE_M "OpenCL/check_multi_comp4.c" #define INITVAL 0 @@ -2320,50 +2320,87 @@ __kernel void m11800_m04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, * base */ - u32 pws0[4] = { 0 }; - u32 pws1[4] = { 0 }; + u32 wordl0[4]; + + wordl0[0] = pws[gid].i[ 0]; + wordl0[1] = pws[gid].i[ 1]; + wordl0[2] = pws[gid].i[ 2]; + wordl0[3] = pws[gid].i[ 3]; + + u32 wordl1[4]; + + wordl1[0] = pws[gid].i[ 4]; + wordl1[1] = pws[gid].i[ 5]; + wordl1[2] = pws[gid].i[ 6]; + wordl1[3] = pws[gid].i[ 7]; - pws0[0] = pws[gid].i[0]; - pws0[1] = pws[gid].i[1]; - pws0[2] = pws[gid].i[2]; - pws0[3] = pws[gid].i[3]; - pws1[0] = pws[gid].i[4]; - pws1[1] = pws[gid].i[5]; - pws1[2] = pws[gid].i[6]; - pws1[3] = pws[gid].i[7]; + u32 wordl2[4]; + + wordl2[0] = 0; + wordl2[1] = 0; + wordl2[2] = 0; + wordl2[3] = 0; + + u32 wordl3[4]; + + wordl3[0] = 0; + wordl3[1] = 0; + wordl3[2] = 0; + wordl3[3] = 0; const u32 pw_l_len = pws[gid].pw_len; + if (combs_mode == COMBINATOR_MODE_BASE_RIGHT) + { + append_0x80_2x4 (wordl0, wordl1, pw_l_len); + + switch_buffer_by_offset_le (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len); + } + /** * loop */ - for (u32 il_pos = 0; il_pos < combs_cnt; il_pos += VECT_SIZE) + for (u32 il_pos = 0; il_pos < combs_cnt; il_pos++) { - const u32x pw_r_len = pwlenx_create_combt (combs_buf, il_pos); + const u32 pw_r_len = combs_buf[il_pos].pw_len; + + const u32 pw_len = pw_l_len + pw_r_len; + + u32 wordr0[4]; + + wordr0[0] = combs_buf[il_pos].i[0]; + wordr0[1] = combs_buf[il_pos].i[1]; + wordr0[2] = combs_buf[il_pos].i[2]; + wordr0[3] = combs_buf[il_pos].i[3]; + + u32 wordr1[4]; + + wordr1[0] = combs_buf[il_pos].i[4]; + wordr1[1] = combs_buf[il_pos].i[5]; + wordr1[2] = combs_buf[il_pos].i[6]; + wordr1[3] = combs_buf[il_pos].i[7]; - const u32x pw_len = pw_l_len + pw_r_len; + u32 wordr2[4]; - u32x wordr0[4] = { 0 }; - u32x wordr1[4] = { 0 }; - u32x wordr2[4] = { 0 }; - u32x wordr3[4] = { 0 }; + wordr2[0] = 0; + wordr2[1] = 0; + wordr2[2] = 0; + wordr2[3] = 0; - wordr0[0] = ix_create_combt (combs_buf, il_pos, 0); - wordr0[1] = ix_create_combt (combs_buf, il_pos, 1); - wordr0[2] = ix_create_combt (combs_buf, il_pos, 2); - wordr0[3] = ix_create_combt (combs_buf, il_pos, 3); - wordr1[0] = ix_create_combt (combs_buf, il_pos, 4); - wordr1[1] = ix_create_combt (combs_buf, il_pos, 5); - wordr1[2] = ix_create_combt (combs_buf, il_pos, 6); - wordr1[3] = ix_create_combt (combs_buf, il_pos, 7); + u32 wordr3[4]; + + wordr3[0] = 0; + wordr3[1] = 0; + wordr3[2] = 0; + wordr3[3] = 0; if (combs_mode == COMBINATOR_MODE_BASE_LEFT) { switch_buffer_by_offset_le (wordr0, wordr1, wordr2, wordr3, pw_l_len); } - u32x w[16]; + u32 w[16]; w[ 0] = wordl0[0] | wordr0[0]; w[ 1] = wordl0[1] | wordr0[1]; @@ -2502,50 +2539,87 @@ __kernel void m11800_s04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, * base */ - u32 pws0[4] = { 0 }; - u32 pws1[4] = { 0 }; + u32 wordl0[4]; + + wordl0[0] = pws[gid].i[ 0]; + wordl0[1] = pws[gid].i[ 1]; + wordl0[2] = pws[gid].i[ 2]; + wordl0[3] = pws[gid].i[ 3]; + + u32 wordl1[4]; + + wordl1[0] = pws[gid].i[ 4]; + wordl1[1] = pws[gid].i[ 5]; + wordl1[2] = pws[gid].i[ 6]; + wordl1[3] = pws[gid].i[ 7]; + + u32 wordl2[4]; - pws0[0] = pws[gid].i[0]; - pws0[1] = pws[gid].i[1]; - pws0[2] = pws[gid].i[2]; - pws0[3] = pws[gid].i[3]; - pws1[0] = pws[gid].i[4]; - pws1[1] = pws[gid].i[5]; - pws1[2] = pws[gid].i[6]; - pws1[3] = pws[gid].i[7]; + wordl2[0] = 0; + wordl2[1] = 0; + wordl2[2] = 0; + wordl2[3] = 0; + + u32 wordl3[4]; + + wordl3[0] = 0; + wordl3[1] = 0; + wordl3[2] = 0; + wordl3[3] = 0; const u32 pw_l_len = pws[gid].pw_len; + if (combs_mode == COMBINATOR_MODE_BASE_RIGHT) + { + append_0x80_2x4 (wordl0, wordl1, pw_l_len); + + switch_buffer_by_offset_le (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len); + } + /** * loop */ - for (u32 il_pos = 0; il_pos < combs_cnt; il_pos += VECT_SIZE) + for (u32 il_pos = 0; il_pos < combs_cnt; il_pos++) { - const u32x pw_r_len = pwlenx_create_combt (combs_buf, il_pos); + const u32 pw_r_len = combs_buf[il_pos].pw_len; + + const u32 pw_len = pw_l_len + pw_r_len; + + u32 wordr0[4]; + + wordr0[0] = combs_buf[il_pos].i[0]; + wordr0[1] = combs_buf[il_pos].i[1]; + wordr0[2] = combs_buf[il_pos].i[2]; + wordr0[3] = combs_buf[il_pos].i[3]; + + u32 wordr1[4]; + + wordr1[0] = combs_buf[il_pos].i[4]; + wordr1[1] = combs_buf[il_pos].i[5]; + wordr1[2] = combs_buf[il_pos].i[6]; + wordr1[3] = combs_buf[il_pos].i[7]; + + u32 wordr2[4]; - const u32x pw_len = pw_l_len + pw_r_len; + wordr2[0] = 0; + wordr2[1] = 0; + wordr2[2] = 0; + wordr2[3] = 0; - u32x wordr0[4] = { 0 }; - u32x wordr1[4] = { 0 }; - u32x wordr2[4] = { 0 }; - u32x wordr3[4] = { 0 }; + u32 wordr3[4]; - wordr0[0] = ix_create_combt (combs_buf, il_pos, 0); - wordr0[1] = ix_create_combt (combs_buf, il_pos, 1); - wordr0[2] = ix_create_combt (combs_buf, il_pos, 2); - wordr0[3] = ix_create_combt (combs_buf, il_pos, 3); - wordr1[0] = ix_create_combt (combs_buf, il_pos, 4); - wordr1[1] = ix_create_combt (combs_buf, il_pos, 5); - wordr1[2] = ix_create_combt (combs_buf, il_pos, 6); - wordr1[3] = ix_create_combt (combs_buf, il_pos, 7); + wordr3[0] = 0; + wordr3[1] = 0; + wordr3[2] = 0; + wordr3[3] = 0; if (combs_mode == COMBINATOR_MODE_BASE_LEFT) { switch_buffer_by_offset_le (wordr0, wordr1, wordr2, wordr3, pw_l_len); } - u32x w[16]; + u32 w[16]; w[ 0] = wordl0[0] | wordr0[0]; w[ 1] = wordl0[1] | wordr0[1]; diff --git a/OpenCL/m12600_a1.cl b/OpenCL/m12600_a1.cl index 09bc487..80485c6 100644 --- a/OpenCL/m12600_a1.cl +++ b/OpenCL/m12600_a1.cl @@ -5,8 +5,6 @@ #define _SHA256_SHA1_ -#define NEW_SIMD_CODE - #include "include/constants.h" #include "include/kernel_vendor.h" @@ -18,7 +16,9 @@ #include "include/kernel_functions.c" #include "OpenCL/types_ocl.c" #include "OpenCL/common.c" -#include "OpenCL/simd.c" + +#define COMPARE_S "OpenCL/check_single_comp4.c" +#define COMPARE_M "OpenCL/check_multi_comp4.c" #define uint_to_hex_upper8(i) l_bin2asc[(i)] @@ -55,20 +55,43 @@ __kernel void m12600_m04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, * base */ - u32 pws0[4] = { 0 }; - u32 pws1[4] = { 0 }; + u32 wordl0[4]; + + wordl0[0] = pws[gid].i[ 0]; + wordl0[1] = pws[gid].i[ 1]; + wordl0[2] = pws[gid].i[ 2]; + wordl0[3] = pws[gid].i[ 3]; + + u32 wordl1[4]; + + wordl1[0] = pws[gid].i[ 4]; + wordl1[1] = pws[gid].i[ 5]; + wordl1[2] = pws[gid].i[ 6]; + wordl1[3] = pws[gid].i[ 7]; - pws0[0] = pws[gid].i[0]; - pws0[1] = pws[gid].i[1]; - pws0[2] = pws[gid].i[2]; - pws0[3] = pws[gid].i[3]; - pws1[0] = pws[gid].i[4]; - pws1[1] = pws[gid].i[5]; - pws1[2] = pws[gid].i[6]; - pws1[3] = pws[gid].i[7]; + u32 wordl2[4]; + + wordl2[0] = 0; + wordl2[1] = 0; + wordl2[2] = 0; + wordl2[3] = 0; + + u32 wordl3[4]; + + wordl3[0] = 0; + wordl3[1] = 0; + wordl3[2] = 0; + wordl3[3] = 0; const u32 pw_l_len = pws[gid].pw_len; + if (combs_mode == COMBINATOR_MODE_BASE_RIGHT) + { + append_0x80_2x4 (wordl0, wordl1, pw_l_len); + + switch_buffer_by_offset_le (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len); + } + /** * salt */ @@ -88,25 +111,39 @@ __kernel void m12600_m04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, * loop */ - for (u32 il_pos = 0; il_pos < combs_cnt; il_pos += VECT_SIZE) + for (u32 il_pos = 0; il_pos < combs_cnt; il_pos++) { - const u32x pw_r_len = pwlenx_create_combt (combs_buf, il_pos); + const u32 pw_r_len = combs_buf[il_pos].pw_len; + + const u32 pw_len = pw_l_len + pw_r_len; + + u32 wordr0[4]; + + wordr0[0] = combs_buf[il_pos].i[0]; + wordr0[1] = combs_buf[il_pos].i[1]; + wordr0[2] = combs_buf[il_pos].i[2]; + wordr0[3] = combs_buf[il_pos].i[3]; - const u32x pw_len = pw_l_len + pw_r_len; + u32 wordr1[4]; - u32x wordr0[4] = { 0 }; - u32x wordr1[4] = { 0 }; - u32x wordr2[4] = { 0 }; - u32x wordr3[4] = { 0 }; + wordr1[0] = combs_buf[il_pos].i[4]; + wordr1[1] = combs_buf[il_pos].i[5]; + wordr1[2] = combs_buf[il_pos].i[6]; + wordr1[3] = combs_buf[il_pos].i[7]; - wordr0[0] = ix_create_combt (combs_buf, il_pos, 0); - wordr0[1] = ix_create_combt (combs_buf, il_pos, 1); - wordr0[2] = ix_create_combt (combs_buf, il_pos, 2); - wordr0[3] = ix_create_combt (combs_buf, il_pos, 3); - wordr1[0] = ix_create_combt (combs_buf, il_pos, 4); - wordr1[1] = ix_create_combt (combs_buf, il_pos, 5); - wordr1[2] = ix_create_combt (combs_buf, il_pos, 6); - wordr1[3] = ix_create_combt (combs_buf, il_pos, 7); + u32 wordr2[4]; + + wordr2[0] = 0; + wordr2[1] = 0; + wordr2[2] = 0; + wordr2[3] = 0; + + u32 wordr3[4]; + + wordr3[0] = 0; + wordr3[1] = 0; + wordr3[2] = 0; + wordr3[3] = 0; if (combs_mode == COMBINATOR_MODE_BASE_LEFT) { @@ -115,28 +152,28 @@ __kernel void m12600_m04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, switch_buffer_by_offset_le (wordr0, wordr1, wordr2, wordr3, pw_l_len); } - u32x w0[4]; + u32 w0[4]; w0[0] = wordl0[0] | wordr0[0]; w0[1] = wordl0[1] | wordr0[1]; w0[2] = wordl0[2] | wordr0[2]; w0[3] = wordl0[3] | wordr0[3]; - u32x w1[4]; + u32 w1[4]; w1[0] = wordl1[0] | wordr1[0]; w1[1] = wordl1[1] | wordr1[1]; w1[2] = wordl1[2] | wordr1[2]; w1[3] = wordl1[3] | wordr1[3]; - u32x w2[4]; + u32 w2[4]; w2[0] = wordl2[0] | wordr2[0]; w2[1] = wordl2[1] | wordr2[1]; w2[2] = wordl2[2] | wordr2[2]; w2[3] = wordl2[3] | wordr2[3]; - u32x w3[4]; + u32 w3[4]; w3[0] = wordl3[0] | wordr3[0]; w3[1] = wordl3[1] | wordr3[1]; @@ -147,31 +184,31 @@ __kernel void m12600_m04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, * sha1 */ - u32x w0_t = swap32 (w0[0]); - u32x w1_t = swap32 (w0[1]); - u32x w2_t = swap32 (w0[2]); - u32x w3_t = swap32 (w0[3]); - u32x w4_t = swap32 (w1[0]); - u32x w5_t = swap32 (w1[1]); - u32x w6_t = swap32 (w1[2]); - u32x w7_t = swap32 (w1[3]); - u32x w8_t = swap32 (w2[0]); - u32x w9_t = swap32 (w2[1]); - u32x wa_t = swap32 (w2[2]); - u32x wb_t = swap32 (w2[3]); - u32x wc_t = swap32 (w3[0]); - u32x wd_t = swap32 (w3[1]); - u32x we_t = 0; - u32x wf_t = pw_len * 8; - - u32x a = SHA1M_A; - u32x b = SHA1M_B; - u32x c = SHA1M_C; - u32x d = SHA1M_D; - u32x e = SHA1M_E; - u32x f = 0; - u32x g = 0; - u32x h = 0; + u32 w0_t = swap32 (w0[0]); + u32 w1_t = swap32 (w0[1]); + u32 w2_t = swap32 (w0[2]); + u32 w3_t = swap32 (w0[3]); + u32 w4_t = swap32 (w1[0]); + u32 w5_t = swap32 (w1[1]); + u32 w6_t = swap32 (w1[2]); + u32 w7_t = swap32 (w1[3]); + u32 w8_t = swap32 (w2[0]); + u32 w9_t = swap32 (w2[1]); + u32 wa_t = swap32 (w2[2]); + u32 wb_t = swap32 (w2[3]); + u32 wc_t = swap32 (w3[0]); + u32 wd_t = swap32 (w3[1]); + u32 we_t = 0; + u32 wf_t = pw_len * 8; + + u32 a = SHA1M_A; + u32 b = SHA1M_B; + u32 c = SHA1M_C; + u32 d = SHA1M_D; + u32 e = SHA1M_E; + u32 f = 0; + u32 g = 0; + u32 h = 0; #undef K #define K SHA1C00 @@ -394,7 +431,13 @@ __kernel void m12600_m04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, wd_t = SHA256_EXPAND (wb_t, w6_t, we_t, wd_t); SHA256_STEP (SHA256_F0o, SHA256_F1o, d, e, f, g, h, a, b, c, wd_t, SHA256C3d); we_t = SHA256_EXPAND (wc_t, w7_t, wf_t, we_t); SHA256_STEP (SHA256_F0o, SHA256_F1o, c, d, e, f, g, h, a, b, we_t, SHA256C3e); wf_t = SHA256_EXPAND (wd_t, w8_t, w0_t, wf_t); SHA256_STEP (SHA256_F0o, SHA256_F1o, b, c, d, e, f, g, h, a, wf_t, SHA256C3f); - COMPARE_M_SIMD (d, h, c, g); + + const u32 r0 = d; + const u32 r1 = h; + const u32 r2 = c; + const u32 r3 = g; + + #include COMPARE_M } } @@ -439,20 +482,43 @@ __kernel void m12600_s04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, * base */ - u32 pws0[4] = { 0 }; - u32 pws1[4] = { 0 }; + u32 wordl0[4]; + + wordl0[0] = pws[gid].i[ 0]; + wordl0[1] = pws[gid].i[ 1]; + wordl0[2] = pws[gid].i[ 2]; + wordl0[3] = pws[gid].i[ 3]; + + u32 wordl1[4]; + + wordl1[0] = pws[gid].i[ 4]; + wordl1[1] = pws[gid].i[ 5]; + wordl1[2] = pws[gid].i[ 6]; + wordl1[3] = pws[gid].i[ 7]; + + u32 wordl2[4]; - pws0[0] = pws[gid].i[0]; - pws0[1] = pws[gid].i[1]; - pws0[2] = pws[gid].i[2]; - pws0[3] = pws[gid].i[3]; - pws1[0] = pws[gid].i[4]; - pws1[1] = pws[gid].i[5]; - pws1[2] = pws[gid].i[6]; - pws1[3] = pws[gid].i[7]; + wordl2[0] = 0; + wordl2[1] = 0; + wordl2[2] = 0; + wordl2[3] = 0; + + u32 wordl3[4]; + + wordl3[0] = 0; + wordl3[1] = 0; + wordl3[2] = 0; + wordl3[3] = 0; const u32 pw_l_len = pws[gid].pw_len; + if (combs_mode == COMBINATOR_MODE_BASE_RIGHT) + { + append_0x80_2x4 (wordl0, wordl1, pw_l_len); + + switch_buffer_by_offset_le (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len); + } + /** * salt */ @@ -484,25 +550,39 @@ __kernel void m12600_s04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, * loop */ - for (u32 il_pos = 0; il_pos < combs_cnt; il_pos += VECT_SIZE) + for (u32 il_pos = 0; il_pos < combs_cnt; il_pos++) { - const u32x pw_r_len = pwlenx_create_combt (combs_buf, il_pos); + const u32 pw_r_len = combs_buf[il_pos].pw_len; + + const u32 pw_len = pw_l_len + pw_r_len; + + u32 wordr0[4]; + + wordr0[0] = combs_buf[il_pos].i[0]; + wordr0[1] = combs_buf[il_pos].i[1]; + wordr0[2] = combs_buf[il_pos].i[2]; + wordr0[3] = combs_buf[il_pos].i[3]; + + u32 wordr1[4]; - const u32x pw_len = pw_l_len + pw_r_len; + wordr1[0] = combs_buf[il_pos].i[4]; + wordr1[1] = combs_buf[il_pos].i[5]; + wordr1[2] = combs_buf[il_pos].i[6]; + wordr1[3] = combs_buf[il_pos].i[7]; - u32x wordr0[4] = { 0 }; - u32x wordr1[4] = { 0 }; - u32x wordr2[4] = { 0 }; - u32x wordr3[4] = { 0 }; + u32 wordr2[4]; - wordr0[0] = ix_create_combt (combs_buf, il_pos, 0); - wordr0[1] = ix_create_combt (combs_buf, il_pos, 1); - wordr0[2] = ix_create_combt (combs_buf, il_pos, 2); - wordr0[3] = ix_create_combt (combs_buf, il_pos, 3); - wordr1[0] = ix_create_combt (combs_buf, il_pos, 4); - wordr1[1] = ix_create_combt (combs_buf, il_pos, 5); - wordr1[2] = ix_create_combt (combs_buf, il_pos, 6); - wordr1[3] = ix_create_combt (combs_buf, il_pos, 7); + wordr2[0] = 0; + wordr2[1] = 0; + wordr2[2] = 0; + wordr2[3] = 0; + + u32 wordr3[4]; + + wordr3[0] = 0; + wordr3[1] = 0; + wordr3[2] = 0; + wordr3[3] = 0; if (combs_mode == COMBINATOR_MODE_BASE_LEFT) { @@ -511,28 +591,28 @@ __kernel void m12600_s04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, switch_buffer_by_offset_le (wordr0, wordr1, wordr2, wordr3, pw_l_len); } - u32x w0[4]; + u32 w0[4]; w0[0] = wordl0[0] | wordr0[0]; w0[1] = wordl0[1] | wordr0[1]; w0[2] = wordl0[2] | wordr0[2]; w0[3] = wordl0[3] | wordr0[3]; - u32x w1[4]; + u32 w1[4]; w1[0] = wordl1[0] | wordr1[0]; w1[1] = wordl1[1] | wordr1[1]; w1[2] = wordl1[2] | wordr1[2]; w1[3] = wordl1[3] | wordr1[3]; - u32x w2[4]; + u32 w2[4]; w2[0] = wordl2[0] | wordr2[0]; w2[1] = wordl2[1] | wordr2[1]; w2[2] = wordl2[2] | wordr2[2]; w2[3] = wordl2[3] | wordr2[3]; - u32x w3[4]; + u32 w3[4]; w3[0] = wordl3[0] | wordr3[0]; w3[1] = wordl3[1] | wordr3[1]; @@ -543,31 +623,31 @@ __kernel void m12600_s04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, * sha1 */ - u32x w0_t = swap32 (w0[0]); - u32x w1_t = swap32 (w0[1]); - u32x w2_t = swap32 (w0[2]); - u32x w3_t = swap32 (w0[3]); - u32x w4_t = swap32 (w1[0]); - u32x w5_t = swap32 (w1[1]); - u32x w6_t = swap32 (w1[2]); - u32x w7_t = swap32 (w1[3]); - u32x w8_t = swap32 (w2[0]); - u32x w9_t = swap32 (w2[1]); - u32x wa_t = swap32 (w2[2]); - u32x wb_t = swap32 (w2[3]); - u32x wc_t = swap32 (w3[0]); - u32x wd_t = swap32 (w3[1]); - u32x we_t = 0; - u32x wf_t = pw_len * 8; - - u32x a = SHA1M_A; - u32x b = SHA1M_B; - u32x c = SHA1M_C; - u32x d = SHA1M_D; - u32x e = SHA1M_E; - u32x f = 0; - u32x g = 0; - u32x h = 0; + u32 w0_t = swap32 (w0[0]); + u32 w1_t = swap32 (w0[1]); + u32 w2_t = swap32 (w0[2]); + u32 w3_t = swap32 (w0[3]); + u32 w4_t = swap32 (w1[0]); + u32 w5_t = swap32 (w1[1]); + u32 w6_t = swap32 (w1[2]); + u32 w7_t = swap32 (w1[3]); + u32 w8_t = swap32 (w2[0]); + u32 w9_t = swap32 (w2[1]); + u32 wa_t = swap32 (w2[2]); + u32 wb_t = swap32 (w2[3]); + u32 wc_t = swap32 (w3[0]); + u32 wd_t = swap32 (w3[1]); + u32 we_t = 0; + u32 wf_t = pw_len * 8; + + u32 a = SHA1M_A; + u32 b = SHA1M_B; + u32 c = SHA1M_C; + u32 d = SHA1M_D; + u32 e = SHA1M_E; + u32 f = 0; + u32 g = 0; + u32 h = 0; #undef K #define K SHA1C00 @@ -790,7 +870,13 @@ __kernel void m12600_s04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, wd_t = SHA256_EXPAND (wb_t, w6_t, we_t, wd_t); SHA256_STEP (SHA256_F0o, SHA256_F1o, d, e, f, g, h, a, b, c, wd_t, SHA256C3d); we_t = SHA256_EXPAND (wc_t, w7_t, wf_t, we_t); SHA256_STEP (SHA256_F0o, SHA256_F1o, c, d, e, f, g, h, a, b, we_t, SHA256C3e); wf_t = SHA256_EXPAND (wd_t, w8_t, w0_t, wf_t); SHA256_STEP (SHA256_F0o, SHA256_F1o, b, c, d, e, f, g, h, a, wf_t, SHA256C3f); - COMPARE_S_SIMD (d, h, c, g); + + const u32 r0 = d; + const u32 r1 = h; + const u32 r2 = c; + const u32 r3 = g; + + #include COMPARE_S } } diff --git a/src/oclHashcat.c b/src/oclHashcat.c index bdfb65d..aa171b5 100644 --- a/src/oclHashcat.c +++ b/src/oclHashcat.c @@ -4494,8 +4494,6 @@ static void *thread_calc_stdin (void *p) run_cracker (device_param, pws_cnt); device_param->pws_cnt = 0; - - memset (device_param->pws_buf, 0, device_param->size_pws); } } @@ -4755,8 +4753,6 @@ static void *thread_calc (void *p) run_cracker (device_param, pws_cnt); device_param->pws_cnt = 0; - - memset (device_param->pws_buf, 0, device_param->size_pws); } if (data.devices_status == STATUS_STOP_AT_CHECKPOINT) check_checkpoint (); -- 2.25.1