From 9d74f2958d77d354fc4bfefa851f8fb3a1418720 Mon Sep 17 00:00:00 2001 From: Jens Steube Date: Sat, 14 May 2016 19:45:51 +0200 Subject: [PATCH] Added SIMD code for WPA/WPA2 --- OpenCL/common.c | 530 +++++++++++++++++++++ OpenCL/m00400.cl | 123 +++++ OpenCL/m02500.cl | 954 +++++++++++++++++++++++++++++-------- include/kernel_functions.c | 9 + include/kernel_vendor.h | 19 +- include/shared.h | 9 +- src/hashcat.c | 28 +- 7 files changed, 1474 insertions(+), 198 deletions(-) diff --git a/OpenCL/common.c b/OpenCL/common.c index 0846999..0d99f61 100644 --- a/OpenCL/common.c +++ b/OpenCL/common.c @@ -6186,6 +6186,536 @@ inline void append_0x01_2x4_S (u32 w0[4], u32 w1[4], const u32 offset) } } +inline void append_0x01_3x4_S (u32 w0[4], u32 w1[4], u32 w2[4], const u32 offset) +{ + switch (offset) + { + case 0: + w0[0] = 0x01; + break; + + case 1: + w0[0] = w0[0] | 0x0100; + break; + + case 2: + w0[0] = w0[0] | 0x010000; + break; + + case 3: + w0[0] = w0[0] | 0x01000000; + break; + + case 4: + w0[1] = 0x01; + break; + + case 5: + w0[1] = w0[1] | 0x0100; + break; + + case 6: + w0[1] = w0[1] | 0x010000; + break; + + case 7: + w0[1] = w0[1] | 0x01000000; + break; + + case 8: + w0[2] = 0x01; + break; + + case 9: + w0[2] = w0[2] | 0x0100; + break; + + case 10: + w0[2] = w0[2] | 0x010000; + break; + + case 11: + w0[2] = w0[2] | 0x01000000; + break; + + case 12: + w0[3] = 0x01; + break; + + case 13: + w0[3] = w0[3] | 0x0100; + break; + + case 14: + w0[3] = w0[3] | 0x010000; + break; + + case 15: + w0[3] = w0[3] | 0x01000000; + break; + + case 16: + w1[0] = 0x01; + break; + + case 17: + w1[0] = w1[0] | 0x0100; + break; + + case 18: + w1[0] = w1[0] | 0x010000; + break; + + case 19: + w1[0] = w1[0] | 0x01000000; + break; + + case 20: + w1[1] = 0x01; + break; + + case 21: + w1[1] = w1[1] | 0x0100; + break; + + case 22: + w1[1] = w1[1] | 0x010000; + break; + + case 23: + w1[1] = w1[1] | 0x01000000; + break; + + case 24: + w1[2] = 0x01; + break; + + case 25: + w1[2] = w1[2] | 0x0100; + break; + + case 26: + w1[2] = w1[2] | 0x010000; + break; + + case 27: + w1[2] = w1[2] | 0x01000000; + break; + + case 28: + w1[3] = 0x01; + break; + + case 29: + w1[3] = w1[3] | 0x0100; + break; + + case 30: + w1[3] = w1[3] | 0x010000; + break; + + case 31: + w1[3] = w1[3] | 0x01000000; + break; + + case 32: + w2[0] = 0x01; + break; + + case 33: + w2[0] = w2[0] | 0x0100; + break; + + case 34: + w2[0] = w2[0] | 0x010000; + break; + + case 35: + w2[0] = w2[0] | 0x01000000; + break; + + case 36: + w2[1] = 0x01; + break; + + case 37: + w2[1] = w2[1] | 0x0100; + break; + + case 38: + w2[1] = w2[1] | 0x010000; + break; + + case 39: + w2[1] = w2[1] | 0x01000000; + break; + + case 40: + w2[2] = 0x01; + break; + + case 41: + w2[2] = w2[2] | 0x0100; + break; + + case 42: + w2[2] = w2[2] | 0x010000; + break; + + case 43: + w2[2] = w2[2] | 0x01000000; + break; + + case 44: + w2[3] = 0x01; + break; + + case 45: + w2[3] = w2[3] | 0x0100; + break; + + case 46: + w2[3] = w2[3] | 0x010000; + break; + + case 47: + w2[3] = w2[3] | 0x01000000; + break; + } +} + +inline void append_0x02_2x4_S (u32 w0[4], u32 w1[4], const u32 offset) +{ + switch (offset) + { + case 0: + w0[0] = 0x02; + break; + + case 1: + w0[0] = w0[0] | 0x0200; + break; + + case 2: + w0[0] = w0[0] | 0x020000; + break; + + case 3: + w0[0] = w0[0] | 0x02000000; + break; + + case 4: + w0[1] = 0x02; + break; + + case 5: + w0[1] = w0[1] | 0x0200; + break; + + case 6: + w0[1] = w0[1] | 0x020000; + break; + + case 7: + w0[1] = w0[1] | 0x02000000; + break; + + case 8: + w0[2] = 0x02; + break; + + case 9: + w0[2] = w0[2] | 0x0200; + break; + + case 10: + w0[2] = w0[2] | 0x020000; + break; + + case 11: + w0[2] = w0[2] | 0x02000000; + break; + + case 12: + w0[3] = 0x02; + break; + + case 13: + w0[3] = w0[3] | 0x0200; + break; + + case 14: + w0[3] = w0[3] | 0x020000; + break; + + case 15: + w0[3] = w0[3] | 0x02000000; + break; + + case 16: + w1[0] = 0x02; + break; + + case 17: + w1[0] = w1[0] | 0x0200; + break; + + case 18: + w1[0] = w1[0] | 0x020000; + break; + + case 19: + w1[0] = w1[0] | 0x02000000; + break; + + case 20: + w1[1] = 0x02; + break; + + case 21: + w1[1] = w1[1] | 0x0200; + break; + + case 22: + w1[1] = w1[1] | 0x020000; + break; + + case 23: + w1[1] = w1[1] | 0x02000000; + break; + + case 24: + w1[2] = 0x02; + break; + + case 25: + w1[2] = w1[2] | 0x0200; + break; + + case 26: + w1[2] = w1[2] | 0x020000; + break; + + case 27: + w1[2] = w1[2] | 0x02000000; + break; + + case 28: + w1[3] = 0x02; + break; + + case 29: + w1[3] = w1[3] | 0x0200; + break; + + case 30: + w1[3] = w1[3] | 0x020000; + break; + + case 31: + w1[3] = w1[3] | 0x02000000; + break; + } +} + +inline void append_0x02_3x4_S (u32 w0[4], u32 w1[4], u32 w2[4], const u32 offset) +{ + switch (offset) + { + case 0: + w0[0] = 0x02; + break; + + case 1: + w0[0] = w0[0] | 0x0200; + break; + + case 2: + w0[0] = w0[0] | 0x020000; + break; + + case 3: + w0[0] = w0[0] | 0x02000000; + break; + + case 4: + w0[1] = 0x02; + break; + + case 5: + w0[1] = w0[1] | 0x0200; + break; + + case 6: + w0[1] = w0[1] | 0x020000; + break; + + case 7: + w0[1] = w0[1] | 0x02000000; + break; + + case 8: + w0[2] = 0x02; + break; + + case 9: + w0[2] = w0[2] | 0x0200; + break; + + case 10: + w0[2] = w0[2] | 0x020000; + break; + + case 11: + w0[2] = w0[2] | 0x02000000; + break; + + case 12: + w0[3] = 0x02; + break; + + case 13: + w0[3] = w0[3] | 0x0200; + break; + + case 14: + w0[3] = w0[3] | 0x020000; + break; + + case 15: + w0[3] = w0[3] | 0x02000000; + break; + + case 16: + w1[0] = 0x02; + break; + + case 17: + w1[0] = w1[0] | 0x0200; + break; + + case 18: + w1[0] = w1[0] | 0x020000; + break; + + case 19: + w1[0] = w1[0] | 0x02000000; + break; + + case 20: + w1[1] = 0x02; + break; + + case 21: + w1[1] = w1[1] | 0x0200; + break; + + case 22: + w1[1] = w1[1] | 0x020000; + break; + + case 23: + w1[1] = w1[1] | 0x02000000; + break; + + case 24: + w1[2] = 0x02; + break; + + case 25: + w1[2] = w1[2] | 0x0200; + break; + + case 26: + w1[2] = w1[2] | 0x020000; + break; + + case 27: + w1[2] = w1[2] | 0x02000000; + break; + + case 28: + w1[3] = 0x02; + break; + + case 29: + w1[3] = w1[3] | 0x0200; + break; + + case 30: + w1[3] = w1[3] | 0x020000; + break; + + case 31: + w1[3] = w1[3] | 0x02000000; + break; + + case 32: + w2[0] = 0x02; + break; + + case 33: + w2[0] = w2[0] | 0x0200; + break; + + case 34: + w2[0] = w2[0] | 0x020000; + break; + + case 35: + w2[0] = w2[0] | 0x02000000; + break; + + case 36: + w2[1] = 0x02; + break; + + case 37: + w2[1] = w2[1] | 0x0200; + break; + + case 38: + w2[1] = w2[1] | 0x020000; + break; + + case 39: + w2[1] = w2[1] | 0x02000000; + break; + + case 40: + w2[2] = 0x02; + break; + + case 41: + w2[2] = w2[2] | 0x0200; + break; + + case 42: + w2[2] = w2[2] | 0x020000; + break; + + case 43: + w2[2] = w2[2] | 0x02000000; + break; + + case 44: + w2[3] = 0x02; + break; + + case 45: + w2[3] = w2[3] | 0x0200; + break; + + case 46: + w2[3] = w2[3] | 0x020000; + break; + + case 47: + w2[3] = w2[3] | 0x02000000; + break; + } +} + inline void append_0x80_1x4_S (u32 w0[4], const u32 offset) { switch (offset) diff --git a/OpenCL/m00400.cl b/OpenCL/m00400.cl index 6bd37cf..1610b6a 100644 --- a/OpenCL/m00400.cl +++ b/OpenCL/m00400.cl @@ -448,6 +448,93 @@ __kernel void m00400_loop (__global pw_t *pws, __global kernel_rule_t *rules_buf digest[3].s3 = tmps[gidx + 3].digest_buf[3]; } + #endif + #if VECT_SIZE >= 8 + + if ((gidx + 4) < gid_max) + { + w0[0].s4 = pws[gidx + 4].i[0]; + w0[1].s4 = pws[gidx + 4].i[1]; + w0[2].s4 = pws[gidx + 4].i[2]; + w0[3].s4 = pws[gidx + 4].i[3]; + w1[0].s4 = pws[gidx + 4].i[4]; + w1[1].s4 = pws[gidx + 4].i[5]; + w1[2].s4 = pws[gidx + 4].i[6]; + w1[3].s4 = pws[gidx + 4].i[7]; + w2[0].s4 = pws[gidx + 4].i[8]; + w2[1].s4 = pws[gidx + 4].i[9]; + + pw_len.s4 = pws[gidx + 4].pw_len; + + digest[0].s4 = tmps[gidx + 4].digest_buf[0]; + digest[1].s4 = tmps[gidx + 4].digest_buf[1]; + digest[2].s4 = tmps[gidx + 4].digest_buf[2]; + digest[3].s4 = tmps[gidx + 4].digest_buf[3]; + } + + if ((gidx + 5) < gid_max) + { + w0[0].s5 = pws[gidx + 5].i[0]; + w0[1].s5 = pws[gidx + 5].i[1]; + w0[2].s5 = pws[gidx + 5].i[2]; + w0[3].s5 = pws[gidx + 5].i[3]; + w1[0].s5 = pws[gidx + 5].i[4]; + w1[1].s5 = pws[gidx + 5].i[5]; + w1[2].s5 = pws[gidx + 5].i[6]; + w1[3].s5 = pws[gidx + 5].i[7]; + w2[0].s5 = pws[gidx + 5].i[8]; + w2[1].s5 = pws[gidx + 5].i[9]; + + pw_len.s5 = pws[gidx + 5].pw_len; + + digest[0].s5 = tmps[gidx + 5].digest_buf[0]; + digest[1].s5 = tmps[gidx + 5].digest_buf[1]; + digest[2].s5 = tmps[gidx + 5].digest_buf[2]; + digest[3].s5 = tmps[gidx + 5].digest_buf[3]; + } + + if ((gidx + 6) < gid_max) + { + w0[0].s6 = pws[gidx + 6].i[0]; + w0[1].s6 = pws[gidx + 6].i[1]; + w0[2].s6 = pws[gidx + 6].i[2]; + w0[3].s6 = pws[gidx + 6].i[3]; + w1[0].s6 = pws[gidx + 6].i[4]; + w1[1].s6 = pws[gidx + 6].i[5]; + w1[2].s6 = pws[gidx + 6].i[6]; + w1[3].s6 = pws[gidx + 6].i[7]; + w2[0].s6 = pws[gidx + 6].i[8]; + w2[1].s6 = pws[gidx + 6].i[9]; + + pw_len.s6 = pws[gidx + 6].pw_len; + + digest[0].s6 = tmps[gidx + 6].digest_buf[0]; + digest[1].s6 = tmps[gidx + 6].digest_buf[1]; + digest[2].s6 = tmps[gidx + 6].digest_buf[2]; + digest[3].s6 = tmps[gidx + 6].digest_buf[3]; + } + + if ((gidx + 7) < gid_max) + { + w0[0].s7 = pws[gidx + 7].i[0]; + w0[1].s7 = pws[gidx + 7].i[1]; + w0[2].s7 = pws[gidx + 7].i[2]; + w0[3].s7 = pws[gidx + 7].i[3]; + w1[0].s7 = pws[gidx + 7].i[4]; + w1[1].s7 = pws[gidx + 7].i[5]; + w1[2].s7 = pws[gidx + 7].i[6]; + w1[3].s7 = pws[gidx + 7].i[7]; + w2[0].s7 = pws[gidx + 7].i[8]; + w2[1].s7 = pws[gidx + 7].i[9]; + + pw_len.s7 = pws[gidx + 7].pw_len; + + digest[0].s7 = tmps[gidx + 7].digest_buf[0]; + digest[1].s7 = tmps[gidx + 7].digest_buf[1]; + digest[2].s7 = tmps[gidx + 7].digest_buf[2]; + digest[3].s7 = tmps[gidx + 7].digest_buf[3]; + } + #endif #endif @@ -547,6 +634,42 @@ __kernel void m00400_loop (__global pw_t *pws, __global kernel_rule_t *rules_buf tmps[gidx + 3].digest_buf[3] = digest[3].s3; } + #endif + + #if VECT_SIZE >= 8 + + if ((gidx + 4) < gid_max) + { + tmps[gidx + 4].digest_buf[0] = digest[0].s4; + tmps[gidx + 4].digest_buf[1] = digest[1].s4; + tmps[gidx + 4].digest_buf[2] = digest[2].s4; + tmps[gidx + 4].digest_buf[3] = digest[3].s4; + } + + if ((gidx + 5) < gid_max) + { + tmps[gidx + 5].digest_buf[0] = digest[0].s5; + tmps[gidx + 5].digest_buf[1] = digest[1].s5; + tmps[gidx + 5].digest_buf[2] = digest[2].s5; + tmps[gidx + 5].digest_buf[3] = digest[3].s5; + } + + if ((gidx + 6) < gid_max) + { + tmps[gidx + 6].digest_buf[0] = digest[0].s6; + tmps[gidx + 6].digest_buf[1] = digest[1].s6; + tmps[gidx + 6].digest_buf[2] = digest[2].s6; + tmps[gidx + 6].digest_buf[3] = digest[3].s6; + } + + if ((gidx + 7) < gid_max) + { + tmps[gidx + 7].digest_buf[0] = digest[0].s7; + tmps[gidx + 7].digest_buf[1] = digest[1].s7; + tmps[gidx + 7].digest_buf[2] = digest[2].s7; + tmps[gidx + 7].digest_buf[3] = digest[3].s7; + } + #endif #endif } diff --git a/OpenCL/m02500.cl b/OpenCL/m02500.cl index 323554d..7b62c69 100644 --- a/OpenCL/m02500.cl +++ b/OpenCL/m02500.cl @@ -5,6 +5,8 @@ #define _WPA_ +#define NEW_SIMD_CODE + #include "include/constants.h" #include "include/kernel_vendor.h" @@ -20,7 +22,7 @@ #define COMPARE_S "OpenCL/check_single_comp4.c" #define COMPARE_M "OpenCL/check_multi_comp4.c" -void md5_transform (const u32 w0[4], const u32 w1[4], const u32 w2[4], const u32 w3[4], u32 digest[4]) +void md5_transform_S (const u32 w0[4], const u32 w1[4], const u32 w2[4], const u32 w3[4], u32 digest[4]) { u32 a = digest[0]; u32 b = digest[1]; @@ -44,73 +46,73 @@ void md5_transform (const u32 w0[4], const u32 w1[4], const u32 w2[4], const u32 u32 we_t = w3[2]; u32 wf_t = w3[3]; - MD5_STEP (MD5_Fo, a, b, c, d, w0_t, MD5C00, MD5S00); - MD5_STEP (MD5_Fo, d, a, b, c, w1_t, MD5C01, MD5S01); - MD5_STEP (MD5_Fo, c, d, a, b, w2_t, MD5C02, MD5S02); - MD5_STEP (MD5_Fo, b, c, d, a, w3_t, MD5C03, MD5S03); - MD5_STEP (MD5_Fo, a, b, c, d, w4_t, MD5C04, MD5S00); - MD5_STEP (MD5_Fo, d, a, b, c, w5_t, MD5C05, MD5S01); - MD5_STEP (MD5_Fo, c, d, a, b, w6_t, MD5C06, MD5S02); - MD5_STEP (MD5_Fo, b, c, d, a, w7_t, MD5C07, MD5S03); - MD5_STEP (MD5_Fo, a, b, c, d, w8_t, MD5C08, MD5S00); - MD5_STEP (MD5_Fo, d, a, b, c, w9_t, MD5C09, MD5S01); - MD5_STEP (MD5_Fo, c, d, a, b, wa_t, MD5C0a, MD5S02); - MD5_STEP (MD5_Fo, b, c, d, a, wb_t, MD5C0b, MD5S03); - MD5_STEP (MD5_Fo, a, b, c, d, wc_t, MD5C0c, MD5S00); - MD5_STEP (MD5_Fo, d, a, b, c, wd_t, MD5C0d, MD5S01); - MD5_STEP (MD5_Fo, c, d, a, b, we_t, MD5C0e, MD5S02); - MD5_STEP (MD5_Fo, b, c, d, a, wf_t, MD5C0f, MD5S03); - - MD5_STEP (MD5_Go, a, b, c, d, w1_t, MD5C10, MD5S10); - MD5_STEP (MD5_Go, d, a, b, c, w6_t, MD5C11, MD5S11); - MD5_STEP (MD5_Go, c, d, a, b, wb_t, MD5C12, MD5S12); - MD5_STEP (MD5_Go, b, c, d, a, w0_t, MD5C13, MD5S13); - MD5_STEP (MD5_Go, a, b, c, d, w5_t, MD5C14, MD5S10); - MD5_STEP (MD5_Go, d, a, b, c, wa_t, MD5C15, MD5S11); - MD5_STEP (MD5_Go, c, d, a, b, wf_t, MD5C16, MD5S12); - MD5_STEP (MD5_Go, b, c, d, a, w4_t, MD5C17, MD5S13); - MD5_STEP (MD5_Go, a, b, c, d, w9_t, MD5C18, MD5S10); - MD5_STEP (MD5_Go, d, a, b, c, we_t, MD5C19, MD5S11); - MD5_STEP (MD5_Go, c, d, a, b, w3_t, MD5C1a, MD5S12); - MD5_STEP (MD5_Go, b, c, d, a, w8_t, MD5C1b, MD5S13); - MD5_STEP (MD5_Go, a, b, c, d, wd_t, MD5C1c, MD5S10); - MD5_STEP (MD5_Go, d, a, b, c, w2_t, MD5C1d, MD5S11); - MD5_STEP (MD5_Go, c, d, a, b, w7_t, MD5C1e, MD5S12); - MD5_STEP (MD5_Go, b, c, d, a, wc_t, MD5C1f, MD5S13); - - MD5_STEP (MD5_H , a, b, c, d, w5_t, MD5C20, MD5S20); - MD5_STEP (MD5_H , d, a, b, c, w8_t, MD5C21, MD5S21); - MD5_STEP (MD5_H , c, d, a, b, wb_t, MD5C22, MD5S22); - MD5_STEP (MD5_H , b, c, d, a, we_t, MD5C23, MD5S23); - MD5_STEP (MD5_H , a, b, c, d, w1_t, MD5C24, MD5S20); - MD5_STEP (MD5_H , d, a, b, c, w4_t, MD5C25, MD5S21); - MD5_STEP (MD5_H , c, d, a, b, w7_t, MD5C26, MD5S22); - MD5_STEP (MD5_H , b, c, d, a, wa_t, MD5C27, MD5S23); - MD5_STEP (MD5_H , a, b, c, d, wd_t, MD5C28, MD5S20); - MD5_STEP (MD5_H , d, a, b, c, w0_t, MD5C29, MD5S21); - MD5_STEP (MD5_H , c, d, a, b, w3_t, MD5C2a, MD5S22); - MD5_STEP (MD5_H , b, c, d, a, w6_t, MD5C2b, MD5S23); - MD5_STEP (MD5_H , a, b, c, d, w9_t, MD5C2c, MD5S20); - MD5_STEP (MD5_H , d, a, b, c, wc_t, MD5C2d, MD5S21); - MD5_STEP (MD5_H , c, d, a, b, wf_t, MD5C2e, MD5S22); - MD5_STEP (MD5_H , b, c, d, a, w2_t, MD5C2f, MD5S23); - - MD5_STEP (MD5_I , a, b, c, d, w0_t, MD5C30, MD5S30); - MD5_STEP (MD5_I , d, a, b, c, w7_t, MD5C31, MD5S31); - MD5_STEP (MD5_I , c, d, a, b, we_t, MD5C32, MD5S32); - MD5_STEP (MD5_I , b, c, d, a, w5_t, MD5C33, MD5S33); - MD5_STEP (MD5_I , a, b, c, d, wc_t, MD5C34, MD5S30); - MD5_STEP (MD5_I , d, a, b, c, w3_t, MD5C35, MD5S31); - MD5_STEP (MD5_I , c, d, a, b, wa_t, MD5C36, MD5S32); - MD5_STEP (MD5_I , b, c, d, a, w1_t, MD5C37, MD5S33); - MD5_STEP (MD5_I , a, b, c, d, w8_t, MD5C38, MD5S30); - MD5_STEP (MD5_I , d, a, b, c, wf_t, MD5C39, MD5S31); - MD5_STEP (MD5_I , c, d, a, b, w6_t, MD5C3a, MD5S32); - MD5_STEP (MD5_I , b, c, d, a, wd_t, MD5C3b, MD5S33); - MD5_STEP (MD5_I , a, b, c, d, w4_t, MD5C3c, MD5S30); - MD5_STEP (MD5_I , d, a, b, c, wb_t, MD5C3d, MD5S31); - MD5_STEP (MD5_I , c, d, a, b, w2_t, MD5C3e, MD5S32); - MD5_STEP (MD5_I , b, c, d, a, w9_t, MD5C3f, MD5S33); + MD5_STEP_S (MD5_Fo, a, b, c, d, w0_t, MD5C00, MD5S00); + MD5_STEP_S (MD5_Fo, d, a, b, c, w1_t, MD5C01, MD5S01); + MD5_STEP_S (MD5_Fo, c, d, a, b, w2_t, MD5C02, MD5S02); + MD5_STEP_S (MD5_Fo, b, c, d, a, w3_t, MD5C03, MD5S03); + MD5_STEP_S (MD5_Fo, a, b, c, d, w4_t, MD5C04, MD5S00); + MD5_STEP_S (MD5_Fo, d, a, b, c, w5_t, MD5C05, MD5S01); + MD5_STEP_S (MD5_Fo, c, d, a, b, w6_t, MD5C06, MD5S02); + MD5_STEP_S (MD5_Fo, b, c, d, a, w7_t, MD5C07, MD5S03); + MD5_STEP_S (MD5_Fo, a, b, c, d, w8_t, MD5C08, MD5S00); + MD5_STEP_S (MD5_Fo, d, a, b, c, w9_t, MD5C09, MD5S01); + MD5_STEP_S (MD5_Fo, c, d, a, b, wa_t, MD5C0a, MD5S02); + MD5_STEP_S (MD5_Fo, b, c, d, a, wb_t, MD5C0b, MD5S03); + MD5_STEP_S (MD5_Fo, a, b, c, d, wc_t, MD5C0c, MD5S00); + MD5_STEP_S (MD5_Fo, d, a, b, c, wd_t, MD5C0d, MD5S01); + MD5_STEP_S (MD5_Fo, c, d, a, b, we_t, MD5C0e, MD5S02); + MD5_STEP_S (MD5_Fo, b, c, d, a, wf_t, MD5C0f, MD5S03); + + MD5_STEP_S (MD5_Go, a, b, c, d, w1_t, MD5C10, MD5S10); + MD5_STEP_S (MD5_Go, d, a, b, c, w6_t, MD5C11, MD5S11); + MD5_STEP_S (MD5_Go, c, d, a, b, wb_t, MD5C12, MD5S12); + MD5_STEP_S (MD5_Go, b, c, d, a, w0_t, MD5C13, MD5S13); + MD5_STEP_S (MD5_Go, a, b, c, d, w5_t, MD5C14, MD5S10); + MD5_STEP_S (MD5_Go, d, a, b, c, wa_t, MD5C15, MD5S11); + MD5_STEP_S (MD5_Go, c, d, a, b, wf_t, MD5C16, MD5S12); + MD5_STEP_S (MD5_Go, b, c, d, a, w4_t, MD5C17, MD5S13); + MD5_STEP_S (MD5_Go, a, b, c, d, w9_t, MD5C18, MD5S10); + MD5_STEP_S (MD5_Go, d, a, b, c, we_t, MD5C19, MD5S11); + MD5_STEP_S (MD5_Go, c, d, a, b, w3_t, MD5C1a, MD5S12); + MD5_STEP_S (MD5_Go, b, c, d, a, w8_t, MD5C1b, MD5S13); + MD5_STEP_S (MD5_Go, a, b, c, d, wd_t, MD5C1c, MD5S10); + MD5_STEP_S (MD5_Go, d, a, b, c, w2_t, MD5C1d, MD5S11); + MD5_STEP_S (MD5_Go, c, d, a, b, w7_t, MD5C1e, MD5S12); + MD5_STEP_S (MD5_Go, b, c, d, a, wc_t, MD5C1f, MD5S13); + + MD5_STEP_S (MD5_H , a, b, c, d, w5_t, MD5C20, MD5S20); + MD5_STEP_S (MD5_H , d, a, b, c, w8_t, MD5C21, MD5S21); + MD5_STEP_S (MD5_H , c, d, a, b, wb_t, MD5C22, MD5S22); + MD5_STEP_S (MD5_H , b, c, d, a, we_t, MD5C23, MD5S23); + MD5_STEP_S (MD5_H , a, b, c, d, w1_t, MD5C24, MD5S20); + MD5_STEP_S (MD5_H , d, a, b, c, w4_t, MD5C25, MD5S21); + MD5_STEP_S (MD5_H , c, d, a, b, w7_t, MD5C26, MD5S22); + MD5_STEP_S (MD5_H , b, c, d, a, wa_t, MD5C27, MD5S23); + MD5_STEP_S (MD5_H , a, b, c, d, wd_t, MD5C28, MD5S20); + MD5_STEP_S (MD5_H , d, a, b, c, w0_t, MD5C29, MD5S21); + MD5_STEP_S (MD5_H , c, d, a, b, w3_t, MD5C2a, MD5S22); + MD5_STEP_S (MD5_H , b, c, d, a, w6_t, MD5C2b, MD5S23); + MD5_STEP_S (MD5_H , a, b, c, d, w9_t, MD5C2c, MD5S20); + MD5_STEP_S (MD5_H , d, a, b, c, wc_t, MD5C2d, MD5S21); + MD5_STEP_S (MD5_H , c, d, a, b, wf_t, MD5C2e, MD5S22); + MD5_STEP_S (MD5_H , b, c, d, a, w2_t, MD5C2f, MD5S23); + + MD5_STEP_S (MD5_I , a, b, c, d, w0_t, MD5C30, MD5S30); + MD5_STEP_S (MD5_I , d, a, b, c, w7_t, MD5C31, MD5S31); + MD5_STEP_S (MD5_I , c, d, a, b, we_t, MD5C32, MD5S32); + MD5_STEP_S (MD5_I , b, c, d, a, w5_t, MD5C33, MD5S33); + MD5_STEP_S (MD5_I , a, b, c, d, wc_t, MD5C34, MD5S30); + MD5_STEP_S (MD5_I , d, a, b, c, w3_t, MD5C35, MD5S31); + MD5_STEP_S (MD5_I , c, d, a, b, wa_t, MD5C36, MD5S32); + MD5_STEP_S (MD5_I , b, c, d, a, w1_t, MD5C37, MD5S33); + MD5_STEP_S (MD5_I , a, b, c, d, w8_t, MD5C38, MD5S30); + MD5_STEP_S (MD5_I , d, a, b, c, wf_t, MD5C39, MD5S31); + MD5_STEP_S (MD5_I , c, d, a, b, w6_t, MD5C3a, MD5S32); + MD5_STEP_S (MD5_I , b, c, d, a, wd_t, MD5C3b, MD5S33); + MD5_STEP_S (MD5_I , a, b, c, d, w4_t, MD5C3c, MD5S30); + MD5_STEP_S (MD5_I , d, a, b, c, wb_t, MD5C3d, MD5S31); + MD5_STEP_S (MD5_I , c, d, a, b, w2_t, MD5C3e, MD5S32); + MD5_STEP_S (MD5_I , b, c, d, a, w9_t, MD5C3f, MD5S33); digest[0] += a; digest[1] += b; @@ -118,7 +120,7 @@ void md5_transform (const u32 w0[4], const u32 w1[4], const u32 w2[4], const u32 digest[3] += d; } -void hmac_md5_pad (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], u32 ipad[4], u32 opad[4]) +void hmac_md5_pad_S (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], u32 ipad[4], u32 opad[4]) { w0[0] = w0[0] ^ 0x36363636; w0[1] = w0[1] ^ 0x36363636; @@ -142,7 +144,7 @@ void hmac_md5_pad (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], u32 ipad[4], u32 ipad[2] = MD5M_C; ipad[3] = MD5M_D; - md5_transform (w0, w1, w2, w3, ipad); + md5_transform_S (w0, w1, w2, w3, ipad); w0[0] = w0[0] ^ 0x6a6a6a6a; w0[1] = w0[1] ^ 0x6a6a6a6a; @@ -166,17 +168,17 @@ void hmac_md5_pad (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], u32 ipad[4], u32 opad[2] = MD5M_C; opad[3] = MD5M_D; - md5_transform (w0, w1, w2, w3, opad); + md5_transform_S (w0, w1, w2, w3, opad); } -void hmac_md5_run (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], u32 ipad[4], u32 opad[4], u32 digest[4]) +void hmac_md5_run_S (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], u32 ipad[4], u32 opad[4], u32 digest[4]) { digest[0] = ipad[0]; digest[1] = ipad[1]; digest[2] = ipad[2]; digest[3] = ipad[3]; - md5_transform (w0, w1, w2, w3, digest); + md5_transform_S (w0, w1, w2, w3, digest); w0[0] = digest[0]; w0[1] = digest[1]; @@ -200,10 +202,10 @@ void hmac_md5_run (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], u32 ipad[4], u32 digest[2] = opad[2]; digest[3] = opad[3]; - md5_transform (w0, w1, w2, w3, digest); + md5_transform_S (w0, w1, w2, w3, digest); } -void sha1_transform (const u32 w0[4], const u32 w1[4], const u32 w2[4], const u32 w3[4], u32 digest[5]) +void sha1_transform_S (const u32 w0[4], const u32 w1[4], const u32 w2[4], const u32 w3[4], u32 digest[5]) { u32 A = digest[0]; u32 B = digest[1]; @@ -231,6 +233,223 @@ void sha1_transform (const u32 w0[4], const u32 w1[4], const u32 w2[4], const u3 #undef K #define K SHA1C00 + SHA1_STEP_S (SHA1_F0o, A, B, C, D, E, w0_t); + SHA1_STEP_S (SHA1_F0o, E, A, B, C, D, w1_t); + SHA1_STEP_S (SHA1_F0o, D, E, A, B, C, w2_t); + SHA1_STEP_S (SHA1_F0o, C, D, E, A, B, w3_t); + SHA1_STEP_S (SHA1_F0o, B, C, D, E, A, w4_t); + SHA1_STEP_S (SHA1_F0o, A, B, C, D, E, w5_t); + SHA1_STEP_S (SHA1_F0o, E, A, B, C, D, w6_t); + SHA1_STEP_S (SHA1_F0o, D, E, A, B, C, w7_t); + SHA1_STEP_S (SHA1_F0o, C, D, E, A, B, w8_t); + SHA1_STEP_S (SHA1_F0o, B, C, D, E, A, w9_t); + SHA1_STEP_S (SHA1_F0o, A, B, C, D, E, wa_t); + SHA1_STEP_S (SHA1_F0o, E, A, B, C, D, wb_t); + SHA1_STEP_S (SHA1_F0o, D, E, A, B, C, wc_t); + SHA1_STEP_S (SHA1_F0o, C, D, E, A, B, wd_t); + SHA1_STEP_S (SHA1_F0o, B, C, D, E, A, we_t); + SHA1_STEP_S (SHA1_F0o, A, B, C, D, E, wf_t); + w0_t = rotl32_S ((wd_t ^ w8_t ^ w2_t ^ w0_t), 1u); SHA1_STEP_S (SHA1_F0o, E, A, B, C, D, w0_t); + w1_t = rotl32_S ((we_t ^ w9_t ^ w3_t ^ w1_t), 1u); SHA1_STEP_S (SHA1_F0o, D, E, A, B, C, w1_t); + w2_t = rotl32_S ((wf_t ^ wa_t ^ w4_t ^ w2_t), 1u); SHA1_STEP_S (SHA1_F0o, C, D, E, A, B, w2_t); + w3_t = rotl32_S ((w0_t ^ wb_t ^ w5_t ^ w3_t), 1u); SHA1_STEP_S (SHA1_F0o, B, C, D, E, A, w3_t); + + #undef K + #define K SHA1C01 + + w4_t = rotl32_S ((w1_t ^ wc_t ^ w6_t ^ w4_t), 1u); SHA1_STEP_S (SHA1_F1, A, B, C, D, E, w4_t); + w5_t = rotl32_S ((w2_t ^ wd_t ^ w7_t ^ w5_t), 1u); SHA1_STEP_S (SHA1_F1, E, A, B, C, D, w5_t); + w6_t = rotl32_S ((w3_t ^ we_t ^ w8_t ^ w6_t), 1u); SHA1_STEP_S (SHA1_F1, D, E, A, B, C, w6_t); + w7_t = rotl32_S ((w4_t ^ wf_t ^ w9_t ^ w7_t), 1u); SHA1_STEP_S (SHA1_F1, C, D, E, A, B, w7_t); + w8_t = rotl32_S ((w5_t ^ w0_t ^ wa_t ^ w8_t), 1u); SHA1_STEP_S (SHA1_F1, B, C, D, E, A, w8_t); + w9_t = rotl32_S ((w6_t ^ w1_t ^ wb_t ^ w9_t), 1u); SHA1_STEP_S (SHA1_F1, A, B, C, D, E, w9_t); + wa_t = rotl32_S ((w7_t ^ w2_t ^ wc_t ^ wa_t), 1u); SHA1_STEP_S (SHA1_F1, E, A, B, C, D, wa_t); + wb_t = rotl32_S ((w8_t ^ w3_t ^ wd_t ^ wb_t), 1u); SHA1_STEP_S (SHA1_F1, D, E, A, B, C, wb_t); + wc_t = rotl32_S ((w9_t ^ w4_t ^ we_t ^ wc_t), 1u); SHA1_STEP_S (SHA1_F1, C, D, E, A, B, wc_t); + wd_t = rotl32_S ((wa_t ^ w5_t ^ wf_t ^ wd_t), 1u); SHA1_STEP_S (SHA1_F1, B, C, D, E, A, wd_t); + we_t = rotl32_S ((wb_t ^ w6_t ^ w0_t ^ we_t), 1u); SHA1_STEP_S (SHA1_F1, A, B, C, D, E, we_t); + wf_t = rotl32_S ((wc_t ^ w7_t ^ w1_t ^ wf_t), 1u); SHA1_STEP_S (SHA1_F1, E, A, B, C, D, wf_t); + w0_t = rotl32_S ((wd_t ^ w8_t ^ w2_t ^ w0_t), 1u); SHA1_STEP_S (SHA1_F1, D, E, A, B, C, w0_t); + w1_t = rotl32_S ((we_t ^ w9_t ^ w3_t ^ w1_t), 1u); SHA1_STEP_S (SHA1_F1, C, D, E, A, B, w1_t); + w2_t = rotl32_S ((wf_t ^ wa_t ^ w4_t ^ w2_t), 1u); SHA1_STEP_S (SHA1_F1, B, C, D, E, A, w2_t); + w3_t = rotl32_S ((w0_t ^ wb_t ^ w5_t ^ w3_t), 1u); SHA1_STEP_S (SHA1_F1, A, B, C, D, E, w3_t); + w4_t = rotl32_S ((w1_t ^ wc_t ^ w6_t ^ w4_t), 1u); SHA1_STEP_S (SHA1_F1, E, A, B, C, D, w4_t); + w5_t = rotl32_S ((w2_t ^ wd_t ^ w7_t ^ w5_t), 1u); SHA1_STEP_S (SHA1_F1, D, E, A, B, C, w5_t); + w6_t = rotl32_S ((w3_t ^ we_t ^ w8_t ^ w6_t), 1u); SHA1_STEP_S (SHA1_F1, C, D, E, A, B, w6_t); + w7_t = rotl32_S ((w4_t ^ wf_t ^ w9_t ^ w7_t), 1u); SHA1_STEP_S (SHA1_F1, B, C, D, E, A, w7_t); + + #undef K + #define K SHA1C02 + + w8_t = rotl32_S ((w5_t ^ w0_t ^ wa_t ^ w8_t), 1u); SHA1_STEP_S (SHA1_F2o, A, B, C, D, E, w8_t); + w9_t = rotl32_S ((w6_t ^ w1_t ^ wb_t ^ w9_t), 1u); SHA1_STEP_S (SHA1_F2o, E, A, B, C, D, w9_t); + wa_t = rotl32_S ((w7_t ^ w2_t ^ wc_t ^ wa_t), 1u); SHA1_STEP_S (SHA1_F2o, D, E, A, B, C, wa_t); + wb_t = rotl32_S ((w8_t ^ w3_t ^ wd_t ^ wb_t), 1u); SHA1_STEP_S (SHA1_F2o, C, D, E, A, B, wb_t); + wc_t = rotl32_S ((w9_t ^ w4_t ^ we_t ^ wc_t), 1u); SHA1_STEP_S (SHA1_F2o, B, C, D, E, A, wc_t); + wd_t = rotl32_S ((wa_t ^ w5_t ^ wf_t ^ wd_t), 1u); SHA1_STEP_S (SHA1_F2o, A, B, C, D, E, wd_t); + we_t = rotl32_S ((wb_t ^ w6_t ^ w0_t ^ we_t), 1u); SHA1_STEP_S (SHA1_F2o, E, A, B, C, D, we_t); + wf_t = rotl32_S ((wc_t ^ w7_t ^ w1_t ^ wf_t), 1u); SHA1_STEP_S (SHA1_F2o, D, E, A, B, C, wf_t); + w0_t = rotl32_S ((wd_t ^ w8_t ^ w2_t ^ w0_t), 1u); SHA1_STEP_S (SHA1_F2o, C, D, E, A, B, w0_t); + w1_t = rotl32_S ((we_t ^ w9_t ^ w3_t ^ w1_t), 1u); SHA1_STEP_S (SHA1_F2o, B, C, D, E, A, w1_t); + w2_t = rotl32_S ((wf_t ^ wa_t ^ w4_t ^ w2_t), 1u); SHA1_STEP_S (SHA1_F2o, A, B, C, D, E, w2_t); + w3_t = rotl32_S ((w0_t ^ wb_t ^ w5_t ^ w3_t), 1u); SHA1_STEP_S (SHA1_F2o, E, A, B, C, D, w3_t); + w4_t = rotl32_S ((w1_t ^ wc_t ^ w6_t ^ w4_t), 1u); SHA1_STEP_S (SHA1_F2o, D, E, A, B, C, w4_t); + w5_t = rotl32_S ((w2_t ^ wd_t ^ w7_t ^ w5_t), 1u); SHA1_STEP_S (SHA1_F2o, C, D, E, A, B, w5_t); + w6_t = rotl32_S ((w3_t ^ we_t ^ w8_t ^ w6_t), 1u); SHA1_STEP_S (SHA1_F2o, B, C, D, E, A, w6_t); + w7_t = rotl32_S ((w4_t ^ wf_t ^ w9_t ^ w7_t), 1u); SHA1_STEP_S (SHA1_F2o, A, B, C, D, E, w7_t); + w8_t = rotl32_S ((w5_t ^ w0_t ^ wa_t ^ w8_t), 1u); SHA1_STEP_S (SHA1_F2o, E, A, B, C, D, w8_t); + w9_t = rotl32_S ((w6_t ^ w1_t ^ wb_t ^ w9_t), 1u); SHA1_STEP_S (SHA1_F2o, D, E, A, B, C, w9_t); + wa_t = rotl32_S ((w7_t ^ w2_t ^ wc_t ^ wa_t), 1u); SHA1_STEP_S (SHA1_F2o, C, D, E, A, B, wa_t); + wb_t = rotl32_S ((w8_t ^ w3_t ^ wd_t ^ wb_t), 1u); SHA1_STEP_S (SHA1_F2o, B, C, D, E, A, wb_t); + + #undef K + #define K SHA1C03 + + wc_t = rotl32_S ((w9_t ^ w4_t ^ we_t ^ wc_t), 1u); SHA1_STEP_S (SHA1_F1, A, B, C, D, E, wc_t); + wd_t = rotl32_S ((wa_t ^ w5_t ^ wf_t ^ wd_t), 1u); SHA1_STEP_S (SHA1_F1, E, A, B, C, D, wd_t); + we_t = rotl32_S ((wb_t ^ w6_t ^ w0_t ^ we_t), 1u); SHA1_STEP_S (SHA1_F1, D, E, A, B, C, we_t); + wf_t = rotl32_S ((wc_t ^ w7_t ^ w1_t ^ wf_t), 1u); SHA1_STEP_S (SHA1_F1, C, D, E, A, B, wf_t); + w0_t = rotl32_S ((wd_t ^ w8_t ^ w2_t ^ w0_t), 1u); SHA1_STEP_S (SHA1_F1, B, C, D, E, A, w0_t); + w1_t = rotl32_S ((we_t ^ w9_t ^ w3_t ^ w1_t), 1u); SHA1_STEP_S (SHA1_F1, A, B, C, D, E, w1_t); + w2_t = rotl32_S ((wf_t ^ wa_t ^ w4_t ^ w2_t), 1u); SHA1_STEP_S (SHA1_F1, E, A, B, C, D, w2_t); + w3_t = rotl32_S ((w0_t ^ wb_t ^ w5_t ^ w3_t), 1u); SHA1_STEP_S (SHA1_F1, D, E, A, B, C, w3_t); + w4_t = rotl32_S ((w1_t ^ wc_t ^ w6_t ^ w4_t), 1u); SHA1_STEP_S (SHA1_F1, C, D, E, A, B, w4_t); + w5_t = rotl32_S ((w2_t ^ wd_t ^ w7_t ^ w5_t), 1u); SHA1_STEP_S (SHA1_F1, B, C, D, E, A, w5_t); + w6_t = rotl32_S ((w3_t ^ we_t ^ w8_t ^ w6_t), 1u); SHA1_STEP_S (SHA1_F1, A, B, C, D, E, w6_t); + w7_t = rotl32_S ((w4_t ^ wf_t ^ w9_t ^ w7_t), 1u); SHA1_STEP_S (SHA1_F1, E, A, B, C, D, w7_t); + w8_t = rotl32_S ((w5_t ^ w0_t ^ wa_t ^ w8_t), 1u); SHA1_STEP_S (SHA1_F1, D, E, A, B, C, w8_t); + w9_t = rotl32_S ((w6_t ^ w1_t ^ wb_t ^ w9_t), 1u); SHA1_STEP_S (SHA1_F1, C, D, E, A, B, w9_t); + wa_t = rotl32_S ((w7_t ^ w2_t ^ wc_t ^ wa_t), 1u); SHA1_STEP_S (SHA1_F1, B, C, D, E, A, wa_t); + wb_t = rotl32_S ((w8_t ^ w3_t ^ wd_t ^ wb_t), 1u); SHA1_STEP_S (SHA1_F1, A, B, C, D, E, wb_t); + wc_t = rotl32_S ((w9_t ^ w4_t ^ we_t ^ wc_t), 1u); SHA1_STEP_S (SHA1_F1, E, A, B, C, D, wc_t); + wd_t = rotl32_S ((wa_t ^ w5_t ^ wf_t ^ wd_t), 1u); SHA1_STEP_S (SHA1_F1, D, E, A, B, C, wd_t); + we_t = rotl32_S ((wb_t ^ w6_t ^ w0_t ^ we_t), 1u); SHA1_STEP_S (SHA1_F1, C, D, E, A, B, we_t); + wf_t = rotl32_S ((wc_t ^ w7_t ^ w1_t ^ wf_t), 1u); SHA1_STEP_S (SHA1_F1, B, C, D, E, A, wf_t); + + digest[0] += A; + digest[1] += B; + digest[2] += C; + digest[3] += D; + digest[4] += E; +} + +void hmac_sha1_pad_S (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], u32 ipad[5], u32 opad[5]) +{ + w0[0] = w0[0] ^ 0x36363636; + w0[1] = w0[1] ^ 0x36363636; + w0[2] = w0[2] ^ 0x36363636; + w0[3] = w0[3] ^ 0x36363636; + w1[0] = w1[0] ^ 0x36363636; + w1[1] = w1[1] ^ 0x36363636; + w1[2] = w1[2] ^ 0x36363636; + w1[3] = w1[3] ^ 0x36363636; + w2[0] = w2[0] ^ 0x36363636; + w2[1] = w2[1] ^ 0x36363636; + w2[2] = w2[2] ^ 0x36363636; + w2[3] = w2[3] ^ 0x36363636; + w3[0] = w3[0] ^ 0x36363636; + w3[1] = w3[1] ^ 0x36363636; + w3[2] = w3[2] ^ 0x36363636; + w3[3] = w3[3] ^ 0x36363636; + + ipad[0] = SHA1M_A; + ipad[1] = SHA1M_B; + ipad[2] = SHA1M_C; + ipad[3] = SHA1M_D; + ipad[4] = SHA1M_E; + + sha1_transform_S (w0, w1, w2, w3, ipad); + + w0[0] = w0[0] ^ 0x6a6a6a6a; + w0[1] = w0[1] ^ 0x6a6a6a6a; + w0[2] = w0[2] ^ 0x6a6a6a6a; + w0[3] = w0[3] ^ 0x6a6a6a6a; + w1[0] = w1[0] ^ 0x6a6a6a6a; + w1[1] = w1[1] ^ 0x6a6a6a6a; + w1[2] = w1[2] ^ 0x6a6a6a6a; + w1[3] = w1[3] ^ 0x6a6a6a6a; + w2[0] = w2[0] ^ 0x6a6a6a6a; + w2[1] = w2[1] ^ 0x6a6a6a6a; + w2[2] = w2[2] ^ 0x6a6a6a6a; + w2[3] = w2[3] ^ 0x6a6a6a6a; + w3[0] = w3[0] ^ 0x6a6a6a6a; + w3[1] = w3[1] ^ 0x6a6a6a6a; + w3[2] = w3[2] ^ 0x6a6a6a6a; + w3[3] = w3[3] ^ 0x6a6a6a6a; + + opad[0] = SHA1M_A; + opad[1] = SHA1M_B; + opad[2] = SHA1M_C; + opad[3] = SHA1M_D; + opad[4] = SHA1M_E; + + sha1_transform_S (w0, w1, w2, w3, opad); +} + +void hmac_sha1_run_S (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], u32 ipad[5], u32 opad[5], u32 digest[5]) +{ + digest[0] = ipad[0]; + digest[1] = ipad[1]; + digest[2] = ipad[2]; + digest[3] = ipad[3]; + digest[4] = ipad[4]; + + sha1_transform_S (w0, w1, w2, w3, digest); + + w0[0] = digest[0]; + w0[1] = digest[1]; + w0[2] = digest[2]; + w0[3] = digest[3]; + w1[0] = digest[4]; + w1[1] = 0x80000000; + w1[2] = 0; + w1[3] = 0; + w2[0] = 0; + w2[1] = 0; + w2[2] = 0; + w2[3] = 0; + w3[0] = 0; + w3[1] = 0; + w3[2] = 0; + w3[3] = (64 + 20) * 8; + + digest[0] = opad[0]; + digest[1] = opad[1]; + digest[2] = opad[2]; + digest[3] = opad[3]; + digest[4] = opad[4]; + + sha1_transform_S (w0, w1, w2, w3, digest); +} + +void sha1_transform_V (const u32x w0[4], const u32x w1[4], const u32x w2[4], const u32x w3[4], u32x digest[5]) +{ + u32x A = digest[0]; + u32x B = digest[1]; + u32x C = digest[2]; + u32x D = digest[3]; + u32x E = digest[4]; + + u32x w0_t = w0[0]; + u32x w1_t = w0[1]; + u32x w2_t = w0[2]; + u32x w3_t = w0[3]; + u32x w4_t = w1[0]; + u32x w5_t = w1[1]; + u32x w6_t = w1[2]; + u32x w7_t = w1[3]; + u32x w8_t = w2[0]; + u32x w9_t = w2[1]; + u32x wa_t = w2[2]; + u32x wb_t = w2[3]; + u32x wc_t = w3[0]; + u32x wd_t = w3[1]; + u32x we_t = w3[2]; + u32x wf_t = w3[3]; + + #undef K + #define K SHA1C00 + SHA1_STEP (SHA1_F0o, A, B, C, D, E, w0_t); SHA1_STEP (SHA1_F0o, E, A, B, C, D, w1_t); SHA1_STEP (SHA1_F0o, D, E, A, B, C, w2_t); @@ -331,60 +550,7 @@ void sha1_transform (const u32 w0[4], const u32 w1[4], const u32 w2[4], const u3 digest[4] += E; } -void hmac_sha1_pad (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], u32 ipad[5], u32 opad[5]) -{ - w0[0] = w0[0] ^ 0x36363636; - w0[1] = w0[1] ^ 0x36363636; - w0[2] = w0[2] ^ 0x36363636; - w0[3] = w0[3] ^ 0x36363636; - w1[0] = w1[0] ^ 0x36363636; - w1[1] = w1[1] ^ 0x36363636; - w1[2] = w1[2] ^ 0x36363636; - w1[3] = w1[3] ^ 0x36363636; - w2[0] = w2[0] ^ 0x36363636; - w2[1] = w2[1] ^ 0x36363636; - w2[2] = w2[2] ^ 0x36363636; - w2[3] = w2[3] ^ 0x36363636; - w3[0] = w3[0] ^ 0x36363636; - w3[1] = w3[1] ^ 0x36363636; - w3[2] = w3[2] ^ 0x36363636; - w3[3] = w3[3] ^ 0x36363636; - - ipad[0] = SHA1M_A; - ipad[1] = SHA1M_B; - ipad[2] = SHA1M_C; - ipad[3] = SHA1M_D; - ipad[4] = SHA1M_E; - - sha1_transform (w0, w1, w2, w3, ipad); - - w0[0] = w0[0] ^ 0x6a6a6a6a; - w0[1] = w0[1] ^ 0x6a6a6a6a; - w0[2] = w0[2] ^ 0x6a6a6a6a; - w0[3] = w0[3] ^ 0x6a6a6a6a; - w1[0] = w1[0] ^ 0x6a6a6a6a; - w1[1] = w1[1] ^ 0x6a6a6a6a; - w1[2] = w1[2] ^ 0x6a6a6a6a; - w1[3] = w1[3] ^ 0x6a6a6a6a; - w2[0] = w2[0] ^ 0x6a6a6a6a; - w2[1] = w2[1] ^ 0x6a6a6a6a; - w2[2] = w2[2] ^ 0x6a6a6a6a; - w2[3] = w2[3] ^ 0x6a6a6a6a; - w3[0] = w3[0] ^ 0x6a6a6a6a; - w3[1] = w3[1] ^ 0x6a6a6a6a; - w3[2] = w3[2] ^ 0x6a6a6a6a; - w3[3] = w3[3] ^ 0x6a6a6a6a; - - opad[0] = SHA1M_A; - opad[1] = SHA1M_B; - opad[2] = SHA1M_C; - opad[3] = SHA1M_D; - opad[4] = SHA1M_E; - - sha1_transform (w0, w1, w2, w3, opad); -} - -void hmac_sha1_run (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], u32 ipad[5], u32 opad[5], u32 digest[5]) +void hmac_sha1_run_V (u32x w0[4], u32x w1[4], u32x w2[4], u32x w3[4], u32x ipad[5], u32x opad[5], u32x digest[5]) { digest[0] = ipad[0]; digest[1] = ipad[1]; @@ -392,7 +558,7 @@ void hmac_sha1_run (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], u32 ipad[5], u32 digest[3] = ipad[3]; digest[4] = ipad[4]; - sha1_transform (w0, w1, w2, w3, digest); + sha1_transform_V (w0, w1, w2, w3, digest); w0[0] = digest[0]; w0[1] = digest[1]; @@ -417,7 +583,7 @@ void hmac_sha1_run (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], u32 ipad[5], u32 digest[3] = opad[3]; digest[4] = opad[4]; - sha1_transform (w0, w1, w2, w3, digest); + sha1_transform_V (w0, w1, w2, w3, digest); } __kernel void m02500_init (__global pw_t *pws, __global kernel_rule_t *rules_buf, __global comb_t *combs_buf, __global bf_t *bfs_buf, __global wpa_tmp_t *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global wpa_t *wpa_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 il_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max) @@ -480,27 +646,27 @@ __kernel void m02500_init (__global pw_t *pws, __global kernel_rule_t *rules_buf * pads */ - w0[0] = swap32 (w0[0]); - w0[1] = swap32 (w0[1]); - w0[2] = swap32 (w0[2]); - w0[3] = swap32 (w0[3]); - w1[0] = swap32 (w1[0]); - w1[1] = swap32 (w1[1]); - w1[2] = swap32 (w1[2]); - w1[3] = swap32 (w1[3]); - w2[0] = swap32 (w2[0]); - w2[1] = swap32 (w2[1]); - w2[2] = swap32 (w2[2]); - w2[3] = swap32 (w2[3]); - w3[0] = swap32 (w3[0]); - w3[1] = swap32 (w3[1]); - w3[2] = swap32 (w3[2]); - w3[3] = swap32 (w3[3]); + w0[0] = swap32_S (w0[0]); + w0[1] = swap32_S (w0[1]); + w0[2] = swap32_S (w0[2]); + w0[3] = swap32_S (w0[3]); + w1[0] = swap32_S (w1[0]); + w1[1] = swap32_S (w1[1]); + w1[2] = swap32_S (w1[2]); + w1[3] = swap32_S (w1[3]); + w2[0] = swap32_S (w2[0]); + w2[1] = swap32_S (w2[1]); + w2[2] = swap32_S (w2[2]); + w2[3] = swap32_S (w2[3]); + w3[0] = swap32_S (w3[0]); + w3[1] = swap32_S (w3[1]); + w3[2] = swap32_S (w3[2]); + w3[3] = swap32_S (w3[3]); u32 ipad[5]; u32 opad[5]; - hmac_sha1_pad (w0, w1, w2, w3, ipad, opad); + hmac_sha1_pad_S (w0, w1, w2, w3, ipad, opad); tmps[gid].ipad[0] = ipad[0]; tmps[gid].ipad[1] = ipad[1]; @@ -534,22 +700,22 @@ __kernel void m02500_init (__global pw_t *pws, __global kernel_rule_t *rules_buf w3[3] = 0; if (j == 1) - append_0x01_3x4 (w0, w1, w2, salt_len + 3); + append_0x01_3x4_S (w0, w1, w2, salt_len + 3); else - append_0x02_3x4 (w0, w1, w2, salt_len + 3); - - append_0x80_3x4 (w0, w1, w2, salt_len + 4); - - w0[0] = swap32 (w0[0]); - w0[1] = swap32 (w0[1]); - w0[2] = swap32 (w0[2]); - w0[3] = swap32 (w0[3]); - w1[0] = swap32 (w1[0]); - w1[1] = swap32 (w1[1]); - w1[2] = swap32 (w1[2]); - w1[3] = swap32 (w1[3]); - w2[0] = swap32 (w2[0]); - w2[1] = swap32 (w2[1]); + append_0x02_3x4_S (w0, w1, w2, salt_len + 3); + + append_0x80_3x4_S (w0, w1, w2, salt_len + 4); + + w0[0] = swap32_S (w0[0]); + w0[1] = swap32_S (w0[1]); + w0[2] = swap32_S (w0[2]); + w0[3] = swap32_S (w0[3]); + w1[0] = swap32_S (w1[0]); + w1[1] = swap32_S (w1[1]); + w1[2] = swap32_S (w1[2]); + w1[3] = swap32_S (w1[3]); + w2[0] = swap32_S (w2[0]); + w2[1] = swap32_S (w2[1]); w2[2] = 0; w2[3] = 0; w3[0] = 0; @@ -559,7 +725,7 @@ __kernel void m02500_init (__global pw_t *pws, __global kernel_rule_t *rules_buf u32 dgst[5]; - hmac_sha1_run (w0, w1, w2, w3, ipad, opad, dgst); + hmac_sha1_run_S (w0, w1, w2, w3, ipad, opad, dgst); tmps[gid].dgst[i + 0] = dgst[0]; tmps[gid].dgst[i + 1] = dgst[1]; @@ -581,8 +747,10 @@ __kernel void m02500_loop (__global pw_t *pws, __global kernel_rule_t *rules_buf if (gid >= gid_max) return; - u32 ipad[5]; - u32 opad[5]; + u32x ipad[5]; + u32x opad[5]; + + #if VECT_SIZE == 1 ipad[0] = tmps[gid].ipad[0]; ipad[1] = tmps[gid].ipad[1]; @@ -596,10 +764,150 @@ __kernel void m02500_loop (__global pw_t *pws, __global kernel_rule_t *rules_buf opad[3] = tmps[gid].opad[3]; opad[4] = tmps[gid].opad[4]; + #else + + const u32 gidx = gid * VECT_SIZE; + + #if VECT_SIZE >= 2 + + if ((gidx + 0) < gid_max) + { + ipad[0].s0 = tmps[gidx + 0].ipad[0]; + ipad[1].s0 = tmps[gidx + 0].ipad[1]; + ipad[2].s0 = tmps[gidx + 0].ipad[2]; + ipad[3].s0 = tmps[gidx + 0].ipad[3]; + ipad[4].s0 = tmps[gidx + 0].ipad[4]; + + opad[0].s0 = tmps[gidx + 0].opad[0]; + opad[1].s0 = tmps[gidx + 0].opad[1]; + opad[2].s0 = tmps[gidx + 0].opad[2]; + opad[3].s0 = tmps[gidx + 0].opad[3]; + opad[4].s0 = tmps[gidx + 0].opad[4]; + } + + if ((gidx + 1) < gid_max) + { + ipad[0].s1 = tmps[gidx + 1].ipad[0]; + ipad[1].s1 = tmps[gidx + 1].ipad[1]; + ipad[2].s1 = tmps[gidx + 1].ipad[2]; + ipad[3].s1 = tmps[gidx + 1].ipad[3]; + ipad[4].s1 = tmps[gidx + 1].ipad[4]; + + opad[0].s1 = tmps[gidx + 1].opad[0]; + opad[1].s1 = tmps[gidx + 1].opad[1]; + opad[2].s1 = tmps[gidx + 1].opad[2]; + opad[3].s1 = tmps[gidx + 1].opad[3]; + opad[4].s1 = tmps[gidx + 1].opad[4]; + } + + #endif + + #if VECT_SIZE >= 4 + + if ((gidx + 2) < gid_max) + { + ipad[0].s2 = tmps[gidx + 2].ipad[0]; + ipad[1].s2 = tmps[gidx + 2].ipad[1]; + ipad[2].s2 = tmps[gidx + 2].ipad[2]; + ipad[3].s2 = tmps[gidx + 2].ipad[3]; + ipad[4].s2 = tmps[gidx + 2].ipad[4]; + + opad[0].s2 = tmps[gidx + 2].opad[0]; + opad[1].s2 = tmps[gidx + 2].opad[1]; + opad[2].s2 = tmps[gidx + 2].opad[2]; + opad[3].s2 = tmps[gidx + 2].opad[3]; + opad[4].s2 = tmps[gidx + 2].opad[4]; + } + + if ((gidx + 3) < gid_max) + { + ipad[0].s3 = tmps[gidx + 3].ipad[0]; + ipad[1].s3 = tmps[gidx + 3].ipad[1]; + ipad[2].s3 = tmps[gidx + 3].ipad[2]; + ipad[3].s3 = tmps[gidx + 3].ipad[3]; + ipad[4].s3 = tmps[gidx + 3].ipad[4]; + + opad[0].s3 = tmps[gidx + 3].opad[0]; + opad[1].s3 = tmps[gidx + 3].opad[1]; + opad[2].s3 = tmps[gidx + 3].opad[2]; + opad[3].s3 = tmps[gidx + 3].opad[3]; + opad[4].s3 = tmps[gidx + 3].opad[4]; + } + + #endif + + #if VECT_SIZE >= 8 + + if ((gidx + 4) < gid_max) + { + ipad[0].s4 = tmps[gidx + 4].ipad[0]; + ipad[1].s4 = tmps[gidx + 4].ipad[1]; + ipad[2].s4 = tmps[gidx + 4].ipad[2]; + ipad[3].s4 = tmps[gidx + 4].ipad[3]; + ipad[4].s4 = tmps[gidx + 4].ipad[4]; + + opad[0].s4 = tmps[gidx + 4].opad[0]; + opad[1].s4 = tmps[gidx + 4].opad[1]; + opad[2].s4 = tmps[gidx + 4].opad[2]; + opad[3].s4 = tmps[gidx + 4].opad[3]; + opad[4].s4 = tmps[gidx + 4].opad[4]; + } + + if ((gidx + 5) < gid_max) + { + ipad[0].s5 = tmps[gidx + 5].ipad[0]; + ipad[1].s5 = tmps[gidx + 5].ipad[1]; + ipad[2].s5 = tmps[gidx + 5].ipad[2]; + ipad[3].s5 = tmps[gidx + 5].ipad[3]; + ipad[4].s5 = tmps[gidx + 5].ipad[4]; + + opad[0].s5 = tmps[gidx + 5].opad[0]; + opad[1].s5 = tmps[gidx + 5].opad[1]; + opad[2].s5 = tmps[gidx + 5].opad[2]; + opad[3].s5 = tmps[gidx + 5].opad[3]; + opad[4].s5 = tmps[gidx + 5].opad[4]; + } + + if ((gidx + 6) < gid_max) + { + ipad[0].s6 = tmps[gidx + 6].ipad[0]; + ipad[1].s6 = tmps[gidx + 6].ipad[1]; + ipad[2].s6 = tmps[gidx + 6].ipad[2]; + ipad[3].s6 = tmps[gidx + 6].ipad[3]; + ipad[4].s6 = tmps[gidx + 6].ipad[4]; + + opad[0].s6 = tmps[gidx + 6].opad[0]; + opad[1].s6 = tmps[gidx + 6].opad[1]; + opad[2].s6 = tmps[gidx + 6].opad[2]; + opad[3].s6 = tmps[gidx + 6].opad[3]; + opad[4].s6 = tmps[gidx + 6].opad[4]; + } + + if ((gidx + 7) < gid_max) + { + ipad[0].s7 = tmps[gidx + 7].ipad[0]; + ipad[1].s7 = tmps[gidx + 7].ipad[1]; + ipad[2].s7 = tmps[gidx + 7].ipad[2]; + ipad[3].s7 = tmps[gidx + 7].ipad[3]; + ipad[4].s7 = tmps[gidx + 7].ipad[4]; + + opad[0].s7 = tmps[gidx + 7].opad[0]; + opad[1].s7 = tmps[gidx + 7].opad[1]; + opad[2].s7 = tmps[gidx + 7].opad[2]; + opad[3].s7 = tmps[gidx + 7].opad[3]; + opad[4].s7 = tmps[gidx + 7].opad[4]; + } + + #endif + + #endif + for (u32 i = 0; i < 8; i += 5) { - u32 dgst[5]; - u32 out[5]; + u32x dgst[5]; + u32x out[5]; + + #if VECT_SIZE == 1 dgst[0] = tmps[gid].dgst[i + 0]; dgst[1] = tmps[gid].dgst[i + 1]; @@ -613,12 +921,148 @@ __kernel void m02500_loop (__global pw_t *pws, __global kernel_rule_t *rules_buf out[3] = tmps[gid].out[i + 3]; out[4] = tmps[gid].out[i + 4]; + #else + + #if VECT_SIZE >= 2 + + if ((gidx + 0) < gid_max) + { + dgst[0].s0 = tmps[gidx + 0].dgst[i + 0]; + dgst[1].s0 = tmps[gidx + 0].dgst[i + 1]; + dgst[2].s0 = tmps[gidx + 0].dgst[i + 2]; + dgst[3].s0 = tmps[gidx + 0].dgst[i + 3]; + dgst[4].s0 = tmps[gidx + 0].dgst[i + 4]; + + out[0].s0 = tmps[gidx + 0].out[i + 0]; + out[1].s0 = tmps[gidx + 0].out[i + 1]; + out[2].s0 = tmps[gidx + 0].out[i + 2]; + out[3].s0 = tmps[gidx + 0].out[i + 3]; + out[4].s0 = tmps[gidx + 0].out[i + 4]; + } + + if ((gidx + 1) < gid_max) + { + dgst[0].s1 = tmps[gidx + 1].dgst[i + 0]; + dgst[1].s1 = tmps[gidx + 1].dgst[i + 1]; + dgst[2].s1 = tmps[gidx + 1].dgst[i + 2]; + dgst[3].s1 = tmps[gidx + 1].dgst[i + 3]; + dgst[4].s1 = tmps[gidx + 1].dgst[i + 4]; + + out[0].s1 = tmps[gidx + 1].out[i + 0]; + out[1].s1 = tmps[gidx + 1].out[i + 1]; + out[2].s1 = tmps[gidx + 1].out[i + 2]; + out[3].s1 = tmps[gidx + 1].out[i + 3]; + out[4].s1 = tmps[gidx + 1].out[i + 4]; + } + + #endif + + #if VECT_SIZE >= 4 + + if ((gidx + 2) < gid_max) + { + dgst[0].s2 = tmps[gidx + 2].dgst[i + 0]; + dgst[1].s2 = tmps[gidx + 2].dgst[i + 1]; + dgst[2].s2 = tmps[gidx + 2].dgst[i + 2]; + dgst[3].s2 = tmps[gidx + 2].dgst[i + 3]; + dgst[4].s2 = tmps[gidx + 2].dgst[i + 4]; + + out[0].s2 = tmps[gidx + 2].out[i + 0]; + out[1].s2 = tmps[gidx + 2].out[i + 1]; + out[2].s2 = tmps[gidx + 2].out[i + 2]; + out[3].s2 = tmps[gidx + 2].out[i + 3]; + out[4].s2 = tmps[gidx + 2].out[i + 4]; + } + + if ((gidx + 3) < gid_max) + { + dgst[0].s3 = tmps[gidx + 3].dgst[i + 0]; + dgst[1].s3 = tmps[gidx + 3].dgst[i + 1]; + dgst[2].s3 = tmps[gidx + 3].dgst[i + 2]; + dgst[3].s3 = tmps[gidx + 3].dgst[i + 3]; + dgst[4].s3 = tmps[gidx + 3].dgst[i + 4]; + + out[0].s3 = tmps[gidx + 3].out[i + 0]; + out[1].s3 = tmps[gidx + 3].out[i + 1]; + out[2].s3 = tmps[gidx + 3].out[i + 2]; + out[3].s3 = tmps[gidx + 3].out[i + 3]; + out[4].s3 = tmps[gidx + 3].out[i + 4]; + } + + #endif + + #if VECT_SIZE >= 8 + + if ((gidx + 4) < gid_max) + { + dgst[0].s4 = tmps[gidx + 4].dgst[i + 0]; + dgst[1].s4 = tmps[gidx + 4].dgst[i + 1]; + dgst[2].s4 = tmps[gidx + 4].dgst[i + 2]; + dgst[3].s4 = tmps[gidx + 4].dgst[i + 3]; + dgst[4].s4 = tmps[gidx + 4].dgst[i + 4]; + + out[0].s4 = tmps[gidx + 4].out[i + 0]; + out[1].s4 = tmps[gidx + 4].out[i + 1]; + out[2].s4 = tmps[gidx + 4].out[i + 2]; + out[3].s4 = tmps[gidx + 4].out[i + 3]; + out[4].s4 = tmps[gidx + 4].out[i + 4]; + } + + if ((gidx + 5) < gid_max) + { + dgst[0].s5 = tmps[gidx + 5].dgst[i + 0]; + dgst[1].s5 = tmps[gidx + 5].dgst[i + 1]; + dgst[2].s5 = tmps[gidx + 5].dgst[i + 2]; + dgst[3].s5 = tmps[gidx + 5].dgst[i + 3]; + dgst[4].s5 = tmps[gidx + 5].dgst[i + 4]; + + out[0].s5 = tmps[gidx + 5].out[i + 0]; + out[1].s5 = tmps[gidx + 5].out[i + 1]; + out[2].s5 = tmps[gidx + 5].out[i + 2]; + out[3].s5 = tmps[gidx + 5].out[i + 3]; + out[4].s5 = tmps[gidx + 5].out[i + 4]; + } + + if ((gidx + 6) < gid_max) + { + dgst[0].s6 = tmps[gidx + 6].dgst[i + 0]; + dgst[1].s6 = tmps[gidx + 6].dgst[i + 1]; + dgst[2].s6 = tmps[gidx + 6].dgst[i + 2]; + dgst[3].s6 = tmps[gidx + 6].dgst[i + 3]; + dgst[4].s6 = tmps[gidx + 6].dgst[i + 4]; + + out[0].s6 = tmps[gidx + 6].out[i + 0]; + out[1].s6 = tmps[gidx + 6].out[i + 1]; + out[2].s6 = tmps[gidx + 6].out[i + 2]; + out[3].s6 = tmps[gidx + 6].out[i + 3]; + out[4].s6 = tmps[gidx + 6].out[i + 4]; + } + + if ((gidx + 7) < gid_max) + { + dgst[0].s7 = tmps[gidx + 7].dgst[i + 0]; + dgst[1].s7 = tmps[gidx + 7].dgst[i + 1]; + dgst[2].s7 = tmps[gidx + 7].dgst[i + 2]; + dgst[3].s7 = tmps[gidx + 7].dgst[i + 3]; + dgst[4].s7 = tmps[gidx + 7].dgst[i + 4]; + + out[0].s7 = tmps[gidx + 7].out[i + 0]; + out[1].s7 = tmps[gidx + 7].out[i + 1]; + out[2].s7 = tmps[gidx + 7].out[i + 2]; + out[3].s7 = tmps[gidx + 7].out[i + 3]; + out[4].s7 = tmps[gidx + 7].out[i + 4]; + } + + #endif + + #endif + for (u32 j = 0; j < loop_cnt; j++) { - u32 w0[4]; - u32 w1[4]; - u32 w2[4]; - u32 w3[4]; + u32x w0[4]; + u32x w1[4]; + u32x w2[4]; + u32x w3[4]; w0[0] = dgst[0]; w0[1] = dgst[1]; @@ -637,7 +1081,7 @@ __kernel void m02500_loop (__global pw_t *pws, __global kernel_rule_t *rules_buf w3[2] = 0; w3[3] = (64 + 20) * 8; - hmac_sha1_run (w0, w1, w2, w3, ipad, opad, dgst); + hmac_sha1_run_V (w0, w1, w2, w3, ipad, opad, dgst); out[0] ^= dgst[0]; out[1] ^= dgst[1]; @@ -646,6 +1090,8 @@ __kernel void m02500_loop (__global pw_t *pws, __global kernel_rule_t *rules_buf out[4] ^= dgst[4]; } + #if VECT_SIZE == 1 + tmps[gid].dgst[i + 0] = dgst[0]; tmps[gid].dgst[i + 1] = dgst[1]; tmps[gid].dgst[i + 2] = dgst[2]; @@ -657,6 +1103,142 @@ __kernel void m02500_loop (__global pw_t *pws, __global kernel_rule_t *rules_buf tmps[gid].out[i + 2] = out[2]; tmps[gid].out[i + 3] = out[3]; tmps[gid].out[i + 4] = out[4]; + + #else + + #if VECT_SIZE >= 2 + + if ((gidx + 0) < gid_max) + { + tmps[gidx + 0].dgst[i + 0] = dgst[0].s0; + tmps[gidx + 0].dgst[i + 1] = dgst[1].s0; + tmps[gidx + 0].dgst[i + 2] = dgst[2].s0; + tmps[gidx + 0].dgst[i + 3] = dgst[3].s0; + tmps[gidx + 0].dgst[i + 4] = dgst[4].s0; + + tmps[gidx + 0].out[i + 0] = out[0].s0; + tmps[gidx + 0].out[i + 1] = out[1].s0; + tmps[gidx + 0].out[i + 2] = out[2].s0; + tmps[gidx + 0].out[i + 3] = out[3].s0; + tmps[gidx + 0].out[i + 4] = out[4].s0; + } + + if ((gidx + 1) < gid_max) + { + tmps[gidx + 1].dgst[i + 0] = dgst[0].s1; + tmps[gidx + 1].dgst[i + 1] = dgst[1].s1; + tmps[gidx + 1].dgst[i + 2] = dgst[2].s1; + tmps[gidx + 1].dgst[i + 3] = dgst[3].s1; + tmps[gidx + 1].dgst[i + 4] = dgst[4].s1; + + tmps[gidx + 1].out[i + 0] = out[0].s1; + tmps[gidx + 1].out[i + 1] = out[1].s1; + tmps[gidx + 1].out[i + 2] = out[2].s1; + tmps[gidx + 1].out[i + 3] = out[3].s1; + tmps[gidx + 1].out[i + 4] = out[4].s1; + } + + #endif + + #if VECT_SIZE >= 4 + + if ((gidx + 2) < gid_max) + { + tmps[gidx + 2].dgst[i + 0] = dgst[0].s2; + tmps[gidx + 2].dgst[i + 1] = dgst[1].s2; + tmps[gidx + 2].dgst[i + 2] = dgst[2].s2; + tmps[gidx + 2].dgst[i + 3] = dgst[3].s2; + tmps[gidx + 2].dgst[i + 4] = dgst[4].s2; + + tmps[gidx + 2].out[i + 0] = out[0].s2; + tmps[gidx + 2].out[i + 1] = out[1].s2; + tmps[gidx + 2].out[i + 2] = out[2].s2; + tmps[gidx + 2].out[i + 3] = out[3].s2; + tmps[gidx + 2].out[i + 4] = out[4].s2; + } + + if ((gidx + 3) < gid_max) + { + tmps[gidx + 3].dgst[i + 0] = dgst[0].s3; + tmps[gidx + 3].dgst[i + 1] = dgst[1].s3; + tmps[gidx + 3].dgst[i + 2] = dgst[2].s3; + tmps[gidx + 3].dgst[i + 3] = dgst[3].s3; + tmps[gidx + 3].dgst[i + 4] = dgst[4].s3; + + tmps[gidx + 3].out[i + 0] = out[0].s3; + tmps[gidx + 3].out[i + 1] = out[1].s3; + tmps[gidx + 3].out[i + 2] = out[2].s3; + tmps[gidx + 3].out[i + 3] = out[3].s3; + tmps[gidx + 3].out[i + 4] = out[4].s3; + } + + #endif + + #if VECT_SIZE >= 8 + + if ((gidx + 4) < gid_max) + { + tmps[gidx + 4].dgst[i + 0] = dgst[0].s4; + tmps[gidx + 4].dgst[i + 1] = dgst[1].s4; + tmps[gidx + 4].dgst[i + 2] = dgst[2].s4; + tmps[gidx + 4].dgst[i + 3] = dgst[3].s4; + tmps[gidx + 4].dgst[i + 4] = dgst[4].s4; + + tmps[gidx + 4].out[i + 0] = out[0].s4; + tmps[gidx + 4].out[i + 1] = out[1].s4; + tmps[gidx + 4].out[i + 2] = out[2].s4; + tmps[gidx + 4].out[i + 3] = out[3].s4; + tmps[gidx + 4].out[i + 4] = out[4].s4; + } + + if ((gidx + 5) < gid_max) + { + tmps[gidx + 5].dgst[i + 0] = dgst[0].s5; + tmps[gidx + 5].dgst[i + 1] = dgst[1].s5; + tmps[gidx + 5].dgst[i + 2] = dgst[2].s5; + tmps[gidx + 5].dgst[i + 3] = dgst[3].s5; + tmps[gidx + 5].dgst[i + 4] = dgst[4].s5; + + tmps[gidx + 5].out[i + 0] = out[0].s5; + tmps[gidx + 5].out[i + 1] = out[1].s5; + tmps[gidx + 5].out[i + 2] = out[2].s5; + tmps[gidx + 5].out[i + 3] = out[3].s5; + tmps[gidx + 5].out[i + 4] = out[4].s5; + } + + if ((gidx + 6) < gid_max) + { + tmps[gidx + 6].dgst[i + 0] = dgst[0].s6; + tmps[gidx + 6].dgst[i + 1] = dgst[1].s6; + tmps[gidx + 6].dgst[i + 2] = dgst[2].s6; + tmps[gidx + 6].dgst[i + 3] = dgst[3].s6; + tmps[gidx + 6].dgst[i + 4] = dgst[4].s6; + + tmps[gidx + 6].out[i + 0] = out[0].s6; + tmps[gidx + 6].out[i + 1] = out[1].s6; + tmps[gidx + 6].out[i + 2] = out[2].s6; + tmps[gidx + 6].out[i + 3] = out[3].s6; + tmps[gidx + 6].out[i + 4] = out[4].s6; + } + + if ((gidx + 7) < gid_max) + { + tmps[gidx + 7].dgst[i + 0] = dgst[0].s7; + tmps[gidx + 7].dgst[i + 1] = dgst[1].s7; + tmps[gidx + 7].dgst[i + 2] = dgst[2].s7; + tmps[gidx + 7].dgst[i + 3] = dgst[3].s7; + tmps[gidx + 7].dgst[i + 4] = dgst[4].s7; + + tmps[gidx + 7].out[i + 0] = out[0].s7; + tmps[gidx + 7].out[i + 1] = out[1].s7; + tmps[gidx + 7].out[i + 2] = out[2].s7; + tmps[gidx + 7].out[i + 3] = out[3].s7; + tmps[gidx + 7].out[i + 4] = out[4].s7; + } + + #endif + + #endif } } @@ -693,7 +1275,7 @@ __kernel void m02500_comp (__global pw_t *pws, __global kernel_rule_t *rules_buf u32 ipad[5]; u32 opad[5]; - hmac_sha1_pad (w0, w1, w2, w3, ipad, opad); + hmac_sha1_pad_S (w0, w1, w2, w3, ipad, opad); w0[0] = wpa_bufs[salt_pos].pke[ 0]; w0[1] = wpa_bufs[salt_pos].pke[ 1]; @@ -712,7 +1294,7 @@ __kernel void m02500_comp (__global pw_t *pws, __global kernel_rule_t *rules_buf w3[2] = wpa_bufs[salt_pos].pke[14]; w3[3] = wpa_bufs[salt_pos].pke[15]; - sha1_transform (w0, w1, w2, w3, ipad); + sha1_transform_S (w0, w1, w2, w3, ipad); w0[0] = wpa_bufs[salt_pos].pke[16]; w0[1] = wpa_bufs[salt_pos].pke[17]; @@ -733,13 +1315,13 @@ __kernel void m02500_comp (__global pw_t *pws, __global kernel_rule_t *rules_buf u32 digest[5]; - hmac_sha1_run (w0, w1, w2, w3, ipad, opad, digest); + hmac_sha1_run_S (w0, w1, w2, w3, ipad, opad, digest); { - w0[0] = swap32 (digest[0]); - w0[1] = swap32 (digest[1]); - w0[2] = swap32 (digest[2]); - w0[3] = swap32 (digest[3]); + w0[0] = swap32_S (digest[0]); + w0[1] = swap32_S (digest[1]); + w0[2] = swap32_S (digest[2]); + w0[3] = swap32_S (digest[3]); w1[0] = 0; w1[1] = 0; w1[2] = 0; @@ -753,7 +1335,7 @@ __kernel void m02500_comp (__global pw_t *pws, __global kernel_rule_t *rules_buf w3[2] = 0; w3[3] = 0; - hmac_md5_pad (w0, w1, w2, w3, ipad, opad); + hmac_md5_pad_S (w0, w1, w2, w3, ipad, opad); int eapol_size = wpa_bufs[salt_pos].eapol_size; @@ -779,7 +1361,7 @@ __kernel void m02500_comp (__global pw_t *pws, __global kernel_rule_t *rules_buf w3[2] = wpa_bufs[salt_pos].eapol[eapol_off + 14]; w3[3] = wpa_bufs[salt_pos].eapol[eapol_off + 15]; - md5_transform (w0, w1, w2, w3, ipad); + md5_transform_S (w0, w1, w2, w3, ipad); } w0[0] = wpa_bufs[salt_pos].eapol[eapol_off + 0]; @@ -801,7 +1383,7 @@ __kernel void m02500_comp (__global pw_t *pws, __global kernel_rule_t *rules_buf u32 digest1[4]; - hmac_md5_run (w0, w1, w2, w3, ipad, opad, digest1); + hmac_md5_run_S (w0, w1, w2, w3, ipad, opad, digest1); /** * base @@ -835,7 +1417,7 @@ __kernel void m02500_comp (__global pw_t *pws, __global kernel_rule_t *rules_buf w3[2] = 0; w3[3] = 0; - hmac_sha1_pad (w0, w1, w2, w3, ipad, opad); + hmac_sha1_pad_S (w0, w1, w2, w3, ipad, opad); int eapol_size = wpa_bufs[salt_pos].eapol_size; @@ -861,7 +1443,7 @@ __kernel void m02500_comp (__global pw_t *pws, __global kernel_rule_t *rules_buf w3[2] = wpa_bufs[salt_pos].eapol[eapol_off + 14]; w3[3] = wpa_bufs[salt_pos].eapol[eapol_off + 15]; - sha1_transform (w0, w1, w2, w3, ipad); + sha1_transform_S (w0, w1, w2, w3, ipad); } w0[0] = wpa_bufs[salt_pos].eapol[eapol_off + 0]; @@ -883,7 +1465,7 @@ __kernel void m02500_comp (__global pw_t *pws, __global kernel_rule_t *rules_buf u32 digest2[5]; - hmac_sha1_run (w0, w1, w2, w3, ipad, opad, digest2); + hmac_sha1_run_S (w0, w1, w2, w3, ipad, opad, digest2); /** * base diff --git a/include/kernel_functions.c b/include/kernel_functions.c index a10f535..258238c 100644 --- a/include/kernel_functions.c +++ b/include/kernel_functions.c @@ -137,6 +137,15 @@ #define SHA1_F2o(x,y,z) (SHA1_F2 ((x), (y), (z))) #endif +#define SHA1_STEP_S(f,a,b,c,d,e,x) \ +{ \ + e += K; \ + e += x; \ + e += f (b, c, d); \ + e += rotl32_S (a, 5u); \ + b = rotl32_S (b, 30u); \ +} + #define SHA1_STEP(f,a,b,c,d,e,x) \ { \ e += K; \ diff --git a/include/kernel_vendor.h b/include/kernel_vendor.h index 8d869bd..7f9c789 100644 --- a/include/kernel_vendor.h +++ b/include/kernel_vendor.h @@ -29,8 +29,25 @@ #if VENDOR_ID == (1 << 0) #define IS_AMD -#elif VENDOR_ID == (1 << 6) +//#define IS_GENERIC +#elif VENDOR_ID == (1 << 1) +#define IS_APPLE +#define IS_GENERIC +#elif VENDOR_ID == (1 << 2) +#define IS_INTEL_BEIGNET +#define IS_GENERIC +#elif VENDOR_ID == (1 << 3) +#define IS_INTEL_SDK +#define IS_GENERIC +#elif VENDOR_ID == (1 << 4) +#define IS_MESA +#define IS_GENERIC +#elif VENDOR_ID == (1 << 5) #define IS_NV +//#define IS_GENERIC +#elif VENDOR_ID == (1 << 6) +#define IS_POCL +#define IS_GENERIC #else #define IS_GENERIC #endif diff --git a/include/shared.h b/include/shared.h index 28d4286..8805d6e 100644 --- a/include/shared.h +++ b/include/shared.h @@ -135,7 +135,7 @@ static inline int CPU_ISSET (int num, cpu_set_t *cs) { return (cs->count & (1 < #define CL_VENDOR_AMD "Advanced Micro Devices, Inc." #define CL_VENDOR_APPLE "Apple" #define CL_VENDOR_INTEL_BEIGNET "Intel" -#define CL_VENDOR_INTEL_SDK "Intel(R) OpenCL" +#define CL_VENDOR_INTEL_SDK "Intel(R) Corporation" #define CL_VENDOR_MESA "Mesa" #define CL_VENDOR_NV "NVIDIA Corporation" #define CL_VENDOR_POCL "The pocl project" @@ -144,10 +144,9 @@ static inline int CPU_ISSET (int num, cpu_set_t *cs) { return (cs->count & (1 < #define VENDOR_ID_APPLE (1 << 1) #define VENDOR_ID_INTEL_BEIGNET (1 << 2) #define VENDOR_ID_INTEL_SDK (1 << 3) -#define VENDOR_ID INTEL (1 << 4) -#define VENDOR_ID_MESA (1 << 5) -#define VENDOR_ID_NV (1 << 6) -#define VENDOR_ID_POCL (1 << 7) +#define VENDOR_ID_MESA (1 << 4) +#define VENDOR_ID_NV (1 << 5) +#define VENDOR_ID_POCL (1 << 6) #define VENDOR_ID_GENERIC (1 << 31) #define BLOCK_SIZE 64 diff --git a/src/hashcat.c b/src/hashcat.c index e5f6777..1619c50 100644 --- a/src/hashcat.c +++ b/src/hashcat.c @@ -8167,7 +8167,8 @@ int main (int argc, char **argv) dgst_size = DGST_SIZE_4_4; parse_func = wpa_parse_hash; sort_by_digest = sort_by_digest_4_4; - opti_type = OPTI_TYPE_ZERO_BYTE; + opti_type = OPTI_TYPE_ZERO_BYTE + | OPTI_TYPE_SLOW_HASH_SIMD; dgst_pos0 = 0; dgst_pos1 = 1; dgst_pos2 = 2; @@ -12684,19 +12685,19 @@ int main (int argc, char **argv) } else if (strcmp (platform_vendor, CL_VENDOR_APPLE) == 0) { - vendor_id = VENDOR_ID_GENERIC; + vendor_id = VENDOR_ID_APPLE; } else if (strcmp (platform_vendor, CL_VENDOR_INTEL_BEIGNET) == 0) { - vendor_id = VENDOR_ID_GENERIC; + vendor_id = VENDOR_ID_INTEL_BEIGNET; } else if (strcmp (platform_vendor, CL_VENDOR_INTEL_SDK) == 0) { - vendor_id = VENDOR_ID_GENERIC; + vendor_id = VENDOR_ID_INTEL_SDK; } else if (strcmp (platform_vendor, CL_VENDOR_MESA) == 0) { - vendor_id = VENDOR_ID_GENERIC; + vendor_id = VENDOR_ID_MESA; } else if (strcmp (platform_vendor, CL_VENDOR_NV) == 0) { @@ -12704,7 +12705,7 @@ int main (int argc, char **argv) } else if (strcmp (platform_vendor, CL_VENDOR_POCL) == 0) { - vendor_id = VENDOR_ID_GENERIC; + vendor_id = VENDOR_ID_POCL; } else { @@ -14007,6 +14008,21 @@ int main (int argc, char **argv) snprintf (build_opts, sizeof (build_opts) - 1, "-cl-std=CL1.1 -I\"%s/\" -DVENDOR_ID=%u -DCUDA_ARCH=%d -DVECT_SIZE=%u -DDEVICE_TYPE=%u -DKERN_TYPE=%u -D_unroll", shared_dir, device_param->vendor_id, (device_param->sm_major * 100) + device_param->sm_minor, device_param->vector_width, (u32) device_param->device_type, kern_type); + if (device_param->vendor_id == VENDOR_ID_INTEL_SDK) + { + // we do vectorizing much better than the auto-vectorizer + + char build_opts_new[1024] = { 0 }; + + snprintf (build_opts_new, sizeof (build_opts_new) - 1, "%s -cl-opt-disable", build_opts); + + strncpy (build_opts, build_opts_new, sizeof (build_opts) - 1); + } + + #ifdef DEBUG + log_info ("Device #%u: build_opts '%s'\n", device_id + 1, build_opts); + #endif + /** * main kernel */ -- 2.25.1