From 2dd8156d24d3140d1423abc7a7ca9dc995103e64 Mon Sep 17 00:00:00 2001 From: jsteube Date: Thu, 26 May 2016 16:45:52 +0200 Subject: [PATCH] Introduce a true memset kernel, currently operates on 16 byte per item --- OpenCL/inc_common.cl | 9 +++ include/types.h | 4 +- src/hashcat.c | 139 ++++++++++++++++++++++++++++++++++++------- 3 files changed, 131 insertions(+), 21 deletions(-) diff --git a/OpenCL/inc_common.cl b/OpenCL/inc_common.cl index ca811e5..cfd4ba4 100644 --- a/OpenCL/inc_common.cl +++ b/OpenCL/inc_common.cl @@ -9322,3 +9322,12 @@ inline void append_0x80_4x4_VV (u32x w0[4], u32x w1[4], u32x w2[4], u32x w3[4], #endif } + +__kernel void gpu_memset (__global uint4 *buf, const u32 value, const u32 gid_max) +{ + const u32 gid = get_global_id (0); + + if (gid >= gid_max) return; + + buf[gid] = (uint4) (value); +} diff --git a/include/types.h b/include/types.h index 3b4b8e2..2a2b7c0 100644 --- a/include/types.h +++ b/include/types.h @@ -1016,13 +1016,13 @@ struct __hc_device_param cl_kernel kernel_amp; cl_kernel kernel_tm; cl_kernel kernel_weak; + cl_kernel kernel_memset; cl_context context; cl_program program; cl_program program_mp; cl_program program_amp; - cl_program program_weak; cl_command_queue command_queue; @@ -1064,6 +1064,7 @@ struct __hc_device_param void *kernel_params_mp_l[PARAMCNT]; void *kernel_params_amp[PARAMCNT]; void *kernel_params_tm[PARAMCNT]; + void *kernel_params_memset[PARAMCNT]; u32 kernel_params_buf32[PARAMCNT]; @@ -1077,6 +1078,7 @@ struct __hc_device_param u64 kernel_params_mp_l_buf64[PARAMCNT]; u32 kernel_params_amp_buf32[PARAMCNT]; + u32 kernel_params_memset_buf32[PARAMCNT]; }; typedef struct __hc_device_param hc_device_param_t; diff --git a/src/hashcat.c b/src/hashcat.c index 7f67789..22f8c13 100644 --- a/src/hashcat.c +++ b/src/hashcat.c @@ -2609,8 +2609,56 @@ static void run_kernel_amp (hc_device_param_t *device_param, const uint num) hc_clFinish (data.ocl, device_param->command_queue); } +static void run_kernel_memset (hc_device_param_t *device_param, cl_mem buf, const uint value, const uint num) +{ + const u32 num16d = num / 16; + const u32 num16m = num % 16; + + if (num16d) + { + device_param->kernel_params_memset_buf32[1] = value; + device_param->kernel_params_memset_buf32[2] = num16d; + + uint kernel_threads = device_param->kernel_threads; + + uint num_elements = num16d; + + while (num_elements % kernel_threads) num_elements++; + + cl_kernel kernel = device_param->kernel_memset; + + hc_clSetKernelArg (data.ocl, kernel, 0, sizeof (cl_mem), (void *) &buf); + hc_clSetKernelArg (data.ocl, kernel, 1, sizeof (cl_uint), device_param->kernel_params_memset[1]); + hc_clSetKernelArg (data.ocl, kernel, 2, sizeof (cl_uint), device_param->kernel_params_memset[2]); + + const size_t global_work_size[3] = { num_elements, 1, 1 }; + const size_t local_work_size[3] = { kernel_threads, 1, 1 }; + + hc_clEnqueueNDRangeKernel (data.ocl, device_param->command_queue, kernel, 1, NULL, global_work_size, local_work_size, 0, NULL, NULL); + + hc_clFlush (data.ocl, device_param->command_queue); + + hc_clFinish (data.ocl, device_param->command_queue); + } + + if (num16m) + { + u32 tmp[4]; + + tmp[0] = value; + tmp[1] = value; + tmp[2] = value; + tmp[3] = value; + + hc_clEnqueueWriteBuffer (data.ocl, device_param->command_queue, buf, CL_TRUE, num16d * 16, num16m, tmp, 0, NULL, NULL); + } +} + static void run_kernel_bzero (hc_device_param_t *device_param, cl_mem buf, const size_t size) { + run_kernel_memset (device_param, buf, 0, size); + + /* int rc = -1; if (device_param->opencl_v12 && device_param->platform_vendor_id == VENDOR_ID_AMD) @@ -2644,6 +2692,7 @@ static void run_kernel_bzero (hc_device_param_t *device_param, cl_mem buf, const myfree (tmp); } + */ } static void choose_kernel (hc_device_param_t *device_param, const uint attack_exec, const uint attack_mode, const uint opts_type, const salt_t *salt_buf, const uint highest_pw_len, const uint pws_cnt) @@ -2895,6 +2944,14 @@ static void autotune (hc_device_param_t *device_param) const u32 kernel_power_max = device_param->device_processors * device_param->kernel_threads * kernel_accel_max; + run_kernel_memset (device_param, device_param->d_pws_buf, 7, kernel_power_max * sizeof (pw_t)); + + if (data.attack_exec == ATTACK_EXEC_OUTSIDE_KERNEL) + { + run_kernel_memset (device_param, device_param->d_pws_amp_buf, 7, kernel_power_max * sizeof (pw_t)); + } + + /* for (u32 i = 0; i < kernel_power_max; i++) { device_param->pws_buf[i].i[0] = i; @@ -2908,6 +2965,7 @@ static void autotune (hc_device_param_t *device_param) { run_kernel_amp (device_param, kernel_power_max); } + */ #define VERIFIER_CNT 1 @@ -3030,10 +3088,19 @@ static void autotune (hc_device_param_t *device_param) // reset them fake words + /* memset (device_param->pws_buf, 0, kernel_power_max * sizeof (pw_t)); hc_clEnqueueWriteBuffer (data.ocl, device_param->command_queue, device_param->d_pws_buf, CL_TRUE, 0, kernel_power_max * sizeof (pw_t), device_param->pws_buf, 0, NULL, NULL); hc_clEnqueueWriteBuffer (data.ocl, device_param->command_queue, device_param->d_pws_amp_buf, CL_TRUE, 0, kernel_power_max * sizeof (pw_t), device_param->pws_buf, 0, NULL, NULL); + */ + + run_kernel_memset (device_param, device_param->d_pws_buf, 0, kernel_power_max * sizeof (pw_t)); + + if (data.attack_exec == ATTACK_EXEC_OUTSIDE_KERNEL) + { + run_kernel_memset (device_param, device_param->d_pws_amp_buf, 0, kernel_power_max * sizeof (pw_t)); + } // reset timer @@ -15084,13 +15151,6 @@ int main (int argc, char **argv) hc_clEnqueueWriteBuffer (data.ocl, device_param->command_queue, device_param->d_digests_shown, CL_TRUE, 0, size_shown, data.digests_shown, 0, NULL, NULL); hc_clEnqueueWriteBuffer (data.ocl, device_param->command_queue, device_param->d_salt_bufs, CL_TRUE, 0, size_salts, data.salts_buf, 0, NULL, NULL); - run_kernel_bzero (device_param, device_param->d_pws_buf, size_pws); - run_kernel_bzero (device_param, device_param->d_pws_amp_buf, size_pws); - run_kernel_bzero (device_param, device_param->d_tmps, size_tmps); - run_kernel_bzero (device_param, device_param->d_hooks, size_hooks); - run_kernel_bzero (device_param, device_param->d_plain_bufs, size_plains); - run_kernel_bzero (device_param, device_param->d_result, size_results); - /** * special buffers */ @@ -15101,8 +15161,6 @@ int main (int argc, char **argv) device_param->d_rules_c = hc_clCreateBuffer (data.ocl, device_param->context, CL_MEM_READ_ONLY, size_rules_c, NULL); hc_clEnqueueWriteBuffer (data.ocl, device_param->command_queue, device_param->d_rules, CL_TRUE, 0, size_rules, kernel_rules_buf, 0, NULL, NULL); - - run_kernel_bzero (device_param, device_param->d_rules_c, size_rules_c); } else if (attack_kern == ATTACK_KERN_COMBI) { @@ -15110,11 +15168,6 @@ int main (int argc, char **argv) device_param->d_combs_c = hc_clCreateBuffer (data.ocl, device_param->context, CL_MEM_READ_ONLY, size_combs, NULL); device_param->d_root_css_buf = hc_clCreateBuffer (data.ocl, device_param->context, CL_MEM_READ_ONLY, size_root_css, NULL); device_param->d_markov_css_buf = hc_clCreateBuffer (data.ocl, device_param->context, CL_MEM_READ_ONLY, size_markov_css, NULL); - - run_kernel_bzero (device_param, device_param->d_combs, size_combs); - run_kernel_bzero (device_param, device_param->d_combs_c, size_combs); - run_kernel_bzero (device_param, device_param->d_root_css_buf, size_root_css); - run_kernel_bzero (device_param, device_param->d_markov_css_buf, size_markov_css); } else if (attack_kern == ATTACK_KERN_BF) { @@ -15123,12 +15176,6 @@ int main (int argc, char **argv) device_param->d_tm_c = hc_clCreateBuffer (data.ocl, device_param->context, CL_MEM_READ_ONLY, size_tm, NULL); device_param->d_root_css_buf = hc_clCreateBuffer (data.ocl, device_param->context, CL_MEM_READ_ONLY, size_root_css, NULL); device_param->d_markov_css_buf = hc_clCreateBuffer (data.ocl, device_param->context, CL_MEM_READ_ONLY, size_markov_css, NULL); - - run_kernel_bzero (device_param, device_param->d_bfs, size_bfs); - run_kernel_bzero (device_param, device_param->d_bfs_c, size_bfs); - run_kernel_bzero (device_param, device_param->d_tm_c, size_tm); - run_kernel_bzero (device_param, device_param->d_root_css_buf, size_root_css); - run_kernel_bzero (device_param, device_param->d_markov_css_buf, size_markov_css); } if (size_esalts) @@ -15272,6 +15319,13 @@ int main (int argc, char **argv) device_param->kernel_params_tm[0] = &device_param->d_bfs_c; device_param->kernel_params_tm[1] = &device_param->d_tm_c; + device_param->kernel_params_memset_buf32[1] = 0; // value + device_param->kernel_params_memset_buf32[2] = 0; // gid_max + + device_param->kernel_params_memset[0] = NULL; + device_param->kernel_params_memset[1] = &device_param->kernel_params_memset_buf32[1]; + device_param->kernel_params_memset[2] = &device_param->kernel_params_memset_buf32[2]; + /** * kernel name */ @@ -15380,6 +15434,18 @@ int main (int argc, char **argv) if (opts_type & OPTS_TYPE_HOOK23) hc_clSetKernelArg (data.ocl, device_param->kernel23, i, sizeof (cl_uint), device_param->kernel_params[i]); } + // GPU memset + + device_param->kernel_memset = hc_clCreateKernel (data.ocl, device_param->program, "gpu_memset"); + + hc_clGetKernelWorkGroupInfo (data.ocl, device_param->kernel_memset, device_param->device, CL_KERNEL_WORK_GROUP_SIZE, sizeof (size_t), &kernel_wgs_tmp, NULL); kernel_threads = MIN (kernel_threads, kernel_wgs_tmp); + + hc_clSetKernelArg (data.ocl, device_param->kernel_memset, 0, sizeof (cl_mem), device_param->kernel_params_memset[0]); + hc_clSetKernelArg (data.ocl, device_param->kernel_memset, 1, sizeof (cl_uint), device_param->kernel_params_memset[1]); + hc_clSetKernelArg (data.ocl, device_param->kernel_memset, 2, sizeof (cl_uint), device_param->kernel_params_memset[2]); + + // MP start + if (attack_mode == ATTACK_MODE_BF) { device_param->kernel_mp_l = hc_clCreateKernel (data.ocl, device_param->program_mp, "l_markov"); @@ -15440,6 +15506,39 @@ int main (int argc, char **argv) device_param->kernel_threads = kernel_threads; + // zero some data buffers + + run_kernel_bzero (device_param, device_param->d_pws_buf, size_pws); + run_kernel_bzero (device_param, device_param->d_pws_amp_buf, size_pws); + run_kernel_bzero (device_param, device_param->d_tmps, size_tmps); + run_kernel_bzero (device_param, device_param->d_hooks, size_hooks); + run_kernel_bzero (device_param, device_param->d_plain_bufs, size_plains); + run_kernel_bzero (device_param, device_param->d_result, size_results); + + /** + * special buffers + */ + + if (attack_kern == ATTACK_KERN_STRAIGHT) + { + run_kernel_bzero (device_param, device_param->d_rules_c, size_rules_c); + } + else if (attack_kern == ATTACK_KERN_COMBI) + { + run_kernel_bzero (device_param, device_param->d_combs, size_combs); + run_kernel_bzero (device_param, device_param->d_combs_c, size_combs); + run_kernel_bzero (device_param, device_param->d_root_css_buf, size_root_css); + run_kernel_bzero (device_param, device_param->d_markov_css_buf, size_markov_css); + } + else if (attack_kern == ATTACK_KERN_BF) + { + run_kernel_bzero (device_param, device_param->d_bfs, size_bfs); + run_kernel_bzero (device_param, device_param->d_bfs_c, size_bfs); + run_kernel_bzero (device_param, device_param->d_tm_c, size_tm); + run_kernel_bzero (device_param, device_param->d_root_css_buf, size_root_css); + run_kernel_bzero (device_param, device_param->d_markov_css_buf, size_markov_css); + } + /** * Store initial fanspeed if gpu_temp_retain is enabled */ -- 2.43.0