#define MARKOV_DISABLE 0
#define MARKOV_CLASSIC 0
#define BENCHMARK 0
-#define BENCHMARK_REPEATS 100
#define RESTORE 0
#define RESTORE_TIMER 60
#define RESTORE_DISABLE 0
if (rc != 0)
{
// NOTE: clEnqueueFillBuffer () always fails with -59
- // IOW, it's not supported by Nvidia ForceWare <= 352.21, also pocl segfaults, also on apple
+ // IOW, it's not supported by Nvidia drivers <= 352.21, also pocl segfaults, also on apple
// How's that possible, OpenCL 1.2 support is advertised??
// We need to workaround...
device_param->speed_cnt[speed_pos] = perf_sum_all;
device_param->speed_ms[speed_pos] = speed_ms;
+
+ if (data.benchmark == 1)
+ {
+ if (speed_ms > 4096) data.devices_status = STATUS_ABORTED;
+ }
}
if (opts_type & OPTS_TYPE_HOOK23)
}
}
-static double try_run (hc_device_param_t *device_param, const u32 kernel_accel, const u32 kernel_loops, const int repeat)
+static double try_run (hc_device_param_t *device_param, const u32 kernel_accel, const u32 kernel_loops)
{
const u32 kernel_power = device_param->device_processors * device_param->kernel_threads * kernel_accel;
- device_param->kernel_params_buf32[26] = kernel_loops;
- device_param->kernel_params_buf32[27] = kernel_loops;
+ device_param->kernel_params_buf32[25] = 0;
+ device_param->kernel_params_buf32[26] = kernel_loops; // not a bug, both need to be set
+ device_param->kernel_params_buf32[27] = kernel_loops; // because there's two variables for inner iters for slow and fast hashes
// init some fake words
- for (u32 i = 0; i < kernel_power; i++)
+ if (data.hash_mode == 10700)
{
- device_param->pws_buf[i].i[0] = i;
- device_param->pws_buf[i].i[1] = 0x01234567;
- device_param->pws_buf[i].pw_len = 4 + (i & 3);
- }
+ // hash mode 10700 hangs on length 0 (unlimited loop)
- hc_clEnqueueWriteBuffer (data.ocl, device_param->command_queue, device_param->d_pws_buf, CL_TRUE, 0, kernel_power * sizeof (pw_t), device_param->pws_buf, 0, NULL, NULL);
+ for (u32 i = 0; i < kernel_power; i++)
+ {
+ device_param->pws_buf[i].i[0] = i;
+ device_param->pws_buf[i].i[1] = i + 0x01234567;
+ device_param->pws_buf[i].i[2] = i + 0x89abcdef;
+ device_param->pws_buf[i].i[3] = 0xffffffff;
+ device_param->pws_buf[i].pw_len = 4 + (i & 3);
+ }
- if (data.attack_exec == ATTACK_EXEC_OUTSIDE_KERNEL)
- {
- run_kernel_amp (device_param, kernel_power);
- }
+ hc_clEnqueueWriteBuffer (data.ocl, device_param->command_queue, device_param->d_pws_buf, CL_TRUE, 0, kernel_power * sizeof (pw_t), device_param->pws_buf, 0, NULL, NULL);
- // caching run
+ if (data.attack_exec == ATTACK_EXEC_OUTSIDE_KERNEL)
+ {
+ run_kernel_amp (device_param, kernel_power);
+ }
+ }
if (data.attack_exec == ATTACK_EXEC_INSIDE_KERNEL)
{
- run_kernel (KERN_RUN_1, device_param, kernel_power, false);
+ run_kernel (KERN_RUN_1, device_param, kernel_power, true);
}
else
{
- run_kernel (KERN_RUN_2, device_param, kernel_power, false);
+ run_kernel (KERN_RUN_2, device_param, kernel_power, true);
}
- // now user repeats
-
- for (int i = 0; i < repeat; i++)
- {
- if (data.attack_exec == ATTACK_EXEC_INSIDE_KERNEL)
- {
- run_kernel (KERN_RUN_1, device_param, kernel_power, true);
- }
- else
- {
- run_kernel (KERN_RUN_2, device_param, kernel_power, true);
- }
- }
-
- const double exec_ms_prev = get_avg_exec_time (device_param, repeat);
+ const double exec_ms_prev = get_avg_exec_time (device_param, 1);
// reset fake words
- memset (device_param->pws_buf, 0, kernel_power * sizeof (pw_t));
+ if (data.hash_mode == 10700)
+ {
+ memset (device_param->pws_buf, 0, kernel_power * sizeof (pw_t));
- hc_clEnqueueWriteBuffer (data.ocl, device_param->command_queue, device_param->d_pws_buf, CL_TRUE, 0, kernel_power * sizeof (pw_t), device_param->pws_buf, 0, NULL, NULL);
- hc_clEnqueueWriteBuffer (data.ocl, device_param->command_queue, device_param->d_pws_amp_buf, CL_TRUE, 0, kernel_power * sizeof (pw_t), device_param->pws_buf, 0, NULL, NULL);
+ hc_clEnqueueWriteBuffer (data.ocl, device_param->command_queue, device_param->d_pws_buf, CL_TRUE, 0, kernel_power * sizeof (pw_t), device_param->pws_buf, 0, NULL, NULL);
+ hc_clEnqueueWriteBuffer (data.ocl, device_param->command_queue, device_param->d_pws_amp_buf, CL_TRUE, 0, kernel_power * sizeof (pw_t), device_param->pws_buf, 0, NULL, NULL);
+ }
return exec_ms_prev;
}
u32 kernel_accel = kernel_accel_min;
u32 kernel_loops = kernel_loops_min;
- // steps
-
#define STEPS_CNT 10
- #define STEPS_ACCEL_CNT (STEPS_CNT + 2)
- #define STEPS_LOOPS_CNT (STEPS_CNT + 2)
-
- u32 steps_accel[STEPS_ACCEL_CNT];
- u32 steps_loops[STEPS_LOOPS_CNT];
-
- for (int i = 0; i < STEPS_ACCEL_CNT; i++)
- {
- steps_accel[i] = 1 << i;
- }
+ #define MAX_RETRIES 1
- for (int i = 0; i < STEPS_LOOPS_CNT; i++)
+ if ((kernel_loops_min == kernel_loops_max) || (kernel_accel_min == kernel_accel_max))
{
- steps_loops[i] = 1 << i;
+ // we do this in case the user specified a fixed -u and -n on the commandline
+ // so we have a cached kernel for benchmark
+
+ try_run (device_param, kernel_accel, kernel_loops);
+ try_run (device_param, kernel_accel, kernel_loops);
+ try_run (device_param, kernel_accel, kernel_loops);
+ try_run (device_param, kernel_accel, kernel_loops);
+ try_run (device_param, kernel_accel, kernel_loops);
}
- steps_accel[STEPS_CNT + 0] = kernel_accel_min;
- steps_accel[STEPS_CNT + 1] = kernel_accel_max;
-
- steps_loops[STEPS_CNT + 0] = kernel_loops_min;
- steps_loops[STEPS_CNT + 1] = kernel_loops_max;
+ double exec_ms_final = try_run (device_param, kernel_accel, kernel_loops);
- qsort (steps_accel, STEPS_ACCEL_CNT, sizeof (u32), sort_by_u32);
- qsort (steps_loops, STEPS_LOOPS_CNT, sizeof (u32), sort_by_u32);
+ // first find out highest kernel-loops that stays below target_ms
- // find out highest kernel-loops that stays below target_ms, we can use it later for multiplication as this is a linear function
+ for (kernel_loops = kernel_loops_max; kernel_loops > kernel_loops_min; kernel_loops >>= 1)
+ {
+ double exec_ms_best = try_run (device_param, kernel_accel_min, kernel_loops);
- u32 kernel_loops_tmp;
+ for (int i = 0; i < MAX_RETRIES; i++)
+ {
+ const double exec_ms_cur = try_run (device_param, kernel_accel_min, kernel_loops);
- for (kernel_loops_tmp = kernel_loops_max; kernel_loops_tmp > kernel_loops_min; kernel_loops_tmp >>= 1)
- {
- const double exec_ms = try_run (device_param, kernel_accel_min, kernel_loops_tmp, 1);
+ exec_ms_best = MIN (exec_ms_best, exec_ms_cur);
+ }
- if (exec_ms < target_ms) break;
+ if (exec_ms_best < target_ms) break;
}
- // kernel-accel
+ // now the same for kernel-accel but with the new kernel-loops from previous loop set
if (kernel_accel_min < kernel_accel_max)
{
- double e_best = 0;
-
- for (int i = 0; i < STEPS_ACCEL_CNT; i++)
+ for (int i = 0; i < STEPS_CNT; i++)
{
- const u32 kernel_accel_try = steps_accel[i];
+ const u32 kernel_accel_try = 1 << i;
if (kernel_accel_try < kernel_accel_min) continue;
if (kernel_accel_try > kernel_accel_max) break;
- const double exec_ms = try_run (device_param, kernel_accel_try, kernel_loops_tmp, 1);
-
- if (exec_ms > target_ms) break;
-
- const double e = kernel_accel_try / exec_ms;
+ double exec_ms_best = try_run (device_param, kernel_accel_try, kernel_loops);
- if (e > e_best)
+ for (int i = 0; i < MAX_RETRIES; i++)
{
- kernel_accel = kernel_accel_try;
+ const double exec_ms_cur = try_run (device_param, kernel_accel_try, kernel_loops);
- e_best = e;
+ exec_ms_best = MIN (exec_ms_best, exec_ms_cur);
}
- }
- }
-
- // kernel-loops final
-
- if (kernel_loops_min < kernel_loops_max)
- {
- double e_best = 0;
- for (int i = 0; i < STEPS_LOOPS_CNT; i++)
- {
- const u32 kernel_loops_try = steps_loops[i];
-
- if (kernel_loops_try < kernel_loops_min) continue;
- if (kernel_loops_try > kernel_loops_max) break;
-
- const double exec_ms = try_run (device_param, kernel_accel, kernel_loops_try, 1);
+ if (exec_ms_best > target_ms) break;
- if (exec_ms > target_ms) break;
+ exec_ms_final = exec_ms_best;
- const double e = kernel_loops_try / exec_ms;
-
- if (e > e_best)
- {
- kernel_loops = kernel_loops_try;
-
- e_best = e;
- }
+ kernel_accel = kernel_accel_try;
}
}
- // final balance
-
- u32 kernel_accel_best = kernel_accel;
- u32 kernel_loops_best = kernel_loops;
+ // there's a chance that we have a fixed kernel_loops but not a fixed kernel_accel
+ // in such a case the above function would not create any change
+ // we'll use the runtime to find out if we're allow to do last improvement
- u32 exec_best = -1;
-
- if ((kernel_accel_min < kernel_accel_max) || (kernel_loops_min < kernel_loops_max))
+ if (exec_ms_final > 0)
{
- const double exec_ms = try_run (device_param, kernel_accel_best, kernel_loops_best, 1);
-
- exec_best = exec_ms;
- }
-
- // reset
-
- if (kernel_accel_min < kernel_accel_max)
- {
- u32 kernel_accel_try = kernel_accel;
- u32 kernel_loops_try = kernel_loops;
-
- for (int i = 0; i < 2; i++)
+ if ((exec_ms_final * 2) <= target_ms)
{
- kernel_accel_try >>= 1;
- kernel_loops_try <<= 1;
+ const double exec_left = target_ms / exec_ms_final;
- if (kernel_accel_try < kernel_accel_min) break;
- if (kernel_loops_try > kernel_loops_max) break;
+ const double accel_left = kernel_accel_max / kernel_accel;
- const double exec_ms = try_run (device_param, kernel_accel_try, kernel_loops_try, 1);
+ const int exec_accel_min = MIN (exec_left, accel_left); // we want that to be int
- if (exec_ms < exec_best)
+ if (exec_accel_min >= 2)
{
- kernel_accel_best = kernel_accel_try;
- kernel_loops_best = kernel_loops_try;
-
- exec_best = exec_ms;
+ kernel_accel *= exec_accel_min;
}
}
}
- // reset
+ // sometimes we're in a bad situation that the algorithm is so slow that we can not
+ // create enough kernel_accel to do both, keep the gpu busy and stay below target_ms.
+ // however, we need to have a minimum kernel_accel and kernel_loops of 32.
+ // luckily, at this level of workload, it became a linear function
- if (kernel_loops_min < kernel_loops_max)
+ if (kernel_accel < 32 || kernel_loops < 32)
{
- u32 kernel_accel_try = kernel_accel;
- u32 kernel_loops_try = kernel_loops;
+ const u32 kernel_power = kernel_accel * kernel_loops;
- for (int i = 0; i < 2; i++)
- {
- kernel_accel_try <<= 1;
- kernel_loops_try >>= 1;
+ // find sqrt
- if (kernel_accel_try > kernel_accel_max) break;
- if (kernel_loops_try < kernel_loops_min) break;
+ u32 sqrtv;
- const double exec_ms = try_run (device_param, kernel_accel_try, kernel_loops_try, 1);
-
- if (exec_ms < exec_best)
- {
- kernel_accel_best = kernel_accel_try;
- kernel_loops_best = kernel_loops_try;
-
- exec_best = exec_ms;
- }
+ for (sqrtv = 1; sqrtv < 0x100000; sqrtv++)
+ {
+ if ((sqrtv * sqrtv) >= kernel_power) break;
}
- }
-
- // because of the balance we may have some free space left!
-
- const int exec_left = target_ms / exec_best;
-
- const int accel_left = kernel_accel_max / kernel_accel_best;
- const int exec_accel_min = MIN (exec_left, accel_left);
+ const u32 kernel_accel_try = sqrtv;
+ const u32 kernel_loops_try = sqrtv;
- if (exec_accel_min)
- {
- kernel_accel_best *= exec_accel_min;
+ if ((kernel_accel_try <= kernel_accel_max) && (kernel_loops_try >= kernel_loops_min))
+ {
+ kernel_accel = kernel_accel_try;
+ kernel_loops = kernel_loops_try;
+ }
}
// reset timer
// store
- kernel_accel = kernel_accel_best;
- kernel_loops = kernel_loops_best;
-
device_param->kernel_accel = kernel_accel;
device_param->kernel_loops = kernel_loops;
log_info ("Device #%u: autotuned kernel-accel to %u\n"
"Device #%u: autotuned kernel-loops to %u\n",
- device_param->device_id + 1,
- kernel_accel,
- device_param->device_id + 1,
- kernel_loops);
+ device_param->device_id + 1, kernel_accel,
+ device_param->device_id + 1, kernel_loops);
fprintf (stdout, "%s", PROMPT);
+
fflush (stdout);
}
hc_clEnqueueCopyBuffer (data.ocl, device_param->command_queue, device_param->d_combs, device_param->d_combs_c, 0, 0, innerloop_left * sizeof (comb_t), 0, NULL, NULL);
}
- choose_kernel (device_param, data.attack_exec, data.attack_mode, data.opts_type, salt_buf, highest_pw_len, pws_cnt);
-
if (data.benchmark == 1)
{
- double exec_ms_avg_prev = get_avg_exec_time (device_param, EXEC_CACHE);
-
- // a few caching rounds
-
- for (u32 i = 0; i < 2; i++)
- {
- hc_timer_set (&device_param->timer_speed);
-
- choose_kernel (device_param, data.attack_exec, data.attack_mode, data.opts_type, salt_buf, highest_pw_len, pws_cnt);
-
- double exec_ms_avg = get_avg_exec_time (device_param, EXEC_CACHE);
-
- exec_ms_avg_prev = exec_ms_avg;
- }
-
- // benchmark_repeats became a maximum possible repeats
-
- for (u32 i = 2; i < data.benchmark_repeats; i++)
- {
- hc_timer_set (&device_param->timer_speed);
-
- choose_kernel (device_param, data.attack_exec, data.attack_mode, data.opts_type, salt_buf, highest_pw_len, pws_cnt);
-
- double exec_ms_avg = get_avg_exec_time (device_param, EXEC_CACHE);
-
- if ((exec_ms_avg_prev / exec_ms_avg) < 1.001) break;
-
- exec_ms_avg_prev = exec_ms_avg;
- }
+ hc_timer_set (&device_param->timer_speed);
}
+ choose_kernel (device_param, data.attack_exec, data.attack_mode, data.opts_type, salt_buf, highest_pw_len, pws_cnt);
+
if (data.devices_status == STATUS_STOP_AT_CHECKPOINT) check_checkpoint ();
if (data.devices_status == STATUS_CRACKED) break;
uint version = VERSION;
uint quiet = QUIET;
uint benchmark = BENCHMARK;
- uint benchmark_repeats = BENCHMARK_REPEATS;
uint show = SHOW;
uint left = LEFT;
uint username = USERNAME;
#define IDX_FORCE 0xff08
#define IDX_RUNTIME 0xff09
#define IDX_BENCHMARK 'b'
- #define IDX_BENCHMARK_REPEATS 0xff78
#define IDX_HASH_MODE 'm'
#define IDX_ATTACK_MODE 'a'
#define IDX_RP_FILE 'r'
{"outfile-check-dir", required_argument, 0, IDX_OUTFILE_CHECK_DIR},
{"force", no_argument, 0, IDX_FORCE},
{"benchmark", no_argument, 0, IDX_BENCHMARK},
- {"benchmark-repeats", required_argument, 0, IDX_BENCHMARK_REPEATS},
{"restore", no_argument, 0, IDX_RESTORE},
{"restore-disable", no_argument, 0, IDX_RESTORE_DISABLE},
{"status", no_argument, 0, IDX_STATUS},
case IDX_LIMIT: limit = atoll (optarg); break;
case IDX_KEYSPACE: keyspace = 1; break;
case IDX_BENCHMARK: benchmark = 1; break;
- case IDX_BENCHMARK_REPEATS: benchmark_repeats = atoi (optarg); break;
case IDX_RESTORE: break;
case IDX_RESTORE_DISABLE: restore_disable = 1; break;
case IDX_STATUS: status = 1; break;
data.rp_gen_seed = rp_gen_seed;
data.force = force;
data.benchmark = benchmark;
- data.benchmark_repeats = benchmark_repeats;
data.skip = skip;
data.limit = limit;
#if defined(HAVE_HWMON) && defined(HAVE_ADL)
logfile_top_uint (attack_mode);
logfile_top_uint (attack_kern);
logfile_top_uint (benchmark);
- logfile_top_uint (benchmark_repeats);
logfile_top_uint (bitmap_min);
logfile_top_uint (bitmap_max);
logfile_top_uint (debug_mode);
attack_exec = ATTACK_EXEC_INSIDE_KERNEL;
opts_type = OPTS_TYPE_PT_GENERATE_BE
| OPTS_TYPE_PT_UNICODE
- | OPTS_TYPE_PT_ADD80;
+ | OPTS_TYPE_PT_ADD80;
kern_type = KERN_TYPE_PSTOKEN;
dgst_size = DGST_SIZE_4_5;
parse_func = pstoken_parse_hash;
break;
case 11300: if (pw_max > 40) pw_max = 40;
break;
+ case 11600: if (pw_max > 32) pw_max = 32;
+ break;
case 12500: if (pw_max > 20) pw_max = 20;
break;
case 12800: if (pw_max > 24) pw_max = 24;
device_param->device_maxclock_frequency = device_maxclock_frequency;
- // skipped
+ // device_endian_little
+
+ cl_bool device_endian_little;
+
+ hc_clGetDeviceInfo (data.ocl, device_param->device, CL_DEVICE_ENDIAN_LITTLE, sizeof (device_endian_little), &device_endian_little, NULL);
+
+ if (device_endian_little == CL_FALSE)
+ {
+ log_info ("Device #%u: WARNING: not little endian device", device_id + 1);
+
+ device_param->skipped = 1;
+ }
+
+ // device_available
+
+ cl_bool device_available;
+
+ hc_clGetDeviceInfo (data.ocl, device_param->device, CL_DEVICE_AVAILABLE, sizeof (device_available), &device_available, NULL);
+
+ if (device_available == CL_FALSE)
+ {
+ log_info ("Device #%u: WARNING: device not available", device_id + 1);
+
+ device_param->skipped = 1;
+ }
+
+ // device_compiler_available
- const u32 skipped1 = ((devices_filter & (1 << device_id)) == 0);
- const u32 skipped2 = ((device_types_filter & (device_type)) == 0);
+ cl_bool device_compiler_available;
- device_param->skipped = (skipped1 || skipped2);
+ hc_clGetDeviceInfo (data.ocl, device_param->device, CL_DEVICE_COMPILER_AVAILABLE, sizeof (device_compiler_available), &device_compiler_available, NULL);
+
+ if (device_compiler_available == CL_FALSE)
+ {
+ log_info ("Device #%u: WARNING: device no compiler available", device_id + 1);
+
+ device_param->skipped = 1;
+ }
+
+ // device_execution_capabilities
+
+ cl_device_exec_capabilities device_execution_capabilities;
+
+ hc_clGetDeviceInfo (data.ocl, device_param->device, CL_DEVICE_EXECUTION_CAPABILITIES, sizeof (device_execution_capabilities), &device_execution_capabilities, NULL);
+
+ if ((device_execution_capabilities & CL_EXEC_KERNEL) == 0)
+ {
+ log_info ("Device #%u: WARNING: device does not support executing kernels", device_id + 1);
+
+ device_param->skipped = 1;
+ }
+
+ // device_extensions
+
+ size_t device_extensions_size;
+
+ hc_clGetDeviceInfo (data.ocl, device_param->device, CL_DEVICE_EXTENSIONS, 0, NULL, &device_extensions_size);
+
+ char *device_extensions = mymalloc (device_extensions_size + 1);
+
+ hc_clGetDeviceInfo (data.ocl, device_param->device, CL_DEVICE_EXTENSIONS, device_extensions_size, device_extensions, NULL);
+
+ if (strstr (device_extensions, "base_atomics") == 0)
+ {
+ log_info ("Device #%u: WARNING: device does not support base atomics", device_id + 1);
+
+ device_param->skipped = 1;
+ }
+
+ if (strstr (device_extensions, "byte_addressable_store") == 0)
+ {
+ log_info ("Device #%u: WARNING: device does not support byte addressable store", device_id + 1);
+
+ device_param->skipped = 1;
+ }
+
+ myfree (device_extensions);
+
+ // device_local_mem_size
+
+ cl_ulong device_local_mem_size;
+
+ hc_clGetDeviceInfo (data.ocl, device_param->device, CL_DEVICE_LOCAL_MEM_SIZE, sizeof (device_local_mem_size), &device_local_mem_size, NULL);
+
+ if (device_local_mem_size < 32768)
+ {
+ log_info ("Device #%u: WARNING: device local mem size is too small", device_id + 1);
+
+ device_param->skipped = 1;
+ }
+
+
+ // skipped
+
+ device_param->skipped |= ((devices_filter & (1 << device_id)) == 0);
+ device_param->skipped |= ((device_types_filter & (device_type)) == 0);
// driver_version
+
hc_clGetDeviceInfo (data.ocl, device_param->device, CL_DRIVER_VERSION, 0, NULL, ¶m_value_size);
char *driver_version = (char *) mymalloc (param_value_size);
log_info ("ATTENTION! OpenCL support for CPU of catalyst driver is not reliable.");
log_info ("You are STRONGLY encouraged not to use it");
log_info ("You can use --force to override this but do not post error reports if you do so");
- log_info ("A good alternative is the free pocl, but make sure to use a version >= 3.8");
+ log_info ("A good alternative is the free pocl >= v0.13, but make sure to use a LLVM >= v3.8");
log_info ("");
return (-1);