#define MARKOV_DISABLE 0
#define MARKOV_CLASSIC 0
#define BENCHMARK 0
-#define BENCHMARK_REPEATS 100
#define RESTORE 0
#define RESTORE_TIMER 60
#define RESTORE_DISABLE 0
#define KERNEL_RULES 1024
#define KERNEL_COMBS 1024
#define KERNEL_BFS 1024
-#define KERNEL_THREADS 64
+#define KERNEL_THREADS_MAX 256
+#define KERNEL_THREADS_MAX_CPU 16
#define POWERTUNE_ENABLE 0
#define LOGFILE_DISABLE 0
#define SCRYPT_TMTO 0
#define MAX_DICTSTAT 10000
-#define NUM_DEFAULT_BENCHMARK_ALGORITHMS 135
+#define NUM_DEFAULT_BENCHMARK_ALGORITHMS 136
#define global_free(attr) \
{ \
8700,
9100,
133,
+ 13500,
11600,
12500,
13000,
" 8700 = Lotus Notes/Domino 6",
" 9100 = Lotus Notes/Domino 8",
" 133 = PeopleSoft",
+ " 13500 = PeopleSoft Token",
"",
"[[ Archives ]]",
"",
for (int i = 0; i < SPEED_CACHE; i++)
{
- double rec_ms;
-
- hc_timer_get (device_param->speed_rec[i], rec_ms);
-
- if (rec_ms > SPEED_MAXAGE) continue;
-
speed_cnt += device_param->speed_cnt[i];
speed_ms += device_param->speed_ms[i];
}
if (device_param->skipped) continue;
- // we need to clear values (set to 0) because in case the device does
- // not get new candidates it idles around but speed display would
- // show it as working.
- // if we instantly set it to 0 after reading it happens that the
- // speed can be shown as zero if the users refreshes too fast.
- // therefore, we add a timestamp when a stat was recorded and if its
- // too old we will not use it
-
speed_cnt[device_id] = 0;
speed_ms[device_id] = 0;
for (int i = 0; i < SPEED_CACHE; i++)
{
- double rec_ms;
-
- hc_timer_get (device_param->speed_rec[i], rec_ms);
-
- if (rec_ms > SPEED_MAXAGE) continue;
-
speed_cnt[device_id] += device_param->speed_cnt[i];
speed_ms[device_id] += device_param->speed_ms[i];
}
hc_clEnqueueReadBuffer (data.ocl, device_param->command_queue, device_param->d_result, CL_TRUE, 0, device_param->size_results, device_param->result, 0, NULL, NULL);
- for (uint i = 0; i < KERNEL_THREADS; i++) if (device_param->result[i] == 1) found = 1;
+ for (uint i = 0; i < device_param->kernel_threads; i++) if (device_param->result[i] == 1) found = 1;
if (found == 1)
{
hc_clGetKernelWorkGroupInfo (data.ocl, kernel, device_param->device, CL_KERNEL_WORK_GROUP_SIZE, sizeof (size_t), &workgroup_size, NULL);
+ if (kern_run == KERN_RUN_2)
+ {
+ if (data.opti_type & OPTI_TYPE_SLOW_HASH_SIMD)
+ {
+ num_elements = CEIL ((float) num_elements / device_param->vector_width);
+ }
+ }
+
if (kernel_threads > workgroup_size) kernel_threads = workgroup_size;
+ while (num_elements % kernel_threads) num_elements++;
+
const size_t global_work_size[3] = { num_elements, 1, 1 };
const size_t local_work_size[3] = { kernel_threads, 1, 1 };
// causes problems with special threads like in bcrypt
// const uint kernel_threads = device_param->kernel_threads;
- uint kernel_threads = KERNEL_THREADS;
+ uint kernel_threads = device_param->kernel_threads;
while (num_elements % kernel_threads) num_elements++;
}
size_t workgroup_size = 0;
+
hc_clGetKernelWorkGroupInfo (data.ocl, kernel, device_param->device, CL_KERNEL_WORK_GROUP_SIZE, sizeof(size_t), &workgroup_size, NULL);
+
if (kernel_threads > workgroup_size) kernel_threads = workgroup_size;
- const size_t global_work_size[3] = { num_elements, 1, 1 };
+ const size_t global_work_size[3] = { num_elements, 1, 1 };
const size_t local_work_size[3] = { kernel_threads, 1, 1 };
hc_clEnqueueNDRangeKernel (data.ocl, device_param->command_queue, kernel, 1, NULL, global_work_size, local_work_size, 0, NULL, NULL);
cl_kernel kernel = device_param->kernel_tm;
size_t workgroup_size = 0;
+
hc_clGetKernelWorkGroupInfo (data.ocl, kernel, device_param->device, CL_KERNEL_WORK_GROUP_SIZE, sizeof (size_t), &workgroup_size, NULL);
+
if (kernel_threads > workgroup_size) kernel_threads = workgroup_size;
- const size_t global_work_size[3] = { num_elements, 1, 1 };
+ const size_t global_work_size[3] = { num_elements, 1, 1 };
const size_t local_work_size[3] = { kernel_threads, 1, 1 };
hc_clEnqueueNDRangeKernel (data.ocl, device_param->command_queue, kernel, 1, NULL, global_work_size, local_work_size, 0, NULL, NULL);
// causes problems with special threads like in bcrypt
// const uint kernel_threads = device_param->kernel_threads;
- uint kernel_threads = KERNEL_THREADS;
+ uint kernel_threads = device_param->kernel_threads;
while (num_elements % kernel_threads) num_elements++;
hc_clSetKernelArg (data.ocl, kernel, 6, sizeof (cl_uint), device_param->kernel_params_amp[6]);
size_t workgroup_size = 0;
+
hc_clGetKernelWorkGroupInfo (data.ocl, kernel, device_param->device, CL_KERNEL_WORK_GROUP_SIZE, sizeof (size_t), &workgroup_size, NULL);
+
if (kernel_threads > workgroup_size) kernel_threads = workgroup_size;
- const size_t global_work_size[3] = { num_elements, 1, 1 };
+ const size_t global_work_size[3] = { num_elements, 1, 1 };
const size_t local_work_size[3] = { kernel_threads, 1, 1 };
hc_clEnqueueNDRangeKernel (data.ocl, device_param->command_queue, kernel, 1, NULL, global_work_size, local_work_size, 0, NULL, NULL);
hc_clFinish (data.ocl, device_param->command_queue);
}
-static void run_kernel_bzero (hc_device_param_t *device_param, cl_mem buf, const uint size)
+static void run_kernel_bzero (hc_device_param_t *device_param, cl_mem buf, const size_t size)
{
int rc = -1;
if (rc != 0)
{
// NOTE: clEnqueueFillBuffer () always fails with -59
- // IOW, it's not supported by Nvidia ForceWare <= 352.21, also pocl segfaults, also on apple
+ // IOW, it's not supported by Nvidia drivers <= 352.21, also pocl segfaults, also on apple
// How's that possible, OpenCL 1.2 support is advertised??
// We need to workaround...
char *tmp = (char *) mymalloc (FILLSZ);
- for (uint i = 0; i < size; i += FILLSZ)
+ for (size_t i = 0; i < size; i += FILLSZ)
{
- const int left = size - i;
+ const size_t left = size - i;
- const int fillsz = MIN (FILLSZ, left);
+ const size_t fillsz = MIN (FILLSZ, left);
hc_clEnqueueWriteBuffer (data.ocl, device_param->command_queue, buf, CL_TRUE, i, fillsz, tmp, 0, NULL, NULL);
}
if (data.devices_status == STATUS_CRACKED) break;
if (data.devices_status == STATUS_ABORTED) break;
if (data.devices_status == STATUS_QUIT) break;
+
+ /**
+ * speed
+ */
+
+ const float iter_part = (float) (loop_pos + loop_left) / iter;
+
+ const u64 perf_sum_all = pws_cnt * iter_part;
+
+ double speed_ms;
+
+ hc_timer_get (device_param->timer_speed, speed_ms);
+
+ const u32 speed_pos = device_param->speed_pos;
+
+ device_param->speed_cnt[speed_pos] = perf_sum_all;
+
+ device_param->speed_ms[speed_pos] = speed_ms;
+
+ if (data.benchmark == 1)
+ {
+ if (speed_ms > 4096) data.devices_status = STATUS_ABORTED;
+ }
}
if (opts_type & OPTS_TYPE_HOOK23)
}
}
-static double try_run (hc_device_param_t *device_param, const u32 kernel_accel, const u32 kernel_loops, const int repeat)
+static double try_run (hc_device_param_t *device_param, const u32 kernel_accel, const u32 kernel_loops)
{
const u32 kernel_power = device_param->device_processors * device_param->kernel_threads * kernel_accel;
- device_param->kernel_params_buf32[26] = kernel_loops;
- device_param->kernel_params_buf32[27] = kernel_loops;
+ device_param->kernel_params_buf32[25] = 0;
+ device_param->kernel_params_buf32[26] = kernel_loops; // not a bug, both need to be set
+ device_param->kernel_params_buf32[27] = kernel_loops; // because there's two variables for inner iters for slow and fast hashes
// init some fake words
- for (u32 i = 0; i < kernel_power; i++)
+ if (data.hash_mode == 10700)
{
- device_param->pws_buf[i].i[0] = i;
- device_param->pws_buf[i].i[1] = 0x01234567;
- device_param->pws_buf[i].pw_len = 4 + (i & 3);
- }
+ // hash mode 10700 hangs on length 0 (unlimited loop)
+
+ for (u32 i = 0; i < kernel_power; i++)
+ {
+ device_param->pws_buf[i].i[0] = i;
+ device_param->pws_buf[i].i[1] = i + 0x01234567;
+ device_param->pws_buf[i].i[2] = i + 0x89abcdef;
+ device_param->pws_buf[i].i[3] = 0xffffffff;
+ device_param->pws_buf[i].pw_len = 4 + (i & 3);
+ }
- hc_clEnqueueWriteBuffer (data.ocl, device_param->command_queue, device_param->d_pws_buf, CL_TRUE, 0, kernel_power * sizeof (pw_t), device_param->pws_buf, 0, NULL, NULL);
+ hc_clEnqueueWriteBuffer (data.ocl, device_param->command_queue, device_param->d_pws_buf, CL_TRUE, 0, kernel_power * sizeof (pw_t), device_param->pws_buf, 0, NULL, NULL);
- if (data.attack_exec == ATTACK_EXEC_OUTSIDE_KERNEL)
- {
- run_kernel_amp (device_param, kernel_power);
+ if (data.attack_exec == ATTACK_EXEC_OUTSIDE_KERNEL)
+ {
+ run_kernel_amp (device_param, kernel_power);
+ }
}
- // caching run
-
if (data.attack_exec == ATTACK_EXEC_INSIDE_KERNEL)
{
- run_kernel (KERN_RUN_1, device_param, kernel_power, false);
+ run_kernel (KERN_RUN_1, device_param, kernel_power, true);
}
else
{
- run_kernel (KERN_RUN_2, device_param, kernel_power, false);
+ run_kernel (KERN_RUN_2, device_param, kernel_power, true);
}
- // now user repeats
-
- for (int i = 0; i < repeat; i++)
- {
- if (data.attack_exec == ATTACK_EXEC_INSIDE_KERNEL)
- {
- run_kernel (KERN_RUN_1, device_param, kernel_power, true);
- }
- else
- {
- run_kernel (KERN_RUN_2, device_param, kernel_power, true);
- }
- }
-
- const double exec_ms_prev = get_avg_exec_time (device_param, repeat);
+ const double exec_ms_prev = get_avg_exec_time (device_param, 1);
// reset fake words
- memset (device_param->pws_buf, 0, kernel_power * sizeof (pw_t));
+ if (data.hash_mode == 10700)
+ {
+ memset (device_param->pws_buf, 0, kernel_power * sizeof (pw_t));
- hc_clEnqueueWriteBuffer (data.ocl, device_param->command_queue, device_param->d_pws_buf, CL_TRUE, 0, kernel_power * sizeof (pw_t), device_param->pws_buf, 0, NULL, NULL);
- hc_clEnqueueWriteBuffer (data.ocl, device_param->command_queue, device_param->d_pws_amp_buf, CL_TRUE, 0, kernel_power * sizeof (pw_t), device_param->pws_buf, 0, NULL, NULL);
+ hc_clEnqueueWriteBuffer (data.ocl, device_param->command_queue, device_param->d_pws_buf, CL_TRUE, 0, kernel_power * sizeof (pw_t), device_param->pws_buf, 0, NULL, NULL);
+ hc_clEnqueueWriteBuffer (data.ocl, device_param->command_queue, device_param->d_pws_amp_buf, CL_TRUE, 0, kernel_power * sizeof (pw_t), device_param->pws_buf, 0, NULL, NULL);
+ }
return exec_ms_prev;
}
u32 kernel_accel = kernel_accel_min;
u32 kernel_loops = kernel_loops_min;
- // steps
-
#define STEPS_CNT 10
- #define STEPS_ACCEL_CNT (STEPS_CNT + 2)
- #define STEPS_LOOPS_CNT (STEPS_CNT + 2)
-
- u32 steps_accel[STEPS_ACCEL_CNT];
- u32 steps_loops[STEPS_LOOPS_CNT];
-
- for (int i = 0; i < STEPS_ACCEL_CNT; i++)
- {
- steps_accel[i] = 1 << i;
- }
+ #define MAX_RETRIES 1
- for (int i = 0; i < STEPS_LOOPS_CNT; i++)
+ if ((kernel_loops_min == kernel_loops_max) || (kernel_accel_min == kernel_accel_max))
{
- steps_loops[i] = 1 << i;
+ // we do this in case the user specified a fixed -u and -n on the commandline
+ // so we have a cached kernel for benchmark
+
+ try_run (device_param, kernel_accel, kernel_loops);
+ try_run (device_param, kernel_accel, kernel_loops);
+ try_run (device_param, kernel_accel, kernel_loops);
+ try_run (device_param, kernel_accel, kernel_loops);
+ try_run (device_param, kernel_accel, kernel_loops);
}
- steps_accel[STEPS_CNT + 0] = kernel_accel_min;
- steps_accel[STEPS_CNT + 1] = kernel_accel_max;
-
- steps_loops[STEPS_CNT + 0] = kernel_loops_min;
- steps_loops[STEPS_CNT + 1] = kernel_loops_max;
+ double exec_ms_final = try_run (device_param, kernel_accel, kernel_loops);
- qsort (steps_accel, STEPS_ACCEL_CNT, sizeof (u32), sort_by_u32);
- qsort (steps_loops, STEPS_LOOPS_CNT, sizeof (u32), sort_by_u32);
+ // first find out highest kernel-loops that stays below target_ms
- // find out highest kernel-loops that stays below target_ms, we can use it later for multiplication as this is a linear function
+ for (kernel_loops = kernel_loops_max; kernel_loops > kernel_loops_min; kernel_loops >>= 1)
+ {
+ double exec_ms_best = try_run (device_param, kernel_accel_min, kernel_loops);
- u32 kernel_loops_tmp;
+ for (int i = 0; i < MAX_RETRIES; i++)
+ {
+ const double exec_ms_cur = try_run (device_param, kernel_accel_min, kernel_loops);
- for (kernel_loops_tmp = kernel_loops_max; kernel_loops_tmp > kernel_loops_min; kernel_loops_tmp >>= 1)
- {
- const double exec_ms = try_run (device_param, kernel_accel_min, kernel_loops_tmp, 1);
+ exec_ms_best = MIN (exec_ms_best, exec_ms_cur);
+ }
- if (exec_ms < target_ms) break;
+ if (exec_ms_best < target_ms) break;
}
- // kernel-accel
+ // now the same for kernel-accel but with the new kernel-loops from previous loop set
if (kernel_accel_min < kernel_accel_max)
{
- double e_best = 0;
-
- for (int i = 0; i < STEPS_ACCEL_CNT; i++)
+ for (int i = 0; i < STEPS_CNT; i++)
{
- const u32 kernel_accel_try = steps_accel[i];
+ const u32 kernel_accel_try = 1 << i;
if (kernel_accel_try < kernel_accel_min) continue;
if (kernel_accel_try > kernel_accel_max) break;
- const double exec_ms = try_run (device_param, kernel_accel_try, kernel_loops_tmp, 1);
-
- if (exec_ms > target_ms) break;
+ double exec_ms_best = try_run (device_param, kernel_accel_try, kernel_loops);
- const double e = kernel_accel_try / exec_ms;
-
- if (e > e_best)
+ for (int i = 0; i < MAX_RETRIES; i++)
{
- kernel_accel = kernel_accel_try;
+ const double exec_ms_cur = try_run (device_param, kernel_accel_try, kernel_loops);
- e_best = e;
+ exec_ms_best = MIN (exec_ms_best, exec_ms_cur);
}
- }
- }
-
- // kernel-loops final
-
- if (kernel_loops_min < kernel_loops_max)
- {
- double e_best = 0;
- for (int i = 0; i < STEPS_LOOPS_CNT; i++)
- {
- const u32 kernel_loops_try = steps_loops[i];
-
- if (kernel_loops_try < kernel_loops_min) continue;
- if (kernel_loops_try > kernel_loops_max) break;
-
- const double exec_ms = try_run (device_param, kernel_accel, kernel_loops_try, 1);
-
- if (exec_ms > target_ms) break;
-
- const double e = kernel_loops_try / exec_ms;
+ if (exec_ms_best > target_ms) break;
- if (e > e_best)
- {
- kernel_loops = kernel_loops_try;
+ exec_ms_final = exec_ms_best;
- e_best = e;
- }
+ kernel_accel = kernel_accel_try;
}
}
- // final balance
-
- u32 kernel_accel_best = kernel_accel;
- u32 kernel_loops_best = kernel_loops;
-
- u32 exec_best = -1;
-
- if ((kernel_accel_min < kernel_accel_max) || (kernel_loops_min < kernel_loops_max))
- {
- const double exec_ms = try_run (device_param, kernel_accel_best, kernel_loops_best, 1);
-
- exec_best = exec_ms;
- }
-
- // reset
+ // there's a chance that we have a fixed kernel_loops but not a fixed kernel_accel
+ // in such a case the above function would not create any change
+ // we'll use the runtime to find out if we're allow to do last improvement
- if (kernel_accel_min < kernel_accel_max)
+ if (exec_ms_final > 0)
{
- u32 kernel_accel_try = kernel_accel;
- u32 kernel_loops_try = kernel_loops;
-
- for (int i = 0; i < 2; i++)
+ if ((exec_ms_final * 2) <= target_ms)
{
- kernel_accel_try >>= 1;
- kernel_loops_try <<= 1;
+ const double exec_left = target_ms / exec_ms_final;
- if (kernel_accel_try < kernel_accel_min) break;
- if (kernel_loops_try > kernel_loops_max) break;
+ const double accel_left = kernel_accel_max / kernel_accel;
- const double exec_ms = try_run (device_param, kernel_accel_try, kernel_loops_try, 1);
+ const int exec_accel_min = MIN (exec_left, accel_left); // we want that to be int
- if (exec_ms < exec_best)
+ if (exec_accel_min >= 2)
{
- kernel_accel_best = kernel_accel_try;
- kernel_loops_best = kernel_loops_try;
-
- exec_best = exec_ms;
+ kernel_accel *= exec_accel_min;
}
}
}
- // reset
+ // sometimes we're in a bad situation that the algorithm is so slow that we can not
+ // create enough kernel_accel to do both, keep the gpu busy and stay below target_ms.
+ // however, we need to have a minimum kernel_accel and kernel_loops of 32.
+ // luckily, at this level of workload, it became a linear function
- if (kernel_loops_min < kernel_loops_max)
+ if (kernel_accel < 32 || kernel_loops < 32)
{
- u32 kernel_accel_try = kernel_accel;
- u32 kernel_loops_try = kernel_loops;
+ const u32 kernel_power = kernel_accel * kernel_loops;
- for (int i = 0; i < 2; i++)
- {
- kernel_accel_try <<= 1;
- kernel_loops_try >>= 1;
+ // find sqrt
- if (kernel_accel_try > kernel_accel_max) break;
- if (kernel_loops_try < kernel_loops_min) break;
+ u32 sqrtv;
- const double exec_ms = try_run (device_param, kernel_accel_try, kernel_loops_try, 1);
+ for (sqrtv = 1; sqrtv < 0x100000; sqrtv++)
+ {
+ if ((sqrtv * sqrtv) >= kernel_power) break;
+ }
- if (exec_ms < exec_best)
- {
- kernel_accel_best = kernel_accel_try;
- kernel_loops_best = kernel_loops_try;
+ const u32 kernel_accel_try = sqrtv;
+ const u32 kernel_loops_try = sqrtv;
- exec_best = exec_ms;
- }
+ if ((kernel_accel_try <= kernel_accel_max) && (kernel_loops_try >= kernel_loops_min))
+ {
+ kernel_accel = kernel_accel_try;
+ kernel_loops = kernel_loops_try;
}
}
// store
- kernel_accel = kernel_accel_best;
- kernel_loops = kernel_loops_best;
-
device_param->kernel_accel = kernel_accel;
device_param->kernel_loops = kernel_loops;
log_info ("Device #%u: autotuned kernel-accel to %u\n"
"Device #%u: autotuned kernel-loops to %u\n",
- device_param->device_id + 1,
- kernel_accel,
- device_param->device_id + 1,
- kernel_loops);
+ device_param->device_id + 1, kernel_accel,
+ device_param->device_id + 1, kernel_loops);
fprintf (stdout, "%s", PROMPT);
+
fflush (stdout);
}
hc_clEnqueueCopyBuffer (data.ocl, device_param->command_queue, device_param->d_combs, device_param->d_combs_c, 0, 0, innerloop_left * sizeof (comb_t), 0, NULL, NULL);
}
- choose_kernel (device_param, data.attack_exec, data.attack_mode, data.opts_type, salt_buf, highest_pw_len, pws_cnt);
-
if (data.benchmark == 1)
{
- double exec_ms_avg_prev = get_avg_exec_time (device_param, EXEC_CACHE);
-
- // a few caching rounds
-
- for (u32 i = 0; i < 2; i++)
- {
- hc_timer_set (&device_param->timer_speed);
-
- choose_kernel (device_param, data.attack_exec, data.attack_mode, data.opts_type, salt_buf, highest_pw_len, pws_cnt);
-
- double exec_ms_avg = get_avg_exec_time (device_param, EXEC_CACHE);
-
- exec_ms_avg_prev = exec_ms_avg;
- }
-
- // benchmark_repeats became a maximum possible repeats
-
- for (u32 i = 2; i < data.benchmark_repeats; i++)
- {
- hc_timer_set (&device_param->timer_speed);
-
- choose_kernel (device_param, data.attack_exec, data.attack_mode, data.opts_type, salt_buf, highest_pw_len, pws_cnt);
-
- double exec_ms_avg = get_avg_exec_time (device_param, EXEC_CACHE);
-
- if ((exec_ms_avg_prev / exec_ms_avg) < 1.001) break;
-
- exec_ms_avg_prev = exec_ms_avg;
- }
+ hc_timer_set (&device_param->timer_speed);
}
+ choose_kernel (device_param, data.attack_exec, data.attack_mode, data.opts_type, salt_buf, highest_pw_len, pws_cnt);
+
if (data.devices_status == STATUS_STOP_AT_CHECKPOINT) check_checkpoint ();
if (data.devices_status == STATUS_CRACKED) break;
device_param->speed_ms[speed_pos] = speed_ms;
- device_param->speed_rec[speed_pos] = device_param->timer_speed;
-
hc_thread_mutex_unlock (mux_display);
speed_pos++;
speed_pos = 0;
}
- // average speed
-
- device_param->speed_cnt_total += perf_sum_all;
-
- device_param->speed_ms_total += speed_ms;
-
/**
* benchmark
*/
if (getenv ("CUDA_CACHE_DISABLE") == NULL)
putenv ((char *) "CUDA_CACHE_DISABLE=1");
+ if (getenv ("POCL_KERNEL_CACHE") == NULL)
+ putenv ((char *) "POCL_KERNEL_CACHE=0");
+
/**
* Real init
*/
uint version = VERSION;
uint quiet = QUIET;
uint benchmark = BENCHMARK;
- uint benchmark_repeats = BENCHMARK_REPEATS;
uint show = SHOW;
uint left = LEFT;
uint username = USERNAME;
#define IDX_FORCE 0xff08
#define IDX_RUNTIME 0xff09
#define IDX_BENCHMARK 'b'
- #define IDX_BENCHMARK_REPEATS 0xff78
#define IDX_HASH_MODE 'm'
#define IDX_ATTACK_MODE 'a'
#define IDX_RP_FILE 'r'
{"outfile-check-dir", required_argument, 0, IDX_OUTFILE_CHECK_DIR},
{"force", no_argument, 0, IDX_FORCE},
{"benchmark", no_argument, 0, IDX_BENCHMARK},
- {"benchmark-repeats", required_argument, 0, IDX_BENCHMARK_REPEATS},
{"restore", no_argument, 0, IDX_RESTORE},
{"restore-disable", no_argument, 0, IDX_RESTORE_DISABLE},
{"status", no_argument, 0, IDX_STATUS},
case IDX_LIMIT: limit = atoll (optarg); break;
case IDX_KEYSPACE: keyspace = 1; break;
case IDX_BENCHMARK: benchmark = 1; break;
- case IDX_BENCHMARK_REPEATS: benchmark_repeats = atoi (optarg); break;
case IDX_RESTORE: break;
case IDX_RESTORE_DISABLE: restore_disable = 1; break;
case IDX_STATUS: status = 1; break;
return (-1);
}
- if (hash_mode_chgd && hash_mode > 13400) // just added to remove compiler warnings for hash_mode_chgd
+ if (hash_mode_chgd && hash_mode > 13500) // just added to remove compiler warnings for hash_mode_chgd
{
log_error ("ERROR: Invalid hash-type specified");
if (loopback == 1)
{
- if (attack_mode == ATTACK_MODE_BF)
- {
- log_error ("ERROR: Parameter loopback not allowed in attack-mode 3");
-
- return (-1);
- }
- else if (attack_mode == ATTACK_MODE_STRAIGHT)
+ if (attack_mode == ATTACK_MODE_STRAIGHT)
{
if ((rp_files_cnt == 0) && (rp_gen == 0))
{
return (-1);
}
}
+ else
+ {
+ log_error ("ERROR: Parameter loopback allowed in attack-mode 0 only");
+
+ return (-1);
+ }
}
if (debug_mode > 0)
data.rp_gen_seed = rp_gen_seed;
data.force = force;
data.benchmark = benchmark;
- data.benchmark_repeats = benchmark_repeats;
data.skip = skip;
data.limit = limit;
#if defined(HAVE_HWMON) && defined(HAVE_ADL)
logfile_top_uint (attack_mode);
logfile_top_uint (attack_kern);
logfile_top_uint (benchmark);
- logfile_top_uint (benchmark_repeats);
logfile_top_uint (bitmap_min);
logfile_top_uint (bitmap_max);
logfile_top_uint (debug_mode);
dgst_size = DGST_SIZE_4_4;
parse_func = phpass_parse_hash;
sort_by_digest = sort_by_digest_4_4;
- opti_type = OPTI_TYPE_ZERO_BYTE;
+ opti_type = OPTI_TYPE_ZERO_BYTE
+ | OPTI_TYPE_SLOW_HASH_SIMD;
dgst_pos0 = 0;
dgst_pos1 = 1;
dgst_pos2 = 2;
dgst_pos3 = 3;
break;
+ case 13500: hash_type = HASH_TYPE_SHA1;
+ salt_type = SALT_TYPE_EMBEDDED;
+ attack_exec = ATTACK_EXEC_INSIDE_KERNEL;
+ opts_type = OPTS_TYPE_PT_GENERATE_BE
+ | OPTS_TYPE_PT_UNICODE
+ | OPTS_TYPE_PT_ADD80;
+ kern_type = KERN_TYPE_PSTOKEN;
+ dgst_size = DGST_SIZE_4_5;
+ parse_func = pstoken_parse_hash;
+ sort_by_digest = sort_by_digest_4_5;
+ opti_type = OPTI_TYPE_ZERO_BYTE
+ | OPTI_TYPE_PRECOMPUTE_INIT
+ | OPTI_TYPE_EARLY_SKIP
+ | OPTI_TYPE_NOT_ITERATED
+ | OPTI_TYPE_PREPENDED_SALT
+ | OPTI_TYPE_RAW_HASH;
+ dgst_pos0 = 3;
+ dgst_pos1 = 4;
+ dgst_pos2 = 2;
+ dgst_pos3 = 1;
+ break;
+
default: usage_mini_print (PROGNAME); return (-1);
}
case 13000: esalt_size = sizeof (rar5_t); break;
case 13100: esalt_size = sizeof (krb5tgs_t); break;
case 13400: esalt_size = sizeof (keepass_t); break;
+ case 13500: esalt_size = sizeof (pstoken_t); break;
}
data.esalt_size = esalt_size;
switch (hash_mode)
{
- case 1500: hashes_buf[0].salt->salt_len = 2;
+ case 1500: hashes_buf[0].salt->salt_len = 2;
+ hashes_buf[0].salt->salt_buf[0] = 388; // pure magic
break;
case 1731: hashes_buf[0].salt->salt_len = 4;
break;
break;
case 13400: ((keepass_t *) hashes_buf[0].esalt)->version = 2;
break;
+ case 13500: ((pstoken_t *) hashes_buf[0].esalt)->salt_len = 113;
+ break;
}
}
uint digests_cnt = hashes_cnt;
uint digests_done = 0;
- uint size_digests = digests_cnt * dgst_size;
- uint size_shown = digests_cnt * sizeof (uint);
+ size_t size_digests = digests_cnt * dgst_size;
+ size_t size_shown = digests_cnt * sizeof (uint);
uint *digests_shown = (uint *) mymalloc (size_shown);
uint *digests_shown_tmp = (uint *) mymalloc (size_shown);
hc_clGetDeviceIDs (data.ocl, platform, CL_DEVICE_TYPE_ALL, DEVICES_MAX, platform_devices, &platform_devices_cnt);
+ char platform_vendor[INFOSZ] = { 0 };
+
+ hc_clGetPlatformInfo (data.ocl, platform, CL_PLATFORM_VENDOR, sizeof (platform_vendor), platform_vendor, NULL);
+
+ // find our own platform vendor because pocl and mesa are pushing original vendor_id through opencl
+ // this causes trouble with vendor id based macros
+ // we'll assign generic to those without special optimization available
+
+ cl_uint vendor_id = 0;
+
+ if (strcmp (platform_vendor, CL_VENDOR_AMD) == 0)
+ {
+ vendor_id = VENDOR_ID_AMD;
+ }
+ else if (strcmp (platform_vendor, CL_VENDOR_APPLE) == 0)
+ {
+ vendor_id = VENDOR_ID_GENERIC;
+ }
+ else if (strcmp (platform_vendor, CL_VENDOR_INTEL_BEIGNET) == 0)
+ {
+ vendor_id = VENDOR_ID_GENERIC;
+ }
+ else if (strcmp (platform_vendor, CL_VENDOR_INTEL_SDK) == 0)
+ {
+ vendor_id = VENDOR_ID_GENERIC;
+ }
+ else if (strcmp (platform_vendor, CL_VENDOR_MESA) == 0)
+ {
+ vendor_id = VENDOR_ID_GENERIC;
+ }
+ else if (strcmp (platform_vendor, CL_VENDOR_NV) == 0)
+ {
+ vendor_id = VENDOR_ID_NV;
+ }
+ else if (strcmp (platform_vendor, CL_VENDOR_POCL) == 0)
+ {
+ vendor_id = VENDOR_ID_GENERIC;
+ }
+ else
+ {
+ vendor_id = VENDOR_ID_GENERIC;
+ }
+
for (uint platform_devices_id = 0; platform_devices_id < platform_devices_cnt; platform_devices_id++)
{
size_t param_value_size = 0;
hc_device_param_t *device_param = &data.devices_param[device_id];
+ device_param->vendor_id = vendor_id;
+
device_param->device = platform_devices[platform_devices_id];
device_param->device_id = device_id;
device_param->device_type = device_type;
- // vendor_id
-
- cl_uint vendor_id = 0;
-
- hc_clGetDeviceInfo (data.ocl, device_param->device, CL_DEVICE_VENDOR_ID, sizeof (vendor_id), &vendor_id, NULL);
-
- device_param->vendor_id = vendor_id;
-
// device_name
hc_clGetDeviceInfo (data.ocl, device_param->device, CL_DEVICE_NAME, 0, NULL, ¶m_value_size);
myfree (device_opencl_version);
- if (strstr (device_version, "pocl"))
- {
- // pocl returns the real vendor_id in CL_DEVICE_VENDOR_ID which causes many problems because of hms and missing amd_bfe () etc
- // we need to overwrite vendor_id to avoid this. maybe open pocl issue?
-
- cl_uint vendor_id = VENDOR_ID_GENERIC;
-
- device_param->vendor_id = vendor_id;
- }
-
// vector_width
cl_uint vector_width;
device_param->device_processors = device_processors;
- // max_mem_alloc_size
+ // device_maxmem_alloc
+ // note we'll limit to 2gb, otherwise this causes all kinds of weird errors because of possible integer overflows in opencl runtimes
cl_ulong device_maxmem_alloc;
hc_clGetDeviceInfo (data.ocl, device_param->device, CL_DEVICE_MAX_MEM_ALLOC_SIZE, sizeof (device_maxmem_alloc), &device_maxmem_alloc, NULL);
- device_param->device_maxmem_alloc = device_maxmem_alloc;
+ device_param->device_maxmem_alloc = MIN (device_maxmem_alloc, 0x7fffffff);
- // max_mem_alloc_size
+ // device_global_mem
cl_ulong device_global_mem;
device_param->device_global_mem = device_global_mem;
+ // max_work_group_size
+
+ size_t device_maxworkgroup_size;
+
+ hc_clGetDeviceInfo (data.ocl, device_param->device, CL_DEVICE_MAX_WORK_GROUP_SIZE, sizeof (device_maxworkgroup_size), &device_maxworkgroup_size, NULL);
+
+ device_param->device_maxworkgroup_size = device_maxworkgroup_size;
+
// max_clock_frequency
cl_uint device_maxclock_frequency;
device_param->device_maxclock_frequency = device_maxclock_frequency;
- // skipped
+ // device_endian_little
+
+ cl_bool device_endian_little;
+
+ hc_clGetDeviceInfo (data.ocl, device_param->device, CL_DEVICE_ENDIAN_LITTLE, sizeof (device_endian_little), &device_endian_little, NULL);
+
+ if (device_endian_little == CL_FALSE)
+ {
+ log_info ("Device #%u: WARNING: not little endian device", device_id + 1);
+
+ device_param->skipped = 1;
+ }
+
+ // device_available
+
+ cl_bool device_available;
+
+ hc_clGetDeviceInfo (data.ocl, device_param->device, CL_DEVICE_AVAILABLE, sizeof (device_available), &device_available, NULL);
+
+ if (device_available == CL_FALSE)
+ {
+ log_info ("Device #%u: WARNING: device not available", device_id + 1);
+
+ device_param->skipped = 1;
+ }
+
+ // device_compiler_available
+
+ cl_bool device_compiler_available;
+
+ hc_clGetDeviceInfo (data.ocl, device_param->device, CL_DEVICE_COMPILER_AVAILABLE, sizeof (device_compiler_available), &device_compiler_available, NULL);
+
+ if (device_compiler_available == CL_FALSE)
+ {
+ log_info ("Device #%u: WARNING: device no compiler available", device_id + 1);
+
+ device_param->skipped = 1;
+ }
+
+ // device_execution_capabilities
+
+ cl_device_exec_capabilities device_execution_capabilities;
+
+ hc_clGetDeviceInfo (data.ocl, device_param->device, CL_DEVICE_EXECUTION_CAPABILITIES, sizeof (device_execution_capabilities), &device_execution_capabilities, NULL);
+
+ if ((device_execution_capabilities & CL_EXEC_KERNEL) == 0)
+ {
+ log_info ("Device #%u: WARNING: device does not support executing kernels", device_id + 1);
+
+ device_param->skipped = 1;
+ }
+
+ // device_extensions
+
+ size_t device_extensions_size;
+
+ hc_clGetDeviceInfo (data.ocl, device_param->device, CL_DEVICE_EXTENSIONS, 0, NULL, &device_extensions_size);
+
+ char *device_extensions = mymalloc (device_extensions_size + 1);
+
+ hc_clGetDeviceInfo (data.ocl, device_param->device, CL_DEVICE_EXTENSIONS, device_extensions_size, device_extensions, NULL);
- const u32 skipped1 = ((devices_filter & (1 << device_id)) == 0);
- const u32 skipped2 = ((device_types_filter & (device_type)) == 0);
+ if (strstr (device_extensions, "base_atomics") == 0)
+ {
+ log_info ("Device #%u: WARNING: device does not support base atomics", device_id + 1);
+
+ device_param->skipped = 1;
+ }
+
+ if (strstr (device_extensions, "byte_addressable_store") == 0)
+ {
+ log_info ("Device #%u: WARNING: device does not support byte addressable store", device_id + 1);
+
+ device_param->skipped = 1;
+ }
+
+ myfree (device_extensions);
+
+ // device_local_mem_size
+
+ cl_ulong device_local_mem_size;
- device_param->skipped = (skipped1 || skipped2);
+ hc_clGetDeviceInfo (data.ocl, device_param->device, CL_DEVICE_LOCAL_MEM_SIZE, sizeof (device_local_mem_size), &device_local_mem_size, NULL);
+
+ if (device_local_mem_size < 32768)
+ {
+ log_info ("Device #%u: WARNING: device local mem size is too small", device_id + 1);
+
+ device_param->skipped = 1;
+ }
+
+
+ // skipped
+
+ device_param->skipped |= ((devices_filter & (1 << device_id)) == 0);
+ device_param->skipped |= ((device_types_filter & (device_type)) == 0);
// driver_version
+
hc_clGetDeviceInfo (data.ocl, device_param->device, CL_DRIVER_VERSION, 0, NULL, ¶m_value_size);
char *driver_version = (char *) mymalloc (param_value_size);
if (device_param->skipped == 0)
{
- if (strstr (device_version, "pocl"))
- {
- if (force == 0)
- {
- log_info ("");
- log_info ("ATTENTION! All pocl drivers are known to be broken due to broken LLVM <= 3.7");
- log_info ("You are STRONGLY encouraged not to use it");
- log_info ("You can use --force to override this but do not post error reports if you do so");
- log_info ("");
-
- return (-1);
- }
- }
-
if (device_type & CL_DEVICE_TYPE_GPU)
{
- if (vendor_id == VENDOR_ID_NV)
- {
- if (device_param->kernel_exec_timeout != 0)
- {
- if (data.quiet == 0) log_info ("Device #%u: WARNING! Kernel exec timeout is not disabled, it might cause you errors of code 702", device_id + 1);
- if (data.quiet == 0) log_info (" See the wiki on how to disable it: https://hashcat.net/wiki/doku.php?id=timeout_patch");
- }
- }
- else if (vendor_id == VENDOR_ID_AMD)
+ if (vendor_id == VENDOR_ID_AMD)
{
int catalyst_check = (force == 1) ? 0 : 1;
log_info ("You can use --force to override this but do not post error reports if you do so");
log_info ("");
+ return (-1);
+ }
+ }
+ else if (vendor_id == VENDOR_ID_NV)
+ {
+ if (device_param->kernel_exec_timeout != 0)
+ {
+ if (data.quiet == 0) log_info ("Device #%u: WARNING! Kernel exec timeout is not disabled, it might cause you errors of code 702", device_id + 1);
+ if (data.quiet == 0) log_info (" See the wiki on how to disable it: https://hashcat.net/wiki/doku.php?id=timeout_patch");
+ }
+ }
+ }
+
+ if (device_type & CL_DEVICE_TYPE_CPU)
+ {
+ if (vendor_id == VENDOR_ID_AMD)
+ {
+ if (force == 0)
+ {
+ log_info ("");
+ log_info ("ATTENTION! OpenCL support for CPU of catalyst driver is not reliable.");
+ log_info ("You are STRONGLY encouraged not to use it");
+ log_info ("You can use --force to override this but do not post error reports if you do so");
+ log_info ("A good alternative is the free pocl >= v0.13, but make sure to use a LLVM >= v3.8");
+ log_info ("");
+
return (-1);
}
}
device_param->command_queue = hc_clCreateCommandQueue (data.ocl, device_param->context, device_param->device, CL_QUEUE_PROFILING_ENABLE);
+ /**
+ * kernel threads: some algorithms need a fixed kernel-threads count
+ * because of shared memory usage or bitslice
+ * there needs to be some upper limit, otherwise there's too much overhead
+ */
+
+ uint kernel_threads = MIN (KERNEL_THREADS_MAX, device_param->device_maxworkgroup_size);
+
+ if (device_param->device_type & CL_DEVICE_TYPE_CPU)
+ {
+ kernel_threads = KERNEL_THREADS_MAX_CPU;
+ }
+
+ if (hash_mode == 1500) kernel_threads = 64; // DES
+ if (hash_mode == 3000) kernel_threads = 64; // DES
+ if (hash_mode == 3200) kernel_threads = 8; // Blowfish
+ if (hash_mode == 7500) kernel_threads = 64; // RC4
+ if (hash_mode == 9000) kernel_threads = 8; // Blowfish
+ if (hash_mode == 9700) kernel_threads = 64; // RC4
+ if (hash_mode == 9710) kernel_threads = 64; // RC4
+ if (hash_mode == 9800) kernel_threads = 64; // RC4
+ if (hash_mode == 9810) kernel_threads = 64; // RC4
+ if (hash_mode == 10400) kernel_threads = 64; // RC4
+ if (hash_mode == 10410) kernel_threads = 64; // RC4
+ if (hash_mode == 10500) kernel_threads = 64; // RC4
+ if (hash_mode == 13100) kernel_threads = 64; // RC4
+
/**
* create input buffers on device : calculate size of fixed memory buffers
*/
- uint size_root_css = SP_PW_MAX * sizeof (cs_t);
- uint size_markov_css = SP_PW_MAX * CHARSIZ * sizeof (cs_t);
+ size_t size_root_css = SP_PW_MAX * sizeof (cs_t);
+ size_t size_markov_css = SP_PW_MAX * CHARSIZ * sizeof (cs_t);
device_param->size_root_css = size_root_css;
device_param->size_markov_css = size_markov_css;
- uint size_results = KERNEL_THREADS * sizeof (uint);
+ size_t size_results = kernel_threads * sizeof (uint);
device_param->size_results = size_results;
- uint size_rules = kernel_rules_cnt * sizeof (kernel_rule_t);
- uint size_rules_c = KERNEL_RULES * sizeof (kernel_rule_t);
+ size_t size_rules = kernel_rules_cnt * sizeof (kernel_rule_t);
+ size_t size_rules_c = KERNEL_RULES * sizeof (kernel_rule_t);
- uint size_plains = digests_cnt * sizeof (plain_t);
- uint size_salts = salts_cnt * sizeof (salt_t);
- uint size_esalts = salts_cnt * esalt_size;
+ size_t size_plains = digests_cnt * sizeof (plain_t);
+ size_t size_salts = salts_cnt * sizeof (salt_t);
+ size_t size_esalts = salts_cnt * esalt_size;
device_param->size_plains = size_plains;
device_param->size_digests = size_digests;
device_param->size_shown = size_shown;
device_param->size_salts = size_salts;
- uint size_combs = KERNEL_COMBS * sizeof (comb_t);
- uint size_bfs = KERNEL_BFS * sizeof (bf_t);
- uint size_tm = 32 * sizeof (bs_word_t);
+ size_t size_combs = KERNEL_COMBS * sizeof (comb_t);
+ size_t size_bfs = KERNEL_BFS * sizeof (bf_t);
+ size_t size_tm = 32 * sizeof (bs_word_t);
// scryptV stuff
- u64 size_scryptV = 1;
+ size_t size_scryptV = 1;
if ((hash_mode == 8900) || (hash_mode == 9300))
{
}
else if (device_param->vendor_id == VENDOR_ID_NV)
{
- tmto_start = 3;
+ tmto_start = 2;
}
}
else if (hash_mode == 9300)
{
if (device_param->vendor_id == VENDOR_ID_AMD)
{
- tmto_start = 3;
+ tmto_start = 2;
}
else if (device_param->vendor_id == VENDOR_ID_NV)
{
- tmto_start = 5;
+ tmto_start = 2;
}
}
}
if (quiet == 0) log_info ("");
- uint shader_per_mp = 1;
-
- if (device_param->vendor_id == VENDOR_ID_AMD)
- {
- shader_per_mp = 8;
- }
- else if (device_param->vendor_id == VENDOR_ID_NV)
- {
- shader_per_mp = 32;
- }
-
for (uint tmto = tmto_start; tmto < tmto_stop; tmto++)
{
// TODO: in theory the following calculation needs to be done per salt, not global
size_scryptV /= 1 << tmto;
- size_scryptV *= device_processors * device_processor_cores * shader_per_mp;
+ size_scryptV *= device_processors * device_processor_cores;
if (size_scryptV > device_param->device_maxmem_alloc)
{
for (uint salts_pos = 0; salts_pos < data.salts_cnt; salts_pos++)
{
data.salts_buf[salts_pos].scrypt_tmto = tmto;
- data.salts_buf[salts_pos].scrypt_phy = device_processors * device_processor_cores * shader_per_mp;
+ data.salts_buf[salts_pos].scrypt_phy = device_processors * device_processor_cores;
}
break;
if (quiet == 0) log_info ("SCRYPT tmto optimizer value set to: %u, mem: %u\n", data.salts_buf[0].scrypt_tmto, size_scryptV);
}
- /**
- * create input buffers on device : calculate size of dynamic size memory buffers
- */
-
- uint kernel_threads = KERNEL_THREADS;
-
- // some algorithms need a fixed kernel-threads count (mostly because of shared memory usage)
-
- if (hash_mode == 3200) kernel_threads = 8;
- if (hash_mode == 9000) kernel_threads = 8;
-
/**
* some algorithms need a fixed kernel-loops count
*/
// find out if we would request too much memory on memory blocks which are based on kernel_accel
- uint size_pws = 4;
- uint size_tmps = 4;
- uint size_hooks = 4;
+ size_t size_pws = 4;
+ size_t size_tmps = 4;
+ size_t size_hooks = 4;
while (kernel_accel_max >= kernel_accel_min)
{
- uint kernel_power_max = device_processors * kernel_threads * kernel_accel_max;
+ const u32 kernel_power_max = device_processors * kernel_threads * kernel_accel_max;
// size_pws
+ size_markov_css
+ size_plains
+ size_pws
+ + size_pws // not a bug
+ size_results
+ size_root_css
+ size_rules
// we don't have sm_* on vendors not NV but it doesn't matter
- snprintf (build_opts, sizeof (build_opts) - 1, "-I\"%s/\" -DVENDOR_ID=%u -DCUDA_ARCH=%d -DVECT_SIZE=%u -DDEVICE_TYPE=%u", shared_dir, device_param->vendor_id, (device_param->sm_major * 100) + device_param->sm_minor, device_param->vector_width, (u32) device_param->device_type);
+ snprintf (build_opts, sizeof (build_opts) - 1, "-cl-std=CL1.1 -I\"%s/\" -DVENDOR_ID=%u -DCUDA_ARCH=%d -DVECT_SIZE=%u -DDEVICE_TYPE=%u", shared_dir, device_param->vendor_id, (device_param->sm_major * 100) + device_param->sm_minor, device_param->vector_width, (u32) device_param->device_type);
/**
* main kernel
memset (device_param->speed_cnt, 0, SPEED_CACHE * sizeof (u64));
memset (device_param->speed_ms, 0, SPEED_CACHE * sizeof (double));
- memset (device_param->speed_rec, 0, SPEED_CACHE * sizeof (hc_timer_t));
-
- device_param->speed_cnt_total = 0;
- device_param->speed_ms_total = 0;
device_param->exec_pos = 0;