Run a few device compatibility checks on startup

[hashcat.git] / src / oclHashcat.c
diff --git a/src/oclHashcat.c b/src/oclHashcat.c

index bd720e8..6bdc33e 100644 (file)
--- a/src/oclHashcat.c
+++ b/src/oclHashcat.c
@@ -33,7 +33,6 @@ double TARGET_MS_PROFILE[3]     = { 8, 16, 96 };
  #define MARKOV_DISABLE          0
  #define MARKOV_CLASSIC          0
  #define BENCHMARK               0
-#define BENCHMARK_REPEATS       100
  #define RESTORE                 0
  #define RESTORE_TIMER           60
  #define RESTORE_DISABLE         0
@@ -84,7 +83,8 @@ double TARGET_MS_PROFILE[3]     = { 8, 16, 96 };
  #define KERNEL_RULES            1024
  #define KERNEL_COMBS            1024
  #define KERNEL_BFS              1024
-#define KERNEL_THREADS          64
+#define KERNEL_THREADS_MAX      256
+#define KERNEL_THREADS_MAX_CPU  16
  #define POWERTUNE_ENABLE        0
  #define LOGFILE_DISABLE         0
  #define SCRYPT_TMTO             0
@@ -148,7 +148,7 @@ double TARGET_MS_PROFILE[3]     = { 8, 16, 96 };
  
  #define MAX_DICTSTAT            10000
  
-#define NUM_DEFAULT_BENCHMARK_ALGORITHMS 135
+#define NUM_DEFAULT_BENCHMARK_ALGORITHMS 136
  
  #define global_free(attr)       \
  {                               \
@@ -268,6 +268,7 @@ static uint default_benchmark_algorithms[NUM_DEFAULT_BENCHMARK_ALGORITHMS] =
    8700,
    9100,
    133,
+  13500,
    11600,
    12500,
    13000,
@@ -696,6 +697,7 @@ const char *USAGE_BIG[] =
    "   8700 = Lotus Notes/Domino 6",
    "   9100 = Lotus Notes/Domino 8",
    "    133 = PeopleSoft",
+  "  13500 = PeopleSoft Token",
    "",
    "[[ Archives ]]",
    "",
@@ -805,12 +807,6 @@ void status_display_automat ()
  
      for (int i = 0; i < SPEED_CACHE; i++)
      {
-      double rec_ms;
-
-      hc_timer_get (device_param->speed_rec[i], rec_ms);
-
-      if (rec_ms > SPEED_MAXAGE) continue;
-
        speed_cnt  += device_param->speed_cnt[i];
        speed_ms   += device_param->speed_ms[i];
      }
@@ -1149,25 +1145,11 @@ void status_display ()
  
      if (device_param->skipped) continue;
  
-    // we need to clear values (set to 0) because in case the device does
-    // not get new candidates it idles around but speed display would
-    // show it as working.
-    // if we instantly set it to 0 after reading it happens that the
-    // speed can be shown as zero if the users refreshes too fast.
-    // therefore, we add a timestamp when a stat was recorded and if its
-    // too old we will not use it
-
      speed_cnt[device_id] = 0;
      speed_ms[device_id]  = 0;
  
      for (int i = 0; i < SPEED_CACHE; i++)
      {
-      double rec_ms;
-
-      hc_timer_get (device_param->speed_rec[i], rec_ms);
-
-      if (rec_ms > SPEED_MAXAGE) continue;
-
        speed_cnt[device_id] += device_param->speed_cnt[i];
        speed_ms[device_id]  += device_param->speed_ms[i];
      }
@@ -2190,7 +2172,7 @@ static void check_cracked (hc_device_param_t *device_param, const uint salt_pos)
  
    hc_clEnqueueReadBuffer (data.ocl, device_param->command_queue, device_param->d_result, CL_TRUE, 0, device_param->size_results, device_param->result, 0, NULL, NULL);
  
-  for (uint i = 0; i < KERNEL_THREADS; i++) if (device_param->result[i] == 1) found = 1;
+  for (uint i = 0; i < device_param->kernel_threads; i++) if (device_param->result[i] == 1) found = 1;
  
    if (found == 1)
    {
@@ -2442,8 +2424,18 @@ static void run_kernel (const uint kern_run, hc_device_param_t *device_param, co
  
      hc_clGetKernelWorkGroupInfo (data.ocl, kernel, device_param->device, CL_KERNEL_WORK_GROUP_SIZE, sizeof (size_t), &workgroup_size, NULL);
  
+    if (kern_run == KERN_RUN_2)
+    {
+      if (data.opti_type & OPTI_TYPE_SLOW_HASH_SIMD)
+      {
+        num_elements = CEIL ((float) num_elements / device_param->vector_width);
+      }
+    }
+
      if (kernel_threads > workgroup_size) kernel_threads = workgroup_size;
  
+    while (num_elements % kernel_threads) num_elements++;
+
      const size_t global_work_size[3] = { num_elements,   1, 1 };
      const size_t local_work_size[3]  = { kernel_threads, 1, 1 };
  
@@ -2489,7 +2481,7 @@ static void run_kernel_mp (const uint kern_run, hc_device_param_t *device_param,
    // causes problems with special threads like in bcrypt
    // const uint kernel_threads = device_param->kernel_threads;
  
-  uint kernel_threads = KERNEL_THREADS;
+  uint kernel_threads = device_param->kernel_threads;
  
    while (num_elements % kernel_threads) num_elements++;
  
@@ -2529,10 +2521,12 @@ static void run_kernel_mp (const uint kern_run, hc_device_param_t *device_param,
    }
  
    size_t workgroup_size = 0;
+
    hc_clGetKernelWorkGroupInfo (data.ocl, kernel, device_param->device, CL_KERNEL_WORK_GROUP_SIZE, sizeof(size_t), &workgroup_size, NULL);
+
    if (kernel_threads > workgroup_size) kernel_threads = workgroup_size;
  
-  const size_t global_work_size[3] = { num_elements, 1, 1 };
+  const size_t global_work_size[3] = { num_elements,   1, 1 };
    const size_t local_work_size[3]  = { kernel_threads, 1, 1 };
  
    hc_clEnqueueNDRangeKernel (data.ocl, device_param->command_queue, kernel, 1, NULL, global_work_size, local_work_size, 0, NULL, NULL);
@@ -2551,10 +2545,12 @@ static void run_kernel_tm (hc_device_param_t *device_param)
    cl_kernel kernel = device_param->kernel_tm;
  
    size_t workgroup_size = 0;
+
    hc_clGetKernelWorkGroupInfo (data.ocl, kernel, device_param->device, CL_KERNEL_WORK_GROUP_SIZE, sizeof (size_t), &workgroup_size, NULL);
+
    if (kernel_threads > workgroup_size) kernel_threads = workgroup_size;
  
-  const size_t global_work_size[3] = { num_elements, 1, 1 };
+  const size_t global_work_size[3] = { num_elements,    1, 1 };
    const size_t local_work_size[3]  = { kernel_threads,  1, 1 };
  
    hc_clEnqueueNDRangeKernel (data.ocl, device_param->command_queue, kernel, 1, NULL, global_work_size, local_work_size, 0, NULL, NULL);
@@ -2574,7 +2570,7 @@ static void run_kernel_amp (hc_device_param_t *device_param, const uint num)
    // causes problems with special threads like in bcrypt
    // const uint kernel_threads = device_param->kernel_threads;
  
-  uint kernel_threads = KERNEL_THREADS;
+  uint kernel_threads = device_param->kernel_threads;
  
    while (num_elements % kernel_threads) num_elements++;
  
@@ -2584,10 +2580,12 @@ static void run_kernel_amp (hc_device_param_t *device_param, const uint num)
    hc_clSetKernelArg (data.ocl, kernel, 6, sizeof (cl_uint), device_param->kernel_params_amp[6]);
  
    size_t workgroup_size = 0;
+
    hc_clGetKernelWorkGroupInfo (data.ocl, kernel, device_param->device, CL_KERNEL_WORK_GROUP_SIZE, sizeof (size_t), &workgroup_size, NULL);
+
    if (kernel_threads > workgroup_size) kernel_threads = workgroup_size;
  
-  const size_t global_work_size[3] = { num_elements, 1, 1 };
+  const size_t global_work_size[3] = { num_elements,    1, 1 };
    const size_t local_work_size[3]  = { kernel_threads,  1, 1 };
  
    hc_clEnqueueNDRangeKernel (data.ocl, device_param->command_queue, kernel, 1, NULL, global_work_size, local_work_size, 0, NULL, NULL);
@@ -2597,7 +2595,7 @@ static void run_kernel_amp (hc_device_param_t *device_param, const uint num)
    hc_clFinish (data.ocl, device_param->command_queue);
  }
  
-static void run_kernel_bzero (hc_device_param_t *device_param, cl_mem buf, const uint size)
+static void run_kernel_bzero (hc_device_param_t *device_param, cl_mem buf, const size_t size)
  {
    int rc = -1;
  
@@ -2613,7 +2611,7 @@ static void run_kernel_bzero (hc_device_param_t *device_param, cl_mem buf, const
    if (rc != 0)
    {
      // NOTE: clEnqueueFillBuffer () always fails with -59
-    //       IOW, it's not supported by Nvidia ForceWare <= 352.21, also pocl segfaults, also on apple
+    //       IOW, it's not supported by Nvidia drivers <= 352.21, also pocl segfaults, also on apple
      //       How's that possible, OpenCL 1.2 support is advertised??
      //       We need to workaround...
  
@@ -2621,11 +2619,11 @@ static void run_kernel_bzero (hc_device_param_t *device_param, cl_mem buf, const
  
      char *tmp = (char *) mymalloc (FILLSZ);
  
-    for (uint i = 0; i < size; i += FILLSZ)
+    for (size_t i = 0; i < size; i += FILLSZ)
      {
-      const int left = size - i;
+      const size_t left = size - i;
  
-      const int fillsz = MIN (FILLSZ, left);
+      const size_t fillsz = MIN (FILLSZ, left);
  
        hc_clEnqueueWriteBuffer (data.ocl, device_param->command_queue, buf, CL_TRUE, i, fillsz, tmp, 0, NULL, NULL);
      }
@@ -2694,6 +2692,29 @@ static void choose_kernel (hc_device_param_t *device_param, const uint attack_ex
        if (data.devices_status == STATUS_CRACKED) break;
        if (data.devices_status == STATUS_ABORTED) break;
        if (data.devices_status == STATUS_QUIT)    break;
+
+      /**
+       * speed
+       */
+
+      const float iter_part = (float) (loop_pos + loop_left) / iter;
+
+      const u64 perf_sum_all = pws_cnt * iter_part;
+
+      double speed_ms;
+
+      hc_timer_get (device_param->timer_speed, speed_ms);
+
+      const u32 speed_pos = device_param->speed_pos;
+
+      device_param->speed_cnt[speed_pos] = perf_sum_all;
+
+      device_param->speed_ms[speed_pos] = speed_ms;
+
+      if (data.benchmark == 1)
+      {
+        if (speed_ms > 4096) data.devices_status = STATUS_ABORTED;
+      }
      }
  
      if (opts_type & OPTS_TYPE_HOOK23)
@@ -2771,62 +2792,57 @@ static void run_copy (hc_device_param_t *device_param, const uint pws_cnt)
    }
  }
  
-static double try_run (hc_device_param_t *device_param, const u32 kernel_accel, const u32 kernel_loops, const int repeat)
+static double try_run (hc_device_param_t *device_param, const u32 kernel_accel, const u32 kernel_loops)
  {
    const u32 kernel_power = device_param->device_processors * device_param->kernel_threads * kernel_accel;
  
-  device_param->kernel_params_buf32[26] = kernel_loops;
-  device_param->kernel_params_buf32[27] = kernel_loops;
+  device_param->kernel_params_buf32[25] = 0;
+  device_param->kernel_params_buf32[26] = kernel_loops; // not a bug, both need to be set
+  device_param->kernel_params_buf32[27] = kernel_loops; // because there's two variables for inner iters for slow and fast hashes
  
    // init some fake words
  
-  for (u32 i = 0; i < kernel_power; i++)
+  if (data.hash_mode == 10700)
    {
-    device_param->pws_buf[i].i[0]   = i;
-    device_param->pws_buf[i].i[1]   = 0x01234567;
-    device_param->pws_buf[i].pw_len = 4 + (i & 3);
-  }
+    // hash mode 10700 hangs on length 0 (unlimited loop)
+
+    for (u32 i = 0; i < kernel_power; i++)
+    {
+      device_param->pws_buf[i].i[0]   = i;
+      device_param->pws_buf[i].i[1]   = i + 0x01234567;
+      device_param->pws_buf[i].i[2]   = i + 0x89abcdef;
+      device_param->pws_buf[i].i[3]   = 0xffffffff;
+      device_param->pws_buf[i].pw_len = 4 + (i & 3);
+    }
  
-  hc_clEnqueueWriteBuffer (data.ocl, device_param->command_queue, device_param->d_pws_buf, CL_TRUE, 0, kernel_power * sizeof (pw_t), device_param->pws_buf, 0, NULL, NULL);
+    hc_clEnqueueWriteBuffer (data.ocl, device_param->command_queue, device_param->d_pws_buf, CL_TRUE, 0, kernel_power * sizeof (pw_t), device_param->pws_buf, 0, NULL, NULL);
  
-  if (data.attack_exec == ATTACK_EXEC_OUTSIDE_KERNEL)
-  {
-    run_kernel_amp (device_param, kernel_power);
+    if (data.attack_exec == ATTACK_EXEC_OUTSIDE_KERNEL)
+    {
+      run_kernel_amp (device_param, kernel_power);
+    }
    }
  
-  // caching run
-
    if (data.attack_exec == ATTACK_EXEC_INSIDE_KERNEL)
    {
-    run_kernel (KERN_RUN_1, device_param, kernel_power, false);
+    run_kernel (KERN_RUN_1, device_param, kernel_power, true);
    }
    else
    {
-    run_kernel (KERN_RUN_2, device_param, kernel_power, false);
+    run_kernel (KERN_RUN_2, device_param, kernel_power, true);
    }
  
-  // now user repeats
-
-  for (int i = 0; i < repeat; i++)
-  {
-    if (data.attack_exec == ATTACK_EXEC_INSIDE_KERNEL)
-    {
-      run_kernel (KERN_RUN_1, device_param, kernel_power, true);
-    }
-    else
-    {
-      run_kernel (KERN_RUN_2, device_param, kernel_power, true);
-    }
-  }
-
-  const double exec_ms_prev = get_avg_exec_time (device_param, repeat);
+  const double exec_ms_prev = get_avg_exec_time (device_param, 1);
  
    // reset fake words
  
-  memset (device_param->pws_buf, 0, kernel_power * sizeof (pw_t));
+  if (data.hash_mode == 10700)
+  {
+    memset (device_param->pws_buf, 0, kernel_power * sizeof (pw_t));
  
-  hc_clEnqueueWriteBuffer (data.ocl, device_param->command_queue, device_param->d_pws_buf,     CL_TRUE, 0, kernel_power * sizeof (pw_t), device_param->pws_buf, 0, NULL, NULL);
-  hc_clEnqueueWriteBuffer (data.ocl, device_param->command_queue, device_param->d_pws_amp_buf, CL_TRUE, 0, kernel_power * sizeof (pw_t), device_param->pws_buf, 0, NULL, NULL);
+    hc_clEnqueueWriteBuffer (data.ocl, device_param->command_queue, device_param->d_pws_buf,     CL_TRUE, 0, kernel_power * sizeof (pw_t), device_param->pws_buf, 0, NULL, NULL);
+    hc_clEnqueueWriteBuffer (data.ocl, device_param->command_queue, device_param->d_pws_amp_buf, CL_TRUE, 0, kernel_power * sizeof (pw_t), device_param->pws_buf, 0, NULL, NULL);
+  }
  
    return exec_ms_prev;
  }
@@ -2844,167 +2860,114 @@ static void autotune (hc_device_param_t *device_param)
    u32 kernel_accel = kernel_accel_min;
    u32 kernel_loops = kernel_loops_min;
  
-  // steps
-
    #define STEPS_CNT 10
  
-  #define STEPS_ACCEL_CNT (STEPS_CNT + 2)
-  #define STEPS_LOOPS_CNT (STEPS_CNT + 2)
-
-  u32 steps_accel[STEPS_ACCEL_CNT];
-  u32 steps_loops[STEPS_LOOPS_CNT];
-
-  for (int i = 0; i < STEPS_ACCEL_CNT; i++)
-  {
-    steps_accel[i] = 1 << i;
-  }
+  #define MAX_RETRIES 1
  
-  for (int i = 0; i < STEPS_LOOPS_CNT; i++)
+  if ((kernel_loops_min == kernel_loops_max) || (kernel_accel_min == kernel_accel_max))
    {
-    steps_loops[i] = 1 << i;
+    // we do this in case the user specified a fixed -u and -n on the commandline
+    // so we have a cached kernel for benchmark
+
+    try_run (device_param, kernel_accel, kernel_loops);
+    try_run (device_param, kernel_accel, kernel_loops);
+    try_run (device_param, kernel_accel, kernel_loops);
+    try_run (device_param, kernel_accel, kernel_loops);
+    try_run (device_param, kernel_accel, kernel_loops);
    }
  
-  steps_accel[STEPS_CNT + 0] = kernel_accel_min;
-  steps_accel[STEPS_CNT + 1] = kernel_accel_max;
-
-  steps_loops[STEPS_CNT + 0] = kernel_loops_min;
-  steps_loops[STEPS_CNT + 1] = kernel_loops_max;
+  double exec_ms_final = try_run (device_param, kernel_accel, kernel_loops);
  
-  qsort (steps_accel, STEPS_ACCEL_CNT, sizeof (u32), sort_by_u32);
-  qsort (steps_loops, STEPS_LOOPS_CNT, sizeof (u32), sort_by_u32);
+  // first find out highest kernel-loops that stays below target_ms
  
-  // find out highest kernel-loops that stays below target_ms, we can use it later for multiplication as this is a linear function
+  for (kernel_loops = kernel_loops_max; kernel_loops > kernel_loops_min; kernel_loops >>= 1)
+  {
+    double exec_ms_best = try_run (device_param, kernel_accel_min, kernel_loops);
  
-  u32 kernel_loops_tmp;
+    for (int i = 0; i < MAX_RETRIES; i++)
+    {
+      const double exec_ms_cur = try_run (device_param, kernel_accel_min, kernel_loops);
  
-  for (kernel_loops_tmp = kernel_loops_max; kernel_loops_tmp > kernel_loops_min; kernel_loops_tmp >>= 1)
-  {
-    const double exec_ms = try_run (device_param, kernel_accel_min, kernel_loops_tmp, 1);
+      exec_ms_best = MIN (exec_ms_best, exec_ms_cur);
+    }
  
-    if (exec_ms < target_ms) break;
+    if (exec_ms_best < target_ms) break;
    }
  
-  // kernel-accel
+  // now the same for kernel-accel but with the new kernel-loops from previous loop set
  
    if (kernel_accel_min < kernel_accel_max)
    {
-    double e_best = 0;
-
-    for (int i = 0; i < STEPS_ACCEL_CNT; i++)
+    for (int i = 0; i < STEPS_CNT; i++)
      {
-      const u32 kernel_accel_try = steps_accel[i];
+      const u32 kernel_accel_try = 1 << i;
  
        if (kernel_accel_try < kernel_accel_min) continue;
        if (kernel_accel_try > kernel_accel_max) break;
  
-      const double exec_ms = try_run (device_param, kernel_accel_try, kernel_loops_tmp, 1);
-
-      if (exec_ms > target_ms) break;
+      double exec_ms_best = try_run (device_param, kernel_accel_try, kernel_loops);
  
-      const double e = kernel_accel_try / exec_ms;
-
-      if (e > e_best)
+      for (int i = 0; i < MAX_RETRIES; i++)
        {
-        kernel_accel = kernel_accel_try;
+        const double exec_ms_cur = try_run (device_param, kernel_accel_try, kernel_loops);
  
-        e_best = e;
+        exec_ms_best = MIN (exec_ms_best, exec_ms_cur);
        }
-    }
-  }
-
-  // kernel-loops final
-
-  if (kernel_loops_min < kernel_loops_max)
-  {
-    double e_best = 0;
  
-    for (int i = 0; i < STEPS_LOOPS_CNT; i++)
-    {
-      const u32 kernel_loops_try = steps_loops[i];
-
-      if (kernel_loops_try < kernel_loops_min) continue;
-      if (kernel_loops_try > kernel_loops_max) break;
-
-      const double exec_ms = try_run (device_param, kernel_accel, kernel_loops_try, 1);
-
-      if (exec_ms > target_ms) break;
-
-      const double e = kernel_loops_try / exec_ms;
+      if (exec_ms_best > target_ms) break;
  
-      if (e > e_best)
-      {
-        kernel_loops = kernel_loops_try;
+      exec_ms_final = exec_ms_best;
  
-        e_best = e;
-      }
+      kernel_accel = kernel_accel_try;
      }
    }
  
-  // final balance
-
-  u32 kernel_accel_best = kernel_accel;
-  u32 kernel_loops_best = kernel_loops;
-
-  u32 exec_best = -1;
-
-  if ((kernel_accel_min < kernel_accel_max) || (kernel_loops_min < kernel_loops_max))
-  {
-    const double exec_ms = try_run (device_param, kernel_accel_best, kernel_loops_best, 1);
-
-    exec_best = exec_ms;
-  }
-
-  // reset
+  // there's a chance that we have a fixed kernel_loops but not a fixed kernel_accel
+  // in such a case the above function would not create any change
+  // we'll use the runtime to find out if we're allow to do last improvement
  
-  if (kernel_accel_min < kernel_accel_max)
+  if (exec_ms_final > 0)
    {
-    u32 kernel_accel_try = kernel_accel;
-    u32 kernel_loops_try = kernel_loops;
-
-    for (int i = 0; i < 2; i++)
+    if ((exec_ms_final * 2) <= target_ms)
      {
-      kernel_accel_try >>= 1;
-      kernel_loops_try <<= 1;
+      const double exec_left = target_ms / exec_ms_final;
  
-      if (kernel_accel_try < kernel_accel_min) break;
-      if (kernel_loops_try > kernel_loops_max) break;
+      const double accel_left = kernel_accel_max / kernel_accel;
  
-      const double exec_ms = try_run (device_param, kernel_accel_try, kernel_loops_try, 1);
+      const int exec_accel_min = MIN (exec_left, accel_left); // we want that to be int
  
-      if (exec_ms < exec_best)
+      if (exec_accel_min >= 2)
        {
-        kernel_accel_best = kernel_accel_try;
-        kernel_loops_best = kernel_loops_try;
-
-        exec_best = exec_ms;
+        kernel_accel *= exec_accel_min;
        }
      }
    }
  
-  // reset
+  // sometimes we're in a bad situation that the algorithm is so slow that we can not
+  // create enough kernel_accel to do both, keep the gpu busy and stay below target_ms.
+  // however, we need to have a minimum kernel_accel and kernel_loops of 32.
+  // luckily, at this level of workload, it became a linear function
  
-  if (kernel_loops_min < kernel_loops_max)
+  if (kernel_accel < 32 || kernel_loops < 32)
    {
-    u32 kernel_accel_try = kernel_accel;
-    u32 kernel_loops_try = kernel_loops;
+    const u32 kernel_power = kernel_accel * kernel_loops;
  
-    for (int i = 0; i < 2; i++)
-    {
-      kernel_accel_try <<= 1;
-      kernel_loops_try >>= 1;
+    // find sqrt
  
-      if (kernel_accel_try > kernel_accel_max) break;
-      if (kernel_loops_try < kernel_loops_min) break;
+    u32 sqrtv;
  
-      const double exec_ms = try_run (device_param, kernel_accel_try, kernel_loops_try, 1);
+    for (sqrtv = 1; sqrtv < 0x100000; sqrtv++)
+    {
+      if ((sqrtv * sqrtv) >= kernel_power) break;
+    }
  
-      if (exec_ms < exec_best)
-      {
-        kernel_accel_best = kernel_accel_try;
-        kernel_loops_best = kernel_loops_try;
+    const u32 kernel_accel_try = sqrtv;
+    const u32 kernel_loops_try = sqrtv;
  
-        exec_best = exec_ms;
-      }
+    if ((kernel_accel_try <= kernel_accel_max) && (kernel_loops_try >= kernel_loops_min))
+    {
+      kernel_accel = kernel_accel_try;
+      kernel_loops = kernel_loops_try;
      }
    }
  
@@ -3016,9 +2979,6 @@ static void autotune (hc_device_param_t *device_param)
  
    // store
  
-  kernel_accel = kernel_accel_best;
-  kernel_loops = kernel_loops_best;
-
    device_param->kernel_accel = kernel_accel;
    device_param->kernel_loops = kernel_loops;
  
@@ -3034,12 +2994,11 @@ static void autotune (hc_device_param_t *device_param)
  
      log_info ("Device #%u: autotuned kernel-accel to %u\n"
                "Device #%u: autotuned kernel-loops to %u\n",
-              device_param->device_id + 1,
-              kernel_accel,
-              device_param->device_id + 1,
-              kernel_loops);
+              device_param->device_id + 1, kernel_accel,
+              device_param->device_id + 1, kernel_loops);
  
      fprintf (stdout, "%s", PROMPT);
+
      fflush (stdout);
    }
  
@@ -3290,41 +3249,13 @@ static void run_cracker (hc_device_param_t *device_param, const uint pws_cnt)
          hc_clEnqueueCopyBuffer (data.ocl, device_param->command_queue, device_param->d_combs, device_param->d_combs_c, 0, 0, innerloop_left * sizeof (comb_t), 0, NULL, NULL);
        }
  
-      choose_kernel (device_param, data.attack_exec, data.attack_mode, data.opts_type, salt_buf, highest_pw_len, pws_cnt);
-
        if (data.benchmark == 1)
        {
-        double exec_ms_avg_prev = get_avg_exec_time (device_param, EXEC_CACHE);
-
-        // a few caching rounds
-
-        for (u32 i = 0; i < 2; i++)
-        {
-          hc_timer_set (&device_param->timer_speed);
-
-          choose_kernel (device_param, data.attack_exec, data.attack_mode, data.opts_type, salt_buf, highest_pw_len, pws_cnt);
-
-          double exec_ms_avg = get_avg_exec_time (device_param, EXEC_CACHE);
-
-          exec_ms_avg_prev = exec_ms_avg;
-        }
-
-        // benchmark_repeats became a maximum possible repeats
-
-        for (u32 i = 2; i < data.benchmark_repeats; i++)
-        {
-          hc_timer_set (&device_param->timer_speed);
-
-          choose_kernel (device_param, data.attack_exec, data.attack_mode, data.opts_type, salt_buf, highest_pw_len, pws_cnt);
-
-          double exec_ms_avg = get_avg_exec_time (device_param, EXEC_CACHE);
-
-          if ((exec_ms_avg_prev / exec_ms_avg) < 1.001) break;
-
-          exec_ms_avg_prev = exec_ms_avg;
-        }
+        hc_timer_set (&device_param->timer_speed);
        }
  
+      choose_kernel (device_param, data.attack_exec, data.attack_mode, data.opts_type, salt_buf, highest_pw_len, pws_cnt);
+
        if (data.devices_status == STATUS_STOP_AT_CHECKPOINT) check_checkpoint ();
  
        if (data.devices_status == STATUS_CRACKED) break;
@@ -3371,8 +3302,6 @@ static void run_cracker (hc_device_param_t *device_param, const uint pws_cnt)
  
        device_param->speed_ms[speed_pos] = speed_ms;
  
-      device_param->speed_rec[speed_pos] = device_param->timer_speed;
-
        hc_thread_mutex_unlock (mux_display);
  
        speed_pos++;
@@ -3382,12 +3311,6 @@ static void run_cracker (hc_device_param_t *device_param, const uint pws_cnt)
          speed_pos = 0;
        }
  
-      // average speed
-
-      device_param->speed_cnt_total += perf_sum_all;
-
-      device_param->speed_ms_total += speed_ms;
-
        /**
         * benchmark
         */
@@ -5321,6 +5244,9 @@ int main (int argc, char **argv)
    if (getenv ("CUDA_CACHE_DISABLE") == NULL)
      putenv ((char *) "CUDA_CACHE_DISABLE=1");
  
+  if (getenv ("POCL_KERNEL_CACHE") == NULL)
+    putenv ((char *) "POCL_KERNEL_CACHE=0");
+
    /**
     * Real init
     */
@@ -5349,7 +5275,6 @@ int main (int argc, char **argv)
    uint  version           = VERSION;
    uint  quiet             = QUIET;
    uint  benchmark         = BENCHMARK;
-  uint  benchmark_repeats = BENCHMARK_REPEATS;
    uint  show              = SHOW;
    uint  left              = LEFT;
    uint  username          = USERNAME;
@@ -5447,7 +5372,6 @@ int main (int argc, char **argv)
    #define IDX_FORCE             0xff08
    #define IDX_RUNTIME           0xff09
    #define IDX_BENCHMARK         'b'
-  #define IDX_BENCHMARK_REPEATS 0xff78
    #define IDX_HASH_MODE         'm'
    #define IDX_ATTACK_MODE       'a'
    #define IDX_RP_FILE           'r'
@@ -5526,7 +5450,6 @@ int main (int argc, char **argv)
      {"outfile-check-dir", required_argument, 0, IDX_OUTFILE_CHECK_DIR},
      {"force",             no_argument,       0, IDX_FORCE},
      {"benchmark",         no_argument,       0, IDX_BENCHMARK},
-    {"benchmark-repeats", required_argument, 0, IDX_BENCHMARK_REPEATS},
      {"restore",           no_argument,       0, IDX_RESTORE},
      {"restore-disable",   no_argument,       0, IDX_RESTORE_DISABLE},
      {"status",            no_argument,       0, IDX_STATUS},
@@ -5836,7 +5759,6 @@ int main (int argc, char **argv)
        case IDX_LIMIT:             limit             = atoll (optarg);  break;
        case IDX_KEYSPACE:          keyspace          = 1;               break;
        case IDX_BENCHMARK:         benchmark         = 1;               break;
-      case IDX_BENCHMARK_REPEATS: benchmark_repeats = atoi (optarg);   break;
        case IDX_RESTORE:                                                break;
        case IDX_RESTORE_DISABLE:   restore_disable   = 1;               break;
        case IDX_STATUS:            status            = 1;               break;
@@ -5981,7 +5903,7 @@ int main (int argc, char **argv)
      return (-1);
    }
  
-  if (hash_mode_chgd && hash_mode > 13400) // just added to remove compiler warnings for hash_mode_chgd
+  if (hash_mode_chgd && hash_mode > 13500) // just added to remove compiler warnings for hash_mode_chgd
    {
      log_error ("ERROR: Invalid hash-type specified");
  
@@ -6328,13 +6250,7 @@ int main (int argc, char **argv)
  
    if (loopback == 1)
    {
-    if (attack_mode == ATTACK_MODE_BF)
-    {
-      log_error ("ERROR: Parameter loopback not allowed in attack-mode 3");
-
-      return (-1);
-    }
-    else if (attack_mode == ATTACK_MODE_STRAIGHT)
+    if (attack_mode == ATTACK_MODE_STRAIGHT)
      {
        if ((rp_files_cnt == 0) && (rp_gen == 0))
        {
@@ -6343,6 +6259,12 @@ int main (int argc, char **argv)
          return (-1);
        }
      }
+    else
+    {
+      log_error ("ERROR: Parameter loopback allowed in attack-mode 0 only");
+
+      return (-1);
+    }
    }
  
    if (debug_mode > 0)
@@ -6579,7 +6501,6 @@ int main (int argc, char **argv)
    data.rp_gen_seed       = rp_gen_seed;
    data.force             = force;
    data.benchmark         = benchmark;
-  data.benchmark_repeats = benchmark_repeats;
    data.skip              = skip;
    data.limit             = limit;
    #if defined(HAVE_HWMON) && defined(HAVE_ADL)
@@ -6654,7 +6575,6 @@ int main (int argc, char **argv)
    logfile_top_uint   (attack_mode);
    logfile_top_uint   (attack_kern);
    logfile_top_uint   (benchmark);
-  logfile_top_uint   (benchmark_repeats);
    logfile_top_uint   (bitmap_min);
    logfile_top_uint   (bitmap_max);
    logfile_top_uint   (debug_mode);
@@ -7619,7 +7539,8 @@ int main (int argc, char **argv)
                     dgst_size   = DGST_SIZE_4_4;
                     parse_func  = phpass_parse_hash;
                     sort_by_digest = sort_by_digest_4_4;
-                   opti_type   = OPTI_TYPE_ZERO_BYTE;
+                   opti_type   = OPTI_TYPE_ZERO_BYTE
+                               | OPTI_TYPE_SLOW_HASH_SIMD;
                     dgst_pos0   = 0;
                     dgst_pos1   = 1;
                     dgst_pos2   = 2;
@@ -10307,6 +10228,28 @@ int main (int argc, char **argv)
                     dgst_pos3   = 3;
                     break;
  
+      case 13500:  hash_type   = HASH_TYPE_SHA1;
+                   salt_type   = SALT_TYPE_EMBEDDED;
+                   attack_exec = ATTACK_EXEC_INSIDE_KERNEL;
+                   opts_type   = OPTS_TYPE_PT_GENERATE_BE
+                               | OPTS_TYPE_PT_UNICODE
+                               | OPTS_TYPE_PT_ADD80;
+                   kern_type   = KERN_TYPE_PSTOKEN;
+                   dgst_size   = DGST_SIZE_4_5;
+                   parse_func  = pstoken_parse_hash;
+                   sort_by_digest = sort_by_digest_4_5;
+                   opti_type   = OPTI_TYPE_ZERO_BYTE
+                               | OPTI_TYPE_PRECOMPUTE_INIT
+                               | OPTI_TYPE_EARLY_SKIP
+                               | OPTI_TYPE_NOT_ITERATED
+                               | OPTI_TYPE_PREPENDED_SALT
+                               | OPTI_TYPE_RAW_HASH;
+                   dgst_pos0   = 3;
+                   dgst_pos1   = 4;
+                   dgst_pos2   = 2;
+                   dgst_pos3   = 1;
+                   break;
+
        default:     usage_mini_print (PROGNAME); return (-1);
      }
  
@@ -10412,6 +10355,7 @@ int main (int argc, char **argv)
        case 13000:  esalt_size = sizeof (rar5_t);          break;
        case 13100:  esalt_size = sizeof (krb5tgs_t);       break;
        case 13400:  esalt_size = sizeof (keepass_t);       break;
+      case 13500:  esalt_size = sizeof (pstoken_t);       break;
      }
  
      data.esalt_size = esalt_size;
@@ -11398,7 +11342,8 @@ int main (int argc, char **argv)
  
          switch (hash_mode)
          {
-          case  1500: hashes_buf[0].salt->salt_len = 2;
+          case  1500: hashes_buf[0].salt->salt_len    = 2;
+                      hashes_buf[0].salt->salt_buf[0] = 388; // pure magic
                        break;
            case  1731: hashes_buf[0].salt->salt_len = 4;
                        break;
@@ -11516,6 +11461,8 @@ int main (int argc, char **argv)
                        break;
            case 13400: ((keepass_t *) hashes_buf[0].esalt)->version       = 2;
                        break;
+          case 13500: ((pstoken_t *) hashes_buf[0].esalt)->salt_len      = 113;
+                      break;
          }
        }
  
@@ -12088,8 +12035,8 @@ int main (int argc, char **argv)
      uint digests_cnt  = hashes_cnt;
      uint digests_done = 0;
  
-    uint size_digests = digests_cnt * dgst_size;
-    uint size_shown   = digests_cnt * sizeof (uint);
+    size_t size_digests = digests_cnt * dgst_size;
+    size_t size_shown   = digests_cnt * sizeof (uint);
  
      uint *digests_shown     = (uint *) mymalloc (size_shown);
      uint *digests_shown_tmp = (uint *) mymalloc (size_shown);
@@ -12685,6 +12632,49 @@ int main (int argc, char **argv)
  
        hc_clGetDeviceIDs (data.ocl, platform, CL_DEVICE_TYPE_ALL, DEVICES_MAX, platform_devices, &platform_devices_cnt);
  
+      char platform_vendor[INFOSZ] = { 0 };
+
+      hc_clGetPlatformInfo (data.ocl, platform, CL_PLATFORM_VENDOR, sizeof (platform_vendor), platform_vendor, NULL);
+
+      // find our own platform vendor because pocl and mesa are pushing original vendor_id through opencl
+      // this causes trouble with vendor id based macros
+      // we'll assign generic to those without special optimization available
+
+      cl_uint vendor_id = 0;
+
+      if (strcmp (platform_vendor, CL_VENDOR_AMD) == 0)
+      {
+        vendor_id = VENDOR_ID_AMD;
+      }
+      else if (strcmp (platform_vendor, CL_VENDOR_APPLE) == 0)
+      {
+        vendor_id = VENDOR_ID_GENERIC;
+      }
+      else if (strcmp (platform_vendor, CL_VENDOR_INTEL_BEIGNET) == 0)
+      {
+        vendor_id = VENDOR_ID_GENERIC;
+      }
+      else if (strcmp (platform_vendor, CL_VENDOR_INTEL_SDK) == 0)
+      {
+        vendor_id = VENDOR_ID_GENERIC;
+      }
+      else if (strcmp (platform_vendor, CL_VENDOR_MESA) == 0)
+      {
+        vendor_id = VENDOR_ID_GENERIC;
+      }
+      else if (strcmp (platform_vendor, CL_VENDOR_NV) == 0)
+      {
+        vendor_id = VENDOR_ID_NV;
+      }
+      else if (strcmp (platform_vendor, CL_VENDOR_POCL) == 0)
+      {
+        vendor_id = VENDOR_ID_GENERIC;
+      }
+      else
+      {
+        vendor_id = VENDOR_ID_GENERIC;
+      }
+
        for (uint platform_devices_id = 0; platform_devices_id < platform_devices_cnt; platform_devices_id++)
        {
          size_t param_value_size = 0;
@@ -12693,6 +12683,8 @@ int main (int argc, char **argv)
  
          hc_device_param_t *device_param = &data.devices_param[device_id];
  
+        device_param->vendor_id = vendor_id;
+
          device_param->device = platform_devices[platform_devices_id];
  
          device_param->device_id = device_id;
@@ -12709,14 +12701,6 @@ int main (int argc, char **argv)
  
          device_param->device_type = device_type;
  
-        // vendor_id
-
-        cl_uint vendor_id = 0;
-
-        hc_clGetDeviceInfo (data.ocl, device_param->device, CL_DEVICE_VENDOR_ID, sizeof (vendor_id), &vendor_id, NULL);
-
-        device_param->vendor_id = vendor_id;
-
          // device_name
  
          hc_clGetDeviceInfo (data.ocl, device_param->device, CL_DEVICE_NAME, 0, NULL, &param_value_size);
@@ -12753,16 +12737,6 @@ int main (int argc, char **argv)
  
          myfree (device_opencl_version);
  
-        if (strstr (device_version, "pocl"))
-        {
-          // pocl returns the real vendor_id in CL_DEVICE_VENDOR_ID which causes many problems because of hms and missing amd_bfe () etc
-          // we need to overwrite vendor_id to avoid this. maybe open pocl issue?
-
-          cl_uint vendor_id = VENDOR_ID_GENERIC;
-
-          device_param->vendor_id = vendor_id;
-        }
-
          // vector_width
  
          cl_uint vector_width;
@@ -12802,15 +12776,16 @@ int main (int argc, char **argv)
  
          device_param->device_processors = device_processors;
  
-        // max_mem_alloc_size
+        // device_maxmem_alloc
+        // note we'll limit to 2gb, otherwise this causes all kinds of weird errors because of possible integer overflows in opencl runtimes
  
          cl_ulong device_maxmem_alloc;
  
          hc_clGetDeviceInfo (data.ocl, device_param->device, CL_DEVICE_MAX_MEM_ALLOC_SIZE, sizeof (device_maxmem_alloc), &device_maxmem_alloc, NULL);
  
-        device_param->device_maxmem_alloc = device_maxmem_alloc;
+        device_param->device_maxmem_alloc = MIN (device_maxmem_alloc, 0x7fffffff);
  
-        // max_mem_alloc_size
+        // device_global_mem
  
          cl_ulong device_global_mem;
  
@@ -12818,6 +12793,14 @@ int main (int argc, char **argv)
  
          device_param->device_global_mem = device_global_mem;
  
+        // max_work_group_size
+
+        size_t device_maxworkgroup_size;
+
+        hc_clGetDeviceInfo (data.ocl, device_param->device, CL_DEVICE_MAX_WORK_GROUP_SIZE, sizeof (device_maxworkgroup_size), &device_maxworkgroup_size, NULL);
+
+        device_param->device_maxworkgroup_size = device_maxworkgroup_size;
+
          // max_clock_frequency
  
          cl_uint device_maxclock_frequency;
@@ -12826,14 +12809,105 @@ int main (int argc, char **argv)
  
          device_param->device_maxclock_frequency = device_maxclock_frequency;
  
-        // skipped
+        // device_endian_little
+
+        cl_bool device_endian_little;
+
+        hc_clGetDeviceInfo (data.ocl, device_param->device, CL_DEVICE_ENDIAN_LITTLE, sizeof (device_endian_little), &device_endian_little, NULL);
+
+        if (device_endian_little == CL_FALSE)
+        {
+          log_info ("Device #%u: WARNING: not little endian device", device_id + 1);
+
+          device_param->skipped = 1;
+        }
+
+        // device_available
+
+        cl_bool device_available;
+
+        hc_clGetDeviceInfo (data.ocl, device_param->device, CL_DEVICE_AVAILABLE, sizeof (device_available), &device_available, NULL);
+
+        if (device_available == CL_FALSE)
+        {
+          log_info ("Device #%u: WARNING: device not available", device_id + 1);
+
+          device_param->skipped = 1;
+        }
+
+        // device_compiler_available
+
+        cl_bool device_compiler_available;
+
+        hc_clGetDeviceInfo (data.ocl, device_param->device, CL_DEVICE_COMPILER_AVAILABLE, sizeof (device_compiler_available), &device_compiler_available, NULL);
+
+        if (device_compiler_available == CL_FALSE)
+        {
+          log_info ("Device #%u: WARNING: device no compiler available", device_id + 1);
+
+          device_param->skipped = 1;
+        }
+
+        // device_execution_capabilities
+
+        cl_device_exec_capabilities device_execution_capabilities;
+
+        hc_clGetDeviceInfo (data.ocl, device_param->device, CL_DEVICE_EXECUTION_CAPABILITIES, sizeof (device_execution_capabilities), &device_execution_capabilities, NULL);
+
+        if ((device_execution_capabilities & CL_EXEC_KERNEL) == 0)
+        {
+          log_info ("Device #%u: WARNING: device does not support executing kernels", device_id + 1);
+
+          device_param->skipped = 1;
+        }
+
+        // device_extensions
+
+        size_t device_extensions_size;
+
+        hc_clGetDeviceInfo (data.ocl, device_param->device, CL_DEVICE_EXTENSIONS, 0, NULL, &device_extensions_size);
+
+        char *device_extensions = mymalloc (device_extensions_size + 1);
+
+        hc_clGetDeviceInfo (data.ocl, device_param->device, CL_DEVICE_EXTENSIONS, device_extensions_size, device_extensions, NULL);
  
-        const u32 skipped1 = ((devices_filter      & (1 << device_id)) == 0);
-        const u32 skipped2 = ((device_types_filter & (device_type))    == 0);
+        if (strstr (device_extensions, "base_atomics") == 0)
+        {
+          log_info ("Device #%u: WARNING: device does not support base atomics", device_id + 1);
+
+          device_param->skipped = 1;
+        }
+
+        if (strstr (device_extensions, "byte_addressable_store") == 0)
+        {
+          log_info ("Device #%u: WARNING: device does not support byte addressable store", device_id + 1);
+
+          device_param->skipped = 1;
+        }
+
+        myfree (device_extensions);
+
+        // device_local_mem_size
+
+        cl_ulong device_local_mem_size;
  
-        device_param->skipped = (skipped1 || skipped2);
+        hc_clGetDeviceInfo (data.ocl, device_param->device, CL_DEVICE_LOCAL_MEM_SIZE, sizeof (device_local_mem_size), &device_local_mem_size, NULL);
+
+        if (device_local_mem_size < 32768)
+        {
+          log_info ("Device #%u: WARNING: device local mem size is too small", device_id + 1);
+
+          device_param->skipped = 1;
+        }
+
+
+        // skipped
+
+        device_param->skipped |= ((devices_filter      & (1 << device_id)) == 0);
+        device_param->skipped |= ((device_types_filter & (device_type))    == 0);
  
          // driver_version
+
          hc_clGetDeviceInfo (data.ocl, device_param->device, CL_DRIVER_VERSION, 0, NULL, &param_value_size);
  
          char *driver_version = (char *) mymalloc (param_value_size);
@@ -12945,31 +13019,9 @@ int main (int argc, char **argv)
  
          if (device_param->skipped == 0)
          {
-          if (strstr (device_version, "pocl"))
-          {
-            if (force == 0)
-            {
-              log_info ("");
-              log_info ("ATTENTION! All pocl drivers are known to be broken due to broken LLVM <= 3.7");
-              log_info ("You are STRONGLY encouraged not to use it");
-              log_info ("You can use --force to override this but do not post error reports if you do so");
-              log_info ("");
-
-              return (-1);
-            }
-          }
-
            if (device_type & CL_DEVICE_TYPE_GPU)
            {
-            if (vendor_id == VENDOR_ID_NV)
-            {
-              if (device_param->kernel_exec_timeout != 0)
-              {
-                if (data.quiet == 0) log_info ("Device #%u: WARNING! Kernel exec timeout is not disabled, it might cause you errors of code 702", device_id + 1);
-                if (data.quiet == 0) log_info ("           See the wiki on how to disable it: https://hashcat.net/wiki/doku.php?id=timeout_patch");
-              }
-            }
-            else if (vendor_id == VENDOR_ID_AMD)
+            if (vendor_id == VENDOR_ID_AMD)
              {
                int catalyst_check = (force == 1) ? 0 : 1;
  
@@ -13014,6 +13066,32 @@ int main (int argc, char **argv)
                  log_info ("You can use --force to override this but do not post error reports if you do so");
                  log_info ("");
  
+                return (-1);
+              }
+            }
+            else if (vendor_id == VENDOR_ID_NV)
+            {
+              if (device_param->kernel_exec_timeout != 0)
+              {
+                if (data.quiet == 0) log_info ("Device #%u: WARNING! Kernel exec timeout is not disabled, it might cause you errors of code 702", device_id + 1);
+                if (data.quiet == 0) log_info ("           See the wiki on how to disable it: https://hashcat.net/wiki/doku.php?id=timeout_patch");
+              }
+            }
+          }
+
+          if (device_type & CL_DEVICE_TYPE_CPU)
+          {
+            if (vendor_id == VENDOR_ID_AMD)
+            {
+              if (force == 0)
+              {
+                log_info ("");
+                log_info ("ATTENTION! OpenCL support for CPU of catalyst driver is not reliable.");
+                log_info ("You are STRONGLY encouraged not to use it");
+                log_info ("You can use --force to override this but do not post error reports if you do so");
+                log_info ("A good alternative is the free pocl >= v0.13, but make sure to use a LLVM >= v3.8");
+                log_info ("");
+
                  return (-1);
                }
              }
@@ -13494,39 +13572,66 @@ int main (int argc, char **argv)
  
        device_param->command_queue = hc_clCreateCommandQueue (data.ocl, device_param->context, device_param->device, CL_QUEUE_PROFILING_ENABLE);
  
+      /**
+       * kernel threads: some algorithms need a fixed kernel-threads count
+       *                 because of shared memory usage or bitslice
+       *                 there needs to be some upper limit, otherwise there's too much overhead
+       */
+
+      uint kernel_threads = MIN (KERNEL_THREADS_MAX, device_param->device_maxworkgroup_size);
+
+      if (device_param->device_type & CL_DEVICE_TYPE_CPU)
+      {
+        kernel_threads = KERNEL_THREADS_MAX_CPU;
+      }
+
+      if (hash_mode ==  1500) kernel_threads = 64; // DES
+      if (hash_mode ==  3000) kernel_threads = 64; // DES
+      if (hash_mode ==  3200) kernel_threads = 8;  // Blowfish
+      if (hash_mode ==  7500) kernel_threads = 64; // RC4
+      if (hash_mode ==  9000) kernel_threads = 8;  // Blowfish
+      if (hash_mode ==  9700) kernel_threads = 64; // RC4
+      if (hash_mode ==  9710) kernel_threads = 64; // RC4
+      if (hash_mode ==  9800) kernel_threads = 64; // RC4
+      if (hash_mode ==  9810) kernel_threads = 64; // RC4
+      if (hash_mode == 10400) kernel_threads = 64; // RC4
+      if (hash_mode == 10410) kernel_threads = 64; // RC4
+      if (hash_mode == 10500) kernel_threads = 64; // RC4
+      if (hash_mode == 13100) kernel_threads = 64; // RC4
+
        /**
         * create input buffers on device : calculate size of fixed memory buffers
         */
  
-      uint size_root_css   = SP_PW_MAX *           sizeof (cs_t);
-      uint size_markov_css = SP_PW_MAX * CHARSIZ * sizeof (cs_t);
+      size_t size_root_css   = SP_PW_MAX *           sizeof (cs_t);
+      size_t size_markov_css = SP_PW_MAX * CHARSIZ * sizeof (cs_t);
  
        device_param->size_root_css   = size_root_css;
        device_param->size_markov_css = size_markov_css;
  
-      uint size_results = KERNEL_THREADS * sizeof (uint);
+      size_t size_results = kernel_threads * sizeof (uint);
  
        device_param->size_results = size_results;
  
-      uint size_rules   = kernel_rules_cnt * sizeof (kernel_rule_t);
-      uint size_rules_c = KERNEL_RULES     * sizeof (kernel_rule_t);
+      size_t size_rules   = kernel_rules_cnt * sizeof (kernel_rule_t);
+      size_t size_rules_c = KERNEL_RULES     * sizeof (kernel_rule_t);
  
-      uint size_plains  = digests_cnt * sizeof (plain_t);
-      uint size_salts   = salts_cnt   * sizeof (salt_t);
-      uint size_esalts  = salts_cnt   * esalt_size;
+      size_t size_plains  = digests_cnt * sizeof (plain_t);
+      size_t size_salts   = salts_cnt   * sizeof (salt_t);
+      size_t size_esalts  = salts_cnt   * esalt_size;
  
        device_param->size_plains   = size_plains;
        device_param->size_digests  = size_digests;
        device_param->size_shown    = size_shown;
        device_param->size_salts    = size_salts;
  
-      uint size_combs = KERNEL_COMBS * sizeof (comb_t);
-      uint size_bfs   = KERNEL_BFS   * sizeof (bf_t);
-      uint size_tm    = 32           * sizeof (bs_word_t);
+      size_t size_combs = KERNEL_COMBS * sizeof (comb_t);
+      size_t size_bfs   = KERNEL_BFS   * sizeof (bf_t);
+      size_t size_tm    = 32           * sizeof (bs_word_t);
  
        // scryptV stuff
  
-      u64 size_scryptV = 1;
+      size_t size_scryptV = 1;
  
        if ((hash_mode == 8900) || (hash_mode == 9300))
        {
@@ -13551,35 +13656,24 @@ int main (int argc, char **argv)
              }
              else if (device_param->vendor_id == VENDOR_ID_NV)
              {
-              tmto_start = 3;
+              tmto_start = 2;
              }
            }
            else if (hash_mode == 9300)
            {
              if (device_param->vendor_id == VENDOR_ID_AMD)
              {
-              tmto_start = 3;
+              tmto_start = 2;
              }
              else if (device_param->vendor_id == VENDOR_ID_NV)
              {
-              tmto_start = 5;
+              tmto_start = 2;
              }
            }
          }
  
          if (quiet == 0) log_info ("");
  
-        uint shader_per_mp = 1;
-
-        if (device_param->vendor_id == VENDOR_ID_AMD)
-        {
-          shader_per_mp = 8;
-        }
-        else if (device_param->vendor_id == VENDOR_ID_NV)
-        {
-          shader_per_mp = 32;
-        }
-
          for (uint tmto = tmto_start; tmto < tmto_stop; tmto++)
          {
            // TODO: in theory the following calculation needs to be done per salt, not global
@@ -13589,7 +13683,7 @@ int main (int argc, char **argv)
  
            size_scryptV /= 1 << tmto;
  
-          size_scryptV *= device_processors * device_processor_cores * shader_per_mp;
+          size_scryptV *= device_processors * device_processor_cores;
  
            if (size_scryptV > device_param->device_maxmem_alloc)
            {
@@ -13601,7 +13695,7 @@ int main (int argc, char **argv)
            for (uint salts_pos = 0; salts_pos < data.salts_cnt; salts_pos++)
            {
              data.salts_buf[salts_pos].scrypt_tmto = tmto;
-            data.salts_buf[salts_pos].scrypt_phy  = device_processors * device_processor_cores * shader_per_mp;
+            data.salts_buf[salts_pos].scrypt_phy  = device_processors * device_processor_cores;
            }
  
            break;
@@ -13618,17 +13712,6 @@ int main (int argc, char **argv)
          if (quiet == 0) log_info ("SCRYPT tmto optimizer value set to: %u, mem: %u\n", data.salts_buf[0].scrypt_tmto, size_scryptV);
        }
  
-      /**
-       * create input buffers on device : calculate size of dynamic size memory buffers
-       */
-
-      uint kernel_threads = KERNEL_THREADS;
-
-      // some algorithms need a fixed kernel-threads count (mostly because of shared memory usage)
-
-      if (hash_mode == 3200) kernel_threads = 8;
-      if (hash_mode == 9000) kernel_threads = 8;
-
        /**
         * some algorithms need a fixed kernel-loops count
         */
@@ -13706,13 +13789,13 @@ int main (int argc, char **argv)
  
        // find out if we would request too much memory on memory blocks which are based on kernel_accel
  
-      uint size_pws   = 4;
-      uint size_tmps  = 4;
-      uint size_hooks = 4;
+      size_t size_pws   = 4;
+      size_t size_tmps  = 4;
+      size_t size_hooks = 4;
  
        while (kernel_accel_max >= kernel_accel_min)
        {
-        uint kernel_power_max = device_processors * kernel_threads * kernel_accel_max;
+        const u32 kernel_power_max = device_processors * kernel_threads * kernel_accel_max;
  
          // size_pws
  
@@ -13819,6 +13902,7 @@ int main (int argc, char **argv)
              + size_markov_css
              + size_plains
              + size_pws
+            + size_pws // not a bug
              + size_results
              + size_root_css
              + size_rules
@@ -13885,7 +13969,7 @@ int main (int argc, char **argv)
  
        // we don't have sm_* on vendors not NV but it doesn't matter
  
-      snprintf (build_opts, sizeof (build_opts) - 1, "-I\"%s/\" -DVENDOR_ID=%u -DCUDA_ARCH=%d -DVECT_SIZE=%u -DDEVICE_TYPE=%u", shared_dir, device_param->vendor_id, (device_param->sm_major * 100) + device_param->sm_minor, device_param->vector_width, (u32) device_param->device_type);
+      snprintf (build_opts, sizeof (build_opts) - 1, "-cl-std=CL1.1 -I\"%s/\" -DVENDOR_ID=%u -DCUDA_ARCH=%d -DVECT_SIZE=%u -DDEVICE_TYPE=%u", shared_dir, device_param->vendor_id, (device_param->sm_major * 100) + device_param->sm_minor, device_param->vector_width, (u32) device_param->device_type);
  
        /**
         * main kernel
@@ -16033,10 +16117,6 @@ int main (int argc, char **argv)
  
            memset (device_param->speed_cnt, 0, SPEED_CACHE * sizeof (u64));
            memset (device_param->speed_ms,  0, SPEED_CACHE * sizeof (double));
-          memset (device_param->speed_rec, 0, SPEED_CACHE * sizeof (hc_timer_t));
-
-          device_param->speed_cnt_total = 0;
-          device_param->speed_ms_total  = 0;
  
            device_param->exec_pos = 0;