device_param->speed_cnt[speed_pos] = perf_sum_all;
device_param->speed_ms[speed_pos] = speed_ms;
+
+ if (data.benchmark == 1)
+ {
+ if (speed_ms > 4096) data.devices_status = STATUS_ABORTED;
+ }
}
if (opts_type & OPTS_TYPE_HOOK23)
#define MAX_RETRIES 1
- double exec_ms_final = 0;
+ if ((kernel_loops_min == kernel_loops_max) || (kernel_accel_min == kernel_accel_max))
+ {
+ // we do this in case the user specified a fixed -u and -n on the commandline
+ // so we have a cached kernel for benchmark
+
+ try_run (device_param, kernel_accel, kernel_loops);
+ try_run (device_param, kernel_accel, kernel_loops);
+ try_run (device_param, kernel_accel, kernel_loops);
+ try_run (device_param, kernel_accel, kernel_loops);
+ try_run (device_param, kernel_accel, kernel_loops);
+ }
+
+ double exec_ms_final = try_run (device_param, kernel_accel, kernel_loops);
// first find out highest kernel-loops that stays below target_ms
exec_ms_best = MIN (exec_ms_best, exec_ms_cur);
}
- if (exec_ms_final == 0) exec_ms_final = exec_ms_best;
-
if (exec_ms_best < target_ms) break;
}
}
}
- // sometimes we're in a bad situation that the algorithm is so slow that we can not
- // create enough kernel_accel to do both, keep the gpu busy and stay below target_ms.
- // however, we need to have a minimum kernel_accel and kernel_loops of 32.
- // luckily, at this level of workload, it became a linear function
-
- while (kernel_accel < 32 && kernel_loops >= 32)
- {
- const u32 kernel_accel_try = kernel_accel * 2;
- const u32 kernel_loops_try = kernel_loops / 2;
-
- if (kernel_accel_try > kernel_accel_max) break;
- if (kernel_loops_try < kernel_loops_min) break;
-
- kernel_accel = kernel_accel_try;
- kernel_loops = kernel_loops_try;
- }
-
- // finally there's a chance that we have a fixed kernel_loops but not a fixed kernel_accel
+ // there's a chance that we have a fixed kernel_loops but not a fixed kernel_accel
// in such a case the above function would not create any change
// we'll use the runtime to find out if we're allow to do last improvement
if (exec_ms_final > 0)
{
- if (exec_ms_final < target_ms)
+ if ((exec_ms_final * 2) <= target_ms)
{
const double exec_left = target_ms / exec_ms_final;
const double accel_left = kernel_accel_max / kernel_accel;
- const double exec_accel_min = MIN (exec_left, accel_left);
+ const int exec_accel_min = MIN (exec_left, accel_left); // we want that to be int
if (exec_accel_min >= 2)
{
}
}
+ // sometimes we're in a bad situation that the algorithm is so slow that we can not
+ // create enough kernel_accel to do both, keep the gpu busy and stay below target_ms.
+ // however, we need to have a minimum kernel_accel and kernel_loops of 32.
+ // luckily, at this level of workload, it became a linear function
+
+ if (kernel_accel < 32 || kernel_loops < 32)
+ {
+ const u32 kernel_power = kernel_accel * kernel_loops;
+
+ // find sqrt
+
+ u32 sqrtv;
+
+ for (sqrtv = 1; sqrtv < 0x100000; sqrtv++)
+ {
+ if ((sqrtv * sqrtv) >= kernel_power) break;
+ }
+
+ const u32 kernel_accel_try = sqrtv;
+ const u32 kernel_loops_try = sqrtv;
+
+ if ((kernel_accel_try <= kernel_accel_max) && (kernel_loops_try >= kernel_loops_min))
+ {
+ kernel_accel = kernel_accel_try;
+ kernel_loops = kernel_loops_try;
+ }
+ }
+
// reset timer
device_param->exec_pos = 0;
break;
case 11300: if (pw_max > 40) pw_max = 40;
break;
+ case 11600: if (pw_max > 32) pw_max = 32;
+ break;
case 12500: if (pw_max > 20) pw_max = 20;
break;
case 12800: if (pw_max > 24) pw_max = 24;
device_param->device_maxclock_frequency = device_maxclock_frequency;
- // skipped
+ // device_endian_little
- const u32 skipped1 = ((devices_filter & (1 << device_id)) == 0);
- const u32 skipped2 = ((device_types_filter & (device_type)) == 0);
+ cl_bool device_endian_little;
- device_param->skipped = (skipped1 || skipped2);
+ hc_clGetDeviceInfo (data.ocl, device_param->device, CL_DEVICE_ENDIAN_LITTLE, sizeof (device_endian_little), &device_endian_little, NULL);
+
+ if (device_endian_little == CL_FALSE)
+ {
+ log_info ("Device #%u: WARNING: not little endian device", device_id + 1);
+
+ device_param->skipped = 1;
+ }
+
+ // device_available
+
+ cl_bool device_available;
+
+ hc_clGetDeviceInfo (data.ocl, device_param->device, CL_DEVICE_AVAILABLE, sizeof (device_available), &device_available, NULL);
+
+ if (device_available == CL_FALSE)
+ {
+ log_info ("Device #%u: WARNING: device not available", device_id + 1);
+
+ device_param->skipped = 1;
+ }
+
+ // device_compiler_available
+
+ cl_bool device_compiler_available;
+
+ hc_clGetDeviceInfo (data.ocl, device_param->device, CL_DEVICE_COMPILER_AVAILABLE, sizeof (device_compiler_available), &device_compiler_available, NULL);
+
+ if (device_compiler_available == CL_FALSE)
+ {
+ log_info ("Device #%u: WARNING: device no compiler available", device_id + 1);
+
+ device_param->skipped = 1;
+ }
+
+ // device_execution_capabilities
+
+ cl_device_exec_capabilities device_execution_capabilities;
+
+ hc_clGetDeviceInfo (data.ocl, device_param->device, CL_DEVICE_EXECUTION_CAPABILITIES, sizeof (device_execution_capabilities), &device_execution_capabilities, NULL);
+
+ if ((device_execution_capabilities & CL_EXEC_KERNEL) == 0)
+ {
+ log_info ("Device #%u: WARNING: device does not support executing kernels", device_id + 1);
+
+ device_param->skipped = 1;
+ }
+
+ // device_extensions
+
+ size_t device_extensions_size;
+
+ hc_clGetDeviceInfo (data.ocl, device_param->device, CL_DEVICE_EXTENSIONS, 0, NULL, &device_extensions_size);
+
+ char *device_extensions = mymalloc (device_extensions_size + 1);
+
+ hc_clGetDeviceInfo (data.ocl, device_param->device, CL_DEVICE_EXTENSIONS, device_extensions_size, device_extensions, NULL);
+
+ if (strstr (device_extensions, "base_atomics") == 0)
+ {
+ log_info ("Device #%u: WARNING: device does not support base atomics", device_id + 1);
+
+ device_param->skipped = 1;
+ }
+
+ if (strstr (device_extensions, "byte_addressable_store") == 0)
+ {
+ log_info ("Device #%u: WARNING: device does not support byte addressable store", device_id + 1);
+
+ device_param->skipped = 1;
+ }
+
+ myfree (device_extensions);
+
+ // device_local_mem_size
+
+ cl_ulong device_local_mem_size;
+
+ hc_clGetDeviceInfo (data.ocl, device_param->device, CL_DEVICE_LOCAL_MEM_SIZE, sizeof (device_local_mem_size), &device_local_mem_size, NULL);
+
+ if (device_local_mem_size < 32768)
+ {
+ log_info ("Device #%u: WARNING: device local mem size is too small", device_id + 1);
+
+ device_param->skipped = 1;
+ }
+
+
+ // skipped
+
+ device_param->skipped |= ((devices_filter & (1 << device_id)) == 0);
+ device_param->skipped |= ((device_types_filter & (device_type)) == 0);
// driver_version
+
hc_clGetDeviceInfo (data.ocl, device_param->device, CL_DRIVER_VERSION, 0, NULL, ¶m_value_size);
char *driver_version = (char *) mymalloc (param_value_size);
log_info ("ATTENTION! OpenCL support for CPU of catalyst driver is not reliable.");
log_info ("You are STRONGLY encouraged not to use it");
log_info ("You can use --force to override this but do not post error reports if you do so");
- log_info ("A good alternative is the free pocl, but make sure to use a version >= 3.8");
+ log_info ("A good alternative is the free pocl >= v0.13, but make sure to use a LLVM >= v3.8");
log_info ("");
return (-1);
* some algorithms have a maximum kernel-loops count
*/
- /*
if (attack_exec == ATTACK_EXEC_OUTSIDE_KERNEL)
{
if (data.salts_buf[0].salt_iter < device_param->kernel_loops_max)
device_param->kernel_loops_max = data.salts_buf[0].salt_iter;
}
}
- */
/**
* some algorithms need a special kernel-accel