NVML_CLOCK_MEM = 2
} nvmlClockType_t;
+typedef enum nvmlTemperatureThresholds_enum
+{
+ NVML_TEMPERATURE_THRESHOLD_SHUTDOWN = 0, // Temperature at which the GPU will shut down
+ // for HW protection
+ NVML_TEMPERATURE_THRESHOLD_SLOWDOWN = 1, // Temperature at which the GPU will begin slowdown
+ // Keep this last
+ NVML_TEMPERATURE_THRESHOLD_COUNT
+} nvmlTemperatureThresholds_t;
+
/*
* End of declarations from nvml.h
**/
typedef nvmlReturn_t (*NVML_DEVICE_GET_POWER_USAGE) (nvmlDevice_t, unsigned int *);
typedef nvmlReturn_t (*NVML_DEVICE_GET_UTILIZATION_RATES) (nvmlDevice_t, nvmlUtilization_t *);
typedef nvmlReturn_t (*NVML_DEVICE_GET_CLOCKINFO) (nvmlDevice_t, nvmlClockType_t, unsigned int *);
+typedef nvmlReturn_t (*NVML_DEVICE_GET_THRESHOLD) (nvmlDevice_t, nvmlTemperatureThresholds_t, unsigned int *);
typedef struct
{
NVML_DEVICE_GET_POWER_USAGE nvmlDeviceGetPowerUsage;
NVML_DEVICE_GET_UTILIZATION_RATES nvmlDeviceGetUtilizationRates;
NVML_DEVICE_GET_CLOCKINFO nvmlDeviceGetClockInfo;
+ NVML_DEVICE_GET_THRESHOLD nvmlDeviceGetTemperatureThreshold;
} hm_nvml_lib_t;
nvmlReturn_t hm_NVML_nvmlDeviceGetPowerUsage (NVML_PTR *nvml, nvmlDevice_t device, unsigned int *power);
nvmlReturn_t hm_NVML_nvmlDeviceGetUtilizationRates (NVML_PTR *nvml, nvmlDevice_t device, nvmlUtilization_t *utilization);
nvmlReturn_t hm_NVML_nvmlDeviceGetClockInfo (NVML_PTR *nvml, nvmlDevice_t device, nvmlClockType_t type, unsigned int *clock);
+nvmlReturn_t hm_NVML_nvmlDeviceGetTemperatureThreshold (NVML_PTR *nvml, nvmlDevice_t device, nvmlTemperatureThresholds_t thresholdType, unsigned int *temp);
#endif // HAVE_HWMON && HAVE_NVML
// void hm_get_opencl_busid_devid (hm_attrs_t *hm_device, uint opencl_num_devices, cl_device_id *devices);
#endif // HAVE_ADL
-int hm_get_temperature_with_device_id (const uint device_id);
-int hm_get_fanspeed_with_device_id (const uint device_id);
-int hm_get_utilization_with_device_id (const uint device_id);
-int hm_get_memoryspeed_with_device_id (const uint device_id);
-int hm_get_corespeed_with_device_id (const uint device_id);
+int hm_get_threshold_slowdown_with_device_id (const uint device_id);
+int hm_get_temperature_with_device_id (const uint device_id);
+int hm_get_fanspeed_with_device_id (const uint device_id);
+int hm_get_utilization_with_device_id (const uint device_id);
+int hm_get_memoryspeed_with_device_id (const uint device_id);
+int hm_get_corespeed_with_device_id (const uint device_id);
int hm_set_fanspeed_with_device_id_amd (const uint device_id, const int fanspeed);
} adapter_index;
- int od_version;
- int fan_supported;
+ int od_version;
+ int fan_supported;
+
+ int gpu_temp_threshold_slowdown;
+ int gpu_temp_threshold_shutdown;
// int busid; // used for CL_DEVICE_TOPOLOGY_AMD but broken for dual GPUs
// int devid; // used for CL_DEVICE_TOPOLOGY_AMD but broken for dual GPUs
HC_LOAD_FUNC(nvml, nvmlDeviceGetPowerUsage, NVML_DEVICE_GET_POWER_USAGE, NVML, 0)
HC_LOAD_FUNC(nvml, nvmlDeviceGetUtilizationRates, NVML_DEVICE_GET_UTILIZATION_RATES, NVML, 0)
HC_LOAD_FUNC(nvml, nvmlDeviceGetClockInfo, NVML_DEVICE_GET_CLOCKINFO, NVML, 0)
+ HC_LOAD_FUNC(nvml, nvmlDeviceGetTemperatureThreshold, NVML_DEVICE_GET_THRESHOLD, NVML, 0)
return 0;
}
return nvml_rc;
}
+nvmlReturn_t hm_NVML_nvmlDeviceGetTemperatureThreshold (NVML_PTR *nvml, nvmlDevice_t device, nvmlTemperatureThresholds_t thresholdType, unsigned int *temp)
+{
+ if (!nvml) return -1;
+
+ nvmlReturn_t nvml_rc = nvml->nvmlDeviceGetTemperatureThreshold (device, thresholdType, temp);
+
+ if (nvml_rc != NVML_SUCCESS)
+ {
+ *temp = -1;
+
+ //const char *string = hm_NVML_nvmlErrorString (nvml, nvml_rc);
+
+ //log_info ("WARN: %s %d %s\n", "nvmlDeviceGetUtilizationRates()", nvml_rc, string);
+ }
+
+ return nvml_rc;
+}
out_fp = stdout;
}
+
lock_file (out_fp);
}
else
#ifdef HAVE_HWMON
uint hwmon_check = 0;
+ int slowdown_warnings = 0;
+
// these variables are mainly used for fan control (AMD only)
int *fan_speed_chgd = (int *) mycalloc (data.devices_cnt, sizeof (int));
if (data.devices_status != STATUS_RUNNING) continue;
-
#ifdef HAVE_HWMON
+
+ if (1)
+ {
+ hc_thread_mutex_lock (mux_adl);
+
+ for (uint device_id = 0; device_id < data.devices_cnt; device_id++)
+ {
+ hc_device_param_t *device_param = &data.devices_param[device_id];
+
+ if (device_param->skipped) continue;
+
+ if ((data.devices_param[device_id].device_type & CL_DEVICE_TYPE_GPU) == 0) continue;
+
+ const int temperature = hm_get_temperature_with_device_id (device_id);
+
+ const int threshold = data.hm_device[device_id].gpu_temp_threshold_slowdown;
+
+ if (temperature >= threshold)
+ {
+ if (slowdown_warnings < 3)
+ {
+ if (data.quiet == 0) clear_prompt ();
+
+ log_info ("WARNING: Drivers temperature threshold (%dc) hit on GPU #%d, expect performance to drop...", threshold, device_id + 1);
+
+ if (slowdown_warnings == 2)
+ {
+ log_info ("");
+ }
+
+ if (data.quiet == 0) fprintf (stdout, "%s", PROMPT);
+ if (data.quiet == 0) fflush (stdout);
+
+ slowdown_warnings++;
+ }
+ }
+ else
+ {
+ slowdown_warnings = 0;
+ }
+ }
+
+ hc_thread_mutex_unlock (mux_adl);
+ }
+
if (hwmon_check == 1)
{
hc_thread_mutex_lock (mux_adl);
#ifdef HAVE_HWMON
#if defined(HAVE_NVML) || defined(HAVE_NVAPI)
- hm_attrs_t hm_adapters_nv[DEVICES_MAX] = { { { 0 }, 0, 0 } };
+ hm_attrs_t hm_adapters_nv[DEVICES_MAX] = { { { 0 }, 0, 0, 0, 0 } };
#endif
#ifdef HAVE_ADL
- hm_attrs_t hm_adapters_amd[DEVICES_MAX] = { { { 0 }, 0, 0 } };
+ hm_attrs_t hm_adapters_amd[DEVICES_MAX] = { { { 0 }, 0, 0, 0, 0 } };
#endif
if (gpu_temp_disable == 0)
}
}
- /*
- * Temporary fix:
- * with AMD r9 295x cards it seems that we need to set the powertune value just AFTER the ocl init stuff
- * otherwise after hc_clCreateContext () etc, powertune value was set back to "normal" and cards unfortunately
- * were not working @ full speed (setting hm_ADL_Overdrive_PowerControl_Set () here seems to fix the problem)
- * Driver / ADL bug?
- */
+ /**
+ * Temporary fix:
+ * with AMD r9 295x cards it seems that we need to set the powertune value just AFTER the ocl init stuff
+ * otherwise after hc_clCreateContext () etc, powertune value was set back to "normal" and cards unfortunately
+ * were not working @ full speed (setting hm_ADL_Overdrive_PowerControl_Set () here seems to fix the problem)
+ * Driver / ADL bug?
+ */
#ifdef HAVE_ADL
if (powertune_enable == 1)
run_kernel_bzero (device_param, device_param->d_markov_css_buf, size_markov_css);
}
+ /**
+ * Store thermal target temperature so we can send a notice to user
+ */
+
+ #if defined(HAVE_HWMON)
+ if (gpu_temp_disable == 0)
+ {
+ const int gpu_temp_threshold_slowdown = hm_get_threshold_slowdown_with_device_id (device_id);
+
+ data.hm_device[device_id].gpu_temp_threshold_slowdown = (gpu_temp_threshold_slowdown == -1) ? 100000 : gpu_temp_threshold_slowdown;
+ }
+ #endif
+
/**
* Store initial fanspeed if gpu_temp_retain is enabled
*/
}
#endif // HAVE_ADL
+int hm_get_threshold_slowdown_with_device_id (const uint device_id)
+{
+ if ((data.devices_param[device_id].device_type & CL_DEVICE_TYPE_GPU) == 0) return -1;
+
+ #ifdef HAVE_ADL
+
+ #endif
+
+ #if defined(HAVE_NVML) || defined(HAVE_NVAPI)
+ if (data.devices_param[device_id].device_vendor_id == VENDOR_ID_NV)
+ {
+ #if defined(LINUX) && defined(HAVE_NVML)
+ int target = 0;
+
+ hm_NVML_nvmlDeviceGetTemperatureThreshold (data.hm_nv, data.hm_device[device_id].adapter_index.nv, NVML_TEMPERATURE_THRESHOLD_SLOWDOWN, (unsigned int *) &target);
+
+ return target;
+ #endif
+
+ #if defined(WIN) && defined(HAVE_NVAPI)
+
+ #endif // WIN && HAVE_NVAPI
+ }
+ #endif // HAVE_NVML || HAVE_NVAPI
+
+ return -1;
+}
+
int hm_get_temperature_with_device_id (const uint device_id)
{
if ((data.devices_param[device_id].device_type & CL_DEVICE_TYPE_GPU) == 0) return -1;
#endif
#if defined(WIN) && defined(HAVE_NVAPI)
-
NV_GPU_COOLER_SETTINGS pCoolerSettings;
pCoolerSettings.Version = GPU_COOLER_SETTINGS_VER | sizeof (NV_GPU_COOLER_SETTINGS);