NVML_TEMPERATURE_THRESHOLD_COUNT
} nvmlTemperatureThresholds_t;
+/**
+ * Compute mode.
+ *
+ * NVML_COMPUTEMODE_EXCLUSIVE_PROCESS was added in CUDA 4.0.
+ * Earlier CUDA versions supported a single exclusive mode,
+ * which is equivalent to NVML_COMPUTEMODE_EXCLUSIVE_THREAD in CUDA 4.0 and beyond.
+ */
+typedef enum nvmlComputeMode_enum
+{
+ NVML_COMPUTEMODE_DEFAULT = 0, //!< Default compute mode -- multiple contexts per device
+ NVML_COMPUTEMODE_EXCLUSIVE_THREAD = 1, //!< Compute-exclusive-thread mode -- only one context per device, usable from one thread at a time
+ NVML_COMPUTEMODE_PROHIBITED = 2, //!< Compute-prohibited mode -- no contexts per device
+ NVML_COMPUTEMODE_EXCLUSIVE_PROCESS = 3, //!< Compute-exclusive-process mode -- only one context per device, usable from multiple threads at a time
+
+ // Keep this last
+ NVML_COMPUTEMODE_COUNT
+} nvmlComputeMode_t;
+
+/**
+ * GPU Operation Mode
+ *
+ * GOM allows to reduce power usage and optimize GPU throughput by disabling GPU features.
+ *
+ * Each GOM is designed to meet specific user needs.
+ */
+typedef enum nvmlGom_enum
+{
+ NVML_GOM_ALL_ON = 0, //!< Everything is enabled and running at full speed
+
+ NVML_GOM_COMPUTE = 1, //!< Designed for running only compute tasks. Graphics operations
+ //!< are not allowed
+
+ NVML_GOM_LOW_DP = 2 //!< Designed for running graphics applications that don't require
+ //!< high bandwidth double precision
+} nvmlGpuOperationMode_t;
+
/*
* End of declarations from nvml.h
**/
typedef nvmlReturn_t (*NVML_DEVICE_GET_THRESHOLD) (nvmlDevice_t, nvmlTemperatureThresholds_t, unsigned int *);
typedef nvmlReturn_t (*NVML_DEVICE_GET_CURRPCIELINKGENERATION) (nvmlDevice_t, unsigned int *);
typedef nvmlReturn_t (*NVML_DEVICE_GET_CURRPCIELINKWIDTH) (nvmlDevice_t, unsigned int *);
+typedef nvmlReturn_t (*NVML_DEVICE_GET_CURRENTCLOCKSTHROTTLEREASONS) (nvmlDevice_t, unsigned long long *);
+typedef nvmlReturn_t (*NVML_DEVICE_GET_SUPPORTEDCLOCKSTHROTTLEREASONS) (nvmlDevice_t, unsigned long long *);
+typedef nvmlReturn_t (*NVML_DEVICE_SET_COMPUTEMODE) (nvmlDevice_t, nvmlComputeMode_t);
+typedef nvmlReturn_t (*NVML_DEVICE_SET_OPERATIONMODE) (nvmlDevice_t, nvmlGpuOperationMode_t);
+typedef nvmlReturn_t (*NVML_DEVICE_GET_POWERMANAGEMENTLIMITCONSTRAINTS) (nvmlDevice_t, unsigned int *, unsigned int *);
+typedef nvmlReturn_t (*NVML_DEVICE_SET_POWERMANAGEMENTLIMIT) (nvmlDevice_t, unsigned int);
typedef struct
{
NVML_DEVICE_GET_THRESHOLD nvmlDeviceGetTemperatureThreshold;
NVML_DEVICE_GET_CURRPCIELINKGENERATION nvmlDeviceGetCurrPcieLinkGeneration;
NVML_DEVICE_GET_CURRPCIELINKWIDTH nvmlDeviceGetCurrPcieLinkWidth;
+ NVML_DEVICE_GET_CURRENTCLOCKSTHROTTLEREASONS nvmlDeviceGetCurrentClocksThrottleReasons;
+ NVML_DEVICE_GET_SUPPORTEDCLOCKSTHROTTLEREASONS nvmlDeviceGetSupportedClocksThrottleReasons;
+ NVML_DEVICE_SET_COMPUTEMODE nvmlDeviceSetComputeMode;
+ NVML_DEVICE_SET_OPERATIONMODE nvmlDeviceSetGpuOperationMode;
+ NVML_DEVICE_GET_POWERMANAGEMENTLIMITCONSTRAINTS nvmlDeviceGetPowerManagementLimitConstraints;
+ NVML_DEVICE_SET_POWERMANAGEMENTLIMIT nvmlDeviceSetPowerManagementLimit;
} hm_nvml_lib_t;
nvmlReturn_t hm_NVML_nvmlDeviceGetTemperatureThreshold (NVML_PTR *nvml, nvmlDevice_t device, nvmlTemperatureThresholds_t thresholdType, unsigned int *temp);
nvmlReturn_t hm_NVML_nvmlDeviceGetCurrPcieLinkGeneration (NVML_PTR *nvml, nvmlDevice_t device, unsigned int *currLinkGen);
nvmlReturn_t hm_NVML_nvmlDeviceGetCurrPcieLinkWidth (NVML_PTR *nvml, nvmlDevice_t device, unsigned int *currLinkWidth);
+nvmlReturn_t hm_NVML_nvmlDeviceGetCurrentClocksThrottleReasons (NVML_PTR *nvml, nvmlDevice_t device, unsigned long long *clocksThrottleReasons);
+nvmlReturn_t hm_NVML_nvmlDeviceGetSupportedClocksThrottleReasons (NVML_PTR *nvml, nvmlDevice_t device, unsigned long long *supportedClocksThrottleReasons);
+nvmlReturn_t hm_NVML_nvmlDeviceSetComputeMode (NVML_PTR *nvml, int, nvmlDevice_t device, nvmlComputeMode_t mode);
+nvmlReturn_t hm_NVML_nvmlDeviceSetGpuOperationMode (NVML_PTR *nvml, int, nvmlDevice_t device, nvmlGpuOperationMode_t mode);
+nvmlReturn_t hm_NVML_nvmlDeviceGetPowerManagementLimitConstraints (NVML_PTR *nvml, int, nvmlDevice_t device, unsigned int *minLimit, unsigned int *maxLimit);
+nvmlReturn_t hm_NVML_nvmlDeviceSetPowerManagementLimit (NVML_PTR *nvml, int skip_warnings, nvmlDevice_t device, unsigned int limit);
#endif // HAVE_HWMON
HC_LOAD_FUNC(nvml, nvmlDeviceGetTemperatureThreshold, NVML_DEVICE_GET_THRESHOLD, NVML, 0)
HC_LOAD_FUNC(nvml, nvmlDeviceGetCurrPcieLinkGeneration, NVML_DEVICE_GET_CURRPCIELINKGENERATION, NVML, 0)
HC_LOAD_FUNC(nvml, nvmlDeviceGetCurrPcieLinkWidth, NVML_DEVICE_GET_CURRPCIELINKWIDTH, NVML, 0)
+ HC_LOAD_FUNC(nvml, nvmlDeviceGetCurrentClocksThrottleReasons, NVML_DEVICE_GET_CURRENTCLOCKSTHROTTLEREASONS, NVML, 0)
+ HC_LOAD_FUNC(nvml, nvmlDeviceGetSupportedClocksThrottleReasons, NVML_DEVICE_GET_SUPPORTEDCLOCKSTHROTTLEREASONS, NVML, 0)
+ HC_LOAD_FUNC(nvml, nvmlDeviceSetComputeMode, NVML_DEVICE_SET_COMPUTEMODE, NVML, 0)
+ HC_LOAD_FUNC(nvml, nvmlDeviceSetGpuOperationMode, NVML_DEVICE_SET_OPERATIONMODE, NVML, 0)
+ HC_LOAD_FUNC(nvml, nvmlDeviceGetPowerManagementLimitConstraints, NVML_DEVICE_GET_POWERMANAGEMENTLIMITCONSTRAINTS, NVML, 0)
+ HC_LOAD_FUNC(nvml, nvmlDeviceSetPowerManagementLimit, NVML_DEVICE_SET_POWERMANAGEMENTLIMIT, NVML, 0)
return 0;
}
if (nvml_rc != NVML_SUCCESS)
{
- *temp = -1;
-
- //const char *string = hm_NVML_nvmlErrorString (nvml, nvml_rc);
+ const char *string = hm_NVML_nvmlErrorString (nvml, nvml_rc);
- //log_info ("WARN: %s %d %s\n", "nvmlDeviceGetTemperature()", nvml_rc, string);
+ log_info ("WARN: %s %d %s\n", "nvmlDeviceGetTemperature()", nvml_rc, string);
}
return nvml_rc;
if (nvml_rc != NVML_SUCCESS)
{
- *speed = -1;
-
if (skip_warnings == 0)
{
const char *string = hm_NVML_nvmlErrorString (nvml, nvml_rc);
return nvml_rc;
}
-/* only tesla following */
-
nvmlReturn_t hm_NVML_nvmlDeviceGetPowerUsage (NVML_PTR *nvml, nvmlDevice_t device, unsigned int *power)
{
if (!nvml) return -1;
if (nvml_rc != NVML_SUCCESS)
{
- *power = -1;
-
- //const char *string = hm_NVML_nvmlErrorString (nvml, nvml_rc);
+ const char *string = hm_NVML_nvmlErrorString (nvml, nvml_rc);
- //log_info ("WARN: %s %d %s\n", "nvmlDeviceGetPowerUsage()", nvml_rc, string);
+ log_info ("WARN: %s %d %s\n", "nvmlDeviceGetPowerUsage()", nvml_rc, string);
}
return nvml_rc;
if (nvml_rc != NVML_SUCCESS)
{
- utilization->gpu = -1;
- utilization->memory = -1;
-
- //const char *string = hm_NVML_nvmlErrorString (nvml, nvml_rc);
+ const char *string = hm_NVML_nvmlErrorString (nvml, nvml_rc);
- //log_info ("WARN: %s %d %s\n", "nvmlDeviceGetUtilizationRates()", nvml_rc, string);
+ log_info ("WARN: %s %d %s\n", "nvmlDeviceGetUtilizationRates()", nvml_rc, string);
}
return nvml_rc;
if (nvml_rc != NVML_SUCCESS)
{
- *clock = -1;
-
- //const char *string = hm_NVML_nvmlErrorString (nvml, nvml_rc);
+ const char *string = hm_NVML_nvmlErrorString (nvml, nvml_rc);
- //log_info ("WARN: %s %d %s\n", "nvmlDeviceGetUtilizationRates()", nvml_rc, string);
+ log_info ("WARN: %s %d %s\n", "nvmlDeviceGetUtilizationRates()", nvml_rc, string);
}
return nvml_rc;
if (nvml_rc != NVML_SUCCESS)
{
- *temp = -1;
-
- //const char *string = hm_NVML_nvmlErrorString (nvml, nvml_rc);
+ const char *string = hm_NVML_nvmlErrorString (nvml, nvml_rc);
- //log_info ("WARN: %s %d %s\n", "nvmlDeviceGetUtilizationRates()", nvml_rc, string);
+ log_info ("WARN: %s %d %s\n", "nvmlDeviceGetTemperatureThreshold()", nvml_rc, string);
}
return nvml_rc;
if (nvml_rc != NVML_SUCCESS)
{
- *currLinkGen = -1;
-
- //const char *string = hm_NVML_nvmlErrorString (nvml, nvml_rc);
+ const char *string = hm_NVML_nvmlErrorString (nvml, nvml_rc);
- //log_info ("WARN: %s %d %s\n", "nvmlDeviceGetUtilizationRates()", nvml_rc, string);
+ log_info ("WARN: %s %d %s\n", "nvmlDeviceGetUtilizationRates()", nvml_rc, string);
}
return nvml_rc;
if (nvml_rc != NVML_SUCCESS)
{
- *currLinkWidth = -1;
+ const char *string = hm_NVML_nvmlErrorString (nvml, nvml_rc);
+
+ log_info ("WARN: %s %d %s\n", "nvmlDeviceGetUtilizationRates()", nvml_rc, string);
+ }
+
+ return nvml_rc;
+}
+
+nvmlReturn_t hm_NVML_nvmlDeviceGetCurrentClocksThrottleReasons (NVML_PTR *nvml, nvmlDevice_t device, unsigned long long *clocksThrottleReasons)
+{
+ if (!nvml) return -1;
+
+ nvmlReturn_t nvml_rc = nvml->nvmlDeviceGetCurrentClocksThrottleReasons (device, clocksThrottleReasons);
+
+ if (nvml_rc != NVML_SUCCESS)
+ {
+ const char *string = hm_NVML_nvmlErrorString (nvml, nvml_rc);
+
+ log_info ("WARN: %s %d %s\n", "nvmlDeviceGetUtilizationRates()", nvml_rc, string);
+ }
+
+ return nvml_rc;
+}
+
+nvmlReturn_t hm_NVML_nvmlDeviceGetSupportedClocksThrottleReasons (NVML_PTR *nvml, nvmlDevice_t device, unsigned long long *supportedClocksThrottleReasons)
+{
+ if (!nvml) return -1;
+
+ nvmlReturn_t nvml_rc = nvml->nvmlDeviceGetSupportedClocksThrottleReasons (device, supportedClocksThrottleReasons);
+
+ if (nvml_rc != NVML_SUCCESS)
+ {
+ const char *string = hm_NVML_nvmlErrorString (nvml, nvml_rc);
+
+ log_info ("WARN: %s %d %s\n", "nvmlDeviceGetSupportedClocksThrottleReasons()", nvml_rc, string);
+ }
+
+ return nvml_rc;
+}
+
+nvmlReturn_t hm_NVML_nvmlDeviceSetComputeMode (NVML_PTR *nvml, int skip_warnings, nvmlDevice_t device, nvmlComputeMode_t mode)
+{
+ if (!nvml) return -1;
+
+ nvmlReturn_t nvml_rc = nvml->nvmlDeviceSetComputeMode (device, mode);
+
+ if (nvml_rc != NVML_SUCCESS)
+ {
+ if (skip_warnings == 0)
+ {
+ const char *string = hm_NVML_nvmlErrorString (nvml, nvml_rc);
+
+ log_info ("WARN: %s %d %s\n", "nvmlDeviceSetComputeMode()", nvml_rc, string);
+ }
+ }
+
+ return nvml_rc;
+}
+
+nvmlReturn_t hm_NVML_nvmlDeviceSetGpuOperationMode (NVML_PTR *nvml, int skip_warnings, nvmlDevice_t device, nvmlGpuOperationMode_t mode)
+{
+ if (!nvml) return -1;
+
+ nvmlReturn_t nvml_rc = nvml->nvmlDeviceSetGpuOperationMode (device, mode);
+
+ if (nvml_rc != NVML_SUCCESS)
+ {
+ if (skip_warnings == 0)
+ {
+ const char *string = hm_NVML_nvmlErrorString (nvml, nvml_rc);
+
+ log_info ("WARN: %s %d %s\n", "nvmlDeviceSetGpuOperationMode()", nvml_rc, string);
+ }
+ }
+
+ return nvml_rc;
+}
+
+nvmlReturn_t hm_NVML_nvmlDeviceGetPowerManagementLimitConstraints (NVML_PTR *nvml, int skip_warnings, nvmlDevice_t device, unsigned int *minLimit, unsigned int *maxLimit)
+{
+ if (!nvml) return -1;
+
+ nvmlReturn_t nvml_rc = nvml->nvmlDeviceGetPowerManagementLimitConstraints (device, minLimit, maxLimit);
+
+ if (nvml_rc != NVML_SUCCESS)
+ {
+ if (skip_warnings == 0)
+ {
+ const char *string = hm_NVML_nvmlErrorString (nvml, nvml_rc);
+
+ log_info ("WARN: %s %d %s\n", "nvmlDeviceGetPowerManagementLimitConstraints()", nvml_rc, string);
+ }
+ }
- //const char *string = hm_NVML_nvmlErrorString (nvml, nvml_rc);
+ return nvml_rc;
+}
- //log_info ("WARN: %s %d %s\n", "nvmlDeviceGetUtilizationRates()", nvml_rc, string);
+nvmlReturn_t hm_NVML_nvmlDeviceSetPowerManagementLimit (NVML_PTR *nvml, int skip_warnings, nvmlDevice_t device, unsigned int limit)
+{
+ if (!nvml) return -1;
+
+ nvmlReturn_t nvml_rc = nvml->nvmlDeviceSetPowerManagementLimit (device, limit);
+
+ if (nvml_rc != NVML_SUCCESS)
+ {
+ if (skip_warnings == 0)
+ {
+ const char *string = hm_NVML_nvmlErrorString (nvml, nvml_rc);
+
+ log_info ("WARN: %s %d %s\n", "nvmlDeviceSetPowerManagementLimit()", nvml_rc, string);
+ }
}
return nvml_rc;
const int num_corespeed = hm_get_corespeed_with_device_id (device_id);
const int num_memoryspeed = hm_get_memoryspeed_with_device_id (device_id);
const int num_buslanes = hm_get_buslanes_with_device_id (device_id);
- // not working
- //const int num_throttle = hm_get_throttle_with_device_id (device_id);
+ const int num_throttle = hm_get_throttle_with_device_id (device_id);
char output_buf[256] = { 0 };
output_len = strlen (output_buf);
}
- /*
- if (num_throttle >= 0)
+ if (num_throttle == 1)
{
- snprintf (output_buf + output_len, sizeof (output_buf) - output_len, " Throttle:%u", num_throttle);
+ snprintf (output_buf + output_len, sizeof (output_buf) - output_len, " *Throttled*");
output_len = strlen (output_buf);
}
- */
if (output_len == 0)
{
unsigned int speed;
if (hm_NVML_nvmlDeviceGetFanSpeed (data.hm_nv, 1, hm_adapters_nv[i].adapter_index.nv, &speed) != NVML_ERROR_NOT_SUPPORTED) hm_adapters_nv[i].fan_get_supported = 1;
+
+ hm_NVML_nvmlDeviceSetComputeMode (data.hm_nv, 1, hm_adapters_nv[i].adapter_index.nv, NVML_COMPUTEMODE_EXCLUSIVE_PROCESS);
+
+ hm_NVML_nvmlDeviceSetGpuOperationMode (data.hm_nv, 1, hm_adapters_nv[i].adapter_index.nv, NVML_GOM_ALL_ON);
+
+ unsigned int minLimit;
+ unsigned int maxLimit;
+
+ if (hm_NVML_nvmlDeviceGetPowerManagementLimitConstraints (data.hm_nv, 1, hm_adapters_nv[i].adapter_index.nv, &minLimit, &maxLimit) == NVML_SUCCESS)
+ {
+ if (maxLimit > 0)
+ {
+ hm_NVML_nvmlDeviceSetPowerManagementLimit (data.hm_nv, 1, hm_adapters_nv[i].adapter_index.nv, maxLimit);
+ }
+ }
}
}
}
if (gpu_temp_disable == 0)
{
const int gpu_temp_threshold_slowdown = hm_get_threshold_slowdown_with_device_id (device_id);
+ const int gpu_temp_threshold_shutdown = hm_get_threshold_slowdown_with_device_id (device_id);
+
+ data.hm_device[device_id].gpu_temp_threshold_slowdown = (gpu_temp_threshold_slowdown > 0) ? gpu_temp_threshold_slowdown : 10000;
+ data.hm_device[device_id].gpu_temp_threshold_shutdown = (gpu_temp_threshold_shutdown > 0) ? gpu_temp_threshold_shutdown : 10000;
- data.hm_device[device_id].gpu_temp_threshold_slowdown = (gpu_temp_threshold_slowdown == -1) ? 100000 : gpu_temp_threshold_slowdown;
+ // we could use those numbers for gpu_temp_retain and gpu_temp_abort, too
}
/**