Send a notice to user in case the drivers temperature threshold for slowdown is reach...

author jsteube <jens.steube@gmail.com>

Sat, 28 May 2016 22:59:24 +0000 (00:59 +0200)

committer jsteube <jens.steube@gmail.com>

Sat, 28 May 2016 22:59:24 +0000 (00:59 +0200)
author jsteube <jens.steube@gmail.com>
Sat, 28 May 2016 22:59:24 +0000 (00:59 +0200)
committer jsteube <jens.steube@gmail.com>
Sat, 28 May 2016 22:59:24 +0000 (00:59 +0200)
diff --git a/include/ext_nvml.h b/include/ext_nvml.h

index 68fd324..03671d5 100644 (file)
--- a/include/ext_nvml.h
+++ b/include/ext_nvml.h
@@ -58,6 +58,15 @@ typedef enum nvmlClockType_enum {
         NVML_CLOCK_MEM = 2
  } nvmlClockType_t;
  
+typedef enum nvmlTemperatureThresholds_enum
+{
+    NVML_TEMPERATURE_THRESHOLD_SHUTDOWN = 0,    // Temperature at which the GPU will shut down
+                                                // for HW protection
+    NVML_TEMPERATURE_THRESHOLD_SLOWDOWN = 1,    // Temperature at which the GPU will begin slowdown
+    // Keep this last
+    NVML_TEMPERATURE_THRESHOLD_COUNT
+} nvmlTemperatureThresholds_t;
+
  /*
   * End of declarations from nvml.h
   **/
@@ -76,6 +85,7 @@ typedef nvmlReturn_t (*NVML_DEVICE_GET_FAN_SPEED) (nvmlDevice_t, unsigned int *)
  typedef nvmlReturn_t (*NVML_DEVICE_GET_POWER_USAGE) (nvmlDevice_t, unsigned int *);
  typedef nvmlReturn_t (*NVML_DEVICE_GET_UTILIZATION_RATES) (nvmlDevice_t, nvmlUtilization_t *);
  typedef nvmlReturn_t (*NVML_DEVICE_GET_CLOCKINFO) (nvmlDevice_t, nvmlClockType_t, unsigned int *);
+typedef nvmlReturn_t (*NVML_DEVICE_GET_THRESHOLD) (nvmlDevice_t, nvmlTemperatureThresholds_t, unsigned int *);
  
  typedef struct
  {
@@ -91,6 +101,7 @@ typedef struct
    NVML_DEVICE_GET_POWER_USAGE nvmlDeviceGetPowerUsage;
    NVML_DEVICE_GET_UTILIZATION_RATES nvmlDeviceGetUtilizationRates;
    NVML_DEVICE_GET_CLOCKINFO nvmlDeviceGetClockInfo;
+  NVML_DEVICE_GET_THRESHOLD nvmlDeviceGetTemperatureThreshold;
  
  } hm_nvml_lib_t;
  
@@ -109,6 +120,7 @@ nvmlReturn_t hm_NVML_nvmlDeviceGetFanSpeed (NVML_PTR *nvml, int, nvmlDevice_t de
  nvmlReturn_t hm_NVML_nvmlDeviceGetPowerUsage (NVML_PTR *nvml, nvmlDevice_t device, unsigned int *power);
  nvmlReturn_t hm_NVML_nvmlDeviceGetUtilizationRates (NVML_PTR *nvml, nvmlDevice_t device, nvmlUtilization_t *utilization);
  nvmlReturn_t hm_NVML_nvmlDeviceGetClockInfo (NVML_PTR *nvml, nvmlDevice_t device, nvmlClockType_t type, unsigned int *clock);
+nvmlReturn_t hm_NVML_nvmlDeviceGetTemperatureThreshold (NVML_PTR *nvml, nvmlDevice_t device, nvmlTemperatureThresholds_t thresholdType, unsigned int *temp);
  
  #endif // HAVE_HWMON && HAVE_NVML
  
diff --git a/include/shared.h b/include/shared.h

index b1ddb29..2724587 100644 (file)
--- a/include/shared.h
+++ b/include/shared.h
@@ -1454,11 +1454,12 @@ int hm_check_fanspeed_control (void *adl, hm_attrs_t *hm_device, u32 *valid_adl_
  // void hm_get_opencl_busid_devid (hm_attrs_t *hm_device, uint opencl_num_devices, cl_device_id *devices);
  #endif // HAVE_ADL
  
-int hm_get_temperature_with_device_id (const uint device_id);
-int hm_get_fanspeed_with_device_id    (const uint device_id);
-int hm_get_utilization_with_device_id (const uint device_id);
-int hm_get_memoryspeed_with_device_id (const uint device_id);
-int hm_get_corespeed_with_device_id   (const uint device_id);
+int hm_get_threshold_slowdown_with_device_id (const uint device_id);
+int hm_get_temperature_with_device_id        (const uint device_id);
+int hm_get_fanspeed_with_device_id           (const uint device_id);
+int hm_get_utilization_with_device_id        (const uint device_id);
+int hm_get_memoryspeed_with_device_id        (const uint device_id);
+int hm_get_corespeed_with_device_id          (const uint device_id);
  
  int hm_set_fanspeed_with_device_id_amd (const uint device_id, const int fanspeed);
  
diff --git a/include/types.h b/include/types.h

index 2ccfe02..df55563 100644 (file)
--- a/include/types.h
+++ b/include/types.h
@@ -1098,8 +1098,11 @@ typedef struct
  
    } adapter_index;
  
-  int     od_version;
-  int     fan_supported;
+  int   od_version;
+  int   fan_supported;
+
+  int   gpu_temp_threshold_slowdown;
+  int   gpu_temp_threshold_shutdown;
  
    // int     busid; // used for CL_DEVICE_TOPOLOGY_AMD but broken for dual GPUs
    // int     devid; // used for CL_DEVICE_TOPOLOGY_AMD but broken for dual GPUs
diff --git a/src/ext_nvml.c b/src/ext_nvml.c

index 2a66214..31c078a 100644 (file)
--- a/src/ext_nvml.c
+++ b/src/ext_nvml.c
@@ -33,6 +33,7 @@ int nvml_init (NVML_PTR *nvml)
    HC_LOAD_FUNC(nvml, nvmlDeviceGetPowerUsage, NVML_DEVICE_GET_POWER_USAGE, NVML, 0)
    HC_LOAD_FUNC(nvml, nvmlDeviceGetUtilizationRates, NVML_DEVICE_GET_UTILIZATION_RATES, NVML, 0)
    HC_LOAD_FUNC(nvml, nvmlDeviceGetClockInfo, NVML_DEVICE_GET_CLOCKINFO, NVML, 0)
+  HC_LOAD_FUNC(nvml, nvmlDeviceGetTemperatureThreshold, NVML_DEVICE_GET_THRESHOLD, NVML, 0)
  
    return 0;
  }
@@ -218,3 +219,20 @@ nvmlReturn_t hm_NVML_nvmlDeviceGetClockInfo (NVML_PTR *nvml, nvmlDevice_t device
    return nvml_rc;
  }
  
+nvmlReturn_t hm_NVML_nvmlDeviceGetTemperatureThreshold (NVML_PTR *nvml, nvmlDevice_t device, nvmlTemperatureThresholds_t thresholdType, unsigned int *temp)
+{
+  if (!nvml) return -1;
+
+  nvmlReturn_t nvml_rc = nvml->nvmlDeviceGetTemperatureThreshold (device, thresholdType, temp);
+
+  if (nvml_rc != NVML_SUCCESS)
+  {
+    *temp = -1;
+
+    //const char *string = hm_NVML_nvmlErrorString (nvml, nvml_rc);
+
+    //log_info ("WARN: %s %d %s\n", "nvmlDeviceGetUtilizationRates()", nvml_rc, string);
+  }
+
+  return nvml_rc;
+}
diff --git a/src/hashcat.c b/src/hashcat.c

index fd09ac3..9a60017 100644 (file)
--- a/src/hashcat.c
+++ b/src/hashcat.c
@@ -2104,6 +2104,7 @@ static void check_hash (hc_device_param_t *device_param, plain_t *plain)
  
        out_fp = stdout;
      }
+
      lock_file (out_fp);
    }
    else
@@ -3781,6 +3782,8 @@ static void *thread_monitor (void *p)
    #ifdef HAVE_HWMON
    uint hwmon_check   = 0;
  
+  int slowdown_warnings = 0;
+
    // these variables are mainly used for fan control (AMD only)
  
    int *fan_speed_chgd = (int *) mycalloc (data.devices_cnt, sizeof (int));
@@ -3845,8 +3848,52 @@ static void *thread_monitor (void *p)
  
      if (data.devices_status != STATUS_RUNNING) continue;
  
-
      #ifdef HAVE_HWMON
+
+    if (1)
+    {
+      hc_thread_mutex_lock (mux_adl);
+
+      for (uint device_id = 0; device_id < data.devices_cnt; device_id++)
+      {
+        hc_device_param_t *device_param = &data.devices_param[device_id];
+
+        if (device_param->skipped) continue;
+
+        if ((data.devices_param[device_id].device_type & CL_DEVICE_TYPE_GPU) == 0) continue;
+
+        const int temperature = hm_get_temperature_with_device_id (device_id);
+
+        const int threshold = data.hm_device[device_id].gpu_temp_threshold_slowdown;
+
+        if (temperature >= threshold)
+        {
+          if (slowdown_warnings < 3)
+          {
+            if (data.quiet == 0) clear_prompt ();
+
+            log_info ("WARNING: Drivers temperature threshold (%dc) hit on GPU #%d, expect performance to drop...", threshold, device_id + 1);
+
+            if (slowdown_warnings == 2)
+            {
+              log_info ("");
+            }
+
+            if (data.quiet == 0) fprintf (stdout, "%s", PROMPT);
+            if (data.quiet == 0) fflush (stdout);
+
+            slowdown_warnings++;
+          }
+        }
+        else
+        {
+          slowdown_warnings = 0;
+        }
+      }
+
+      hc_thread_mutex_unlock (mux_adl);
+    }
+
      if (hwmon_check == 1)
      {
        hc_thread_mutex_lock (mux_adl);
@@ -13946,11 +13993,11 @@ int main (int argc, char **argv)
  
      #ifdef HAVE_HWMON
      #if defined(HAVE_NVML) || defined(HAVE_NVAPI)
-    hm_attrs_t hm_adapters_nv[DEVICES_MAX]  = { { { 0 }, 0, 0 } };
+    hm_attrs_t hm_adapters_nv[DEVICES_MAX]  = { { { 0 }, 0, 0, 0, 0 } };
      #endif
  
      #ifdef HAVE_ADL
-    hm_attrs_t hm_adapters_amd[DEVICES_MAX] = { { { 0 }, 0, 0 } };
+    hm_attrs_t hm_adapters_amd[DEVICES_MAX] = { { { 0 }, 0, 0, 0, 0 } };
      #endif
  
      if (gpu_temp_disable == 0)
@@ -14219,13 +14266,13 @@ int main (int argc, char **argv)
        }
      }
  
-   /*
-    * Temporary fix:
-    * with AMD r9 295x cards it seems that we need to set the powertune value just AFTER the ocl init stuff
-    * otherwise after hc_clCreateContext () etc, powertune value was set back to "normal" and cards unfortunately
-    * were not working @ full speed (setting hm_ADL_Overdrive_PowerControl_Set () here seems to fix the problem)
-    * Driver / ADL bug?
-    */
+    /**
+     * Temporary fix:
+     * with AMD r9 295x cards it seems that we need to set the powertune value just AFTER the ocl init stuff
+     * otherwise after hc_clCreateContext () etc, powertune value was set back to "normal" and cards unfortunately
+     * were not working @ full speed (setting hm_ADL_Overdrive_PowerControl_Set () here seems to fix the problem)
+     * Driver / ADL bug?
+     */
  
      #ifdef HAVE_ADL
      if (powertune_enable == 1)
@@ -15565,6 +15612,19 @@ int main (int argc, char **argv)
          run_kernel_bzero (device_param, device_param->d_markov_css_buf, size_markov_css);
        }
  
+      /**
+       * Store thermal target temperature so we can send a notice to user
+       */
+
+      #if defined(HAVE_HWMON)
+      if (gpu_temp_disable == 0)
+      {
+        const int gpu_temp_threshold_slowdown = hm_get_threshold_slowdown_with_device_id (device_id);
+
+        data.hm_device[device_id].gpu_temp_threshold_slowdown = (gpu_temp_threshold_slowdown == -1) ? 100000 : gpu_temp_threshold_slowdown;
+      }
+      #endif
+
        /**
         * Store initial fanspeed if gpu_temp_retain is enabled
         */
diff --git a/src/shared.c b/src/shared.c

index 6f17e88..0396e93 100644 (file)
--- a/src/shared.c
+++ b/src/shared.c
@@ -3057,6 +3057,34 @@ int hm_get_adapter_index_amd (hm_attrs_t *hm_device, u32 *valid_adl_device_list,
  }
  #endif // HAVE_ADL
  
+int hm_get_threshold_slowdown_with_device_id (const uint device_id)
+{
+  if ((data.devices_param[device_id].device_type & CL_DEVICE_TYPE_GPU) == 0) return -1;
+
+  #ifdef HAVE_ADL
+
+  #endif
+
+  #if defined(HAVE_NVML) || defined(HAVE_NVAPI)
+  if (data.devices_param[device_id].device_vendor_id == VENDOR_ID_NV)
+  {
+    #if defined(LINUX) && defined(HAVE_NVML)
+    int target = 0;
+
+    hm_NVML_nvmlDeviceGetTemperatureThreshold (data.hm_nv, data.hm_device[device_id].adapter_index.nv, NVML_TEMPERATURE_THRESHOLD_SLOWDOWN, (unsigned int *) &target);
+
+    return target;
+    #endif
+
+    #if defined(WIN) && defined(HAVE_NVAPI)
+
+    #endif // WIN && HAVE_NVAPI
+  }
+  #endif // HAVE_NVML || HAVE_NVAPI
+
+  return -1;
+}
+
  int hm_get_temperature_with_device_id (const uint device_id)
  {
    if ((data.devices_param[device_id].device_type & CL_DEVICE_TYPE_GPU) == 0) return -1;
@@ -3169,7 +3197,6 @@ int hm_get_fanspeed_with_device_id (const uint device_id)
        #endif
  
        #if defined(WIN) && defined(HAVE_NVAPI)
-
        NV_GPU_COOLER_SETTINGS pCoolerSettings;
  
        pCoolerSettings.Version = GPU_COOLER_SETTINGS_VER | sizeof (NV_GPU_COOLER_SETTINGS);
author	jsteube <jens.steube@gmail.com>
	Sat, 28 May 2016 22:59:24 +0000 (00:59 +0200)
committer	jsteube <jens.steube@gmail.com>
	Sat, 28 May 2016 22:59:24 +0000 (00:59 +0200)
include/ext_nvml.h		patch \| blob \| history
include/shared.h		patch \| blob \| history
include/types.h		patch \| blob \| history
src/ext_nvml.c		patch \| blob \| history
src/hashcat.c		patch \| blob \| history
src/shared.c		patch \| blob \| history