WIP support nvidia power management

This commit is contained in:
Nathan Dwarshuis 2023-09-28 21:45:53 -04:00
parent 564796b81d
commit b5920374a7
1 changed files with 37 additions and 21 deletions

View File

@ -3,7 +3,7 @@ local i_o = require 'i_o'
return function(update_freq, config, common, width, point) return function(update_freq, config, common, width, point)
local NA = 'N/A' local NA = 'N/A'
local NVIDIA_EXE = 'nvidia-settings' local NVIDIA_EXE = 'nvidia-smi'
local geo = config.geometry local geo = config.geometry
local sep_spacing = geo.sep_spacing local sep_spacing = geo.sep_spacing
@ -22,29 +22,17 @@ return function(update_freq, config, common, width, point)
-- vars to process the nv settings glob -- vars to process the nv settings glob
-- --
-- glob will be of the form:
-- <used_mem>
-- <total_mem>
-- <temp>
-- <gpu_freq>,<mem_freq>
-- graphics=<gpu_util>, memory=<mem_util>, video=<vid_util>, PCIe=<pci_util>
local NV_QUERY = NVIDIA_EXE..
' -t'..
' -q UsedDedicatedGPUmemory'..
' -q TotalDedicatedGPUmemory'..
' -q ThermalSensorReading'..
' -q [gpu:0]/GPUCurrentClockFreqs'..
' -q [gpu:0]/GPUutilization'..
' 2>/dev/null'
local NV_REGEX = '(%d+)\n'.. local NV_QUERY = NVIDIA_EXE..
'(%d+)\n'.. ' --query-gpu=memory.used,memory.total,temperature.gpu,clocks.gr,clocks.mem,utilization.gpu,utilization.decoder'..
'(%d+)\n'.. ' --format=csv,noheader,nounits'
'(%d+),(%d+)\n'..
'graphics=(%d+), memory=%d+, video=(%d+), PCIe=%d+\n' local NV_REGEX = '(%d+), (%d+), (%d+), (%d+), (%d+), (%d+), (%d+)'
local mod_state = { local mod_state = {
error = false, error = false,
gpu_frequency = 0,
memory_frequency = 0,
used_memory = 0, used_memory = 0,
total_memory = 0, total_memory = 0,
temp_reading = 0, temp_reading = 0,
@ -52,6 +40,11 @@ return function(update_freq, config, common, width, point)
vid_utilization = 0 vid_utilization = 0
} }
local sleep_token = 0
local sleep_limit = 10
local gpu_idle_freq_limit = 250
-- TODO ensure this file exists
local runtime_status_file = config.dev_power..'/runtime_status' local runtime_status_file = config.dev_power..'/runtime_status'
local want_nvidia_query = config.show_temp or config.show_clock local want_nvidia_query = config.show_temp or config.show_clock
@ -59,7 +52,28 @@ return function(update_freq, config, common, width, point)
local update_state = function() local update_state = function()
local is_active = i_o.read_file(runtime_status_file, nil, '*l') == 'active' local is_active = i_o.read_file(runtime_status_file, nil, '*l') == 'active'
if is_active and want_nvidia_query then -- this will make the nvidia-smi query fire only so often when the clock
-- is below a certain threshold. This is necessary to get the GPU to
-- suspend when nothing is 'using' it, at the cost of lowering the
-- response time for when it eventually is used again. Maybe won't
-- matter that much since the jobs that use the GPU tend to be long
-- anyways, so a few seconds won't hurt. Furthermore, there are ways to
-- wake this up manually by detecting certain processes the likely will
-- use the GPU (ffmpeg and friends) or detecting processes that are
-- holding /dev/nvidia* files (which isn't foolproof but it will capture
-- most events)
if is_active and
mod_state.gpu_frequency > 0 and
mod_state.gpu_frequency < gpu_idle_freq_limit then
if sleep_token < sleep_limit - 1 then
sleep_token = sleep_token + 1
else
sleep_token = 0
end
else
sleep_token = 0
end
if is_active and want_nvidia_query and sleep_token == 0 then
local nvidia_settings_glob = i_o.execute_cmd(NV_QUERY) local nvidia_settings_glob = i_o.execute_cmd(NV_QUERY)
if nvidia_settings_glob == nil then if nvidia_settings_glob == nil then
mod_state.error = 'Error' mod_state.error = 'Error'
@ -72,11 +86,13 @@ return function(update_freq, config, common, width, point)
mod_state.gpu_utilization, mod_state.gpu_utilization,
mod_state.vid_utilization mod_state.vid_utilization
= __string_match(nvidia_settings_glob, NV_REGEX) = __string_match(nvidia_settings_glob, NV_REGEX)
mod_state.gpu_frequency = tonumber(mod_state.gpu_frequency)
mod_state.error = false mod_state.error = false
end end
elseif is_active then elseif is_active then
mod_state.error = false mod_state.error = false
else else
mod_state.gpu_frequency = 0
mod_state.error = 'Off' mod_state.error = 'Off'
end end
end end