Monitoring System 0.1.0
System resource monitoring with pluggable collectors and alerting
Loading...
Searching...
No Matches
gpu_collector.h
Go to the documentation of this file.
1// BSD 3-Clause License
2// Copyright (c) 2021-2025, 🍀☀🌕🌥 🌊
3// See the LICENSE file in the project root for full license information.
4
5#pragma once
6
18#include <atomic>
19#include <chrono>
20#include <cstdint>
21#include <memory>
22#include <mutex>
23#include <string>
24#include <unordered_map>
25#include <vector>
26
29
30namespace kcenon {
31namespace monitoring {
32
37enum class gpu_vendor {
38 unknown,
39 nvidia,
40 amd,
41 intel,
42 apple,
43 other
44};
45
49inline std::string gpu_vendor_to_string(gpu_vendor vendor) {
50 switch (vendor) {
52 return "nvidia";
53 case gpu_vendor::amd:
54 return "amd";
56 return "intel";
58 return "apple";
60 return "other";
61 default:
62 return "unknown";
63 }
64}
65
70enum class gpu_type {
71 unknown,
72 discrete,
75};
76
80inline std::string gpu_type_to_string(gpu_type type) {
81 switch (type) {
83 return "discrete";
85 return "integrated";
87 return "virtual";
88 default:
89 return "unknown";
90 }
91}
92
106
113
114 // Utilization metrics
116
117 // Memory metrics
118 uint64_t memory_used_bytes{0};
119 uint64_t memory_total_bytes{0};
120
121 // Thermal metrics
123
124 // Power metrics
125 double power_watts{0.0};
126 double power_limit_watts{0.0};
127
128 // Clock metrics
129 double clock_mhz{0.0};
130 double memory_clock_mhz{0.0};
131
132 // Fan metrics
133 double fan_speed_percent{0.0};
134
135 // Availability flags
137 bool memory_available{false};
139 bool power_available{false};
140 bool clock_available{false};
141 bool fan_available{false};
142
143 std::chrono::system_clock::time_point timestamp;
144};
145
146// Forward declaration
147namespace platform {
148class metrics_provider;
149} // namespace platform
150
159 public:
162
163 // Non-copyable, non-moveable due to internal state
168
173 bool is_gpu_available() const;
174
179 std::vector<gpu_device_info> enumerate_gpus();
180
185 std::vector<gpu_reading> read_all_gpu_metrics();
186
187 private:
188 std::unique_ptr<platform::metrics_provider> provider_;
189};
190
200 public:
202 ~gpu_collector() override = default;
203
204 // Non-copyable, non-moveable due to internal state
205 gpu_collector(const gpu_collector&) = delete;
209
210 // collector_plugin implementation
211 auto name() const -> std::string_view override { return "gpu"; }
212 auto collect() -> std::vector<metric> override;
213 auto interval() const -> std::chrono::milliseconds override { return std::chrono::seconds(5); }
214 auto is_available() const -> bool override;
215 auto get_metric_types() const -> std::vector<std::string> override;
216
217 auto get_metadata() const -> plugin_metadata override {
218 return plugin_metadata{
219 .name = name(),
220 .description = "GPU metrics (utilization, memory, temperature, power)",
221 .category = plugin_category::hardware,
222 .version = "1.0.0",
223 .dependencies = {},
224 .requires_platform_support = true
225 };
226 }
227
228 auto initialize(const config_map& config) -> bool override;
229 void shutdown() override {}
230 auto get_statistics() const -> stats_map override;
231
236 std::vector<gpu_reading> get_last_readings() const;
237
242 bool is_gpu_available() const;
243
244 private:
246
247 // Configuration
248 bool enabled_{true};
250 bool collect_memory_{true};
252 bool collect_power_{true};
253 bool collect_clock_{true};
254 bool collect_fan_{true};
255
256 // Statistics
257 mutable std::mutex stats_mutex_;
258 std::atomic<size_t> collection_count_{0};
259 std::atomic<size_t> collection_errors_{0};
260 std::atomic<size_t> gpus_found_{0};
261 std::vector<gpu_reading> last_readings_;
262
263 // Helper methods
264 metric create_metric(const std::string& name, double value, const gpu_reading& reading,
265 const std::string& unit = "") const;
266 void add_gpu_metrics(std::vector<metric>& metrics, const gpu_reading& reading);
267};
268
269} // namespace monitoring
270} // namespace kcenon
Pure virtual interface for metric collector plugins.
GPU metrics monitoring collector implementing collector_plugin interface.
auto interval() const -> std::chrono::milliseconds override
Get the collection interval for this plugin.
auto get_metadata() const -> plugin_metadata override
Get plugin metadata.
gpu_collector(gpu_collector &&)=delete
auto is_available() const -> bool override
Check if this plugin is available on the current system.
std::atomic< size_t > collection_count_
gpu_collector(const gpu_collector &)=delete
std::atomic< size_t > collection_errors_
auto collect() -> std::vector< metric > override
Collect current metrics from this plugin.
void shutdown() override
Shutdown plugin and release resources.
gpu_collector & operator=(gpu_collector &&)=delete
auto get_metric_types() const -> std::vector< std::string > override
Get supported metric types.
auto get_statistics() const -> stats_map override
Get plugin statistics.
std::atomic< size_t > gpus_found_
std::vector< gpu_reading > get_last_readings() const
auto initialize(const config_map &config) -> bool override
Initialize plugin with configuration.
void add_gpu_metrics(std::vector< metric > &metrics, const gpu_reading &reading)
metric create_metric(const std::string &name, double value, const gpu_reading &reading, const std::string &unit="") const
gpu_collector & operator=(const gpu_collector &)=delete
~gpu_collector() override=default
std::vector< gpu_reading > last_readings_
std::unique_ptr< gpu_info_collector > collector_
auto name() const -> std::string_view override
Get the unique name of this plugin.
GPU data collector using platform abstraction layer.
gpu_info_collector & operator=(gpu_info_collector &&)=delete
std::vector< gpu_reading > read_all_gpu_metrics()
std::unique_ptr< platform::metrics_provider > provider_
gpu_info_collector(const gpu_info_collector &)=delete
gpu_info_collector(gpu_info_collector &&)=delete
std::vector< gpu_device_info > enumerate_gpus()
gpu_info_collector & operator=(const gpu_info_collector &)=delete
Plugin interface for metric collectors.
Adapter for metric types to support interface definitions.
std::unordered_map< std::string, double > stats_map
Type alias for statistics map.
std::string gpu_vendor_to_string(gpu_vendor vendor)
Convert gpu_vendor to string representation.
gpu_type
GPU type classification.
@ virtual_gpu
Virtual GPU (cloud/VM)
@ discrete
Discrete GPU (dedicated graphics card)
@ unknown
Unknown GPU type.
@ integrated
Integrated GPU (part of CPU/SoC)
@ hardware
Hardware sensors (GPU, temperature, battery, power)
std::string gpu_type_to_string(gpu_type type)
Convert gpu_type to string representation.
std::unordered_map< std::string, std::string > config_map
Type alias for configuration map.
gpu_vendor
GPU vendor identification.
@ apple
Apple (Apple Silicon GPU)
@ intel
Intel Corporation.
@ nvidia
NVIDIA Corporation.
@ amd
Advanced Micro Devices.
@ platform
Platform/system power domain.
Information about a GPU device.
std::string id
Unique device identifier (e.g., "gpu0")
uint32_t device_index
Device index for multi-GPU systems.
std::string driver_version
Driver version string.
std::string device_path
Platform-specific path (e.g., /sys/class/drm/card0)
gpu_type type
GPU type (discrete/integrated)
std::string name
Human-readable device name.
A single GPU metrics reading.
uint64_t memory_used_bytes
VRAM currently used.
bool fan_available
Whether fan metrics available.
double power_watts
Current power consumption.
double fan_speed_percent
Fan speed (0-100)
bool memory_available
Whether memory metrics available.
gpu_device_info device
GPU device information.
bool temperature_available
Whether temperature metrics available.
bool power_available
Whether power metrics available.
double temperature_celsius
GPU temperature.
bool utilization_available
Whether utilization metrics available.
uint64_t memory_total_bytes
Total VRAM capacity.
double utilization_percent
GPU compute utilization (0-100)
double clock_mhz
Current GPU clock speed.
bool clock_available
Whether clock metrics available.
std::chrono::system_clock::time_point timestamp
Reading timestamp.
double memory_clock_mhz
Current memory clock speed.
double power_limit_watts
Power limit/TDP.
Basic metric structure for interface compatibility.
Metadata describing a collector plugin.