Monitoring System 0.1.0
System resource monitoring with pluggable collectors and alerting
Loading...
Searching...
No Matches
adaptive_monitor.h
Go to the documentation of this file.
1// BSD 3-Clause License
2// Copyright (c) 2021-2025, 🍀☀🌕🌥 🌊
3// See the LICENSE file in the project root for full license information.
4
15#pragma once
16
17#include <string>
18#include <memory>
19#include <chrono>
20#include <vector>
21#include <unordered_map>
22#include <mutex>
23#include <atomic>
24#include <algorithm>
25#include <functional>
26#include <thread>
27#include <cmath>
28#include <random>
29
34
35namespace kcenon { namespace monitoring {
36
41 conservative, // Prefer system stability over monitoring detail
42 balanced, // Balance between monitoring and performance
43 aggressive // Prefer monitoring detail over system resources
44};
45
49enum class load_level {
50 idle, // < 20% CPU
51 low, // 20-40% CPU
52 moderate, // 40-60% CPU
53 high, // 60-80% CPU
54 critical // > 80% CPU
55};
56
61 // Thresholds for load levels (CPU percentage)
62 double idle_threshold{20.0};
63 double low_threshold{40.0};
64 double moderate_threshold{60.0};
65 double high_threshold{80.0};
66
67 // Memory thresholds (percentage)
70
71 // Collection intervals by load level (milliseconds)
72 std::chrono::milliseconds idle_interval{100};
73 std::chrono::milliseconds low_interval{250};
74 std::chrono::milliseconds moderate_interval{500};
75 std::chrono::milliseconds high_interval{1000};
76 std::chrono::milliseconds critical_interval{5000};
77
78 // Sampling rates by load level (0.0 to 1.0)
79 double idle_sampling_rate{1.0};
80 double low_sampling_rate{0.8};
82 double high_sampling_rate{0.2};
84
85 // Adaptation parameters
87 std::chrono::seconds adaptation_interval{10};
88 double smoothing_factor{0.7}; // Exponential smoothing for load average
89
90 // Threshold tuning parameters (ARC-005)
91 double hysteresis_margin{5.0}; // Percentage margin to prevent oscillation
92 std::chrono::milliseconds cooldown_period{1000}; // Minimum time between level changes
93 bool enable_hysteresis{true}; // Enable/disable hysteresis
94 bool enable_cooldown{true}; // Enable/disable cooldown
95
99 std::chrono::milliseconds get_interval_for_load(load_level level) const {
100 switch (level) {
101 case load_level::idle: return idle_interval;
102 case load_level::low: return low_interval;
104 case load_level::high: return high_interval;
106 }
107 return moderate_interval;
108 }
109
114 switch (level) {
120 }
122 }
123};
124
129 std::uint64_t total_adaptations{0};
130 std::uint64_t upscale_count{0};
131 std::uint64_t downscale_count{0};
132 std::uint64_t samples_dropped{0};
133 std::uint64_t samples_collected{0};
134 double average_cpu_usage{0.0};
137 std::chrono::milliseconds current_interval;
139 std::chrono::system_clock::time_point last_adaptation;
140
141 // Threshold tuning statistics (ARC-005)
142 std::uint64_t hysteresis_prevented_changes{0}; // Changes prevented by hysteresis
143 std::uint64_t cooldown_prevented_changes{0}; // Changes prevented by cooldown
144 std::chrono::system_clock::time_point last_level_change; // Time of last level change
145};
146
157private:
158 std::shared_ptr<kcenon::monitoring::metrics_collector> collector_;
160 mutable std::mutex config_mutex_; // Protects config_
162 std::atomic<bool> enabled_{true};
163 std::atomic<double> current_sampling_rate_{1.0};
164 mutable std::mutex stats_mutex_;
165
166public:
168 std::shared_ptr<kcenon::monitoring::metrics_collector> collector,
169 const adaptive_config& config = {}
170 ) : collector_(collector), config_(config) {
172 stats_.last_adaptation = std::chrono::system_clock::now();
173 }
174
178 common::Result<kcenon::monitoring::metrics_snapshot> collect() {
179 if (!should_sample()) {
181 return common::Result<kcenon::monitoring::metrics_snapshot>::err(error_info(kcenon::monitoring::monitoring_error_code::operation_cancelled, "Sample dropped due to adaptive sampling").to_common_error());
182 }
183
185 return collector_->collect();
186 }
187
192 void adapt(const kcenon::monitoring::system_metrics& sys_metrics) {
193 // Copy config under lock to avoid holding both locks
194 adaptive_config local_config;
195 {
196 std::lock_guard<std::mutex> config_lock(config_mutex_);
197 local_config = config_;
198 }
199
200 std::lock_guard<std::mutex> lock(stats_mutex_);
201
202 bool is_first_adaptation = (stats_.total_adaptations == 0);
203
204 // Initialize averages on first adaptation
205 if (is_first_adaptation) {
208 } else {
209 // Update average metrics using exponential smoothing
211 local_config.smoothing_factor * sys_metrics.cpu_usage_percent +
212 (1.0 - local_config.smoothing_factor) * stats_.average_cpu_usage;
213
215 local_config.smoothing_factor * sys_metrics.memory_usage_percent +
216 (1.0 - local_config.smoothing_factor) * stats_.average_memory_usage;
217 }
218
219 // Determine load level with hysteresis support
224 local_config
225 );
226
227 // Adapt if load level changed
228 if (new_level != stats_.current_load_level) {
229 auto now = std::chrono::system_clock::now();
230
231 // Check cooldown period (ARC-005)
232 // Skip cooldown check for first adaptation to allow initial level setting
233 if (local_config.enable_cooldown && !is_first_adaptation) {
234 auto time_since_last_change = std::chrono::duration_cast<std::chrono::milliseconds>(
236 );
237 if (time_since_last_change < local_config.cooldown_period) {
239 return; // Skip this adaptation due to cooldown
240 }
241 }
242
243 if (new_level > stats_.current_load_level) {
245 } else {
247 }
248
249 stats_.current_load_level = new_level;
250 stats_.current_interval = local_config.get_interval_for_load(new_level);
251 current_sampling_rate_ = local_config.get_sampling_rate_for_load(new_level);
256 }
257 }
258
263 std::lock_guard lock(stats_mutex_);
264 return stats_;
265 }
266
270 std::chrono::milliseconds get_current_interval() const {
271 std::lock_guard lock(stats_mutex_);
273 }
274
279 void set_config(const adaptive_config& config) {
280 std::lock_guard<std::mutex> lock(config_mutex_);
281 config_ = config;
282 }
283
289 std::lock_guard<std::mutex> lock(config_mutex_);
290 return config_;
291 }
292
296 void set_enabled(bool enabled) {
297 enabled_ = enabled;
298 }
299
303 bool is_enabled() const {
304 return enabled_;
305 }
306
307private:
311 bool should_sample() const {
312 if (!enabled_) return true;
313
314 // Use random sampling based on current rate
315 static thread_local std::mt19937 gen(std::random_device{}());
316 std::uniform_real_distribution<> dis(0.0, 1.0);
317 return dis(gen) < current_sampling_rate_.load();
318 }
319
325 double cpu_usage,
326 double memory_usage,
327 const adaptive_config& cfg
328 ) {
329 // Consider memory pressure in load calculation
330 double effective_load = cpu_usage;
331
332 // Memory pressure should escalate load level
333 if (memory_usage > cfg.memory_critical_threshold) {
334 // Critical memory -> at least high load
335 effective_load = std::max(effective_load, cfg.high_threshold + 1.0);
336 } else if (memory_usage > cfg.memory_warning_threshold) {
337 // Warning memory -> at least moderate load
338 effective_load = std::max(effective_load, cfg.moderate_threshold + 1.0);
339 }
340
341 // Apply strategy-specific adjustments BEFORE determining level
342 switch (cfg.strategy) {
344 effective_load *= 0.8; // Be more conservative
345 break;
347 effective_load *= 1.2; // Be more aggressive
348 break;
350 default:
351 break;
352 }
353
354 // Determine load level
355 if (effective_load >= cfg.high_threshold) {
357 } else if (effective_load >= cfg.moderate_threshold) {
358 return load_level::high;
359 } else if (effective_load >= cfg.low_threshold) {
361 } else if (effective_load >= cfg.idle_threshold) {
362 return load_level::low;
363 } else {
364 return load_level::idle;
365 }
366 }
367
381 double cpu_usage,
382 double memory_usage,
383 load_level current_level,
384 const adaptive_config& cfg
385 ) {
386 // First calculate what the level would be without hysteresis
387 load_level raw_level = calculate_load_level_with_config(cpu_usage, memory_usage, cfg);
388
389 // If hysteresis is disabled, return raw level
390 if (!cfg.enable_hysteresis) {
391 return raw_level;
392 }
393
394 // If no change, return current level
395 if (raw_level == current_level) {
396 return current_level;
397 }
398
399 // Calculate effective load (same logic as calculate_load_level_with_config)
400 double effective_load = cpu_usage;
401 if (memory_usage > cfg.memory_critical_threshold) {
402 effective_load = std::max(effective_load, cfg.high_threshold + 1.0);
403 } else if (memory_usage > cfg.memory_warning_threshold) {
404 effective_load = std::max(effective_load, cfg.moderate_threshold + 1.0);
405 }
406
407 switch (cfg.strategy) {
409 effective_load *= 0.8;
410 break;
412 effective_load *= 1.2;
413 break;
415 default:
416 break;
417 }
418
419 // Get threshold for current level
420 double current_threshold = get_threshold_for_level(current_level, cfg);
421 double margin = cfg.hysteresis_margin;
422
423 // For upward transitions (higher load), require crossing threshold + margin
424 // For downward transitions (lower load), require crossing threshold - margin
425 if (raw_level > current_level) {
426 // Moving to higher load level - need to exceed threshold by margin
427 double next_threshold = get_threshold_for_level(
428 static_cast<load_level>(static_cast<int>(current_level) + 1), cfg);
429 if (effective_load < next_threshold + margin) {
430 return current_level; // Stay at current level (hysteresis)
431 }
432 } else {
433 // Moving to lower load level - need to drop below threshold by margin
434 if (effective_load > current_threshold - margin) {
435 return current_level; // Stay at current level (hysteresis)
436 }
437 }
438
439 return raw_level;
440 }
441
445 static double get_threshold_for_level(load_level level, const adaptive_config& cfg) {
446 switch (level) {
447 case load_level::idle: return 0.0;
448 case load_level::low: return cfg.idle_threshold;
449 case load_level::moderate: return cfg.low_threshold;
450 case load_level::high: return cfg.moderate_threshold;
451 case load_level::critical: return cfg.high_threshold;
452 }
453 return cfg.moderate_threshold;
454 }
455};
456
461private:
462 struct monitor_impl;
463 std::unique_ptr<monitor_impl> impl_;
464
465public:
468
469 // Disable copy
472
473 // Enable move
475 adaptive_monitor& operator=(adaptive_monitor&&) noexcept;
476
480 common::Result<bool> register_collector(
481 const std::string& name,
482 std::shared_ptr<kcenon::monitoring::metrics_collector> collector,
483 const adaptive_config& config = {}
484 );
485
489 common::Result<bool> unregister_collector(const std::string& name);
490
494 common::Result<bool> start();
495
499 common::Result<bool> stop();
500
504 bool is_running() const;
505
509 common::Result<adaptation_stats> get_collector_stats(
510 const std::string& name
511 ) const;
512
516 std::unordered_map<std::string, adaptation_stats> get_all_stats() const;
517
522
526 common::Result<bool> force_adaptation();
527
531 std::vector<std::string> get_active_collectors() const;
532
536 common::Result<bool> set_collector_priority(
537 const std::string& name,
538 int priority
539 );
540};
541
546
551private:
553 std::string collector_name_;
554 bool registered_{false};
555
556public:
558 const std::string& name,
559 std::shared_ptr<kcenon::monitoring::metrics_collector> collector,
560 const adaptive_config& config = {}
562 auto result = monitor_->register_collector(name, collector, config);
563 registered_ = result.is_ok() && result.value();
564 }
565
571
572 // Disable copy
575
576 // Enable move
578 : monitor_(other.monitor_)
579 , collector_name_(std::move(other.collector_name_))
580 , registered_(other.registered_) {
581 other.monitor_ = nullptr;
582 other.registered_ = false;
583 }
584
586 if (this != &other) {
587 if (registered_ && monitor_) {
589 }
590 monitor_ = other.monitor_;
591 collector_name_ = std::move(other.collector_name_);
592 registered_ = other.registered_;
593 other.monitor_ = nullptr;
594 other.registered_ = false;
595 }
596 return *this;
597 }
598
599 bool is_registered() const { return registered_; }
600};
601
602} } // namespace kcenon::monitoring
adaptation_stats get_stats() const
Get current adaptation statistics.
common::Result< kcenon::monitoring::metrics_snapshot > collect()
Collect metrics with adaptive sampling.
void adapt(const kcenon::monitoring::system_metrics &sys_metrics)
Adapt collection behavior based on load.
static load_level calculate_load_level_with_config(double cpu_usage, double memory_usage, const adaptive_config &cfg)
Calculate load level from metrics with provided config.
std::shared_ptr< kcenon::monitoring::metrics_collector > collector_
std::chrono::milliseconds get_current_interval() const
Get current collection interval.
adaptive_config get_config() const
Get adaptive configuration.
adaptive_collector(std::shared_ptr< kcenon::monitoring::metrics_collector > collector, const adaptive_config &config={})
static double get_threshold_for_level(load_level level, const adaptive_config &cfg)
Get the threshold value for a given load level.
void set_enabled(bool enabled)
Enable or disable adaptive behavior.
void set_config(const adaptive_config &config)
Set adaptive configuration.
bool is_enabled() const
Check if adaptive behavior is enabled.
static load_level calculate_load_level_with_hysteresis(double cpu_usage, double memory_usage, load_level current_level, const adaptive_config &cfg)
Calculate load level with hysteresis support (ARC-005)
bool should_sample() const
Determine if current sample should be collected.
Adaptive monitoring controller.
adaptive_monitor(const adaptive_monitor &)=delete
void set_global_strategy(adaptation_strategy strategy)
Set global adaptation strategy.
common::Result< bool > unregister_collector(const std::string &name)
Unregister a collector.
common::Result< bool > stop()
Stop adaptive monitoring.
adaptive_monitor(adaptive_monitor &&) noexcept
common::Result< bool > start()
Start adaptive monitoring.
std::unique_ptr< monitor_impl > impl_
adaptive_monitor & operator=(const adaptive_monitor &)=delete
std::vector< std::string > get_active_collectors() const
Get recommended collectors based on load.
std::unordered_map< std::string, adaptation_stats > get_all_stats() const
Get all collector statistics.
common::Result< adaptation_stats > get_collector_stats(const std::string &name) const
Get adaptation statistics for a collector.
bool is_running() const
Check if monitoring is active.
common::Result< bool > force_adaptation()
Force adaptation cycle.
common::Result< bool > register_collector(const std::string &name, std::shared_ptr< kcenon::monitoring::metrics_collector > collector, const adaptive_config &config={})
Register a collector for adaptive monitoring.
common::Result< bool > set_collector_priority(const std::string &name, int priority)
Set priority for a collector (higher priority = keep active longer)
adaptive_scope(const std::string &name, std::shared_ptr< kcenon::monitoring::metrics_collector > collector, const adaptive_config &config={})
adaptive_scope(adaptive_scope &&other) noexcept
adaptive_scope(const adaptive_scope &)=delete
adaptive_scope & operator=(adaptive_scope &&other) noexcept
adaptive_scope & operator=(const adaptive_scope &)=delete
Abstract base class for metric collectors.
virtual common::Result< metrics_snapshot > collect()=0
Collect metrics.
Monitoring system specific error codes.
Core monitoring system interface definitions.
load_level
System load levels.
adaptation_strategy
Adaptation strategy for monitoring behavior.
adaptive_monitor & global_adaptive_monitor()
Global adaptive monitor instance.
Performance monitoring and profiling implementation.
Result pattern type definitions for monitoring system.
std::chrono::system_clock::time_point last_level_change
std::chrono::milliseconds current_interval
std::chrono::system_clock::time_point last_adaptation
Adaptive configuration parameters.
std::chrono::milliseconds cooldown_period
std::chrono::milliseconds low_interval
std::chrono::milliseconds critical_interval
std::chrono::milliseconds moderate_interval
double get_sampling_rate_for_load(load_level level) const
Get sampling rate for load level.
std::chrono::milliseconds idle_interval
std::chrono::milliseconds get_interval_for_load(load_level level) const
Get collection interval for load level.
std::chrono::milliseconds high_interval
Extended error information with context.