Monitoring System 0.1.0
System resource monitoring with pluggable collectors and alerting
Loading...
Searching...
No Matches
performance_monitor.h
Go to the documentation of this file.
1// BSD 3-Clause License
2// Copyright (c) 2021-2025, 🍀☀🌕🌥 🌊
3// See the LICENSE file in the project root for full license information.
4
14#pragma once
15
16#include <string>
17#include <memory>
18#include <chrono>
19#include <vector>
20#include <deque>
21#include <unordered_map>
22#include <mutex>
23#include <atomic>
24#include <functional>
25#include <thread>
26#include <algorithm>
27#include <numeric>
28#include <cmath>
29#include <shared_mutex>
30
32#include "../core/error_codes.h"
34
35// Use common_system interfaces (Phase 2.3.4)
36#include <kcenon/common/interfaces/monitoring_interface.h>
37
38namespace kcenon { namespace monitoring {
39
46using tag_map = std::unordered_map<std::string, std::string>;
47
53 counter,
54 gauge,
56};
57
63 std::string name;
64 double value;
67 std::chrono::system_clock::time_point timestamp;
68
69 tagged_metric(const std::string& n, double v, recorded_metric_type t,
70 const tag_map& tgs = {})
71 : name(n)
72 , value(v)
73 , type(t)
74 , tags(tgs)
75 , timestamp(std::chrono::system_clock::now()) {}
76
80 std::string key() const {
81 std::string k = name;
82 // Sort tags for consistent key generation
83 std::vector<std::pair<std::string, std::string>> sorted_tags(
84 tags.begin(), tags.end());
85 std::sort(sorted_tags.begin(), sorted_tags.end());
86 for (const auto& [tag_key, tag_value] : sorted_tags) {
87 k += ";" + tag_key + "=" + tag_value;
88 }
89 return k;
90 }
91};
92
97 std::string operation_name;
98 std::chrono::nanoseconds min_duration{std::chrono::nanoseconds::max()};
99 std::chrono::nanoseconds max_duration{std::chrono::nanoseconds::zero()};
100 std::chrono::nanoseconds total_duration{std::chrono::nanoseconds::zero()};
101 std::chrono::nanoseconds mean_duration{std::chrono::nanoseconds::zero()};
102 std::chrono::nanoseconds median_duration{std::chrono::nanoseconds::zero()};
103 std::chrono::nanoseconds p95_duration{std::chrono::nanoseconds::zero()};
104 std::chrono::nanoseconds p99_duration{std::chrono::nanoseconds::zero()};
105 std::uint64_t call_count{0};
106 std::uint64_t error_count{0};
107 double throughput{0.0}; // Operations per second
108
109};
110
115 double cpu_usage_percent{0.0};
117 std::size_t memory_usage_bytes{0};
119 std::uint32_t thread_count{0};
120 std::uint32_t handle_count{0};
121 double disk_io_read_rate{0.0}; // MB/s
122 double disk_io_write_rate{0.0}; // MB/s
123 double network_io_recv_rate{0.0}; // MB/s
124 double network_io_send_rate{0.0}; // MB/s
125
126 std::chrono::system_clock::time_point timestamp;
127};
128
138private:
140 // Using deque instead of vector for O(1) pop_front performance
141 // when removing oldest samples in ring buffer behavior
142 std::deque<std::chrono::nanoseconds> samples;
143 std::atomic<std::uint64_t> call_count{0};
144 std::atomic<std::uint64_t> error_count{0};
145 // Store time as atomic integer for thread-safe access without locks
146 std::atomic<std::chrono::steady_clock::rep> last_access_time{std::chrono::steady_clock::now().time_since_epoch().count()};
147 mutable std::mutex mutex;
148 };
149
150 std::unordered_map<std::string, std::unique_ptr<profile_data>> profiles_;
151 mutable std::shared_mutex profiles_mutex_;
152 std::atomic<bool> enabled_{true};
153 std::size_t max_samples_per_operation_{10000};
154 std::size_t max_profiles_{10000}; // LRU eviction threshold
155
156 // Lock-free collection path (Sprint 3-4)
157 std::atomic<bool> use_lock_free_path_{false};
158
159public:
163 common::Result<bool> record_sample(
164 const std::string& operation_name,
165 std::chrono::nanoseconds duration,
166 bool success = true
167 );
168
172 common::Result<performance_metrics> get_metrics(
173 const std::string& operation_name
174 ) const;
175
179 std::vector<performance_metrics> get_all_metrics() const;
180
184 common::Result<bool> clear_samples(const std::string& operation_name);
185
190
195 void set_enabled(bool enabled) { enabled_ = enabled; }
196
201 bool is_enabled() const { return enabled_; }
202
207 void set_max_samples(std::size_t max_samples) {
208 max_samples_per_operation_ = max_samples;
209 }
210
220 void set_lock_free_mode(bool enable) {
221 use_lock_free_path_ = enable;
222 }
223
228 bool is_lock_free_mode() const {
229 return use_lock_free_path_;
230 }
231};
232
237private:
239 std::string operation_name_;
240 std::chrono::high_resolution_clock::time_point start_time_;
241 bool success_{true};
242 bool completed_{false};
243
244public:
245 scoped_timer(performance_profiler* profiler, const std::string& operation_name)
246 : profiler_(profiler)
247 , operation_name_(operation_name)
248 , start_time_(std::chrono::high_resolution_clock::now()) {}
249
251 if (!completed_ && profiler_) {
252 complete();
253 }
254 }
255
259 void mark_failed() { success_ = false; }
260
264 void complete() {
265 if (completed_) return;
266
267 auto end_time = std::chrono::high_resolution_clock::now();
268 auto duration = std::chrono::duration_cast<std::chrono::nanoseconds>(
269 end_time - start_time_
270 );
271
272 if (profiler_) {
274 }
275
276 completed_ = true;
277 }
278
282 std::chrono::nanoseconds elapsed() const {
283 auto now = std::chrono::high_resolution_clock::now();
284 return std::chrono::duration_cast<std::chrono::nanoseconds>(
285 now - start_time_
286 );
287 }
288};
289
294private:
295 struct monitor_impl;
296 std::unique_ptr<monitor_impl> impl_;
297
298public:
301
302 // Disable copy
305
306 // Enable move
308 system_monitor& operator=(system_monitor&&) noexcept;
309
313 common::Result<system_metrics> get_current_metrics() const;
314
318 common::Result<bool> start_monitoring(
319 std::chrono::milliseconds interval = std::chrono::milliseconds(1000)
320 );
321
325 common::Result<bool> stop_monitoring();
326
331 bool is_monitoring() const;
332
337 std::chrono::seconds duration = std::chrono::seconds(60)
338 ) const;
339};
340
355private:
358 std::string name_;
359 bool enabled_{true};
360
361 // Performance thresholds for alerting
362 struct thresholds {
363 double cpu_threshold{80.0};
364 double memory_threshold{90.0};
365 std::chrono::milliseconds latency_threshold{1000};
366 } thresholds_;
367 mutable std::mutex thresholds_mutex_; // Protects thresholds_
368
369 // Tagged metrics storage for counters, gauges, and histograms
370 struct metric_data {
371 double value{0.0};
374 std::chrono::system_clock::time_point last_update;
375 std::vector<double> histogram_values; // For histogram type only
376 std::size_t max_histogram_samples{1000};
377 };
378 std::unordered_map<std::string, std::unique_ptr<metric_data>> tagged_metrics_;
379 mutable std::shared_mutex metrics_mutex_; // Protects tagged_metrics_
380
381public:
382 explicit performance_monitor(const std::string& name = "performance_monitor")
383 : name_(name) {}
384
385 // Implement metrics_collector interface
386
391 std::string get_name() const override { return name_; }
392
397 bool is_enabled() const override { return enabled_; }
398
399 common::VoidResult set_enabled(bool enable) override {
400 enabled_ = enable;
401 profiler_.set_enabled(enable);
402 return common::ok();
403 }
404
405 common::VoidResult initialize() override {
406 auto result = system_monitor_.start_monitoring();
407 if (result.is_err()) {
408 return common::VoidResult::err(result.error());
409 }
410 return common::ok();
411 }
412
413 common::VoidResult cleanup() override {
414 auto result = system_monitor_.stop_monitoring();
415 if (result.is_err()) {
416 return common::VoidResult::err(result.error());
417 }
418 return common::ok();
419 }
420
421 common::Result<metrics_snapshot> collect() override;
422
426 scoped_timer time_operation(const std::string& operation_name) {
427 return scoped_timer(&profiler_, operation_name);
428 }
429
434 performance_profiler& get_profiler() { return profiler_; }
435
440 const performance_profiler& get_profiler() const { return profiler_; }
441
446 system_monitor& get_system_monitor() { return system_monitor_; }
447
452 const system_monitor& get_system_monitor() const { return system_monitor_; }
453
458 void set_cpu_threshold(double threshold) {
459 std::lock_guard<std::mutex> lock(thresholds_mutex_);
460 thresholds_.cpu_threshold = threshold;
461 }
462
463 void set_memory_threshold(double threshold) {
464 std::lock_guard<std::mutex> lock(thresholds_mutex_);
465 thresholds_.memory_threshold = threshold;
466 }
467
468 void set_latency_threshold(std::chrono::milliseconds threshold) {
469 std::lock_guard<std::mutex> lock(thresholds_mutex_);
470 thresholds_.latency_threshold = threshold;
471 }
472
478 std::lock_guard<std::mutex> lock(thresholds_mutex_);
479 return thresholds_;
480 }
481
485 common::Result<bool> check_thresholds() const;
486
487 // IMonitor interface implementation (Phase 2.3.4)
488
493 void reset() {
494 profiler_.clear_all_samples();
495 clear_all_metrics();
496 }
497
498 // =========================================================================
499 // Tagged Metric Recording Methods
500 // =========================================================================
501
525 common::VoidResult record_counter(const std::string& name, double value,
526 const tag_map& tags = {});
527
550 common::VoidResult record_gauge(const std::string& name, double value,
551 const tag_map& tags = {});
552
575 common::VoidResult record_histogram(const std::string& name, double value,
576 const tag_map& tags = {});
577
585 std::vector<tagged_metric> get_all_tagged_metrics() const;
586
593
594private:
598 static std::string make_metric_key(const std::string& name, const tag_map& tags);
599
603 common::VoidResult record_metric_internal(const std::string& name, double value,
604 recorded_metric_type type, const tag_map& tags);
605};
606
611
615#define PERF_TIMER(operation_name) \
616 kcenon::monitoring::scoped_timer _perf_timer( \
617 &kcenon::monitoring::global_performance_monitor().get_profiler(), \
618 operation_name \
619 )
620
621#define PERF_TIMER_CUSTOM(profiler, operation_name) \
622 kcenon::monitoring::scoped_timer _perf_timer(profiler, operation_name)
623
628private:
630 std::string name_;
631 std::uint32_t iterations_{1000};
632 std::uint32_t warmup_iterations_{100};
633
634public:
635 explicit performance_benchmark(const std::string& name)
636 : name_(name) {}
637
642 void set_iterations(std::uint32_t iterations) {
643 iterations_ = iterations;
644 }
645
650 void set_warmup_iterations(std::uint32_t warmup) {
651 warmup_iterations_ = warmup;
652 }
653
657 template<typename Func>
658 common::Result<performance_metrics> run(
659 const std::string& operation_name,
660 Func&& func
661 ) {
662 // Warmup
663 for (std::uint32_t i = 0; i < warmup_iterations_; ++i) {
664 func();
665 }
666
667 // Actual benchmark
668 for (std::uint32_t i = 0; i < iterations_; ++i) {
669 auto start = std::chrono::high_resolution_clock::now();
670
671 try {
672 func();
673 } catch (...) {
674 // Record error
675 auto end = std::chrono::high_resolution_clock::now();
676 auto duration = std::chrono::duration_cast<std::chrono::nanoseconds>(
677 end - start
678 );
679 profiler_.record_sample(operation_name, duration, false);
680 continue;
681 }
682
683 auto end = std::chrono::high_resolution_clock::now();
684 auto duration = std::chrono::duration_cast<std::chrono::nanoseconds>(
685 end - start
686 );
687 profiler_.record_sample(operation_name, duration, true);
688 }
689
690 return profiler_.get_metrics(operation_name);
691 }
692
696 template<typename Func1, typename Func2>
697 common::Result<std::pair<performance_metrics, performance_metrics>> compare(
698 const std::string& operation1_name,
699 Func1&& func1,
700 const std::string& operation2_name,
701 Func2&& func2
702 ) {
703 auto result1 = run(operation1_name, std::forward<Func1>(func1));
704 if (result1.is_err()) {
705 return common::Result<std::pair<performance_metrics, performance_metrics>>::err(result1.error());
706 }
707
708 auto result2 = run(operation2_name, std::forward<Func2>(func2));
709 if (result2.is_err()) {
710 return common::Result<std::pair<performance_metrics, performance_metrics>>::err(result2.error());
711 }
712
713 return common::ok(std::make_pair(result1.value(), result2.value()));
714 }
715};
716
717// Platform-specific system metrics collection functions
718#if defined(__linux__)
723common::Result<system_metrics> get_linux_system_metrics();
724#endif
725
726#if defined(_WIN32)
731common::Result<system_metrics> get_windows_system_metrics();
732#endif
733
734} } // namespace kcenon::monitoring
Abstract base class for metric collectors.
void set_warmup_iterations(std::uint32_t warmup)
Set number of warmup iterations.
common::Result< performance_metrics > run(const std::string &operation_name, Func &&func)
Run a benchmark.
void set_iterations(std::uint32_t iterations)
Set number of benchmark iterations.
common::Result< std::pair< performance_metrics, performance_metrics > > compare(const std::string &operation1_name, Func1 &&func1, const std::string &operation2_name, Func2 &&func2)
Compare two operations.
Performance monitor combining profiling and system monitoring.
performance_monitor(const std::string &name="performance_monitor")
void reset()
Reset all performance profiler samples and system metrics.
common::VoidResult initialize() override
Initialize the collector.
common::VoidResult record_histogram(const std::string &name, double value, const tag_map &tags={})
system_monitor & get_system_monitor()
Get system monitor.
std::string get_name() const override
Get the name of this performance monitor.
static std::string make_metric_key(const std::string &name, const tag_map &tags)
Generate a unique key from metric name and tags.
const performance_profiler & get_profiler() const
Get performance profiler (const)
thresholds get_thresholds() const
Get current threshold values.
common::Result< bool > check_thresholds() const
Check if any thresholds are exceeded.
void set_cpu_threshold(double threshold)
Set performance thresholds.
common::VoidResult cleanup() override
Cleanup collector resources.
bool is_enabled() const override
Check if this performance monitor is enabled.
void set_latency_threshold(std::chrono::milliseconds threshold)
common::Result< metrics_snapshot > collect() override
Collect metrics.
void clear_all_metrics()
Clear all recorded tagged metrics.
common::VoidResult set_enabled(bool enable) override
Enable or disable the collector.
scoped_timer time_operation(const std::string &operation_name)
Create a scoped timer for an operation.
common::VoidResult record_metric_internal(const std::string &name, double value, recorded_metric_type type, const tag_map &tags)
Internal method to record a metric with type and tags.
std::unordered_map< std::string, std::unique_ptr< metric_data > > tagged_metrics_
const system_monitor & get_system_monitor() const
Get system monitor (const)
std::vector< tagged_metric > get_all_tagged_metrics() const
Get all recorded tagged metrics.
common::VoidResult record_gauge(const std::string &name, double value, const tag_map &tags={})
common::VoidResult record_counter(const std::string &name, double value, const tag_map &tags={})
performance_profiler & get_profiler()
Get performance profiler.
Performance profiler for code sections.
bool is_lock_free_mode() const
Check if lock-free mode is enabled.
std::vector< performance_metrics > get_all_metrics() const
Get all performance metrics.
bool is_enabled() const
Check if profiling is enabled.
common::Result< bool > record_sample(const std::string &operation_name, std::chrono::nanoseconds duration, bool success=true)
Record a performance sample.
common::Result< bool > clear_samples(const std::string &operation_name)
Clear samples for an operation.
void set_max_samples(std::size_t max_samples)
Set maximum samples per operation.
std::unordered_map< std::string, std::unique_ptr< profile_data > > profiles_
void set_enabled(bool enabled)
Enable or disable profiling.
void clear_all_samples()
Clear all samples.
common::Result< performance_metrics > get_metrics(const std::string &operation_name) const
Get performance metrics for an operation.
void set_lock_free_mode(bool enable)
Enable lock-free collection path (Sprint 3-4)
std::chrono::high_resolution_clock::time_point start_time_
std::chrono::nanoseconds elapsed() const
Get elapsed time without completing.
void complete()
Manually complete the timing.
void mark_failed()
Mark the operation as failed.
scoped_timer(performance_profiler *profiler, const std::string &operation_name)
std::vector< system_metrics > get_history(std::chrono::seconds duration=std::chrono::seconds(60)) const
Get historical metrics.
common::Result< bool > start_monitoring(std::chrono::milliseconds interval=std::chrono::milliseconds(1000))
Start monitoring system resources.
common::Result< bool > stop_monitoring()
Stop monitoring.
system_monitor(const system_monitor &)=delete
system_monitor & operator=(const system_monitor &)=delete
system_monitor(system_monitor &&) noexcept
std::unique_ptr< monitor_impl > impl_
common::Result< system_metrics > get_current_metrics() const
Get current system metrics.
bool is_monitoring() const
Check if monitoring is active.
Monitoring system specific error codes.
Core monitoring system interface definitions.
performance_monitor & global_performance_monitor()
Global performance monitor instance.
std::unordered_map< std::string, std::string > tag_map
Type alias for metric tags/labels.
recorded_metric_type
Types of recorded metrics.
@ gauge
Instantaneous value that can go up and down.
@ counter
Monotonically increasing counter.
@ histogram
Distribution of values with buckets.
Result pattern type definitions for monitoring system.
Performance metrics for a specific operation.
std::chrono::system_clock::time_point last_update
std::atomic< std::chrono::steady_clock::rep > last_access_time
std::chrono::system_clock::time_point timestamp
Represents a metric value with associated tags.
std::string key() const
Generate unique key for aggregation based on name and sorted tags.
tagged_metric(const std::string &n, double v, recorded_metric_type t, const tag_map &tgs={})
std::chrono::system_clock::time_point timestamp
std::string name_