Common System 0.2.0
Common interfaces and patterns for system integration
Loading...
Searching...
No Matches
health_monitor.h
Go to the documentation of this file.
1// BSD 3-Clause License
2// Copyright (c) 2025, 🍀☀🌕🌥 🌊
3// See the LICENSE file in the project root for full license information.
4
14#pragma once
15
16#include <atomic>
17#include <chrono>
18#include <functional>
19#include <memory>
20#include <mutex>
21#include <sstream>
22#include <string>
23#include <unordered_map>
24#include <vector>
25
26#include "health_check.h"
28
30
36 std::size_t total_checks{0};
37 std::size_t healthy_count{0};
38 std::size_t degraded_count{0};
39 std::size_t unhealthy_count{0};
40 std::size_t unknown_count{0};
41 std::size_t check_executions{0};
42 std::size_t recovery_attempts{0};
43 std::size_t successful_recoveries{0};
44 std::chrono::system_clock::time_point last_check_time;
45 std::chrono::milliseconds last_check_duration{0};
46};
47
56using recovery_handler = std::function<bool()>;
57
96public:
97 health_monitor() = default;
98
99 ~health_monitor() { stop().value_or(std::monostate{}); }
100
105
112 Result<bool> register_check(const std::string& name, std::shared_ptr<health_check> check) {
113 std::lock_guard<std::mutex> lock(mutex_);
114
115 auto result = graph_.add_node(name, std::move(check));
116 if (result.is_ok()) {
118 }
119 return result;
120 }
121
127 Result<bool> unregister_check(const std::string& name) {
128 std::lock_guard<std::mutex> lock(mutex_);
129
130 auto result = graph_.remove_node(name);
131 if (result.is_ok()) {
133 recovery_handlers_.erase(name);
134 last_results_.erase(name);
135 }
136 return result;
137 }
138
144 Result<health_check_result> check(const std::string& name) {
145 std::lock_guard<std::mutex> lock(mutex_);
146
147 auto start_time = std::chrono::steady_clock::now();
148 auto result = graph_.check_with_dependencies(name);
149 auto end_time = std::chrono::steady_clock::now();
150
151 if (result.is_ok()) {
152 last_results_[name] = result.value();
153 update_stats_after_check(result.value());
154
155 stats_.last_check_time = std::chrono::system_clock::now();
156 stats_.last_check_duration = std::chrono::duration_cast<std::chrono::milliseconds>(
157 end_time - start_time);
159
160 // Trigger recovery if unhealthy
161 if (result.value().status == health_status::unhealthy) {
162 attempt_recovery(name);
163 }
164 }
165
166 return result;
167 }
168
175 Result<bool> add_dependency(const std::string& dependent, const std::string& dependency) {
176 std::lock_guard<std::mutex> lock(mutex_);
177 return graph_.add_dependency(dependent, dependency);
178 }
179
185 if (running_.exchange(true)) {
186 return {error_info{1, "Health monitor is already running", "health_monitor"}};
187 }
188 return ok(std::monostate{});
189 }
190
196 if (!running_.exchange(false)) {
197 return {error_info{1, "Health monitor is not running", "health_monitor"}};
198 }
199 return ok(std::monostate{});
200 }
201
206 [[nodiscard]] bool is_running() const { return running_.load(); }
207
213 void refresh() {
214 std::lock_guard<std::mutex> lock(mutex_);
215
216 auto start_time = std::chrono::steady_clock::now();
217
218 // Reset counts
223
224 auto nodes = graph_.get_all_nodes();
225 for (const auto& name : nodes) {
226 auto result = graph_.check_with_dependencies(name);
227 if (result.is_ok()) {
228 last_results_[name] = result.value();
229 update_stats_after_check(result.value());
230
231 if (result.value().status == health_status::unhealthy) {
232 attempt_recovery(name);
233 }
234 }
235 }
236
237 auto end_time = std::chrono::steady_clock::now();
238 stats_.last_check_time = std::chrono::system_clock::now();
239 stats_.last_check_duration = std::chrono::duration_cast<std::chrono::milliseconds>(
240 end_time - start_time);
242 }
243
249 void register_recovery_handler(const std::string& name, recovery_handler handler) {
250 std::lock_guard<std::mutex> lock(mutex_);
251 recovery_handlers_[name] = std::move(handler);
252 }
253
258 [[nodiscard]] health_monitor_stats get_stats() const {
259 std::lock_guard<std::mutex> lock(mutex_);
260 return stats_;
261 }
262
267 [[nodiscard]] std::string get_health_report() const {
268 std::lock_guard<std::mutex> lock(mutex_);
269
270 std::ostringstream report;
271 report << "=== Health Report ===\n";
272 report << "Status: " << get_overall_status_string() << "\n";
273 report << "Total Checks: " << stats_.total_checks << "\n";
274 report << "Healthy: " << stats_.healthy_count << "\n";
275 report << "Degraded: " << stats_.degraded_count << "\n";
276 report << "Unhealthy: " << stats_.unhealthy_count << "\n";
277 report << "Unknown: " << stats_.unknown_count << "\n";
278 report << "\n--- Individual Checks ---\n";
279
280 for (const auto& [name, result] : last_results_) {
281 report << name << ": " << to_string(result.status);
282 if (!result.message.empty()) {
283 report << " - " << result.message;
284 }
285 report << "\n";
286 }
287
288 return report.str();
289 }
290
295 [[nodiscard]] health_status get_overall_status() const {
296 std::lock_guard<std::mutex> lock(mutex_);
297
298 if (stats_.unhealthy_count > 0) {
300 }
301 if (stats_.degraded_count > 0) {
303 }
304 if (stats_.unknown_count > 0) {
306 }
307 if (stats_.healthy_count > 0) {
309 }
311 }
312
318 [[nodiscard]] bool has_check(const std::string& name) const {
319 std::lock_guard<std::mutex> lock(mutex_);
320 return graph_.has_node(name);
321 }
322
327 [[nodiscard]] std::vector<std::string> get_check_names() const {
328 std::lock_guard<std::mutex> lock(mutex_);
329 return graph_.get_all_nodes();
330 }
331
332private:
334 switch (result.status) {
337 break;
340 break;
343 break;
346 break;
347 }
348 }
349
350 void attempt_recovery(const std::string& name) {
351 auto it = recovery_handlers_.find(name);
352 if (it == recovery_handlers_.end()) {
353 return;
354 }
355
357 if (it->second()) {
359 }
360 }
361
362 [[nodiscard]] std::string get_overall_status_string() const {
364 }
365
367 if (stats_.unhealthy_count > 0) {
369 }
370 if (stats_.degraded_count > 0) {
372 }
373 if (stats_.unknown_count > 0) {
375 }
376 if (stats_.healthy_count > 0) {
378 }
380 }
381
383 std::unordered_map<std::string, recovery_handler> recovery_handlers_;
384 std::unordered_map<std::string, health_check_result> last_results_;
386 std::atomic<bool> running_{false};
387 mutable std::mutex mutex_;
388};
389
404 static health_monitor instance;
405 return instance;
406}
407
408} // namespace kcenon::common::interfaces
Result type for error handling with member function support.
Definition core.cppm:165
T value_or(T default_value) const
Get value or return default (C++23 std::expected compatible)
Definition core.h:384
Manages dependencies between health checks as a DAG.
Result< bool > add_dependency(const std::string &dependent, const std::string &dependency)
Add a dependency between two nodes.
Result< bool > add_node(const std::string &name, std::shared_ptr< health_check > check)
Add a health check node to the graph.
Result< health_check_result > check_with_dependencies(const std::string &name)
Execute health check with its dependencies.
bool has_node(const std::string &name) const
Check if a node exists.
Result< bool > remove_node(const std::string &name)
Remove a health check node from the graph.
std::vector< std::string > get_all_nodes() const
Get all node names.
Central health monitoring system.
void attempt_recovery(const std::string &name)
std::vector< std::string > get_check_names() const
Get all registered check names.
health_monitor(const health_monitor &)=delete
health_monitor(health_monitor &&)=delete
health_monitor & operator=(health_monitor &&)=delete
health_monitor_stats get_stats() const
Get monitoring statistics.
bool has_check(const std::string &name) const
Check if a health check is registered.
health_monitor & operator=(const health_monitor &)=delete
bool is_running() const
Check if health monitoring is running.
Result< health_check_result > check(const std::string &name)
Execute a specific health check.
Result< bool > add_dependency(const std::string &dependent, const std::string &dependency)
Add a dependency between health checks.
VoidResult start()
Start the health monitoring.
Result< bool > unregister_check(const std::string &name)
Unregister a health check.
std::string get_health_report() const
Get a formatted health report.
void update_stats_after_check(const health_check_result &result)
void register_recovery_handler(const std::string &name, recovery_handler handler)
Register a recovery handler for a health check.
health_status get_overall_status() const
Get the overall health status.
Result< bool > register_check(const std::string &name, std::shared_ptr< health_check > check)
Register a health check.
std::unordered_map< std::string, health_check_result > last_results_
std::unordered_map< std::string, recovery_handler > recovery_handlers_
VoidResult stop()
Stop the health monitoring.
void refresh()
Refresh all health checks.
Base classes and types for health checking functionality.
DAG-based health check dependency management.
health_status
Standard health status levels.
std::function< bool()> recovery_handler
Recovery handler function type.
health_monitor & global_health_monitor()
Get the global health monitor instance.
std::string to_string(log_level level)
Convert log level to string.
VoidResult ok()
Create a successful void result.
Definition utilities.h:71
Standard error information used by Result<T>.
Definition core.cppm:106
std::chrono::system_clock::time_point last_check_time