Monitoring System 0.1.0
System resource monitoring with pluggable collectors and alerting
Loading...
Searching...
No Matches
health_monitor.h
Go to the documentation of this file.
1// BSD 3-Clause License
2// Copyright (c) 2021-2025, 🍀☀🌕🌥 🌊
3// See the LICENSE file in the project root for full license information.
4
5#pragma once
6
46#include <atomic>
47#include <chrono>
48#include <condition_variable>
49#include <functional>
50#include <memory>
51#include <mutex>
52#include <queue>
53#include <shared_mutex>
54#include <string>
55#include <thread>
56#include <unordered_map>
57#include <unordered_set>
58#include <vector>
59
61
62namespace kcenon::monitoring {
63
70 liveness,
71 readiness,
72 startup
73};
74
79 std::chrono::milliseconds check_interval{std::chrono::milliseconds(5000)};
80 std::chrono::seconds cache_duration{std::chrono::seconds(1)};
83 std::chrono::seconds recovery_timeout{std::chrono::seconds(30)};
84};
85
90 size_t total_checks{0};
91 size_t healthy_checks{0};
92 size_t unhealthy_checks{0};
93 size_t degraded_checks{0};
96 std::chrono::system_clock::time_point last_check_time;
97};
98
111public:
112 virtual ~health_check() = default;
113
118 virtual std::string get_name() const = 0;
119
124 virtual health_check_type get_type() const = 0;
125
131
136 virtual std::chrono::milliseconds get_timeout() const {
137 return std::chrono::milliseconds(1000);
138 }
139
144 virtual bool is_critical() const {
145 return false;
146 }
147};
148
158public:
167 functional_health_check(const std::string& name,
169 std::function<health_check_result()> check_func,
170 std::chrono::milliseconds timeout = std::chrono::milliseconds(1000),
171 bool critical = false)
172 : name_(name)
173 , type_(type)
174 , check_func_(std::move(check_func))
175 , timeout_(timeout)
176 , critical_(critical) {}
177
179 std::string get_name() const override { return name_; }
181 health_check_type get_type() const override { return type_; }
183 std::chrono::milliseconds get_timeout() const override { return timeout_; }
185 bool is_critical() const override { return critical_; }
186
192 if (check_func_) {
193 return check_func_();
194 }
195 return health_check_result::healthy("No check function");
196 }
197
198private:
199 std::string name_;
202 std::chrono::milliseconds timeout_;
204};
205
220public:
228 composite_health_check(const std::string& name,
230 bool all_required = true)
231 : name_(name)
232 , type_(type)
233 , all_required_(all_required) {}
234
236 std::string get_name() const override { return name_; }
238 health_check_type get_type() const override { return type_; }
239
244 void add_check(std::shared_ptr<health_check> check) {
245 std::lock_guard<std::mutex> lock(mutex_);
246 checks_.push_back(std::move(check));
247 }
248
254 std::lock_guard<std::mutex> lock(mutex_);
255
256 if (checks_.empty()) {
257 return health_check_result::healthy("No checks configured");
258 }
259
260 std::vector<health_check_result> results;
261 results.reserve(checks_.size());
262
263 for (const auto& chk : checks_) {
264 results.push_back(chk->check());
265 }
266
267 if (all_required_) {
268 return check_all_required(results);
269 } else {
270 return check_any_required(results);
271 }
272 }
273
274private:
275 health_check_result check_all_required(const std::vector<health_check_result>& results) {
276 bool has_unhealthy = false;
277 bool has_degraded = false;
278 std::string message;
279
280 for (const auto& result : results) {
281 if (result.status == health_status::unhealthy) {
282 has_unhealthy = true;
283 message += result.message + "; ";
284 } else if (result.status == health_status::degraded) {
285 has_degraded = true;
286 message += result.message + "; ";
287 }
288 }
289
290 if (has_unhealthy) {
291 return health_check_result::unhealthy(message.empty() ? "One or more checks failed" : message);
292 }
293 if (has_degraded) {
294 return health_check_result::degraded(message.empty() ? "One or more checks degraded" : message);
295 }
296 return health_check_result::healthy("All checks passed");
297 }
298
299 health_check_result check_any_required(const std::vector<health_check_result>& results) {
300 bool any_healthy = false;
301 std::string message;
302
303 for (const auto& result : results) {
304 if (result.status == health_status::healthy) {
305 any_healthy = true;
306 break;
307 }
308 message += result.message + "; ";
309 }
310
311 if (any_healthy) {
312 return health_check_result::healthy("At least one check passed");
313 }
314 return health_check_result::unhealthy(message.empty() ? "All checks failed" : message);
315 }
316
317 std::string name_;
320 std::vector<std::shared_ptr<health_check>> checks_;
321 mutable std::mutex mutex_;
322};
323
338public:
345 common::Result<bool> add_node(const std::string& name, std::shared_ptr<health_check> check) {
346 std::lock_guard<std::shared_mutex> lock(mutex_);
347
348 if (nodes_.find(name) != nodes_.end()) {
349 return common::Result<bool>::err(error_info(monitoring_error_code::already_exists, "Node '" + name + "' already exists").to_common_error());
350 }
351
352 nodes_[name] = std::move(check);
353 dependencies_[name] = {};
354 dependents_[name] = {};
355 return common::ok(true);
356 }
357
364 common::Result<bool> add_dependency(const std::string& dependent, const std::string& dependency) {
365 std::lock_guard<std::shared_mutex> lock(mutex_);
366
367 if (nodes_.find(dependent) == nodes_.end()) {
368 return common::Result<bool>::err(error_info(monitoring_error_code::not_found, "Dependent '" + dependent + "' not found").to_common_error());
369 }
370 if (nodes_.find(dependency) == nodes_.end()) {
371 return common::Result<bool>::err(error_info(monitoring_error_code::not_found, "Dependency '" + dependency + "' not found").to_common_error());
372 }
373
374 if (would_create_cycle_internal(dependent, dependency)) {
375 return common::Result<bool>::err(error_info(monitoring_error_code::invalid_state, "Adding dependency would create a cycle").to_common_error());
376 }
377
378 dependencies_[dependent].push_back(dependency);
379 dependents_[dependency].push_back(dependent);
380 return common::ok(true);
381 }
382
388 std::vector<std::string> get_dependencies(const std::string& name) const {
389 std::shared_lock<std::shared_mutex> lock(mutex_);
390
391 auto it = dependencies_.find(name);
392 if (it != dependencies_.end()) {
393 return it->second;
394 }
395 return {};
396 }
397
403 std::vector<std::string> get_dependents(const std::string& name) const {
404 std::shared_lock<std::shared_mutex> lock(mutex_);
405
406 auto it = dependents_.find(name);
407 if (it != dependents_.end()) {
408 return it->second;
409 }
410 return {};
411 }
412
419 bool would_create_cycle(const std::string& from, const std::string& to) const {
420 std::shared_lock<std::shared_mutex> lock(mutex_);
421 return would_create_cycle_internal(from, to);
422 }
423
428 std::vector<std::string> topological_sort() const {
429 std::shared_lock<std::shared_mutex> lock(mutex_);
430
431 std::unordered_map<std::string, int> in_degree;
432 for (const auto& [name, _] : nodes_) {
433 in_degree[name] = 0;
434 }
435
436 for (const auto& [name, deps] : dependencies_) {
437 in_degree[name] = static_cast<int>(deps.size());
438 }
439
440 std::queue<std::string> queue;
441 for (const auto& [name, degree] : in_degree) {
442 if (degree == 0) {
443 queue.push(name);
444 }
445 }
446
447 std::vector<std::string> result;
448 result.reserve(nodes_.size());
449
450 while (!queue.empty()) {
451 std::string current = queue.front();
452 queue.pop();
453 result.push_back(current);
454
455 auto it = dependents_.find(current);
456 if (it != dependents_.end()) {
457 for (const auto& dep : it->second) {
458 if (--in_degree[dep] == 0) {
459 queue.push(dep);
460 }
461 }
462 }
463 }
464
465 return result;
466 }
467
474 std::shared_lock<std::shared_mutex> lock(mutex_);
475
476 auto it = nodes_.find(name);
477 if (it == nodes_.end()) {
478 return health_check_result::unhealthy("Node '" + name + "' not found");
479 }
480
481 auto deps_it = dependencies_.find(name);
482 if (deps_it != dependencies_.end()) {
483 for (const auto& dep_name : deps_it->second) {
484 auto dep_it = nodes_.find(dep_name);
485 if (dep_it != nodes_.end()) {
486 auto dep_result = dep_it->second->check();
487 if (dep_result.status == health_status::unhealthy) {
489 "Dependency '" + dep_name + "' is unhealthy: " + dep_result.message);
490 }
491 if (dep_result.status == health_status::degraded) {
493 "Dependency '" + dep_name + "' is degraded: " + dep_result.message);
494 }
495 }
496 }
497 }
498
499 return it->second->check();
500 }
501
507 std::vector<std::string> get_failure_impact(const std::string& name) const {
508 std::shared_lock<std::shared_mutex> lock(mutex_);
509
510 std::vector<std::string> impacted;
511 std::unordered_set<std::string> visited;
512 std::queue<std::string> to_visit;
513
514 auto it = dependents_.find(name);
515 if (it != dependents_.end()) {
516 for (const auto& dep : it->second) {
517 to_visit.push(dep);
518 }
519 }
520
521 while (!to_visit.empty()) {
522 std::string current = to_visit.front();
523 to_visit.pop();
524
525 if (visited.find(current) != visited.end()) {
526 continue;
527 }
528 visited.insert(current);
529 impacted.push_back(current);
530
531 auto dep_it = dependents_.find(current);
532 if (dep_it != dependents_.end()) {
533 for (const auto& dep : dep_it->second) {
534 if (visited.find(dep) == visited.end()) {
535 to_visit.push(dep);
536 }
537 }
538 }
539 }
540
541 return impacted;
542 }
543
544private:
545 bool would_create_cycle_internal(const std::string& from, const std::string& to) const {
546 if (from == to) {
547 return true;
548 }
549
550 std::unordered_set<std::string> visited;
551 std::queue<std::string> to_visit;
552 to_visit.push(to);
553
554 while (!to_visit.empty()) {
555 std::string current = to_visit.front();
556 to_visit.pop();
557
558 if (current == from) {
559 return true;
560 }
561
562 if (visited.find(current) != visited.end()) {
563 continue;
564 }
565 visited.insert(current);
566
567 auto it = dependencies_.find(current);
568 if (it != dependencies_.end()) {
569 for (const auto& dep : it->second) {
570 if (visited.find(dep) == visited.end()) {
571 to_visit.push(dep);
572 }
573 }
574 }
575 }
576
577 return false;
578 }
579
580 mutable std::shared_mutex mutex_;
581 std::unordered_map<std::string, std::shared_ptr<health_check>> nodes_;
582 std::unordered_map<std::string, std::vector<std::string>> dependencies_;
583 std::unordered_map<std::string, std::vector<std::string>> dependents_;
584};
585
606public:
612 health_check_builder& with_name(const std::string& name) {
613 name_ = name;
614 return *this;
615 }
616
623 type_ = type;
624 return *this;
625 }
626
633 check_func_ = std::move(func);
634 return *this;
635 }
636
642 health_check_builder& with_timeout(std::chrono::milliseconds timeout) {
643 timeout_ = timeout;
644 return *this;
645 }
646
652 health_check_builder& critical(bool is_critical) {
653 critical_ = is_critical;
654 return *this;
655 }
656
661 std::shared_ptr<functional_health_check> build() {
662 return std::make_shared<functional_health_check>(
664 }
665
666private:
667 std::string name_;
670 std::chrono::milliseconds timeout_{std::chrono::milliseconds(1000)};
671 bool critical_{false};
672};
673
697public:
699 health_monitor() = default;
700
705 explicit health_monitor(const health_monitor_config& config) : config_(config) {}
706
708 virtual ~health_monitor() { stop(); }
709
716 common::Result<bool> register_check(const std::string& name, std::shared_ptr<health_check> check) {
717 std::lock_guard<std::shared_mutex> lock(mutex_);
718
719 if (checks_.find(name) != checks_.end()) {
720 return common::Result<bool>::err(error_info(monitoring_error_code::already_exists, "Check '" + name + "' already registered").to_common_error());
721 }
722
723 checks_[name] = std::move(check);
724 auto graph_result = dependency_graph_.add_node(name, checks_[name]);
725 if (graph_result.is_err()) {
726 checks_.erase(name);
727 return common::Result<bool>::err(graph_result.error());
728 }
729 return common::ok(true);
730 }
731
737 common::Result<bool> unregister_check(const std::string& name) {
738 std::lock_guard<std::shared_mutex> lock(mutex_);
739
740 if (checks_.find(name) == checks_.end()) {
742 "Check '" + name + "' not found");
743 return common::Result<bool>::err(err.to_common_error());
744 }
745
746 checks_.erase(name);
747 recovery_handlers_.erase(name);
748 return common::ok(true);
749 }
750
756 common::Result<health_check_result> check(const std::string& name) {
757 std::lock_guard<std::shared_mutex> lock(mutex_);
758
759 auto it = checks_.find(name);
760 if (it == checks_.end()) {
762 "Check '" + name + "' not found");
763 return common::Result<health_check_result>::err(err.to_common_error());
764 }
765
766 auto result = dependency_graph_.check_with_dependencies(name);
767 update_stats(result);
768 cached_results_[name] = result;
769 return common::ok(result);
770 }
771
776 std::unordered_map<std::string, health_check_result> check_all() {
777 std::lock_guard<std::shared_mutex> lock(mutex_);
778
779 std::unordered_map<std::string, health_check_result> results;
780 for (const auto& [name, check] : checks_) {
781 auto result = check->check();
782 results[name] = result;
783 cached_results_[name] = result;
784 update_stats(result);
785 }
786 return results;
787 }
788
795 common::Result<bool> add_dependency(const std::string& dependent, const std::string& dependency) {
796 std::lock_guard<std::shared_mutex> lock(mutex_);
797 return dependency_graph_.add_dependency(dependent, dependency);
798 }
799
804 common::VoidResult start() {
805 std::lock_guard<std::mutex> lock(lifecycle_mutex_);
806
807 if (running_.load()) {
808 return common::ok();
809 }
810
811 running_.store(true);
812 monitor_thread_ = std::thread([this]() { run_monitoring_loop(); });
813 return common::ok();
814 }
815
820 common::VoidResult stop() {
821 std::lock_guard<std::mutex> lock(lifecycle_mutex_);
822
823 if (!running_.load()) {
824 return common::ok();
825 }
826
827 running_.store(false);
828 cv_.notify_all();
829
830 if (monitor_thread_.joinable()) {
831 monitor_thread_.join();
832 }
833
834 return common::ok();
835 }
836
841 bool is_running() const {
842 return running_.load();
843 }
844
851 void refresh() {
852 std::lock_guard<std::shared_mutex> lock(mutex_);
853
854 for (const auto& [name, check] : checks_) {
855 auto result = check->check();
856 cached_results_[name] = result;
857 update_stats(result);
858
859 if (result.status == health_status::unhealthy) {
860 auto it = recovery_handlers_.find(name);
863 if (it->second()) {
865 }
866 }
867 }
868 }
869
870 stats_.last_check_time = std::chrono::system_clock::now();
871 }
872
878 void register_recovery_handler(const std::string& check_name,
879 std::function<bool()> handler) {
880 std::lock_guard<std::shared_mutex> lock(mutex_);
881 recovery_handlers_[check_name] = std::move(handler);
882 }
883
890 std::shared_lock<std::shared_mutex> lock(mutex_);
891
892 if (checks_.empty()) {
894 }
895
896 bool has_unhealthy = false;
897 bool has_degraded = false;
898
899 for (const auto& [name, result] : cached_results_) {
900 if (result.status == health_status::unhealthy) {
901 has_unhealthy = true;
902 } else if (result.status == health_status::degraded) {
903 has_degraded = true;
904 }
905 }
906
907 if (has_unhealthy) return health_status::unhealthy;
908 if (has_degraded) return health_status::degraded;
910 }
911
917 std::shared_lock<std::shared_mutex> lock(mutex_);
918 return stats_;
919 }
920
925 std::string get_health_report() {
926 std::shared_lock<std::shared_mutex> lock(mutex_);
927
928 std::string report = "Health Report:\n";
929
930 if (cached_results_.empty()) {
931 report += " No health checks have been performed yet.\n";
932 return report;
933 }
934
935 for (const auto& [name, result] : cached_results_) {
936 report += " " + name + ": ";
937 switch (result.status) {
939 report += "HEALTHY";
940 break;
942 report += "DEGRADED";
943 break;
945 report += "UNHEALTHY";
946 break;
947 default:
948 report += "UNKNOWN";
949 break;
950 }
951 report += " - " + result.message + "\n";
952 }
953
954 return report;
955 }
956
962 health_check_result result;
964 result.message = "Health monitor operational";
965 result.timestamp = std::chrono::system_clock::now();
966 return result;
967 }
968
969private:
971 while (running_.load()) {
972 refresh();
973
974 std::unique_lock<std::mutex> lock(cv_mutex_);
975 cv_.wait_for(lock, config_.check_interval, [this]() {
976 return !running_.load();
977 });
978 }
979 }
980
981 void update_stats(const health_check_result& result) {
983 switch (result.status) {
986 break;
989 break;
992 break;
993 default:
994 break;
995 }
996 }
997
1001
1002 mutable std::shared_mutex mutex_;
1004 std::mutex cv_mutex_;
1005 std::condition_variable cv_;
1006
1007 std::unordered_map<std::string, std::shared_ptr<health_check>> checks_;
1008 std::unordered_map<std::string, std::function<bool()>> recovery_handlers_;
1009 std::unordered_map<std::string, health_check_result> cached_results_;
1010
1011 std::atomic<bool> running_{false};
1012 std::thread monitor_thread_;
1013};
1014
1020 static health_monitor instance;
1021 return instance;
1022}
1023
1024} // namespace kcenon::monitoring
Composite health check that aggregates multiple sub-checks.
void add_check(std::shared_ptr< health_check > check)
Add a child health check to this composite.
std::string get_name() const override
Get the human-readable name of this health check.
health_check_result check_all_required(const std::vector< health_check_result > &results)
composite_health_check(const std::string &name, health_check_type type, bool all_required=true)
Construct a composite health check.
health_check_type get_type() const override
Get the type of this health check (liveness, readiness, or startup).
health_check_result check_any_required(const std::vector< health_check_result > &results)
health_check_result check() override
Execute all child checks and return the aggregate result.
std::vector< std::shared_ptr< health_check > > checks_
Health check implementation backed by a std::function.
std::function< health_check_result()> check_func_
functional_health_check(const std::string &name, health_check_type type, std::function< health_check_result()> check_func, std::chrono::milliseconds timeout=std::chrono::milliseconds(1000), bool critical=false)
Construct a functional health check.
bool is_critical() const override
Whether this check is critical for overall system health.
std::chrono::milliseconds get_timeout() const override
Get the maximum time allowed for this check to complete.
std::string get_name() const override
Get the human-readable name of this health check.
health_check_result check() override
Execute the stored check function.
health_check_type get_type() const override
Get the type of this health check (liveness, readiness, or startup).
Fluent builder for creating functional_health_check instances.
std::function< health_check_result()> check_func_
health_check_builder & critical(bool is_critical)
Mark this check as critical for overall system health.
health_check_builder & with_check(std::function< health_check_result()> func)
Set the callable that performs the health check.
health_check_builder & with_type(health_check_type type)
Set the health check type.
health_check_builder & with_name(const std::string &name)
Set the health check name.
std::shared_ptr< functional_health_check > build()
Build and return the configured functional_health_check.
health_check_builder & with_timeout(std::chrono::milliseconds timeout)
Set the maximum duration allowed for the check.
Abstract base class for health checks.
virtual bool is_critical() const
Whether this check is critical for overall system health.
virtual std::chrono::milliseconds get_timeout() const
Get the maximum time allowed for this check to complete.
virtual std::string get_name() const =0
Get the human-readable name of this health check.
virtual health_check_result check()=0
Execute the health check and return the result.
virtual health_check_type get_type() const =0
Get the type of this health check (liveness, readiness, or startup).
Directed acyclic graph for health check dependencies.
std::unordered_map< std::string, std::vector< std::string > > dependencies_
bool would_create_cycle_internal(const std::string &from, const std::string &to) const
bool would_create_cycle(const std::string &from, const std::string &to) const
Check whether adding an edge from -> to would create a cycle.
std::vector< std::string > get_dependencies(const std::string &name) const
Get the direct dependencies of a node.
std::vector< std::string > get_failure_impact(const std::string &name) const
Compute all nodes that would be impacted if the given node fails.
std::vector< std::string > topological_sort() const
Compute a topological ordering of all nodes.
std::unordered_map< std::string, std::vector< std::string > > dependents_
std::vector< std::string > get_dependents(const std::string &name) const
Get the nodes that directly depend on the given node.
common::Result< bool > add_dependency(const std::string &dependent, const std::string &dependency)
Add a dependency edge: dependent depends on dependency.
health_check_result check_with_dependencies(const std::string &name)
Execute a health check after verifying all its dependencies are healthy.
common::Result< bool > add_node(const std::string &name, std::shared_ptr< health_check > check)
Add a health check node to the graph.
std::unordered_map< std::string, std::shared_ptr< health_check > > nodes_
Health monitor with dependency management, auto-recovery, and statistics.
std::unordered_map< std::string, std::function< bool()> > recovery_handlers_
health_monitor(const health_monitor_config &config)
Construct with custom configuration.
std::unordered_map< std::string, std::shared_ptr< health_check > > checks_
void refresh()
Manually refresh all health checks and trigger recovery if needed.
common::VoidResult stop()
Stop the periodic health monitoring background thread.
health_check_result check_health() const
Quick self-check of the health monitor itself.
common::Result< bool > unregister_check(const std::string &name)
Remove a previously registered health check.
health_dependency_graph dependency_graph_
void register_recovery_handler(const std::string &check_name, std::function< bool()> handler)
Register a recovery handler for a named health check.
std::unordered_map< std::string, health_check_result > check_all()
Execute all registered health checks.
void update_stats(const health_check_result &result)
common::Result< bool > add_dependency(const std::string &dependent, const std::string &dependency)
Add a dependency between two registered health checks.
health_monitor()=default
Default constructor with default configuration.
health_status get_overall_status()
Get the aggregate health status across all cached results.
virtual ~health_monitor()
Destructor. Stops the monitoring loop if running.
common::Result< health_check_result > check(const std::string &name)
Execute a single named health check (with dependency verification).
std::unordered_map< std::string, health_check_result > cached_results_
bool is_running() const
Check whether the monitoring background thread is running.
health_monitor_stats get_stats() const
Get accumulated health monitoring statistics.
common::VoidResult start()
Start the periodic health monitoring background thread.
std::string get_health_report()
Generate a human-readable health report.
common::Result< bool > register_check(const std::string &name, std::shared_ptr< health_check > check)
Register a named health check.
Core monitoring system interface definitions.
health_monitor & global_health_monitor()
Get the global health monitor singleton instance.
health_check_type
Types of health checks following Kubernetes probe conventions.
@ liveness
Indicates whether the process is alive and should be restarted if failing.
@ readiness
Indicates whether the service is ready to accept traffic.
@ startup
Indicates whether the application has finished initializing.
health_status
System health status levels.
Extended error information with context.
common::error_info to_common_error() const
Convert to common_system error_info.
Result of a health check operation.
static health_check_result unhealthy(const std::string &msg)
std::chrono::system_clock::time_point timestamp
static health_check_result healthy(const std::string &msg="OK")
static health_check_result degraded(const std::string &msg)
Configuration for the health_monitor.
std::chrono::seconds cache_duration
Duration to cache health check results.
std::chrono::milliseconds check_interval
Interval between automatic health check cycles.
size_t max_consecutive_failures
Failures before triggering recovery.
bool enable_auto_recovery
Whether to invoke recovery handlers on failure.
std::chrono::seconds recovery_timeout
Maximum time allowed for a recovery attempt.
Accumulated statistics for health monitoring operations.
std::chrono::system_clock::time_point last_check_time
Timestamp of the last check cycle.
size_t successful_recoveries
Number of successful recovery attempts.
size_t degraded_checks
Number of checks that returned degraded.
size_t total_checks
Total number of health checks performed.
size_t recovery_attempts
Number of auto-recovery attempts made.
size_t unhealthy_checks
Number of checks that returned unhealthy.
size_t healthy_checks
Number of checks that returned healthy.