48#include <condition_variable>
53#include <shared_mutex>
56#include <unordered_map>
57#include <unordered_set>
137 return std::chrono::milliseconds(1000);
170 std::chrono::milliseconds timeout = std::chrono::milliseconds(1000),
230 bool all_required =
true)
245 std::lock_guard<std::mutex> lock(
mutex_);
254 std::lock_guard<std::mutex> lock(
mutex_);
260 std::vector<health_check_result> results;
261 results.reserve(
checks_.size());
263 for (
const auto& chk :
checks_) {
264 results.push_back(chk->check());
276 bool has_unhealthy =
false;
277 bool has_degraded =
false;
280 for (
const auto& result : results) {
282 has_unhealthy =
true;
283 message += result.message +
"; ";
286 message += result.message +
"; ";
300 bool any_healthy =
false;
303 for (
const auto& result : results) {
308 message += result.message +
"; ";
320 std::vector<std::shared_ptr<health_check>>
checks_;
345 common::Result<bool>
add_node(
const std::string& name, std::shared_ptr<health_check> check) {
346 std::lock_guard<std::shared_mutex> lock(
mutex_);
352 nodes_[name] = std::move(check);
355 return common::ok(
true);
364 common::Result<bool>
add_dependency(
const std::string& dependent,
const std::string& dependency) {
365 std::lock_guard<std::shared_mutex> lock(
mutex_);
380 return common::ok(
true);
389 std::shared_lock<std::shared_mutex> lock(
mutex_);
404 std::shared_lock<std::shared_mutex> lock(
mutex_);
420 std::shared_lock<std::shared_mutex> lock(
mutex_);
429 std::shared_lock<std::shared_mutex> lock(
mutex_);
431 std::unordered_map<std::string, int> in_degree;
432 for (
const auto& [name, _] :
nodes_) {
437 in_degree[name] =
static_cast<int>(deps.size());
440 std::queue<std::string> queue;
441 for (
const auto& [name, degree] : in_degree) {
447 std::vector<std::string> result;
448 result.reserve(
nodes_.size());
450 while (!queue.empty()) {
451 std::string current = queue.front();
453 result.push_back(current);
457 for (
const auto& dep : it->second) {
458 if (--in_degree[dep] == 0) {
474 std::shared_lock<std::shared_mutex> lock(
mutex_);
476 auto it =
nodes_.find(name);
483 for (
const auto& dep_name : deps_it->second) {
484 auto dep_it =
nodes_.find(dep_name);
485 if (dep_it !=
nodes_.end()) {
486 auto dep_result = dep_it->second->check();
489 "Dependency '" + dep_name +
"' is unhealthy: " + dep_result.message);
493 "Dependency '" + dep_name +
"' is degraded: " + dep_result.message);
499 return it->second->check();
508 std::shared_lock<std::shared_mutex> lock(
mutex_);
510 std::vector<std::string> impacted;
511 std::unordered_set<std::string> visited;
512 std::queue<std::string> to_visit;
516 for (
const auto& dep : it->second) {
521 while (!to_visit.empty()) {
522 std::string current = to_visit.front();
525 if (visited.find(current) != visited.end()) {
528 visited.insert(current);
529 impacted.push_back(current);
533 for (
const auto& dep : dep_it->second) {
534 if (visited.find(dep) == visited.end()) {
550 std::unordered_set<std::string> visited;
551 std::queue<std::string> to_visit;
554 while (!to_visit.empty()) {
555 std::string current = to_visit.front();
558 if (current == from) {
562 if (visited.find(current) != visited.end()) {
565 visited.insert(current);
569 for (
const auto& dep : it->second) {
570 if (visited.find(dep) == visited.end()) {
581 std::unordered_map<std::string, std::shared_ptr<health_check>>
nodes_;
583 std::unordered_map<std::string, std::vector<std::string>>
dependents_;
661 std::shared_ptr<functional_health_check>
build() {
662 return std::make_shared<functional_health_check>(
670 std::chrono::milliseconds
timeout_{std::chrono::milliseconds(1000)};
717 std::lock_guard<std::shared_mutex> lock(
mutex_);
725 if (graph_result.is_err()) {
727 return common::Result<bool>::err(graph_result.error());
729 return common::ok(
true);
738 std::lock_guard<std::shared_mutex> lock(
mutex_);
742 "Check '" + name +
"' not found");
748 return common::ok(
true);
756 common::Result<health_check_result>
check(
const std::string& name) {
757 std::lock_guard<std::shared_mutex> lock(
mutex_);
762 "Check '" + name +
"' not found");
763 return common::Result<health_check_result>::err(err.
to_common_error());
769 return common::ok(result);
776 std::unordered_map<std::string, health_check_result>
check_all() {
777 std::lock_guard<std::shared_mutex> lock(
mutex_);
779 std::unordered_map<std::string, health_check_result> results;
781 auto result =
check->check();
782 results[name] = result;
795 common::Result<bool>
add_dependency(
const std::string& dependent,
const std::string& dependency) {
796 std::lock_guard<std::shared_mutex> lock(
mutex_);
852 std::lock_guard<std::shared_mutex> lock(
mutex_);
855 auto result =
check->check();
879 std::function<
bool()> handler) {
880 std::lock_guard<std::shared_mutex> lock(
mutex_);
890 std::shared_lock<std::shared_mutex> lock(
mutex_);
896 bool has_unhealthy =
false;
897 bool has_degraded =
false;
901 has_unhealthy =
true;
917 std::shared_lock<std::shared_mutex> lock(
mutex_);
926 std::shared_lock<std::shared_mutex> lock(
mutex_);
928 std::string report =
"Health Report:\n";
931 report +=
" No health checks have been performed yet.\n";
936 report +=
" " + name +
": ";
937 switch (result.status) {
942 report +=
"DEGRADED";
945 report +=
"UNHEALTHY";
951 report +=
" - " + result.message +
"\n";
964 result.
message =
"Health monitor operational";
965 result.
timestamp = std::chrono::system_clock::now();
974 std::unique_lock<std::mutex> lock(
cv_mutex_);
976 return !running_.load();
1007 std::unordered_map<std::string, std::shared_ptr<health_check>>
checks_;
Composite health check that aggregates multiple sub-checks.
void add_check(std::shared_ptr< health_check > check)
Add a child health check to this composite.
std::string get_name() const override
Get the human-readable name of this health check.
health_check_result check_all_required(const std::vector< health_check_result > &results)
composite_health_check(const std::string &name, health_check_type type, bool all_required=true)
Construct a composite health check.
health_check_type get_type() const override
Get the type of this health check (liveness, readiness, or startup).
health_check_result check_any_required(const std::vector< health_check_result > &results)
health_check_result check() override
Execute all child checks and return the aggregate result.
std::vector< std::shared_ptr< health_check > > checks_
Health check implementation backed by a std::function.
std::function< health_check_result()> check_func_
functional_health_check(const std::string &name, health_check_type type, std::function< health_check_result()> check_func, std::chrono::milliseconds timeout=std::chrono::milliseconds(1000), bool critical=false)
Construct a functional health check.
bool is_critical() const override
Whether this check is critical for overall system health.
std::chrono::milliseconds get_timeout() const override
Get the maximum time allowed for this check to complete.
std::string get_name() const override
Get the human-readable name of this health check.
health_check_result check() override
Execute the stored check function.
health_check_type get_type() const override
Get the type of this health check (liveness, readiness, or startup).
std::chrono::milliseconds timeout_
Fluent builder for creating functional_health_check instances.
std::chrono::milliseconds timeout_
std::function< health_check_result()> check_func_
health_check_builder & critical(bool is_critical)
Mark this check as critical for overall system health.
health_check_builder & with_check(std::function< health_check_result()> func)
Set the callable that performs the health check.
health_check_builder & with_type(health_check_type type)
Set the health check type.
health_check_builder & with_name(const std::string &name)
Set the health check name.
std::shared_ptr< functional_health_check > build()
Build and return the configured functional_health_check.
health_check_builder & with_timeout(std::chrono::milliseconds timeout)
Set the maximum duration allowed for the check.
Abstract base class for health checks.
virtual bool is_critical() const
Whether this check is critical for overall system health.
virtual std::chrono::milliseconds get_timeout() const
Get the maximum time allowed for this check to complete.
virtual std::string get_name() const =0
Get the human-readable name of this health check.
virtual health_check_result check()=0
Execute the health check and return the result.
virtual ~health_check()=default
virtual health_check_type get_type() const =0
Get the type of this health check (liveness, readiness, or startup).
Directed acyclic graph for health check dependencies.
std::unordered_map< std::string, std::vector< std::string > > dependencies_
bool would_create_cycle_internal(const std::string &from, const std::string &to) const
bool would_create_cycle(const std::string &from, const std::string &to) const
Check whether adding an edge from -> to would create a cycle.
std::vector< std::string > get_dependencies(const std::string &name) const
Get the direct dependencies of a node.
std::vector< std::string > get_failure_impact(const std::string &name) const
Compute all nodes that would be impacted if the given node fails.
std::vector< std::string > topological_sort() const
Compute a topological ordering of all nodes.
std::unordered_map< std::string, std::vector< std::string > > dependents_
std::vector< std::string > get_dependents(const std::string &name) const
Get the nodes that directly depend on the given node.
common::Result< bool > add_dependency(const std::string &dependent, const std::string &dependency)
Add a dependency edge: dependent depends on dependency.
health_check_result check_with_dependencies(const std::string &name)
Execute a health check after verifying all its dependencies are healthy.
common::Result< bool > add_node(const std::string &name, std::shared_ptr< health_check > check)
Add a health check node to the graph.
std::unordered_map< std::string, std::shared_ptr< health_check > > nodes_
Health monitor with dependency management, auto-recovery, and statistics.
std::unordered_map< std::string, std::function< bool()> > recovery_handlers_
health_monitor(const health_monitor_config &config)
Construct with custom configuration.
void run_monitoring_loop()
std::atomic< bool > running_
health_monitor_config config_
std::unordered_map< std::string, std::shared_ptr< health_check > > checks_
void refresh()
Manually refresh all health checks and trigger recovery if needed.
common::VoidResult stop()
Stop the periodic health monitoring background thread.
health_check_result check_health() const
Quick self-check of the health monitor itself.
common::Result< bool > unregister_check(const std::string &name)
Remove a previously registered health check.
health_dependency_graph dependency_graph_
void register_recovery_handler(const std::string &check_name, std::function< bool()> handler)
Register a recovery handler for a named health check.
std::unordered_map< std::string, health_check_result > check_all()
Execute all registered health checks.
void update_stats(const health_check_result &result)
common::Result< bool > add_dependency(const std::string &dependent, const std::string &dependency)
Add a dependency between two registered health checks.
health_monitor()=default
Default constructor with default configuration.
health_status get_overall_status()
Get the aggregate health status across all cached results.
virtual ~health_monitor()
Destructor. Stops the monitoring loop if running.
std::condition_variable cv_
common::Result< health_check_result > check(const std::string &name)
Execute a single named health check (with dependency verification).
std::unordered_map< std::string, health_check_result > cached_results_
bool is_running() const
Check whether the monitoring background thread is running.
health_monitor_stats get_stats() const
Get accumulated health monitoring statistics.
common::VoidResult start()
Start the periodic health monitoring background thread.
std::thread monitor_thread_
std::mutex lifecycle_mutex_
std::string get_health_report()
Generate a human-readable health report.
common::Result< bool > register_check(const std::string &name, std::shared_ptr< health_check > check)
Register a named health check.
health_monitor_stats stats_
Core monitoring system interface definitions.
health_monitor & global_health_monitor()
Get the global health monitor singleton instance.
health_check_type
Types of health checks following Kubernetes probe conventions.
@ liveness
Indicates whether the process is alive and should be restarted if failing.
@ readiness
Indicates whether the service is ready to accept traffic.
@ startup
Indicates whether the application has finished initializing.
health_status
System health status levels.
Extended error information with context.
common::error_info to_common_error() const
Convert to common_system error_info.
Result of a health check operation.
static health_check_result unhealthy(const std::string &msg)
std::chrono::system_clock::time_point timestamp
static health_check_result healthy(const std::string &msg="OK")
static health_check_result degraded(const std::string &msg)
Configuration for the health_monitor.
std::chrono::seconds cache_duration
Duration to cache health check results.
std::chrono::milliseconds check_interval
Interval between automatic health check cycles.
size_t max_consecutive_failures
Failures before triggering recovery.
bool enable_auto_recovery
Whether to invoke recovery handlers on failure.
std::chrono::seconds recovery_timeout
Maximum time allowed for a recovery attempt.
Accumulated statistics for health monitoring operations.
std::chrono::system_clock::time_point last_check_time
Timestamp of the last check cycle.
size_t successful_recoveries
Number of successful recovery attempts.
size_t degraded_checks
Number of checks that returned degraded.
size_t total_checks
Total number of health checks performed.
size_t recovery_attempts
Number of auto-recovery attempts made.
size_t unhealthy_checks
Number of checks that returned unhealthy.
size_t healthy_checks
Number of checks that returned healthy.