111 default:
return "unknown";
317 if (comp.name == name)
343 bool has_unhealthy =
false;
344 bool has_degraded =
false;
345 bool has_unknown =
false;
352 has_unhealthy =
true;
370 else if (has_degraded)
375 else if (has_unknown)
401 std::ostringstream oss;
410 oss <<
" \"metrics\": {\n";
411 oss <<
" \"uptime_seconds\": " << std::setprecision(2) <<
uptime_seconds <<
",\n";
413 oss <<
" \"success_rate\": " << std::setprecision(4) <<
success_rate <<
",\n";
414 oss <<
" \"avg_latency_ms\": " << std::setprecision(3) <<
avg_latency_ms <<
"\n";
418 oss <<
" \"workers\": {\n";
425 oss <<
" \"queue\": {\n";
431 oss <<
" \"components\": [\n";
432 for (std::size_t i = 0; i <
components.size(); ++i)
436 oss <<
" \"name\": \"" << comp.name <<
"\",\n";
438 oss <<
" \"message\": \"" << comp.message <<
"\"";
440 if (!comp.details.empty())
442 oss <<
",\n \"details\": {\n";
443 std::size_t detail_idx = 0;
444 for (
const auto& [key, value] : comp.details)
446 oss <<
" \"" << key <<
"\": \"" << value <<
"\"";
447 if (++detail_idx < comp.details.size())
483 std::ostringstream oss;
491 oss <<
" Uptime: " << std::setprecision(1) <<
uptime_seconds <<
" seconds\n";
493 oss <<
" Success rate: " << std::setprecision(1) << (
success_rate * 100.0) <<
"%\n";
494 oss <<
" Avg latency: " << std::setprecision(2) <<
avg_latency_ms <<
" ms\n\n";
506 double saturation =
static_cast<double>(
queue_depth) /
509 << saturation <<
"% full)";
513 oss <<
"Components:\n";
517 << comp.name <<
": " << comp.message <<
"\n";
542 [[nodiscard]]
auto to_prometheus(
const std::string& pool_name =
"default") const
545 std::ostringstream oss;
549 double health_value = 0.0;
554 default: health_value = 0.0;
break;
556 oss <<
"# HELP thread_pool_health_status Health status (1=healthy, 0.5=degraded, 0=unhealthy)\n";
557 oss <<
"# TYPE thread_pool_health_status gauge\n";
558 oss <<
"thread_pool_health_status{pool=\"" << pool_name <<
"\"} "
559 << std::setprecision(1) << health_value <<
"\n\n";
562 oss <<
"# HELP thread_pool_uptime_seconds Total uptime in seconds\n";
563 oss <<
"# TYPE thread_pool_uptime_seconds counter\n";
564 oss <<
"thread_pool_uptime_seconds{pool=\"" << pool_name <<
"\"} "
568 oss <<
"# HELP thread_pool_jobs_total Total number of jobs processed\n";
569 oss <<
"# TYPE thread_pool_jobs_total counter\n";
570 oss <<
"thread_pool_jobs_total{pool=\"" << pool_name <<
"\"} "
574 oss <<
"# HELP thread_pool_success_rate Ratio of successful jobs (0.0 to 1.0)\n";
575 oss <<
"# TYPE thread_pool_success_rate gauge\n";
576 oss <<
"thread_pool_success_rate{pool=\"" << pool_name <<
"\"} "
580 oss <<
"# HELP thread_pool_latency_avg_ms Average job latency in milliseconds\n";
581 oss <<
"# TYPE thread_pool_latency_avg_ms gauge\n";
582 oss <<
"thread_pool_latency_avg_ms{pool=\"" << pool_name <<
"\"} "
586 oss <<
"# HELP thread_pool_workers_total Total number of workers\n";
587 oss <<
"# TYPE thread_pool_workers_total gauge\n";
588 oss <<
"thread_pool_workers_total{pool=\"" << pool_name <<
"\"} "
591 oss <<
"# HELP thread_pool_workers_active Number of active workers\n";
592 oss <<
"# TYPE thread_pool_workers_active gauge\n";
593 oss <<
"thread_pool_workers_active{pool=\"" << pool_name <<
"\"} "
596 oss <<
"# HELP thread_pool_workers_idle Number of idle workers\n";
597 oss <<
"# TYPE thread_pool_workers_idle gauge\n";
598 oss <<
"thread_pool_workers_idle{pool=\"" << pool_name <<
"\"} "
602 oss <<
"# HELP thread_pool_queue_depth Current queue depth\n";
603 oss <<
"# TYPE thread_pool_queue_depth gauge\n";
604 oss <<
"thread_pool_queue_depth{pool=\"" << pool_name <<
"\"} "
609 oss <<
"# HELP thread_pool_queue_capacity Maximum queue capacity\n";
610 oss <<
"# TYPE thread_pool_queue_capacity gauge\n";
611 oss <<
"thread_pool_queue_capacity{pool=\"" << pool_name <<
"\"} "
614 double saturation =
static_cast<double>(
queue_depth) /
616 oss <<
"# HELP thread_pool_queue_saturation Queue saturation ratio (0.0 to 1.0)\n";
617 oss <<
"# TYPE thread_pool_queue_saturation gauge\n";
618 oss <<
"thread_pool_queue_saturation{pool=\"" << pool_name <<
"\"} "
619 << std::setprecision(4) << saturation <<
"\n\n";
625 double comp_health = 0.0;
630 default: comp_health = 0.0;
break;
632 oss <<
"# HELP thread_pool_component_health Component health status\n";
633 oss <<
"# TYPE thread_pool_component_health gauge\n";
634 oss <<
"thread_pool_component_health{pool=\"" << pool_name
635 <<
"\",component=\"" << comp.name <<
"\"} "
636 << std::setprecision(1) << comp_health <<
"\n";
health_state
Overall health state of a component or system.
@ healthy
Component is fully operational.
@ degraded
Component is operational but with reduced capacity/performance.
@ unknown
Health state cannot be determined.
@ unhealthy
Component is not operational or failing.
auto health_state_to_http_code(health_state state) -> int
Gets HTTP status code for health state.
auto health_state_to_string(health_state state) -> std::string
Converts health_state to human-readable string.
Health status of a single component.
std::string name
Name of the component (e.g., "workers", "queue", "metrics").
health_state state
Current health state of this component.
std::string message
Human-readable message describing the current state.
std::map< std::string, std::string > details
Additional details about this component's health.
auto is_operational() const -> bool
Checks if this component is operational.
Comprehensive health status of the thread pool.
double success_rate
Job success rate (0.0 to 1.0).
std::size_t active_workers
Number of active workers.
std::vector< component_health > components
Health status of individual components.
health_state overall_status
Overall health state of the thread pool.
auto is_healthy() const -> bool
Checks if the thread pool is fully healthy.
std::size_t total_workers
Total number of workers.
std::string status_message
Human-readable message about overall status.
auto http_status_code() const -> int
Gets HTTP status code for this health status.
std::uint64_t total_jobs_processed
Total number of jobs processed since startup.
auto to_string() const -> std::string
Converts health status to human-readable string.
std::size_t queue_capacity
Queue capacity (if bounded).
double uptime_seconds
Time since the thread pool was started (seconds).
auto calculate_overall_status() -> void
Calculates overall status from component states.
std::chrono::steady_clock::time_point check_time
Time when this health check was performed.
auto to_json() const -> std::string
Converts health status to JSON string.
auto find_component(const std::string &name) const -> const component_health *
Finds a component by name.
auto is_operational() const -> bool
Checks if the thread pool is operational.
auto to_prometheus(const std::string &pool_name="default") const -> std::string
Converts health status to Prometheus-compatible metrics format.
std::size_t queue_depth
Current queue depth.
double avg_latency_ms
Average job latency in milliseconds.
Configurable thresholds for health status determination.
double unhealthy_success_rate
Success rate below which pool is unhealthy (0.0 to 1.0).
double worker_utilization_warning
Worker utilization threshold for degraded status (0.0 to 1.0).
double max_healthy_latency_ms
Maximum average latency (ms) for healthy status.
double queue_saturation_critical
Queue saturation threshold for unhealthy status (0.0 to 1.0).
std::size_t min_idle_workers
Minimum number of idle workers required for healthy status.
double queue_saturation_warning
Queue saturation threshold for degraded status (0.0 to 1.0).
double degraded_latency_ms
Latency (ms) above which pool is considered degraded.
double min_success_rate
Minimum success rate for healthy status (0.0 to 1.0).