Thread System 0.3.1
High-performance C++20 thread pool with work stealing and DAG scheduling
Loading...
Searching...
No Matches
health_status.h
Go to the documentation of this file.
1// BSD 3-Clause License
2// Copyright (c) 2024, 🍀☀🌕🌥 🌊
3// See the LICENSE file in the project root for full license information.
4
12#pragma once
13
14#include <chrono>
15#include <cstdint>
16#include <iomanip>
17#include <map>
18#include <sstream>
19#include <string>
20#include <vector>
21
23{
35 {
41 double min_success_rate{0.95};
42
47
52
56 double degraded_latency_ms{500.0};
57
62
67
72
78 std::size_t min_idle_workers{0};
79 };
80
90 enum class health_state
91 {
92 healthy,
93 degraded,
94 unhealthy,
95 unknown
96 };
97
103 [[nodiscard]] inline auto health_state_to_string(health_state state) -> std::string
104 {
105 switch (state)
106 {
107 case health_state::healthy: return "healthy";
108 case health_state::degraded: return "degraded";
109 case health_state::unhealthy: return "unhealthy";
110 case health_state::unknown: return "unknown";
111 default: return "unknown";
112 }
113 }
114
122 [[nodiscard]] inline auto health_state_to_http_code(health_state state) -> int
123 {
124 switch (state)
125 {
126 case health_state::healthy: return 200;
127 case health_state::degraded: return 200; // Still operational
128 case health_state::unhealthy: return 503;
129 case health_state::unknown: return 503;
130 default: return 503;
131 }
132 }
133
144 {
148 std::string name;
149
154
158 std::string message;
159
165 std::map<std::string, std::string> details;
166
171 [[nodiscard]] auto is_operational() const -> bool
172 {
173 return state == health_state::healthy ||
175 }
176 };
177
206 {
215
219 std::string status_message;
220
224 std::chrono::steady_clock::time_point check_time;
225
229 std::vector<component_health> components;
230
231 // =========================================================================
232 // Summary Metrics
233 // =========================================================================
234
238 double uptime_seconds{0.0};
239
243 std::uint64_t total_jobs_processed{0};
244
248 double success_rate{1.0};
249
253 double avg_latency_ms{0.0};
254
258 std::size_t active_workers{0};
259
263 std::size_t total_workers{0};
264
268 std::size_t queue_depth{0};
269
273 std::size_t queue_capacity{0};
274
275 // =========================================================================
276 // Computed Properties
277 // =========================================================================
278
283 [[nodiscard]] auto is_operational() const -> bool
284 {
287 }
288
293 [[nodiscard]] auto is_healthy() const -> bool
294 {
296 }
297
302 [[nodiscard]] auto http_status_code() const -> int
303 {
305 }
306
312 [[nodiscard]] auto find_component(const std::string& name) const
313 -> const component_health*
314 {
315 for (const auto& comp : components)
316 {
317 if (comp.name == name)
318 {
319 return &comp;
320 }
321 }
322 return nullptr;
323 }
324
335 {
336 if (components.empty())
337 {
339 status_message = "No components registered";
340 return;
341 }
342
343 bool has_unhealthy = false;
344 bool has_degraded = false;
345 bool has_unknown = false;
346
347 for (const auto& comp : components)
348 {
349 switch (comp.state)
350 {
352 has_unhealthy = true;
353 break;
355 has_degraded = true;
356 break;
358 has_unknown = true;
359 break;
360 default:
361 break;
362 }
363 }
364
365 if (has_unhealthy)
366 {
368 status_message = "One or more components are unhealthy";
369 }
370 else if (has_degraded)
371 {
373 status_message = "One or more components are degraded";
374 }
375 else if (has_unknown)
376 {
378 status_message = "One or more components have unknown status";
379 }
380 else
381 {
383 status_message = "All components are healthy";
384 }
385 }
386
387 // =========================================================================
388 // Serialization
389 // =========================================================================
390
399 [[nodiscard]] auto to_json() const -> std::string
400 {
401 std::ostringstream oss;
402 oss << std::fixed;
403
404 oss << "{\n";
405 oss << " \"status\": \"" << health_state_to_string(overall_status) << "\",\n";
406 oss << " \"message\": \"" << status_message << "\",\n";
407 oss << " \"http_code\": " << http_status_code() << ",\n";
408
409 // Metrics
410 oss << " \"metrics\": {\n";
411 oss << " \"uptime_seconds\": " << std::setprecision(2) << uptime_seconds << ",\n";
412 oss << " \"total_jobs_processed\": " << total_jobs_processed << ",\n";
413 oss << " \"success_rate\": " << std::setprecision(4) << success_rate << ",\n";
414 oss << " \"avg_latency_ms\": " << std::setprecision(3) << avg_latency_ms << "\n";
415 oss << " },\n";
416
417 // Workers
418 oss << " \"workers\": {\n";
419 oss << " \"total\": " << total_workers << ",\n";
420 oss << " \"active\": " << active_workers << ",\n";
421 oss << " \"idle\": " << (total_workers - active_workers) << "\n";
422 oss << " },\n";
423
424 // Queue
425 oss << " \"queue\": {\n";
426 oss << " \"depth\": " << queue_depth << ",\n";
427 oss << " \"capacity\": " << queue_capacity << "\n";
428 oss << " },\n";
429
430 // Components
431 oss << " \"components\": [\n";
432 for (std::size_t i = 0; i < components.size(); ++i)
433 {
434 const auto& comp = components[i];
435 oss << " {\n";
436 oss << " \"name\": \"" << comp.name << "\",\n";
437 oss << " \"status\": \"" << health_state_to_string(comp.state) << "\",\n";
438 oss << " \"message\": \"" << comp.message << "\"";
439
440 if (!comp.details.empty())
441 {
442 oss << ",\n \"details\": {\n";
443 std::size_t detail_idx = 0;
444 for (const auto& [key, value] : comp.details)
445 {
446 oss << " \"" << key << "\": \"" << value << "\"";
447 if (++detail_idx < comp.details.size())
448 {
449 oss << ",";
450 }
451 oss << "\n";
452 }
453 oss << " }\n";
454 }
455 else
456 {
457 oss << "\n";
458 }
459
460 oss << " }";
461 if (i < components.size() - 1)
462 {
463 oss << ",";
464 }
465 oss << "\n";
466 }
467 oss << " ]\n";
468 oss << "}";
469
470 return oss.str();
471 }
472
481 [[nodiscard]] auto to_string() const -> std::string
482 {
483 std::ostringstream oss;
484 oss << std::fixed;
485
486 oss << "=== Health Status: " << health_state_to_string(overall_status)
487 << " (HTTP " << http_status_code() << ") ===\n";
488 oss << "Message: " << status_message << "\n\n";
489
490 oss << "Metrics:\n";
491 oss << " Uptime: " << std::setprecision(1) << uptime_seconds << " seconds\n";
492 oss << " Jobs processed: " << total_jobs_processed << "\n";
493 oss << " Success rate: " << std::setprecision(1) << (success_rate * 100.0) << "%\n";
494 oss << " Avg latency: " << std::setprecision(2) << avg_latency_ms << " ms\n\n";
495
496 oss << "Workers: " << active_workers << "/" << total_workers << " active";
497 if (total_workers > 0)
498 {
499 oss << " (" << (total_workers - active_workers) << " idle)";
500 }
501 oss << "\n";
502
503 oss << "Queue: " << queue_depth;
504 if (queue_capacity > 0)
505 {
506 double saturation = static_cast<double>(queue_depth) /
507 static_cast<double>(queue_capacity) * 100.0;
508 oss << "/" << queue_capacity << " (" << std::setprecision(1)
509 << saturation << "% full)";
510 }
511 oss << "\n\n";
512
513 oss << "Components:\n";
514 for (const auto& comp : components)
515 {
516 oss << " [" << health_state_to_string(comp.state) << "] "
517 << comp.name << ": " << comp.message << "\n";
518 }
519
520 return oss.str();
521 }
522
542 [[nodiscard]] auto to_prometheus(const std::string& pool_name = "default") const
543 -> std::string
544 {
545 std::ostringstream oss;
546 oss << std::fixed;
547
548 // Health status (1 = healthy, 0.5 = degraded, 0 = unhealthy/unknown)
549 double health_value = 0.0;
550 switch (overall_status)
551 {
552 case health_state::healthy: health_value = 1.0; break;
553 case health_state::degraded: health_value = 0.5; break;
554 default: health_value = 0.0; break;
555 }
556 oss << "# HELP thread_pool_health_status Health status (1=healthy, 0.5=degraded, 0=unhealthy)\n";
557 oss << "# TYPE thread_pool_health_status gauge\n";
558 oss << "thread_pool_health_status{pool=\"" << pool_name << "\"} "
559 << std::setprecision(1) << health_value << "\n\n";
560
561 // Uptime
562 oss << "# HELP thread_pool_uptime_seconds Total uptime in seconds\n";
563 oss << "# TYPE thread_pool_uptime_seconds counter\n";
564 oss << "thread_pool_uptime_seconds{pool=\"" << pool_name << "\"} "
565 << std::setprecision(2) << uptime_seconds << "\n\n";
566
567 // Jobs processed
568 oss << "# HELP thread_pool_jobs_total Total number of jobs processed\n";
569 oss << "# TYPE thread_pool_jobs_total counter\n";
570 oss << "thread_pool_jobs_total{pool=\"" << pool_name << "\"} "
571 << total_jobs_processed << "\n\n";
572
573 // Success rate
574 oss << "# HELP thread_pool_success_rate Ratio of successful jobs (0.0 to 1.0)\n";
575 oss << "# TYPE thread_pool_success_rate gauge\n";
576 oss << "thread_pool_success_rate{pool=\"" << pool_name << "\"} "
577 << std::setprecision(4) << success_rate << "\n\n";
578
579 // Average latency
580 oss << "# HELP thread_pool_latency_avg_ms Average job latency in milliseconds\n";
581 oss << "# TYPE thread_pool_latency_avg_ms gauge\n";
582 oss << "thread_pool_latency_avg_ms{pool=\"" << pool_name << "\"} "
583 << std::setprecision(3) << avg_latency_ms << "\n\n";
584
585 // Workers
586 oss << "# HELP thread_pool_workers_total Total number of workers\n";
587 oss << "# TYPE thread_pool_workers_total gauge\n";
588 oss << "thread_pool_workers_total{pool=\"" << pool_name << "\"} "
589 << total_workers << "\n\n";
590
591 oss << "# HELP thread_pool_workers_active Number of active workers\n";
592 oss << "# TYPE thread_pool_workers_active gauge\n";
593 oss << "thread_pool_workers_active{pool=\"" << pool_name << "\"} "
594 << active_workers << "\n\n";
595
596 oss << "# HELP thread_pool_workers_idle Number of idle workers\n";
597 oss << "# TYPE thread_pool_workers_idle gauge\n";
598 oss << "thread_pool_workers_idle{pool=\"" << pool_name << "\"} "
599 << (total_workers - active_workers) << "\n\n";
600
601 // Queue
602 oss << "# HELP thread_pool_queue_depth Current queue depth\n";
603 oss << "# TYPE thread_pool_queue_depth gauge\n";
604 oss << "thread_pool_queue_depth{pool=\"" << pool_name << "\"} "
605 << queue_depth << "\n\n";
606
607 if (queue_capacity > 0)
608 {
609 oss << "# HELP thread_pool_queue_capacity Maximum queue capacity\n";
610 oss << "# TYPE thread_pool_queue_capacity gauge\n";
611 oss << "thread_pool_queue_capacity{pool=\"" << pool_name << "\"} "
612 << queue_capacity << "\n\n";
613
614 double saturation = static_cast<double>(queue_depth) /
615 static_cast<double>(queue_capacity);
616 oss << "# HELP thread_pool_queue_saturation Queue saturation ratio (0.0 to 1.0)\n";
617 oss << "# TYPE thread_pool_queue_saturation gauge\n";
618 oss << "thread_pool_queue_saturation{pool=\"" << pool_name << "\"} "
619 << std::setprecision(4) << saturation << "\n\n";
620 }
621
622 // Component health
623 for (const auto& comp : components)
624 {
625 double comp_health = 0.0;
626 switch (comp.state)
627 {
628 case health_state::healthy: comp_health = 1.0; break;
629 case health_state::degraded: comp_health = 0.5; break;
630 default: comp_health = 0.0; break;
631 }
632 oss << "# HELP thread_pool_component_health Component health status\n";
633 oss << "# TYPE thread_pool_component_health gauge\n";
634 oss << "thread_pool_component_health{pool=\"" << pool_name
635 << "\",component=\"" << comp.name << "\"} "
636 << std::setprecision(1) << comp_health << "\n";
637 }
638
639 return oss.str();
640 }
641 };
642
643} // namespace kcenon::thread::diagnostics
health_state
Overall health state of a component or system.
@ healthy
Component is fully operational.
@ degraded
Component is operational but with reduced capacity/performance.
@ unknown
Health state cannot be determined.
@ unhealthy
Component is not operational or failing.
auto health_state_to_http_code(health_state state) -> int
Gets HTTP status code for health state.
auto health_state_to_string(health_state state) -> std::string
Converts health_state to human-readable string.
STL namespace.
Health status of a single component.
std::string name
Name of the component (e.g., "workers", "queue", "metrics").
health_state state
Current health state of this component.
std::string message
Human-readable message describing the current state.
std::map< std::string, std::string > details
Additional details about this component's health.
auto is_operational() const -> bool
Checks if this component is operational.
Comprehensive health status of the thread pool.
double success_rate
Job success rate (0.0 to 1.0).
std::size_t active_workers
Number of active workers.
std::vector< component_health > components
Health status of individual components.
health_state overall_status
Overall health state of the thread pool.
auto is_healthy() const -> bool
Checks if the thread pool is fully healthy.
std::size_t total_workers
Total number of workers.
std::string status_message
Human-readable message about overall status.
auto http_status_code() const -> int
Gets HTTP status code for this health status.
std::uint64_t total_jobs_processed
Total number of jobs processed since startup.
auto to_string() const -> std::string
Converts health status to human-readable string.
std::size_t queue_capacity
Queue capacity (if bounded).
double uptime_seconds
Time since the thread pool was started (seconds).
auto calculate_overall_status() -> void
Calculates overall status from component states.
std::chrono::steady_clock::time_point check_time
Time when this health check was performed.
auto to_json() const -> std::string
Converts health status to JSON string.
auto find_component(const std::string &name) const -> const component_health *
Finds a component by name.
auto is_operational() const -> bool
Checks if the thread pool is operational.
auto to_prometheus(const std::string &pool_name="default") const -> std::string
Converts health status to Prometheus-compatible metrics format.
std::size_t queue_depth
Current queue depth.
double avg_latency_ms
Average job latency in milliseconds.
Configurable thresholds for health status determination.
double unhealthy_success_rate
Success rate below which pool is unhealthy (0.0 to 1.0).
double worker_utilization_warning
Worker utilization threshold for degraded status (0.0 to 1.0).
double max_healthy_latency_ms
Maximum average latency (ms) for healthy status.
double queue_saturation_critical
Queue saturation threshold for unhealthy status (0.0 to 1.0).
std::size_t min_idle_workers
Minimum number of idle workers required for healthy status.
double queue_saturation_warning
Queue saturation threshold for degraded status (0.0 to 1.0).
double degraded_latency_ms
Latency (ms) above which pool is considered degraded.
double min_success_rate
Minimum success rate for healthy status (0.0 to 1.0).