thread_system/thread__pool__diagnostics_8cpp_source.html

// BSD 3-Clause License

// Copyright (c) 2024, 🍀☀🌕🌥 🌊

// See the LICENSE file in the project root for full license information.


#include <kcenon/thread/diagnostics/thread_pool_diagnostics.h>

#include <kcenon/thread/core/thread_pool.h>

#include <kcenon/thread/core/thread_worker.h>


#include <algorithm>

#include <cmath>

#include <format>

#include <iomanip>

#include <sstream>


namespace kcenon::thread::diagnostics

{

    // =========================================================================

    // Constructor / Destructor

    // =========================================================================


    thread_pool_diagnostics::thread_pool_diagnostics(thread_pool& pool,

                                                    const diagnostics_config& config)

        : pool_(pool)

        , config_(config)

        , tracing_enabled_(config.enable_tracing)

        , start_time_(std::chrono::steady_clock::now())

    {

    }


    thread_pool_diagnostics::~thread_pool_diagnostics() = default;


    // =========================================================================

    // Thread Dump

    // =========================================================================


    auto thread_pool_diagnostics::dump_thread_states() const -> std::vector<thread_info>

    {

        // Delegate to thread_pool's collect_worker_diagnostics for actual worker info

        return pool_.collect_worker_diagnostics();

    }


    auto thread_pool_diagnostics::format_thread_dump() const -> std::string

    {

        std::ostringstream oss;


        auto threads = dump_thread_states();

        auto now = std::chrono::system_clock::now();

        auto time_t = std::chrono::system_clock::to_time_t(now);


        std::size_t worker_count;

        {

            std::scoped_lock<std::mutex> lock(pool_.workers_mutex_);

            worker_count = pool_.workers_.size();

        }

        auto active_count = pool_.get_active_worker_count();

        auto idle_count = pool_.get_idle_worker_count();


        // Header

        oss << "=== Thread Pool Dump: " << pool_.to_string() << " ===\n";

        oss << "Time: " << std::put_time(std::gmtime(&time_t), "%Y-%m-%dT%H:%M:%SZ") << "\n";

        oss << "Workers: " << worker_count << ", Active: " << active_count

            << ", Idle: " << idle_count << "\n\n";


        // Worker details

        for (const auto& t : threads)

        {

            auto state_duration = t.state_duration();

            auto duration_sec = std::chrono::duration<double>(state_duration).count();


            oss << t.thread_name << " [tid:" << t.thread_id << "] "

                << worker_state_to_string(t.state)

                << " (" << std::fixed << std::setprecision(1) << duration_sec << "s)\n";


            if (t.current_job.has_value())

            {

                const auto& job = t.current_job.value();

                auto exec_time_ms = std::chrono::duration<double, std::milli>(

                    job.execution_time).count();

                oss << "  Current Job: " << job.job_name << "#" << job.job_id

                    << " (running " << std::fixed << std::setprecision(0)

                    << exec_time_ms << "ms)\n";

            }


            oss << "  Jobs: " << t.jobs_completed << " completed, "

                << t.jobs_failed << " failed\n";

            oss << "  Utilization: " << std::fixed << std::setprecision(1)

                << (t.utilization * 100.0) << "%\n\n";

        }


        return oss.str();

    }


    // =========================================================================

    // Job Inspection

    // =========================================================================


    auto thread_pool_diagnostics::get_active_jobs() const -> std::vector<job_info>

    {

        std::vector<job_info> result;


        // Get thread states which include current job info

        auto threads = dump_thread_states();


        for (const auto& thread : threads)

        {

            if (thread.current_job.has_value())

            {

                result.push_back(thread.current_job.value());

            }

        }


        return result;

    }


    auto thread_pool_diagnostics::get_pending_jobs(std::size_t limit) const

        -> std::vector<job_info>

    {

        // Delegate to job_queue's inspect_pending_jobs

        auto queue = pool_.get_job_queue();

        if (!queue)

        {

            return {};

        }


        return queue->inspect_pending_jobs(limit);

    }


    auto thread_pool_diagnostics::get_recent_jobs(std::size_t limit) const

        -> std::vector<job_info>

    {

        std::lock_guard<std::mutex> lock(jobs_mutex_);


        std::vector<job_info> result;

        auto count = std::min(limit, recent_jobs_.size());

        result.reserve(count);


        auto it = recent_jobs_.rbegin();

        for (std::size_t i = 0; i < count && it != recent_jobs_.rend(); ++i, ++it)

        {

            result.push_back(*it);

        }


        return result;

    }


    void thread_pool_diagnostics::record_job_completion(const job_info& info)

    {

        std::lock_guard<std::mutex> lock(jobs_mutex_);


        recent_jobs_.push_back(info);

        while (recent_jobs_.size() > config_.recent_jobs_capacity)

        {

            recent_jobs_.pop_front();

        }

    }


    // =========================================================================

    // Bottleneck Detection

    // =========================================================================


    auto thread_pool_diagnostics::detect_bottlenecks() const -> bottleneck_report

    {

        bottleneck_report report;


        // Gather metrics

        auto metrics_snap = pool_.metrics().snapshot();

        std::size_t worker_count;

        {

            std::scoped_lock<std::mutex> lock(pool_.workers_mutex_);

            worker_count = pool_.workers_.size();

        }

        auto active_count = pool_.get_active_worker_count();

        auto idle_count = pool_.get_idle_worker_count();

        auto queue_depth = pool_.get_pending_task_count();


        report.queue_depth = queue_depth;

        report.idle_workers = idle_count;

        report.total_workers = worker_count;


        // Calculate queue saturation

        auto queue = pool_.get_job_queue();

        if (queue)

        {

            auto max_size = queue->get_max_size();

            if (max_size.has_value() && max_size.value() > 0)

            {

                report.queue_saturation = static_cast<double>(queue_depth) /

                                          static_cast<double>(max_size.value());

            }

            else if (queue_depth > 0)

            {

                // For unbounded queues, use heuristic: saturation based on queue depth vs workers

                // High queue depth relative to workers indicates potential saturation

                report.queue_saturation = std::min(1.0,

                    static_cast<double>(queue_depth) / static_cast<double>(worker_count * 10));

            }

        }


        // Calculate worker utilization (instantaneous)

        if (worker_count > 0)

        {

            report.worker_utilization = static_cast<double>(active_count) /

                                        static_cast<double>(worker_count);

        }


        // Get per-worker utilization for variance calculation

        auto thread_states = pool_.collect_worker_diagnostics();

        if (!thread_states.empty())

        {

            // Calculate mean utilization from worker stats

            double sum_utilization = 0.0;

            for (const auto& t : thread_states)

            {

                sum_utilization += t.utilization;

            }

            double mean_utilization = sum_utilization / static_cast<double>(thread_states.size());


            // Calculate variance

            double variance_sum = 0.0;

            for (const auto& t : thread_states)

            {

                double diff = t.utilization - mean_utilization;

                variance_sum += diff * diff;

            }

            report.utilization_variance = variance_sum / static_cast<double>(thread_states.size());


            // Use mean utilization from actual worker stats if available

            if (mean_utilization > 0.0)

            {

                report.worker_utilization = mean_utilization;

            }

        }


        // Calculate average wait time from metrics

        auto total_jobs = metrics_snap.tasks_executed + metrics_snap.tasks_failed;

        if (total_jobs > 0)

        {

            // Estimate wait time from idle time (approximation)

            auto avg_idle_ns = metrics_snap.total_idle_time_ns / total_jobs;

            report.avg_wait_time_ms = static_cast<double>(avg_idle_ns) / 1e6;


            // Calculate estimated backlog time

            // Average execution time per job

            double avg_exec_time_ms = 0.0;

            if (metrics_snap.total_busy_time_ns > 0 && total_jobs > 0)

            {

                avg_exec_time_ms = static_cast<double>(metrics_snap.total_busy_time_ns) /

                                   static_cast<double>(total_jobs) / 1e6;

            }


            // Estimated time to clear backlog = (queue_depth * avg_exec_time) / active_workers

            if (active_count > 0 && avg_exec_time_ms > 0)

            {

                report.estimated_backlog_time_ms = static_cast<std::size_t>(

                    (static_cast<double>(queue_depth) * avg_exec_time_ms) /

                    static_cast<double>(active_count));

            }

            else if (worker_count > 0 && avg_exec_time_ms > 0)

            {

                report.estimated_backlog_time_ms = static_cast<std::size_t>(

                    (static_cast<double>(queue_depth) * avg_exec_time_ms) /

                    static_cast<double>(worker_count));

            }

        }


        // Jobs rejected tracking not available in basic metrics

        report.jobs_rejected = 0;


        // Detect bottleneck type (ordered by severity)

        // 1. Queue full - most critical

        if (report.queue_saturation > 0.95 || report.jobs_rejected > 0)

        {

            report.has_bottleneck = true;

            report.type = bottleneck_type::queue_full;

            report.description = "Queue is at or near capacity, jobs are being rejected";

        }

        // 2. Worker starvation - high utilization with growing backlog

        else if (report.worker_utilization > 0.95 && queue_depth > worker_count * 2)

        {

            report.has_bottleneck = true;

            report.type = bottleneck_type::worker_starvation;

            report.description = "Not enough workers to handle the workload";

        }

        // 3. Slow consumer - high wait time with high utilization

        else if (report.avg_wait_time_ms > config_.wait_time_threshold_ms &&

                 report.worker_utilization > config_.utilization_high_threshold)

        {

            report.has_bottleneck = true;

            report.type = bottleneck_type::slow_consumer;

            report.description = "Workers cannot keep up with job submission rate";

        }

        // 4. Uneven distribution - high variance in worker utilization

        else if (report.utilization_variance > 0.1 && worker_count > 1)

        {

            // Variance > 0.1 means standard deviation > ~0.32 which is significant

            report.has_bottleneck = true;

            report.type = bottleneck_type::uneven_distribution;

            report.description = "Work is not evenly distributed across workers";

        }

        // 5. Lock contention - high wait time but low utilization (workers waiting on locks)

        else if (report.avg_wait_time_ms > config_.wait_time_threshold_ms * 2 &&

                 report.worker_utilization < 0.5 && active_count > 0)

        {

            report.has_bottleneck = true;

            report.type = bottleneck_type::lock_contention;

            report.description = "High wait times with low utilization suggests lock contention";

        }

        // 6. Memory pressure - check queue memory usage

        else if (queue)

        {

            auto mem_stats = queue->get_memory_stats();

            // Consider memory pressure if queue uses more than 100MB

            constexpr std::size_t memory_threshold = 100 * 1024 * 1024;

            if (mem_stats.queue_size_bytes > memory_threshold)

            {

                report.has_bottleneck = true;

                report.type = bottleneck_type::memory_pressure;

                report.description = "Excessive memory usage in job queue";

            }

        }


        // Generate recommendations if bottleneck detected

        if (report.has_bottleneck)

        {

            generate_recommendations(report);

        }


        return report;

    }


    void thread_pool_diagnostics::generate_recommendations(bottleneck_report& report) const

    {

        switch (report.type)

        {

            case bottleneck_type::queue_full:

                report.recommendations.push_back("Consider increasing queue capacity");

                report.recommendations.push_back("Enable backpressure with adaptive policy");

                report.recommendations.push_back("Add more worker threads if CPU permits");

                break;


            case bottleneck_type::slow_consumer:

                report.recommendations.push_back("Add more worker threads");

                report.recommendations.push_back("Optimize job execution time");

                report.recommendations.push_back("Consider job batching for small tasks");

                break;


            case bottleneck_type::worker_starvation:

                report.recommendations.push_back("Increase worker thread count");

                report.recommendations.push_back("Consider scaling based on hardware cores");

                report.recommendations.push_back("Enable autoscaling for dynamic adjustment");

                break;


            case bottleneck_type::uneven_distribution:

                report.recommendations.push_back("Enable work stealing if not already");

                report.recommendations.push_back("Review job distribution patterns");

                report.recommendations.push_back("Consider using priority-based scheduling");

                break;


            case bottleneck_type::lock_contention:

                report.recommendations.push_back("Review shared resource access patterns");

                report.recommendations.push_back("Consider using lock-free data structures");

                report.recommendations.push_back("Reduce critical section scope");

                report.recommendations.push_back("Use finer-grained locking strategies");

                break;


            case bottleneck_type::memory_pressure:

                report.recommendations.push_back("Reduce queue capacity or enable backpressure");

                report.recommendations.push_back("Optimize job object size");

                report.recommendations.push_back("Add more workers to process jobs faster");

                report.recommendations.push_back("Consider job prioritization to clear backlog");

                break;


            case bottleneck_type::none:

            default:

                break;

        }

    }


    // =========================================================================

    // Health Checks

    // =========================================================================


    auto thread_pool_diagnostics::health_check() const -> health_status

    {

        health_status status;

        status.check_time = std::chrono::steady_clock::now();


        // Calculate uptime

        auto uptime = status.check_time - start_time_;

        status.uptime_seconds = std::chrono::duration<double>(uptime).count();


        // Get metrics

        auto metrics_snap = pool_.metrics().snapshot();

        status.total_jobs_processed = metrics_snap.tasks_executed +

                                      metrics_snap.tasks_failed;


        if (status.total_jobs_processed > 0)

        {

            status.success_rate = static_cast<double>(metrics_snap.tasks_executed) /

                                  static_cast<double>(status.total_jobs_processed);


            // Calculate average latency (total execution time / total jobs)

            // busy_time represents total execution time across all workers

            double total_exec_time_ms = static_cast<double>(metrics_snap.total_busy_time_ns) / 1e6;

            status.avg_latency_ms = total_exec_time_ms /

                                    static_cast<double>(status.total_jobs_processed);

        }


        // Worker stats

        {

            std::scoped_lock<std::mutex> lock(pool_.workers_mutex_);

            status.total_workers = pool_.workers_.size();

        }

        status.active_workers = pool_.get_active_worker_count();

        status.queue_depth = pool_.get_pending_task_count();


        // Get queue capacity

        auto queue = pool_.get_job_queue();

        if (queue)

        {

            auto max_size = queue->get_max_size();

            if (max_size.has_value())

            {

                status.queue_capacity = max_size.value();

            }

        }


        // Check components

        status.components.push_back(check_worker_health());

        status.components.push_back(check_queue_health());

        status.components.push_back(check_metrics_health(status.avg_latency_ms,

                                                          status.success_rate));


        // Calculate overall status

        status.calculate_overall_status();


        return status;

    }


    auto thread_pool_diagnostics::is_healthy() const -> bool

    {

        std::size_t worker_count;

        {

            std::scoped_lock<std::mutex> lock(pool_.workers_mutex_);

            worker_count = pool_.workers_.size();

        }

        return pool_.is_running() && worker_count > 0;

    }


    auto thread_pool_diagnostics::check_worker_health() const -> component_health

    {

        component_health health;

        health.name = "workers";


        std::size_t total;

        {

            std::scoped_lock<std::mutex> lock(pool_.workers_mutex_);

            total = pool_.workers_.size();

        }

        auto active = pool_.get_active_worker_count();

        auto idle = pool_.get_idle_worker_count();


        health.details["total"] = std::to_string(total);

        health.details["active"] = std::to_string(active);

        health.details["idle"] = std::to_string(idle);


        if (!pool_.is_running())

        {

            health.state = health_state::unhealthy;

            health.message = "Thread pool is not running";

        }

        else if (total == 0)

        {

            health.state = health_state::unhealthy;

            health.message = "No workers available";

        }

        else if (active == total)

        {

            health.state = health_state::degraded;

            health.message = "All workers are busy";

        }

        else

        {

            health.state = health_state::healthy;

            health.message = std::to_string(idle) + " workers available";

        }


        return health;

    }


    auto thread_pool_diagnostics::check_queue_health() const -> component_health

    {

        component_health health;

        health.name = "queue";


        auto depth = pool_.get_pending_task_count();

        health.details["depth"] = std::to_string(depth);


        // Get queue capacity and calculate saturation

        auto queue = pool_.get_job_queue();

        double saturation = 0.0;

        if (queue)

        {

            auto max_size = queue->get_max_size();

            if (max_size.has_value() && max_size.value() > 0)

            {

                health.details["capacity"] = std::to_string(max_size.value());

                saturation = static_cast<double>(depth) / static_cast<double>(max_size.value());

                health.details["saturation"] = std::format("{:.2f}", saturation);

            }

        }


        // Note: Job rejection tracking requires backpressure queue

        // For basic queue, assume no rejections

        std::uint64_t rejected = 0;

        health.details["rejected"] = std::to_string(rejected);


        const auto& thresholds = config_.health_thresholds_config;


        if (saturation >= thresholds.queue_saturation_critical)

        {

            health.state = health_state::unhealthy;

            health.message = "Queue at critical capacity";

        }

        else if (saturation >= thresholds.queue_saturation_warning || rejected > 0)

        {

            health.state = health_state::degraded;

            if (rejected > 0)

            {

                health.message = std::to_string(rejected) + " jobs rejected due to backpressure";

            }

            else

            {

                health.message = "Queue saturation above warning threshold";

            }

        }

        else

        {

            health.state = health_state::healthy;

            health.message = "Queue operational";

        }


        return health;

    }


    auto thread_pool_diagnostics::check_metrics_health(double avg_latency_ms,

                                                        double success_rate) const -> component_health

    {

        component_health health;

        health.name = "metrics";


        health.details["avg_latency_ms"] = std::format("{:.3f}", avg_latency_ms);

        health.details["success_rate"] = std::format("{:.4f}", success_rate);


        const auto& thresholds = config_.health_thresholds_config;


        // Check success rate first (more critical)

        if (success_rate < thresholds.unhealthy_success_rate)

        {

            health.state = health_state::unhealthy;

            health.message = "Success rate critically low: " +

                             std::format("{:.1f}%", success_rate * 100.0);

        }

        else if (success_rate < thresholds.min_success_rate)

        {

            health.state = health_state::degraded;

            health.message = "Success rate below threshold: " +

                             std::format("{:.1f}%", success_rate * 100.0);

        }

        // Check latency

        else if (avg_latency_ms > thresholds.degraded_latency_ms)

        {

            health.state = health_state::degraded;

            health.message = "High average latency: " +

                             std::format("{:.2f}ms", avg_latency_ms);

        }

        else if (avg_latency_ms > thresholds.max_healthy_latency_ms)

        {

            health.state = health_state::degraded;

            health.message = "Elevated latency: " +

                             std::format("{:.2f}ms", avg_latency_ms);

        }

        else

        {

            health.state = health_state::healthy;

            health.message = "Performance metrics within normal range";

        }


        return health;

    }


    // =========================================================================

    // Event Tracing

    // =========================================================================


    void thread_pool_diagnostics::enable_tracing(bool enable, std::size_t history_size)

    {

        tracing_enabled_.store(enable, std::memory_order_relaxed);


        if (enable)

        {

            std::lock_guard<std::mutex> lock(events_mutex_);

            // Clear and resize if needed

            while (event_history_.size() > history_size)

            {

                event_history_.pop_front();

            }

        }


        // Update config

        config_.event_history_size = history_size;

        config_.enable_tracing = enable;

    }


    auto thread_pool_diagnostics::is_tracing_enabled() const -> bool

    {

        return tracing_enabled_.load(std::memory_order_relaxed);

    }


    void thread_pool_diagnostics::add_event_listener(

        std::shared_ptr<execution_event_listener> listener)

    {

        if (!listener) return;


        std::lock_guard<std::mutex> lock(listeners_mutex_);

        listeners_.push_back(std::move(listener));

    }


    void thread_pool_diagnostics::remove_event_listener(

        std::shared_ptr<execution_event_listener> listener)

    {

        if (!listener) return;


        std::lock_guard<std::mutex> lock(listeners_mutex_);

        auto it = std::find(listeners_.begin(), listeners_.end(), listener);

        if (it != listeners_.end())

        {

            listeners_.erase(it);

        }

    }


    void thread_pool_diagnostics::record_event(const job_execution_event& event)

    {

        if (!tracing_enabled_.load(std::memory_order_relaxed))

        {

            return;

        }


        // Store in history

        {

            std::lock_guard<std::mutex> lock(events_mutex_);

            event_history_.push_back(event);

            while (event_history_.size() > config_.event_history_size)

            {

                event_history_.pop_front();

            }

        }


        // Notify listeners

        notify_listeners(event);

    }


    void thread_pool_diagnostics::notify_listeners(const job_execution_event& event)

    {

        std::vector<std::shared_ptr<execution_event_listener>> listeners_copy;

        {

            std::lock_guard<std::mutex> lock(listeners_mutex_);

            listeners_copy = listeners_;

        }


        for (const auto& listener : listeners_copy)

        {

            if (listener)

            {

                listener->on_event(event);

            }

        }

    }


    auto thread_pool_diagnostics::get_recent_events(std::size_t limit) const

        -> std::vector<job_execution_event>

    {

        std::lock_guard<std::mutex> lock(events_mutex_);


        std::vector<job_execution_event> result;

        auto count = std::min(limit, event_history_.size());

        result.reserve(count);


        auto it = event_history_.rbegin();

        for (std::size_t i = 0; i < count && it != event_history_.rend(); ++i, ++it)

        {

            result.push_back(*it);

        }


        return result;

    }


    // =========================================================================

    // Export

    // =========================================================================


    auto thread_pool_diagnostics::to_json() const -> std::string

    {

        std::ostringstream oss;

        oss << "{\n";


        // Health status

        auto health = health_check();

        oss << "  \"health\": {\n";

        oss << "    \"status\": \"" << health_state_to_string(health.overall_status) << "\",\n";

        oss << "    \"message\": \"" << health.status_message << "\",\n";

        oss << "    \"uptime_seconds\": " << std::fixed << std::setprecision(2)

            << health.uptime_seconds << ",\n";

        oss << "    \"total_jobs_processed\": " << health.total_jobs_processed << ",\n";

        oss << "    \"success_rate\": " << std::fixed << std::setprecision(4)

            << health.success_rate << "\n";

        oss << "  },\n";


        // Workers

        oss << "  \"workers\": {\n";

        oss << "    \"total\": " << health.total_workers << ",\n";

        oss << "    \"active\": " << health.active_workers << ",\n";

        oss << "    \"idle\": " << (health.total_workers - health.active_workers) << "\n";

        oss << "  },\n";


        // Queue

        oss << "  \"queue\": {\n";

        oss << "    \"depth\": " << health.queue_depth << "\n";

        oss << "  },\n";


        // Bottleneck

        auto bottleneck = detect_bottlenecks();

        oss << "  \"bottleneck\": {\n";

        oss << "    \"detected\": " << (bottleneck.has_bottleneck ? "true" : "false") << ",\n";

        oss << "    \"type\": \"" << bottleneck_type_to_string(bottleneck.type) << "\",\n";

        oss << "    \"severity\": \"" << bottleneck.severity_string() << "\"\n";

        oss << "  }\n";


        oss << "}";

        return oss.str();

    }


    auto thread_pool_diagnostics::to_string() const -> std::string

    {

        return format_thread_dump();

    }


    auto thread_pool_diagnostics::to_prometheus() const -> std::string

    {

        auto health = health_check();

        return health.to_prometheus(pool_.to_string());

    }


    // =========================================================================

    // Configuration

    // =========================================================================


    auto thread_pool_diagnostics::get_config() const -> diagnostics_config

    {

        return config_;

    }


    void thread_pool_diagnostics::set_config(const diagnostics_config& config)

    {

        config_ = config;

        tracing_enabled_.store(config.enable_tracing, std::memory_order_relaxed);

    }


    auto thread_pool_diagnostics::get_worker_info(const thread_worker& worker,

                                                  std::size_t index) const -> thread_info

    {

        thread_info info;

        info.worker_id = worker.get_worker_id();

        info.thread_name = "Worker-" + std::to_string(index);

        info.state = worker.is_idle() ? worker_state::idle : worker_state::active;

        info.state_since = std::chrono::steady_clock::now();

        return info;

    }


} // namespace kcenon::thread::diagnostics


kcenon::thread::diagnostics::thread_pool_diagnostics::check_queue_health
auto check_queue_health() const -> component_health
Checks queue component health.
Definition thread_pool_diagnostics.cpp:491

kcenon::thread::diagnostics::thread_pool_diagnostics::generate_recommendations
void generate_recommendations(bottleneck_report &report) const
Generates recommendations for a bottleneck.
Definition thread_pool_diagnostics.cpp:331

kcenon::thread::diagnostics::thread_pool_diagnostics::get_active_jobs
auto get_active_jobs() const -> std::vector< job_info >
Gets currently executing jobs.
Definition thread_pool_diagnostics.cpp:97

kcenon::thread::diagnostics::thread_pool_diagnostics::add_event_listener
void add_event_listener(std::shared_ptr< execution_event_listener > listener)
Adds an event listener.
Definition thread_pool_diagnostics.cpp:620

kcenon::thread::diagnostics::thread_pool_diagnostics::get_worker_info
auto get_worker_info(const thread_worker &worker, std::size_t index) const -> thread_info
Gets thread info for a single worker.
Definition thread_pool_diagnostics.cpp:769

kcenon::thread::diagnostics::thread_pool_diagnostics::to_json
auto to_json() const -> std::string
Exports diagnostics as JSON.
Definition thread_pool_diagnostics.cpp:702

kcenon::thread::diagnostics::thread_pool_diagnostics::get_recent_jobs
auto get_recent_jobs(std::size_t limit=100) const -> std::vector< job_info >
Gets recent completed/failed jobs.
Definition thread_pool_diagnostics.cpp:128

kcenon::thread::diagnostics::thread_pool_diagnostics::config_
diagnostics_config config_
Configuration for diagnostics.
Definition thread_pool_diagnostics.h:351

kcenon::thread::diagnostics::thread_pool_diagnostics::~thread_pool_diagnostics
~thread_pool_diagnostics()
Destructor.

kcenon::thread::diagnostics::thread_pool_diagnostics::to_prometheus
auto to_prometheus() const -> std::string
Exports diagnostics as Prometheus-compatible metrics.
Definition thread_pool_diagnostics.cpp:748

kcenon::thread::diagnostics::thread_pool_diagnostics::listeners_mutex_
std::mutex listeners_mutex_
Mutex for event listeners.
Definition thread_pool_diagnostics.h:381

kcenon::thread::diagnostics::thread_pool_diagnostics::listeners_
std::vector< std::shared_ptr< execution_event_listener > > listeners_
Event listeners.
Definition thread_pool_diagnostics.h:386

kcenon::thread::diagnostics::thread_pool_diagnostics::record_job_completion
void record_job_completion(const job_info &info)
Records a job completion for history tracking.
Definition thread_pool_diagnostics.cpp:146

kcenon::thread::diagnostics::thread_pool_diagnostics::pool_
thread_pool & pool_
Reference to the monitored thread pool.
Definition thread_pool_diagnostics.h:346

kcenon::thread::diagnostics::thread_pool_diagnostics::detect_bottlenecks
auto detect_bottlenecks() const -> bottleneck_report
Analyzes for bottlenecks.
Definition thread_pool_diagnostics.cpp:161

kcenon::thread::diagnostics::thread_pool_diagnostics::is_tracing_enabled
auto is_tracing_enabled() const -> bool
Checks if tracing is enabled.
Definition thread_pool_diagnostics.cpp:615

kcenon::thread::diagnostics::thread_pool_diagnostics::check_metrics_health
auto check_metrics_health(double avg_latency_ms, double success_rate) const -> component_health
Checks metrics component health.
Definition thread_pool_diagnostics.cpp:546

kcenon::thread::diagnostics::thread_pool_diagnostics::get_pending_jobs
auto get_pending_jobs(std::size_t limit=100) const -> std::vector< job_info >
Gets pending jobs in queue.
Definition thread_pool_diagnostics.cpp:115

kcenon::thread::diagnostics::thread_pool_diagnostics::get_config
auto get_config() const -> diagnostics_config
Gets the current configuration.
Definition thread_pool_diagnostics.cpp:758

kcenon::thread::diagnostics::thread_pool_diagnostics::get_recent_events
auto get_recent_events(std::size_t limit=100) const -> std::vector< job_execution_event >
Gets recent execution events.
Definition thread_pool_diagnostics.cpp:680

kcenon::thread::diagnostics::thread_pool_diagnostics::jobs_mutex_
std::mutex jobs_mutex_
Mutex for recent jobs access.
Definition thread_pool_diagnostics.h:371

kcenon::thread::diagnostics::thread_pool_diagnostics::start_time_
std::chrono::steady_clock::time_point start_time_
Time when the pool was started.
Definition thread_pool_diagnostics.h:396

kcenon::thread::diagnostics::thread_pool_diagnostics::format_thread_dump
auto format_thread_dump() const -> std::string
Gets formatted thread dump (human-readable).
Definition thread_pool_diagnostics.cpp:42

kcenon::thread::diagnostics::thread_pool_diagnostics::thread_pool_diagnostics
thread_pool_diagnostics(thread_pool &pool, const diagnostics_config &config={})
Constructs diagnostics for a thread pool.
Definition thread_pool_diagnostics.cpp:21

kcenon::thread::diagnostics::thread_pool_diagnostics::notify_listeners
void notify_listeners(const job_execution_event &event)
Notifies all event listeners.
Definition thread_pool_diagnostics.cpp:663

kcenon::thread::diagnostics::thread_pool_diagnostics::to_string
auto to_string() const -> std::string
Exports diagnostics as formatted string.
Definition thread_pool_diagnostics.cpp:743

kcenon::thread::diagnostics::thread_pool_diagnostics::remove_event_listener
void remove_event_listener(std::shared_ptr< execution_event_listener > listener)
Removes an event listener.
Definition thread_pool_diagnostics.cpp:629

kcenon::thread::diagnostics::thread_pool_diagnostics::enable_tracing
void enable_tracing(bool enable, std::size_t history_size=1000)
Enables or disables job execution tracing.
Definition thread_pool_diagnostics.cpp:596

kcenon::thread::diagnostics::thread_pool_diagnostics::health_check
auto health_check() const -> health_status
Performs comprehensive health check.
Definition thread_pool_diagnostics.cpp:383

kcenon::thread::diagnostics::thread_pool_diagnostics::tracing_enabled_
std::atomic< bool > tracing_enabled_
Whether event tracing is enabled.
Definition thread_pool_diagnostics.h:356

kcenon::thread::diagnostics::thread_pool_diagnostics::record_event
void record_event(const job_execution_event &event)
Records a job execution event.
Definition thread_pool_diagnostics.cpp:642

kcenon::thread::diagnostics::thread_pool_diagnostics::is_healthy
auto is_healthy() const -> bool
Quick check if pool is healthy.
Definition thread_pool_diagnostics.cpp:440

kcenon::thread::diagnostics::thread_pool_diagnostics::event_history_
std::deque< job_execution_event > event_history_
Ring buffer for event history.
Definition thread_pool_diagnostics.h:366

kcenon::thread::diagnostics::thread_pool_diagnostics::events_mutex_
std::mutex events_mutex_
Mutex for event history access.
Definition thread_pool_diagnostics.h:361

kcenon::thread::diagnostics::thread_pool_diagnostics::dump_thread_states
auto dump_thread_states() const -> std::vector< thread_info >
Gets current state of all worker threads.
Definition thread_pool_diagnostics.cpp:36

kcenon::thread::diagnostics::thread_pool_diagnostics::set_config
void set_config(const diagnostics_config &config)
Updates the configuration.
Definition thread_pool_diagnostics.cpp:763

kcenon::thread::diagnostics::thread_pool_diagnostics::check_worker_health
auto check_worker_health() const -> component_health
Checks worker component health.
Definition thread_pool_diagnostics.cpp:450

kcenon::thread::diagnostics::thread_pool_diagnostics::recent_jobs_
std::deque< job_info > recent_jobs_
Ring buffer for recent job completions.
Definition thread_pool_diagnostics.h:376

kcenon::thread::job
Represents a unit of work (task) to be executed, typically by a job queue.
Definition job.h:136

kcenon::thread::metrics::ThreadPoolMetrics::snapshot
Snapshot snapshot() const
Get a snapshot of all metrics.
Definition thread_pool_metrics.h:105

kcenon::thread::result
A template class representing either a value or an error.
Definition error_handling.h:252

kcenon::thread::thread_pool
A thread pool for concurrent execution of jobs using multiple worker threads.
Definition thread_pool.h:182

kcenon::thread::thread_pool::get_pending_task_count
auto get_pending_task_count() const -> std::size_t
Get the number of pending tasks in the queue.
Definition thread_pool.cpp:695

kcenon::thread::thread_pool::get_idle_worker_count
std::size_t get_idle_worker_count() const
Get the number of idle workers.
Definition thread_pool.cpp:679

kcenon::thread::thread_pool::to_string
auto to_string(void) const -> std::string
Provides a string representation of this thread_pool.
Definition thread_pool.cpp:616

kcenon::thread::thread_pool::get_active_worker_count
auto get_active_worker_count() const -> std::size_t
Get the current number of active (running) workers.
Definition thread_pool.cpp:755

kcenon::thread::thread_pool::workers_
std::vector< std::unique_ptr< thread_worker > > workers_
A collection of worker threads associated with this pool.
Definition thread_pool.h:702

kcenon::thread::thread_pool::collect_worker_diagnostics
auto collect_worker_diagnostics() const -> std::vector< diagnostics::thread_info >
Collects diagnostics information from all workers.
Definition thread_pool.cpp:783

kcenon::thread::thread_pool::is_running
auto is_running() const -> bool
Check if the thread pool is currently running.
Definition thread_pool.cpp:689

kcenon::thread::thread_pool::metrics
const metrics::ThreadPoolMetrics & metrics() const noexcept
Access aggregated runtime metrics (read-only reference).
Definition thread_pool.cpp:273

kcenon::thread::thread_pool::get_job_queue
auto get_job_queue(void) -> std::shared_ptr< job_queue >
Returns the shared job_queue used by this thread pool.
Definition thread_pool.cpp:269

kcenon::thread::thread_pool::workers_mutex_
std::mutex workers_mutex_
Mutex protecting concurrent access to the workers_ vector.
Definition thread_pool.h:713

kcenon::thread::thread_worker
A specialized worker thread that processes jobs from a job_queue.
Definition thread_worker.h:68

thread_pool.h
Core thread pool implementation with work stealing and auto-scaling.

kcenon::thread::diagnostics::worker_state::active
@ active
Worker is executing a job.

kcenon::thread::diagnostics::worker_state::idle
@ idle
Worker is waiting for jobs.

kcenon::thread::diagnostics::health_state::healthy
@ healthy
Component is fully operational.

kcenon::thread::diagnostics::health_state::degraded
@ degraded
Component is operational but with reduced capacity/performance.

kcenon::thread::diagnostics::health_state::unhealthy
@ unhealthy
Component is not operational or failing.

kcenon::thread::diagnostics::bottleneck_type::slow_consumer
@ slow_consumer
Workers can't keep up with job submission rate.

kcenon::thread::diagnostics::bottleneck_type::lock_contention
@ lock_contention
High mutex wait times affecting throughput.

kcenon::thread::diagnostics::bottleneck_type::worker_starvation
@ worker_starvation
Not enough workers for the workload.

kcenon::thread::diagnostics::bottleneck_type::none
@ none
No bottleneck detected.

kcenon::thread::diagnostics::bottleneck_type::queue_full
@ queue_full
Queue is at capacity.

kcenon::thread::diagnostics::bottleneck_type::memory_pressure
@ memory_pressure
Excessive memory allocations causing slowdown.

kcenon::thread::diagnostics::bottleneck_type::uneven_distribution
@ uneven_distribution
Work is not evenly distributed (work stealing needed)

kcenon::thread::diagnostics
Definition thread_pool_diagnostics.cpp:16

kcenon::thread::diagnostics::bottleneck_type_to_string
auto bottleneck_type_to_string(bottleneck_type type) -> std::string
Converts bottleneck_type to human-readable string.
Definition bottleneck_report.h:47

kcenon::thread::diagnostics::worker_state_to_string
auto worker_state_to_string(worker_state state) -> std::string
Converts worker_state to human-readable string.
Definition thread_info.h:47

kcenon::thread::diagnostics::health_state_to_string
auto health_state_to_string(health_state state) -> std::string
Converts health_state to human-readable string.
Definition health_status.h:103

kcenon::thread::log_level_v2::info
@ info
Informational messages highlighting progress.

kcenon::thread::scaling_reason::queue_depth
@ queue_depth
Queue depth threshold exceeded.

std
STL namespace.

kcenon::thread::diagnostics::bottleneck_report
Analysis report of bottlenecks in the thread pool.
Definition bottleneck_report.h:92

kcenon::thread::diagnostics::bottleneck_report::type
bottleneck_type type
Type of bottleneck detected.
Definition bottleneck_report.h:108

kcenon::thread::diagnostics::bottleneck_report::avg_wait_time_ms
double avg_wait_time_ms
Average wait time in milliseconds.
Definition bottleneck_report.h:127

kcenon::thread::diagnostics::bottleneck_report::recommendations
std::vector< std::string > recommendations
Actionable recommendations to resolve the bottleneck.
Definition bottleneck_report.h:181

kcenon::thread::diagnostics::bottleneck_report::description
std::string description
Human-readable description of the bottleneck.
Definition bottleneck_report.h:103

kcenon::thread::diagnostics::bottleneck_report::utilization_variance
double utilization_variance
Variance in worker utilization.
Definition bottleneck_report.h:148

kcenon::thread::diagnostics::bottleneck_report::has_bottleneck
bool has_bottleneck
Whether a bottleneck was detected.
Definition bottleneck_report.h:96

kcenon::thread::diagnostics::bottleneck_report::jobs_rejected
std::uint64_t jobs_rejected
Jobs rejected due to queue full.
Definition bottleneck_report.h:155

kcenon::thread::diagnostics::bottleneck_report::estimated_backlog_time_ms
std::size_t estimated_backlog_time_ms
Estimated time to process the current backlog.
Definition bottleneck_report.h:141

kcenon::thread::diagnostics::bottleneck_report::queue_depth
std::size_t queue_depth
Current queue depth.
Definition bottleneck_report.h:160

kcenon::thread::diagnostics::bottleneck_report::queue_saturation
double queue_saturation
Queue saturation level.
Definition bottleneck_report.h:120

kcenon::thread::diagnostics::bottleneck_report::idle_workers
std::size_t idle_workers
Number of idle workers.
Definition bottleneck_report.h:165

kcenon::thread::diagnostics::bottleneck_report::total_workers
std::size_t total_workers
Total number of workers.
Definition bottleneck_report.h:170

kcenon::thread::diagnostics::bottleneck_report::worker_utilization
double worker_utilization
Average worker utilization.
Definition bottleneck_report.h:134

kcenon::thread::diagnostics::component_health
Health status of a single component.
Definition health_status.h:144

kcenon::thread::diagnostics::component_health::name
std::string name
Name of the component (e.g., "workers", "queue", "metrics").
Definition health_status.h:148

kcenon::thread::diagnostics::component_health::state
health_state state
Current health state of this component.
Definition health_status.h:153

kcenon::thread::diagnostics::component_health::message
std::string message
Human-readable message describing the current state.
Definition health_status.h:158

kcenon::thread::diagnostics::component_health::details
std::map< std::string, std::string > details
Additional details about this component's health.
Definition health_status.h:165

kcenon::thread::diagnostics::diagnostics_config
Configuration options for thread pool diagnostics.
Definition thread_pool_diagnostics.h:51

kcenon::thread::diagnostics::diagnostics_config::health_thresholds_config
health_thresholds health_thresholds_config
Configurable thresholds for health status determination.
Definition thread_pool_diagnostics.h:85

kcenon::thread::diagnostics::diagnostics_config::recent_jobs_capacity
std::size_t recent_jobs_capacity
Maximum number of recent jobs to track.
Definition thread_pool_diagnostics.h:55

kcenon::thread::diagnostics::diagnostics_config::wait_time_threshold_ms
double wait_time_threshold_ms
Wait time threshold (ms) for slow consumer detection.
Definition thread_pool_diagnostics.h:80

kcenon::thread::diagnostics::diagnostics_config::enable_tracing
bool enable_tracing
Enable automatic event tracing.
Definition thread_pool_diagnostics.h:65

kcenon::thread::diagnostics::diagnostics_config::event_history_size
std::size_t event_history_size
Maximum number of events to retain in history.
Definition thread_pool_diagnostics.h:60

kcenon::thread::diagnostics::diagnostics_config::utilization_high_threshold
double utilization_high_threshold
Worker utilization threshold for bottleneck detection.
Definition thread_pool_diagnostics.h:75

kcenon::thread::diagnostics::health_status
Comprehensive health status of the thread pool.
Definition health_status.h:206

kcenon::thread::diagnostics::health_status::success_rate
double success_rate
Job success rate (0.0 to 1.0).
Definition health_status.h:248

kcenon::thread::diagnostics::health_status::active_workers
std::size_t active_workers
Number of active workers.
Definition health_status.h:258

kcenon::thread::diagnostics::health_status::components
std::vector< component_health > components
Health status of individual components.
Definition health_status.h:229

kcenon::thread::diagnostics::health_status::total_workers
std::size_t total_workers
Total number of workers.
Definition health_status.h:263

kcenon::thread::diagnostics::health_status::total_jobs_processed
std::uint64_t total_jobs_processed
Total number of jobs processed since startup.
Definition health_status.h:243

kcenon::thread::diagnostics::health_status::queue_capacity
std::size_t queue_capacity
Queue capacity (if bounded).
Definition health_status.h:273

kcenon::thread::diagnostics::health_status::uptime_seconds
double uptime_seconds
Time since the thread pool was started (seconds).
Definition health_status.h:238

kcenon::thread::diagnostics::health_status::calculate_overall_status
auto calculate_overall_status() -> void
Calculates overall status from component states.
Definition health_status.h:334

kcenon::thread::diagnostics::health_status::check_time
std::chrono::steady_clock::time_point check_time
Time when this health check was performed.
Definition health_status.h:224

kcenon::thread::diagnostics::health_status::queue_depth
std::size_t queue_depth
Current queue depth.
Definition health_status.h:268

kcenon::thread::diagnostics::health_status::avg_latency_ms
double avg_latency_ms
Average job latency in milliseconds.
Definition health_status.h:253

kcenon::thread::diagnostics::job_execution_event
Event data for job execution tracing.
Definition execution_event.h:98

kcenon::thread::diagnostics::job_info
Information about a job in the thread pool.
Definition job_info.h:90

kcenon::thread::diagnostics::thread_info
Information about a worker thread in the pool.
Definition thread_info.h:88

kcenon::thread::diagnostics::thread_info::worker_id
std::size_t worker_id
Worker ID within the pool.
Definition thread_info.h:108

thread_pool_diagnostics.h
Runtime diagnostics, health monitoring, and execution tracing for thread pools.

thread_worker.h
Specialized worker thread that processes jobs from a job_queue.