Thread System 0.3.1
High-performance C++20 thread pool with work stealing and DAG scheduling
Loading...
Searching...
No Matches
metrics_backend.cpp
Go to the documentation of this file.
1// BSD 3-Clause License
2// Copyright (c) 2024, 🍀☀🌕🌥 🌊
3// See the LICENSE file in the project root for full license information.
4
6
7#include <iomanip>
8#include <sstream>
9
11
12// =============================================================================
13// PrometheusBackend Implementation
14// =============================================================================
15
17 if (labels_.empty()) {
18 return "";
19 }
20
21 std::ostringstream oss;
22 oss << "{";
23 bool first = true;
24 for (const auto& [key, value] : labels_) {
25 if (!first) {
26 oss << ",";
27 }
28 oss << key << "=\"" << value << "\"";
29 first = false;
30 }
31 oss << "}";
32 return oss.str();
33}
34
35std::string PrometheusBackend::export_base(const BaseSnapshot& snapshot) const {
36 std::ostringstream oss;
37 oss << std::fixed << std::setprecision(6);
38 const auto& p = prefix_;
39 const auto labels = format_labels();
40
41 // Task counters
42 oss << "# HELP " << p << "_tasks_submitted_total Total tasks submitted\n";
43 oss << "# TYPE " << p << "_tasks_submitted_total counter\n";
44 oss << p << "_tasks_submitted_total" << labels << " " << snapshot.tasks_submitted << "\n\n";
45
46 oss << "# HELP " << p << "_tasks_executed_total Total tasks executed\n";
47 oss << "# TYPE " << p << "_tasks_executed_total counter\n";
48 oss << p << "_tasks_executed_total" << labels << " " << snapshot.tasks_executed << "\n\n";
49
50 oss << "# HELP " << p << "_tasks_failed_total Total tasks failed\n";
51 oss << "# TYPE " << p << "_tasks_failed_total counter\n";
52 oss << p << "_tasks_failed_total" << labels << " " << snapshot.tasks_failed << "\n\n";
53
54 // Time counters
55 oss << "# HELP " << p << "_busy_time_nanoseconds_total Total busy time\n";
56 oss << "# TYPE " << p << "_busy_time_nanoseconds_total counter\n";
57 oss << p << "_busy_time_nanoseconds_total" << labels << " " << snapshot.total_busy_time_ns << "\n\n";
58
59 oss << "# HELP " << p << "_idle_time_nanoseconds_total Total idle time\n";
60 oss << "# TYPE " << p << "_idle_time_nanoseconds_total counter\n";
61 oss << p << "_idle_time_nanoseconds_total" << labels << " " << snapshot.total_idle_time_ns << "\n";
62
63 return oss.str();
64}
65
66std::string PrometheusBackend::export_enhanced(const EnhancedSnapshot& snapshot) const {
67 std::ostringstream oss;
68 oss << std::fixed << std::setprecision(6);
69 const auto& p = prefix_;
70 const auto labels = format_labels();
71
72 // Task counters
73 oss << "# HELP " << p << "_tasks_submitted_total Total tasks submitted\n";
74 oss << "# TYPE " << p << "_tasks_submitted_total counter\n";
75 oss << p << "_tasks_submitted_total" << labels << " " << snapshot.tasks_submitted << "\n\n";
76
77 oss << "# HELP " << p << "_tasks_executed_total Total tasks executed\n";
78 oss << "# TYPE " << p << "_tasks_executed_total counter\n";
79 oss << p << "_tasks_executed_total" << labels << " " << snapshot.tasks_executed << "\n\n";
80
81 oss << "# HELP " << p << "_tasks_failed_total Total tasks failed\n";
82 oss << "# TYPE " << p << "_tasks_failed_total counter\n";
83 oss << p << "_tasks_failed_total" << labels << " " << snapshot.tasks_failed << "\n\n";
84
85 // Latency summary (enqueue)
86 oss << "# HELP " << p << "_enqueue_latency_us Enqueue latency in microseconds\n";
87 oss << "# TYPE " << p << "_enqueue_latency_us summary\n";
88 oss << p << "_enqueue_latency_us{quantile=\"0.5\"" << (labels.empty() ? "" : "," + labels.substr(1, labels.size() - 2)) << "} " << snapshot.enqueue_latency_p50_us << "\n";
89 oss << p << "_enqueue_latency_us{quantile=\"0.9\"" << (labels.empty() ? "" : "," + labels.substr(1, labels.size() - 2)) << "} " << snapshot.enqueue_latency_p90_us << "\n";
90 oss << p << "_enqueue_latency_us{quantile=\"0.99\"" << (labels.empty() ? "" : "," + labels.substr(1, labels.size() - 2)) << "} " << snapshot.enqueue_latency_p99_us << "\n\n";
91
92 // Latency summary (execution)
93 oss << "# HELP " << p << "_execution_latency_us Execution latency in microseconds\n";
94 oss << "# TYPE " << p << "_execution_latency_us summary\n";
95 oss << p << "_execution_latency_us{quantile=\"0.5\"" << (labels.empty() ? "" : "," + labels.substr(1, labels.size() - 2)) << "} " << snapshot.execution_latency_p50_us << "\n";
96 oss << p << "_execution_latency_us{quantile=\"0.9\"" << (labels.empty() ? "" : "," + labels.substr(1, labels.size() - 2)) << "} " << snapshot.execution_latency_p90_us << "\n";
97 oss << p << "_execution_latency_us{quantile=\"0.99\"" << (labels.empty() ? "" : "," + labels.substr(1, labels.size() - 2)) << "} " << snapshot.execution_latency_p99_us << "\n\n";
98
99 // Latency summary (wait time)
100 oss << "# HELP " << p << "_wait_time_us Queue wait time in microseconds\n";
101 oss << "# TYPE " << p << "_wait_time_us summary\n";
102 oss << p << "_wait_time_us{quantile=\"0.5\"" << (labels.empty() ? "" : "," + labels.substr(1, labels.size() - 2)) << "} " << snapshot.wait_time_p50_us << "\n";
103 oss << p << "_wait_time_us{quantile=\"0.9\"" << (labels.empty() ? "" : "," + labels.substr(1, labels.size() - 2)) << "} " << snapshot.wait_time_p90_us << "\n";
104 oss << p << "_wait_time_us{quantile=\"0.99\"" << (labels.empty() ? "" : "," + labels.substr(1, labels.size() - 2)) << "} " << snapshot.wait_time_p99_us << "\n\n";
105
106 // Throughput
107 oss << "# HELP " << p << "_throughput_1s Tasks per second (1s window)\n";
108 oss << "# TYPE " << p << "_throughput_1s gauge\n";
109 oss << p << "_throughput_1s" << labels << " " << snapshot.throughput_1s << "\n\n";
110
111 oss << "# HELP " << p << "_throughput_1m Tasks per second (1m window)\n";
112 oss << "# TYPE " << p << "_throughput_1m gauge\n";
113 oss << p << "_throughput_1m" << labels << " " << snapshot.throughput_1m << "\n\n";
114
115 // Queue depth
116 oss << "# HELP " << p << "_queue_depth_current Current queue depth\n";
117 oss << "# TYPE " << p << "_queue_depth_current gauge\n";
118 oss << p << "_queue_depth_current" << labels << " " << snapshot.current_queue_depth << "\n\n";
119
120 oss << "# HELP " << p << "_queue_depth_peak Peak queue depth\n";
121 oss << "# TYPE " << p << "_queue_depth_peak gauge\n";
122 oss << p << "_queue_depth_peak" << labels << " " << snapshot.peak_queue_depth << "\n\n";
123
124 // Worker utilization
125 oss << "# HELP " << p << "_worker_utilization Overall worker utilization\n";
126 oss << "# TYPE " << p << "_worker_utilization gauge\n";
127 oss << p << "_worker_utilization" << labels << " " << snapshot.worker_utilization << "\n\n";
128
129 oss << "# HELP " << p << "_active_workers Number of active workers\n";
130 oss << "# TYPE " << p << "_active_workers gauge\n";
131 oss << p << "_active_workers" << labels << " " << snapshot.active_workers << "\n\n";
132
133 // Per-worker utilization
134 oss << "# HELP " << p << "_worker_utilization_per_worker Per-worker utilization\n";
135 oss << "# TYPE " << p << "_worker_utilization_per_worker gauge\n";
136 for (std::size_t i = 0; i < snapshot.per_worker_utilization.size(); ++i) {
137 oss << p << "_worker_utilization_per_worker{worker=\"" << i << "\"";
138 if (!labels_.empty()) {
139 for (const auto& [key, value] : labels_) {
140 oss << "," << key << "=\"" << value << "\"";
141 }
142 }
143 oss << "} " << snapshot.per_worker_utilization[i] << "\n";
144 }
145
146 return oss.str();
147}
148
149// =============================================================================
150// JsonBackend Implementation
151// =============================================================================
152
153std::string JsonBackend::export_base(const BaseSnapshot& snapshot) const {
154 std::ostringstream oss;
155 oss << std::fixed << std::setprecision(2);
156
157 if (pretty_) {
158 oss << "{\n";
159 oss << " \"tasks\": {\n";
160 oss << " \"submitted\": " << snapshot.tasks_submitted << ",\n";
161 oss << " \"executed\": " << snapshot.tasks_executed << ",\n";
162 oss << " \"failed\": " << snapshot.tasks_failed << "\n";
163 oss << " },\n";
164 oss << " \"timing_ns\": {\n";
165 oss << " \"busy\": " << snapshot.total_busy_time_ns << ",\n";
166 oss << " \"idle\": " << snapshot.total_idle_time_ns << "\n";
167 oss << " }\n";
168 oss << "}";
169 } else {
170 oss << "{\"tasks\":{\"submitted\":" << snapshot.tasks_submitted
171 << ",\"executed\":" << snapshot.tasks_executed
172 << ",\"failed\":" << snapshot.tasks_failed
173 << "},\"timing_ns\":{\"busy\":" << snapshot.total_busy_time_ns
174 << ",\"idle\":" << snapshot.total_idle_time_ns << "}}";
175 }
176
177 return oss.str();
178}
179
180std::string JsonBackend::export_enhanced(const EnhancedSnapshot& snapshot) const {
181 std::ostringstream oss;
182 oss << std::fixed << std::setprecision(2);
183
184 if (pretty_) {
185 oss << "{\n";
186 oss << " \"tasks\": {\n";
187 oss << " \"submitted\": " << snapshot.tasks_submitted << ",\n";
188 oss << " \"executed\": " << snapshot.tasks_executed << ",\n";
189 oss << " \"failed\": " << snapshot.tasks_failed << "\n";
190 oss << " },\n";
191
192 oss << " \"latency_us\": {\n";
193 oss << " \"enqueue\": { \"p50\": " << snapshot.enqueue_latency_p50_us
194 << ", \"p90\": " << snapshot.enqueue_latency_p90_us
195 << ", \"p99\": " << snapshot.enqueue_latency_p99_us << " },\n";
196 oss << " \"execution\": { \"p50\": " << snapshot.execution_latency_p50_us
197 << ", \"p90\": " << snapshot.execution_latency_p90_us
198 << ", \"p99\": " << snapshot.execution_latency_p99_us << " },\n";
199 oss << " \"wait_time\": { \"p50\": " << snapshot.wait_time_p50_us
200 << ", \"p90\": " << snapshot.wait_time_p90_us
201 << ", \"p99\": " << snapshot.wait_time_p99_us << " }\n";
202 oss << " },\n";
203
204 oss << " \"throughput\": {\n";
205 oss << " \"rate_1s\": " << snapshot.throughput_1s << ",\n";
206 oss << " \"rate_1m\": " << snapshot.throughput_1m << "\n";
207 oss << " },\n";
208
209 oss << " \"queue\": {\n";
210 oss << " \"current_depth\": " << snapshot.current_queue_depth << ",\n";
211 oss << " \"peak_depth\": " << snapshot.peak_queue_depth << ",\n";
212 oss << " \"avg_depth\": " << snapshot.avg_queue_depth << "\n";
213 oss << " },\n";
214
215 oss << " \"workers\": {\n";
216 oss << " \"active\": " << snapshot.active_workers << ",\n";
217 oss << " \"utilization\": " << snapshot.worker_utilization << ",\n";
218 oss << " \"per_worker_utilization\": [";
219 for (std::size_t i = 0; i < snapshot.per_worker_utilization.size(); ++i) {
220 if (i > 0) {
221 oss << ", ";
222 }
223 oss << snapshot.per_worker_utilization[i];
224 }
225 oss << "]\n";
226 oss << " }\n";
227
228 oss << "}";
229 } else {
230 oss << "{\"tasks\":{\"submitted\":" << snapshot.tasks_submitted
231 << ",\"executed\":" << snapshot.tasks_executed
232 << ",\"failed\":" << snapshot.tasks_failed
233 << "},\"latency_us\":{\"enqueue\":{\"p50\":" << snapshot.enqueue_latency_p50_us
234 << ",\"p90\":" << snapshot.enqueue_latency_p90_us
235 << ",\"p99\":" << snapshot.enqueue_latency_p99_us
236 << "},\"execution\":{\"p50\":" << snapshot.execution_latency_p50_us
237 << ",\"p90\":" << snapshot.execution_latency_p90_us
238 << ",\"p99\":" << snapshot.execution_latency_p99_us
239 << "},\"wait_time\":{\"p50\":" << snapshot.wait_time_p50_us
240 << ",\"p90\":" << snapshot.wait_time_p90_us
241 << ",\"p99\":" << snapshot.wait_time_p99_us
242 << "}},\"throughput\":{\"rate_1s\":" << snapshot.throughput_1s
243 << ",\"rate_1m\":" << snapshot.throughput_1m
244 << "},\"queue\":{\"current_depth\":" << snapshot.current_queue_depth
245 << ",\"peak_depth\":" << snapshot.peak_queue_depth
246 << ",\"avg_depth\":" << snapshot.avg_queue_depth
247 << "},\"workers\":{\"active\":" << snapshot.active_workers
248 << ",\"utilization\":" << snapshot.worker_utilization
249 << ",\"per_worker_utilization\":[";
250 for (std::size_t i = 0; i < snapshot.per_worker_utilization.size(); ++i) {
251 if (i > 0) {
252 oss << ",";
253 }
254 oss << snapshot.per_worker_utilization[i];
255 }
256 oss << "]}}";
257 }
258
259 return oss.str();
260}
261
262// =============================================================================
263// LoggingBackend Implementation
264// =============================================================================
265
266std::string LoggingBackend::export_base(const BaseSnapshot& snapshot) const {
267 std::ostringstream oss;
268 oss << std::fixed << std::setprecision(2);
269
270 oss << "[" << prefix_ << "] Metrics Summary:\n";
271 oss << " Tasks: submitted=" << snapshot.tasks_submitted
272 << ", executed=" << snapshot.tasks_executed
273 << ", failed=" << snapshot.tasks_failed << "\n";
274
275 auto total = snapshot.total_busy_time_ns + snapshot.total_idle_time_ns;
276 double utilization = (total > 0)
277 ? 100.0 * static_cast<double>(snapshot.total_busy_time_ns) / static_cast<double>(total)
278 : 0.0;
279 oss << " Time: busy=" << (snapshot.total_busy_time_ns / 1'000'000.0) << "ms"
280 << ", idle=" << (snapshot.total_idle_time_ns / 1'000'000.0) << "ms"
281 << " (utilization=" << utilization << "%)";
282
283 return oss.str();
284}
285
286std::string LoggingBackend::export_enhanced(const EnhancedSnapshot& snapshot) const {
287 std::ostringstream oss;
288 oss << std::fixed << std::setprecision(2);
289
290 oss << "[" << prefix_ << "] Enhanced Metrics:\n";
291 oss << " Tasks: submitted=" << snapshot.tasks_submitted
292 << ", executed=" << snapshot.tasks_executed
293 << ", failed=" << snapshot.tasks_failed << "\n";
294
295 oss << " Latency (P50/P90/P99):\n";
296 oss << " Enqueue: " << snapshot.enqueue_latency_p50_us << "/"
297 << snapshot.enqueue_latency_p90_us << "/" << snapshot.enqueue_latency_p99_us << " us\n";
298 oss << " Execute: " << snapshot.execution_latency_p50_us << "/"
299 << snapshot.execution_latency_p90_us << "/" << snapshot.execution_latency_p99_us << " us\n";
300 oss << " Wait: " << snapshot.wait_time_p50_us << "/"
301 << snapshot.wait_time_p90_us << "/" << snapshot.wait_time_p99_us << " us\n";
302
303 oss << " Throughput: " << snapshot.throughput_1s << " ops/sec (1s), "
304 << snapshot.throughput_1m << " ops/sec (1m)\n";
305
306 oss << " Queue: depth=" << snapshot.current_queue_depth
307 << ", peak=" << snapshot.peak_queue_depth
308 << ", avg=" << snapshot.avg_queue_depth << "\n";
309
310 oss << " Workers: active=" << snapshot.active_workers
311 << ", utilization=" << (snapshot.worker_utilization * 100.0) << "%";
312
313 return oss.str();
314}
315
316} // namespace kcenon::thread::metrics
std::string export_base(const BaseSnapshot &snapshot) const override
Export base metrics snapshot.
std::string export_enhanced(const EnhancedSnapshot &snapshot) const override
Export enhanced metrics snapshot.
std::string export_enhanced(const EnhancedSnapshot &snapshot) const override
Export enhanced metrics snapshot.
std::string export_base(const BaseSnapshot &snapshot) const override
Export base metrics snapshot.
const std::map< std::string, std::string > & labels() const
Get all configured labels.
std::string prefix_
Metric name prefix.
std::map< std::string, std::string > labels_
Labels to attach to all metrics.
std::string export_base(const BaseSnapshot &snapshot) const override
Export base metrics snapshot.
std::string format_labels() const
Format labels for Prometheus output.
std::string export_enhanced(const EnhancedSnapshot &snapshot) const override
Export enhanced metrics snapshot.
Abstract interface for metrics export backends.
Base snapshot structure containing common metric values.
std::uint64_t tasks_executed
Total tasks successfully executed.
std::uint64_t tasks_failed
Total tasks that failed during execution.
std::uint64_t total_idle_time_ns
Total idle time across all workers in nanoseconds.
std::uint64_t total_busy_time_ns
Total busy time across all workers in nanoseconds.
std::uint64_t tasks_submitted
Total tasks submitted to the pool.
Enhanced snapshot with latency percentiles and throughput.
double throughput_1s
Tasks completed per second (1-second window).
std::size_t peak_queue_depth
Peak queue depth since last reset.
double worker_utilization
Overall worker utilization (0.0 - 1.0).
std::size_t active_workers
Number of active workers.
double enqueue_latency_p50_us
Median (P50) enqueue latency in microseconds.
std::uint64_t tasks_failed
Total tasks that failed during execution.
double enqueue_latency_p90_us
90th percentile enqueue latency in microseconds.
double wait_time_p90_us
90th percentile wait time in microseconds.
double execution_latency_p99_us
99th percentile execution latency in microseconds.
std::uint64_t tasks_executed
Total tasks successfully executed.
std::uint64_t tasks_submitted
Total tasks submitted to the pool.
double avg_queue_depth
Average queue depth over the sampling period.
double execution_latency_p50_us
Median execution latency in microseconds.
double wait_time_p50_us
Median wait time (queue time) in microseconds.
double throughput_1m
Tasks completed per second (1-minute window average).
double execution_latency_p90_us
90th percentile execution latency in microseconds.
std::size_t current_queue_depth
Current queue depth.
double wait_time_p99_us
99th percentile wait time in microseconds.
std::vector< double > per_worker_utilization
Per-worker utilization (0.0 - 1.0 each).
double enqueue_latency_p99_us
99th percentile enqueue latency in microseconds.