Thread System 0.3.1
High-performance C++20 thread pool with work stealing and DAG scheduling
Loading...
Searching...
No Matches
latency_histogram.cpp
Go to the documentation of this file.
1// BSD 3-Clause License
2// Copyright (c) 2024, 🍀☀🌕🌥 🌊
3// See the LICENSE file in the project root for full license information.
4
6
7#include <algorithm>
8#include <cmath>
9#include <limits>
10
12
14 for (auto& bucket : buckets_) {
15 bucket.store(0, std::memory_order_relaxed);
16 }
17}
18
20 for (std::size_t i = 0; i < BUCKET_COUNT; ++i) {
21 buckets_[i].store(other.buckets_[i].load(std::memory_order_relaxed),
22 std::memory_order_relaxed);
23 }
24 total_count_.store(other.total_count_.load(std::memory_order_relaxed),
25 std::memory_order_relaxed);
26 total_sum_.store(other.total_sum_.load(std::memory_order_relaxed),
27 std::memory_order_relaxed);
28 min_value_.store(other.min_value_.load(std::memory_order_relaxed),
29 std::memory_order_relaxed);
30 max_value_.store(other.max_value_.load(std::memory_order_relaxed),
31 std::memory_order_relaxed);
32}
33
35 for (std::size_t i = 0; i < BUCKET_COUNT; ++i) {
36 buckets_[i].store(other.buckets_[i].load(std::memory_order_relaxed),
37 std::memory_order_relaxed);
38 other.buckets_[i].store(0, std::memory_order_relaxed);
39 }
40 total_count_.store(other.total_count_.exchange(0, std::memory_order_relaxed),
41 std::memory_order_relaxed);
42 total_sum_.store(other.total_sum_.exchange(0, std::memory_order_relaxed),
43 std::memory_order_relaxed);
44 min_value_.store(other.min_value_.exchange(
45 std::numeric_limits<std::uint64_t>::max(),
46 std::memory_order_relaxed),
47 std::memory_order_relaxed);
48 max_value_.store(other.max_value_.exchange(0, std::memory_order_relaxed),
49 std::memory_order_relaxed);
50}
51
53 if (this != &other) {
54 for (std::size_t i = 0; i < BUCKET_COUNT; ++i) {
55 buckets_[i].store(other.buckets_[i].load(std::memory_order_relaxed),
56 std::memory_order_relaxed);
57 }
58 total_count_.store(other.total_count_.load(std::memory_order_relaxed),
59 std::memory_order_relaxed);
60 total_sum_.store(other.total_sum_.load(std::memory_order_relaxed),
61 std::memory_order_relaxed);
62 min_value_.store(other.min_value_.load(std::memory_order_relaxed),
63 std::memory_order_relaxed);
64 max_value_.store(other.max_value_.load(std::memory_order_relaxed),
65 std::memory_order_relaxed);
66 }
67 return *this;
68}
69
71 if (this != &other) {
72 for (std::size_t i = 0; i < BUCKET_COUNT; ++i) {
73 buckets_[i].store(other.buckets_[i].load(std::memory_order_relaxed),
74 std::memory_order_relaxed);
75 other.buckets_[i].store(0, std::memory_order_relaxed);
76 }
77 total_count_.store(other.total_count_.exchange(0, std::memory_order_relaxed),
78 std::memory_order_relaxed);
79 total_sum_.store(other.total_sum_.exchange(0, std::memory_order_relaxed),
80 std::memory_order_relaxed);
81 min_value_.store(other.min_value_.exchange(
82 std::numeric_limits<std::uint64_t>::max(),
83 std::memory_order_relaxed),
84 std::memory_order_relaxed);
85 max_value_.store(other.max_value_.exchange(0, std::memory_order_relaxed),
86 std::memory_order_relaxed);
87 }
88 return *this;
89}
90
91void LatencyHistogram::record(std::chrono::nanoseconds value) {
92 record_ns(static_cast<std::uint64_t>(value.count()));
93}
94
95void LatencyHistogram::record_ns(std::uint64_t nanoseconds) {
96 const auto bucket_index = compute_bucket_index(nanoseconds);
97 buckets_[bucket_index].fetch_add(1, std::memory_order_relaxed);
98 total_count_.fetch_add(1, std::memory_order_relaxed);
99 total_sum_.fetch_add(nanoseconds, std::memory_order_relaxed);
100
101 // Update min atomically using CAS
102 auto current_min = min_value_.load(std::memory_order_relaxed);
103 while (nanoseconds < current_min) {
104 if (min_value_.compare_exchange_weak(current_min, nanoseconds,
105 std::memory_order_relaxed,
106 std::memory_order_relaxed)) {
107 break;
108 }
109 }
110
111 // Update max atomically using CAS
112 auto current_max = max_value_.load(std::memory_order_relaxed);
113 while (nanoseconds > current_max) {
114 if (max_value_.compare_exchange_weak(current_max, nanoseconds,
115 std::memory_order_relaxed,
116 std::memory_order_relaxed)) {
117 break;
118 }
119 }
120}
121
122double LatencyHistogram::percentile(double p) const {
123 const auto total = total_count_.load(std::memory_order_relaxed);
124 if (total == 0) {
125 return 0.0;
126 }
127
128 // Clamp percentile to valid range
129 p = std::clamp(p, 0.0, 1.0);
130 const auto target_count = static_cast<std::uint64_t>(std::ceil(p * total));
131
132 std::uint64_t cumulative = 0;
133 for (std::size_t i = 0; i < BUCKET_COUNT; ++i) {
134 cumulative += buckets_[i].load(std::memory_order_relaxed);
135 if (cumulative >= target_count) {
136 // Linear interpolation within the bucket
137 const auto bucket_count_val = buckets_[i].load(std::memory_order_relaxed);
138 if (bucket_count_val == 0) {
139 return bucket_midpoint(i);
140 }
141
142 const auto prev_cumulative = cumulative - bucket_count_val;
143 const auto target_in_bucket = target_count - prev_cumulative;
144 const auto fraction =
145 static_cast<double>(target_in_bucket) / bucket_count_val;
146
147 const auto lower = bucket_lower_bound(i);
148 const auto upper = bucket_upper_bound(i);
149
150 return lower + fraction * (upper - lower);
151 }
152 }
153
154 // Should not reach here, but return max bucket value
155 return static_cast<double>(bucket_upper_bound(BUCKET_COUNT - 1));
156}
157
159 const auto total = total_count_.load(std::memory_order_relaxed);
160 if (total == 0) {
161 return 0.0;
162 }
163 return static_cast<double>(total_sum_.load(std::memory_order_relaxed)) / total;
164}
165
167 const auto total = total_count_.load(std::memory_order_relaxed);
168 if (total < 2) {
169 return 0.0;
170 }
171
172 const auto mean_val = mean();
173 double sum_squared_diff = 0.0;
174
175 for (std::size_t i = 0; i < BUCKET_COUNT; ++i) {
176 const auto bucket_count_val = buckets_[i].load(std::memory_order_relaxed);
177 if (bucket_count_val > 0) {
178 const auto midpoint = bucket_midpoint(i);
179 const auto diff = midpoint - mean_val;
180 sum_squared_diff += bucket_count_val * diff * diff;
181 }
182 }
183
184 return std::sqrt(sum_squared_diff / (total - 1));
185}
186
187std::uint64_t LatencyHistogram::min() const {
188 const auto total = total_count_.load(std::memory_order_relaxed);
189 if (total == 0) {
190 return 0;
191 }
192 return min_value_.load(std::memory_order_relaxed);
193}
194
195std::uint64_t LatencyHistogram::max() const {
196 const auto total = total_count_.load(std::memory_order_relaxed);
197 if (total == 0) {
198 return 0;
199 }
200 return max_value_.load(std::memory_order_relaxed);
201}
202
203std::uint64_t LatencyHistogram::count() const {
204 return total_count_.load(std::memory_order_relaxed);
205}
206
207std::uint64_t LatencyHistogram::sum() const {
208 return total_sum_.load(std::memory_order_relaxed);
209}
210
212 for (auto& bucket : buckets_) {
213 bucket.store(0, std::memory_order_relaxed);
214 }
215 total_count_.store(0, std::memory_order_relaxed);
216 total_sum_.store(0, std::memory_order_relaxed);
217 min_value_.store(std::numeric_limits<std::uint64_t>::max(),
218 std::memory_order_relaxed);
219 max_value_.store(0, std::memory_order_relaxed);
220}
221
223 return total_count_.load(std::memory_order_relaxed) == 0;
224}
225
227 for (std::size_t i = 0; i < BUCKET_COUNT; ++i) {
228 buckets_[i].fetch_add(other.buckets_[i].load(std::memory_order_relaxed),
229 std::memory_order_relaxed);
230 }
231 total_count_.fetch_add(other.total_count_.load(std::memory_order_relaxed),
232 std::memory_order_relaxed);
233 total_sum_.fetch_add(other.total_sum_.load(std::memory_order_relaxed),
234 std::memory_order_relaxed);
235
236 // Update min atomically
237 auto other_min = other.min_value_.load(std::memory_order_relaxed);
238 auto current_min = min_value_.load(std::memory_order_relaxed);
239 while (other_min < current_min) {
240 if (min_value_.compare_exchange_weak(current_min, other_min,
241 std::memory_order_relaxed,
242 std::memory_order_relaxed)) {
243 break;
244 }
245 }
246
247 // Update max atomically
248 auto other_max = other.max_value_.load(std::memory_order_relaxed);
249 auto current_max = max_value_.load(std::memory_order_relaxed);
250 while (other_max > current_max) {
251 if (max_value_.compare_exchange_weak(current_max, other_max,
252 std::memory_order_relaxed,
253 std::memory_order_relaxed)) {
254 break;
255 }
256 }
257}
258
259std::uint64_t LatencyHistogram::bucket_count(std::size_t bucket_index) const {
260 if (bucket_index >= BUCKET_COUNT) {
261 return 0;
262 }
263 return buckets_[bucket_index].load(std::memory_order_relaxed);
264}
265
266std::uint64_t LatencyHistogram::bucket_lower_bound(std::size_t bucket_index) {
267 if (bucket_index == 0) {
268 return 0;
269 }
270 if (bucket_index >= BUCKET_COUNT) {
271 return std::numeric_limits<std::uint64_t>::max();
272 }
273 return static_cast<std::uint64_t>(1ULL << (bucket_index - 1));
274}
275
276std::uint64_t LatencyHistogram::bucket_upper_bound(std::size_t bucket_index) {
277 if (bucket_index >= BUCKET_COUNT - 1) {
278 return std::numeric_limits<std::uint64_t>::max();
279 }
280 return static_cast<std::uint64_t>(1ULL << bucket_index);
281}
282
283std::size_t LatencyHistogram::compute_bucket_index(std::uint64_t value) {
284 if (value == 0) {
285 return 0;
286 }
287
288 // Use bit manipulation to find the highest set bit (floor of log2)
289 // This gives us the bucket index directly
290#if defined(__GNUC__) || defined(__clang__)
291 const auto leading_zeros = __builtin_clzll(value);
292 const auto highest_bit = 63 - leading_zeros;
293#elif defined(_MSC_VER)
294 unsigned long highest_bit;
295 _BitScanReverse64(&highest_bit, value);
296#else
297 // Fallback: portable implementation
298 std::size_t highest_bit = 0;
299 auto temp = value;
300 while (temp >>= 1) {
301 ++highest_bit;
302 }
303#endif
304
305 // Bucket index is highest_bit + 1, capped at BUCKET_COUNT - 1
306 return std::min(static_cast<std::size_t>(highest_bit + 1), BUCKET_COUNT - 1);
307}
308
309double LatencyHistogram::bucket_midpoint(std::size_t bucket_index) {
310 const auto lower = bucket_lower_bound(bucket_index);
311 const auto upper = bucket_upper_bound(bucket_index);
312
313 if (upper == std::numeric_limits<std::uint64_t>::max()) {
314 // For the last bucket, use lower bound as estimate
315 return static_cast<double>(lower);
316 }
317
318 return (static_cast<double>(lower) + static_cast<double>(upper)) / 2.0;
319}
320
321} // namespace kcenon::thread::metrics
HDR-style histogram for latency distribution with logarithmic buckets.
LatencyHistogram()
Default constructor - initializes all buckets to zero.
void record(std::chrono::nanoseconds value)
Record a latency value.
double mean() const
Calculate the arithmetic mean.
static constexpr std::size_t BUCKET_COUNT
Number of histogram buckets.
std::uint64_t count() const
Get the total count of recorded values.
std::uint64_t sum() const
Get the sum of all recorded values.
std::uint64_t bucket_count(std::size_t bucket_index) const
Get the bucket count at a specific index.
static std::size_t compute_bucket_index(std::uint64_t value)
Compute the bucket index for a given value.
std::atomic< std::uint64_t > min_value_
Minimum recorded value.
std::atomic< std::uint64_t > total_count_
Total number of recorded values.
std::array< std::atomic< std::uint64_t >, BUCKET_COUNT > buckets_
Histogram buckets using logarithmic distribution.
bool empty() const
Check if the histogram is empty.
static std::uint64_t bucket_lower_bound(std::size_t bucket_index)
Get the lower bound of a bucket.
double stddev() const
Calculate the standard deviation.
double percentile(double p) const
Calculate the value at a given percentile.
std::atomic< std::uint64_t > max_value_
Maximum recorded value.
static std::uint64_t bucket_upper_bound(std::size_t bucket_index)
Get the upper bound of a bucket.
std::uint64_t min() const
Get the minimum recorded value.
std::atomic< std::uint64_t > total_sum_
Sum of all recorded values.
void reset()
Reset all buckets and counters to zero.
static double bucket_midpoint(std::size_t bucket_index)
Get the midpoint value of a bucket for estimation.
void record_ns(std::uint64_t nanoseconds)
Record a raw nanosecond value.
LatencyHistogram & operator=(const LatencyHistogram &other)
Copy assignment operator.
std::uint64_t max() const
Get the maximum recorded value.
void merge(const LatencyHistogram &other)
Merge another histogram into this one.
HDR-style histogram for latency distribution with logarithmic buckets.