Monitoring System 0.1.0
System resource monitoring with pluggable collectors and alerting
Loading...
Searching...
No Matches
test_fault_tolerance.cpp
Go to the documentation of this file.
1// BSD 3-Clause License
2// Copyright (c) 2021-2025, 🍀☀🌕🌥 🌊
3// See the LICENSE file in the project root for full license information.
4
5#include <gtest/gtest.h>
6#include <thread>
7#include <chrono>
8#include <atomic>
12
13using namespace kcenon::monitoring;
14
15class FaultToleranceTest : public ::testing::Test {
16protected:
17 void SetUp() override {
18 call_count = 0;
20 }
21
22 void TearDown() override {
23 // Clean up registries
27 }
28
29 std::atomic<int> call_count{0};
30 std::atomic<int> success_after_attempts{0};
31
32 // Helper function that fails for the first N attempts
33 kcenon::common::Result<int> failing_operation() {
34 int current_call = ++call_count;
35 if (success_after_attempts > 0 && current_call <= success_after_attempts) {
36 return kcenon::common::make_error<int>(static_cast<int>(monitoring_error_code::operation_failed),
37 "Simulated failure on attempt " + std::to_string(current_call));
38 }
39 return kcenon::common::ok(42);
40 }
41
42 // Helper function that always fails
43 kcenon::common::Result<int> always_failing_operation() {
44 ++call_count;
45 return kcenon::common::make_error<int>(static_cast<int>(monitoring_error_code::operation_failed), "Always fails");
46 }
47
48 // Helper function that always succeeds
49 kcenon::common::Result<int> always_succeeding_operation() {
50 ++call_count;
51 return kcenon::common::ok(100);
52 }
53
54 // Slow operation for timeout testing
55 kcenon::common::Result<int> slow_operation(std::chrono::milliseconds delay) {
56 ++call_count;
57 std::this_thread::sleep_for(delay);
58 return kcenon::common::ok(200);
59 }
60};
61
62// Circuit Breaker Tests
63TEST_F(FaultToleranceTest, CircuitBreakerClosedState) {
65 config.failure_threshold = 3;
66
67 circuit_breaker breaker(config);
68
69 EXPECT_EQ(breaker.get_state(), circuit_state::CLOSED);
70
71 auto result = execute_with_circuit_breaker<int>(breaker, "test", [this]() { return always_succeeding_operation(); });
72 EXPECT_TRUE(result.is_ok());
73 EXPECT_EQ(result.value(), 100);
74 EXPECT_EQ(breaker.get_state(), circuit_state::CLOSED);
75 EXPECT_EQ(call_count.load(), 1);
76}
77
78TEST_F(FaultToleranceTest, CircuitBreakerOpensAfterFailures) {
80 config.failure_threshold = 3;
81
82 circuit_breaker breaker(config);
83
84 // First 3 failures should open the circuit
85 for (int i = 0; i < 3; ++i) {
86 auto result = execute_with_circuit_breaker<int>(breaker, "test", [this]() { return always_failing_operation(); });
87 EXPECT_TRUE(result.is_err());
88 }
89
90 EXPECT_EQ(breaker.get_state(), circuit_state::OPEN);
91 EXPECT_EQ(call_count.load(), 3);
92
93 // Next call should be rejected without calling operation
94 auto result = execute_with_circuit_breaker<int>(breaker, "test", [this]() { return always_failing_operation(); });
95 EXPECT_TRUE(result.is_err());
96 EXPECT_EQ(call_count.load(), 3); // Should not increment
97 EXPECT_EQ(result.error().code, static_cast<int>(monitoring_error_code::circuit_breaker_open));
98}
99
100TEST_F(FaultToleranceTest, CircuitBreakerHalfOpenTransition) {
102 config.failure_threshold = 2;
103 config.timeout = std::chrono::milliseconds(100);
104
105 circuit_breaker breaker(config);
106
107 // Open the circuit
108 for (int i = 0; i < 2; ++i) {
109 execute_with_circuit_breaker<int>(breaker, "test", [this]() { return always_failing_operation(); });
110 }
111 EXPECT_EQ(breaker.get_state(), circuit_state::OPEN);
112
113 // Wait for reset timeout
114 std::this_thread::sleep_for(std::chrono::milliseconds(150));
115
116 // Next call should transition to half-open
117 auto result = execute_with_circuit_breaker<int>(breaker, "test", [this]() { return always_succeeding_operation(); });
118 EXPECT_TRUE(result.is_ok());
119 EXPECT_EQ(breaker.get_state(), circuit_state::HALF_OPEN);
120}
121
122TEST_F(FaultToleranceTest, CircuitBreakerHalfOpenToClosedTransition) {
124 config.failure_threshold = 2;
125 config.success_threshold = 2;
126 config.timeout = std::chrono::milliseconds(50);
127
128 circuit_breaker breaker(config);
129
130 // Open the circuit
131 for (int i = 0; i < 2; ++i) {
132 execute_with_circuit_breaker<int>(breaker, "test", [this]() { return always_failing_operation(); });
133 }
134
135 // Wait and transition to half-open
136 std::this_thread::sleep_for(std::chrono::milliseconds(100));
137 execute_with_circuit_breaker<int>(breaker, "test", [this]() { return always_succeeding_operation(); });
138 EXPECT_EQ(breaker.get_state(), circuit_state::HALF_OPEN);
139
140 // One more success should close the circuit
141 auto result = execute_with_circuit_breaker<int>(breaker, "test", [this]() { return always_succeeding_operation(); });
142 EXPECT_TRUE(result.is_ok());
143 EXPECT_EQ(breaker.get_state(), circuit_state::CLOSED);
144}
145
146TEST_F(FaultToleranceTest, CircuitBreakerWithFallback) {
148 config.failure_threshold = 1;
149
150 circuit_breaker breaker(config);
151
152 // Open the circuit with one failure
153 execute_with_circuit_breaker<int>(breaker, "test", [this]() { return always_failing_operation(); });
154 EXPECT_EQ(breaker.get_state(), circuit_state::OPEN);
155
156 // Use fallback when circuit is open
157 if (!breaker.allow_request()) {
158 auto result = kcenon::common::ok(999);
159 EXPECT_TRUE(result.is_ok());
160 EXPECT_EQ(result.value(), 999);
161 }
162}
163
164TEST_F(FaultToleranceTest, CircuitBreakerStats) {
166 config.failure_threshold = 3;
167
168 circuit_breaker breaker(config);
169
170 // Execute some operations
171 execute_with_circuit_breaker<int>(breaker, "test", [this]() { return always_succeeding_operation(); });
172 execute_with_circuit_breaker<int>(breaker, "test", [this]() { return always_failing_operation(); });
173 execute_with_circuit_breaker<int>(breaker, "test", [this]() { return always_succeeding_operation(); });
174
175 auto stats = breaker.get_stats();
176 EXPECT_FALSE(stats.empty());
177}
178
179// Retry Policy Tests
180TEST_F(FaultToleranceTest, RetryExecutorBasicRetry) {
181 auto config = create_exponential_backoff_config(3, std::chrono::milliseconds(10));
182 retry_executor<int> executor("test_retry", config);
183
184 success_after_attempts = 2; // Succeed on 3rd attempt
185
186 auto result = executor.execute([this]() { return failing_operation(); });
187
188 EXPECT_TRUE(result.is_ok());
189 EXPECT_EQ(result.value(), 42);
190 EXPECT_EQ(call_count.load(), 3);
191
192 auto metrics = executor.get_metrics();
193 EXPECT_EQ(metrics.total_executions, 1);
194 EXPECT_EQ(metrics.successful_executions, 1);
195 EXPECT_EQ(metrics.total_retries, 2);
196}
197
198TEST_F(FaultToleranceTest, RetryExecutorMaxAttemptsExceeded) {
199 auto config = create_exponential_backoff_config(2, std::chrono::milliseconds(10));
200 retry_executor<int> executor("test_retry", config);
201
202 auto result = executor.execute([this]() { return always_failing_operation(); });
203
204 EXPECT_TRUE(result.is_err());
205 EXPECT_EQ(call_count.load(), 2);
206
207 auto metrics = executor.get_metrics();
208 EXPECT_EQ(metrics.total_executions, 1);
209 EXPECT_EQ(metrics.failed_executions, 1);
210 EXPECT_EQ(metrics.total_retries, 1);
211}
212
213TEST_F(FaultToleranceTest, RetryExecutorFixedDelay) {
214 retry_config config = create_fixed_delay_config(3, std::chrono::milliseconds(50));
215 retry_executor<int> executor("test_retry", config);
216
217 success_after_attempts = 2;
218
219 auto start = std::chrono::steady_clock::now();
220 auto result = executor.execute([this]() { return failing_operation(); });
221 auto duration = std::chrono::steady_clock::now() - start;
222
223 EXPECT_TRUE(result.is_ok());
224 EXPECT_GE(duration, std::chrono::milliseconds(100)); // At least 2 delays
225}
226
227TEST_F(FaultToleranceTest, RetryExecutorFibonacciBackoff) {
228 auto config = create_fibonacci_backoff_config(4, std::chrono::milliseconds(10));
229 retry_executor<int> executor("test_retry", config);
230
231 success_after_attempts = 3;
232
233 auto result = executor.execute([this]() { return failing_operation(); });
234
235 EXPECT_TRUE(result.is_ok());
236 EXPECT_EQ(call_count.load(), 4);
237}
238
239TEST_F(FaultToleranceTest, RetryExecutorCustomShouldRetry) {
241 config.should_retry = [](const error_info& error) {
242 // Only retry operation_timeout errors
243 return error.code == monitoring_error_code::operation_timeout;
244 };
245
246 retry_executor<int> executor("test_retry", config);
247
248 // This should not retry because it's not a timeout error
249 auto result = executor.execute([this]() { return always_failing_operation(); });
250
251 EXPECT_TRUE(result.is_err());
252 EXPECT_EQ(call_count.load(), 1); // No retries
253}
254
255// Fault Tolerance Manager Tests
256TEST_F(FaultToleranceTest, FaultToleranceManagerCircuitBreakerFirst) {
258 config.enable_circuit_breaker = true;
259 config.enable_retry = true;
260 config.circuit_breaker_first = true;
261 config.circuit_config.failure_threshold = 2;
262 config.retry_cfg = create_exponential_backoff_config(2, std::chrono::milliseconds(10));
263
264 fault_tolerance_manager<int> manager("test_manager", config);
265
266 success_after_attempts = 1; // Succeed on 2nd attempt
267
268 auto result = manager.execute([this]() { return failing_operation(); });
269
270 EXPECT_TRUE(result.is_ok());
271 EXPECT_EQ(result.value(), 42);
272 EXPECT_EQ(call_count.load(), 2); // Retry executed
273}
274
275TEST_F(FaultToleranceTest, FaultToleranceManagerRetryFirst) {
277 config.enable_circuit_breaker = true;
278 config.enable_retry = true;
279 config.circuit_breaker_first = false;
280 config.circuit_config.failure_threshold = 5;
281 config.retry_cfg = create_exponential_backoff_config(3, std::chrono::milliseconds(10));
282
283 fault_tolerance_manager<int> manager("test_manager", config);
284
285 success_after_attempts = 2;
286
287 auto result = manager.execute([this]() { return failing_operation(); });
288
289 EXPECT_TRUE(result.is_ok());
290 EXPECT_EQ(call_count.load(), 3);
291}
292
293TEST_F(FaultToleranceTest, FaultToleranceManagerOnlyCircuitBreaker) {
295 config.enable_circuit_breaker = true;
296 config.enable_retry = false;
297 config.circuit_config.failure_threshold = 2;
298
299 fault_tolerance_manager<int> manager("test_manager", config);
300
301 auto result = manager.execute([this]() { return always_succeeding_operation(); });
302
303 EXPECT_TRUE(result.is_ok());
304 EXPECT_EQ(result.value(), 100);
305}
306
307TEST_F(FaultToleranceTest, FaultToleranceManagerOnlyRetry) {
309 config.enable_circuit_breaker = false;
310 config.enable_retry = true;
311 config.retry_cfg = create_exponential_backoff_config(3, std::chrono::milliseconds(10));
312
313 fault_tolerance_manager<int> manager("test_manager", config);
314
315 success_after_attempts = 2;
316
317 auto result = manager.execute([this]() { return failing_operation(); });
318
319 EXPECT_TRUE(result.is_ok());
320 EXPECT_EQ(call_count.load(), 3);
321}
322
323TEST_F(FaultToleranceTest, FaultToleranceManagerWithTimeout) {
325 config.enable_circuit_breaker = false;
326 config.enable_retry = true;
327 config.retry_cfg = create_exponential_backoff_config(2, std::chrono::milliseconds(10));
328
329 fault_tolerance_manager<int> manager("test_manager", config);
330
331 auto result = manager.execute_with_timeout(
332 [this]() { return slow_operation(std::chrono::milliseconds(500)); },
333 std::chrono::milliseconds(50)
334 );
335
336 EXPECT_TRUE(result.is_err());
337 EXPECT_EQ(result.error().code, static_cast<int>(monitoring_error_code::operation_timeout));
338}
339
340TEST_F(FaultToleranceTest, FaultToleranceManagerMetrics) {
342 config.enable_circuit_breaker = true;
343 config.enable_retry = true;
344 config.circuit_config.failure_threshold = 5;
345 config.retry_cfg = create_exponential_backoff_config(2, std::chrono::milliseconds(10));
346
347 fault_tolerance_manager<int> manager("test_manager", config);
348
349 // Execute successful operation
350 manager.execute([this]() { return always_succeeding_operation(); });
351
352 // Execute failing operation (will retry once)
353 manager.execute([this]() { return always_failing_operation(); });
354
355 auto metrics = manager.get_metrics();
356 EXPECT_EQ(metrics.total_operations, 2);
357 EXPECT_EQ(metrics.successful_operations, 1);
358 EXPECT_EQ(metrics.failed_operations, 1);
359 EXPECT_NEAR(metrics.get_overall_success_rate(), 0.5, 0.01);
360}
361
362TEST_F(FaultToleranceTest, FaultToleranceManagerHealthCheck) {
364 config.enable_circuit_breaker = true;
365 config.circuit_config.failure_threshold = 2;
366
367 fault_tolerance_manager<int> manager("test_manager", config);
368
369 // Initially healthy
370 auto health = manager.is_healthy();
371 EXPECT_TRUE(health.is_ok());
372 EXPECT_TRUE(health.value());
373
374 // Open circuit breaker
375 for (int i = 0; i < 2; ++i) {
376 manager.execute([this]() { return always_failing_operation(); });
377 }
378
379 // Should now be unhealthy due to open circuit
380 health = manager.is_healthy();
381 EXPECT_TRUE(health.is_ok());
382 EXPECT_FALSE(health.value());
383}
384
385// Registry Tests
386TEST_F(FaultToleranceTest, CircuitBreakerRegistry) {
387 auto& registry = global_circuit_breaker_registry();
388
389 auto breaker = std::make_shared<circuit_breaker>();
390 registry.register_circuit_breaker("test", breaker);
391
392 auto retrieved = registry.get_circuit_breaker("test");
393 EXPECT_EQ(retrieved, breaker);
394
395 auto names = registry.get_all_names();
396 EXPECT_EQ(names.size(), 1);
397 EXPECT_EQ(names[0], "test");
398
399 registry.remove_circuit_breaker("test");
400 retrieved = registry.get_circuit_breaker("test");
401 EXPECT_EQ(retrieved, nullptr);
402}
403
404TEST_F(FaultToleranceTest, RetryExecutorRegistry) {
405 auto& registry = global_retry_executor_registry();
406
407 auto executor = std::make_shared<retry_executor<int>>("test_executor");
408 registry.register_executor<int>("test", executor);
409
410 auto retrieved = registry.get_executor<int>("test");
411 EXPECT_EQ(retrieved, executor);
412
413 auto names = registry.get_all_names();
414 EXPECT_EQ(names.size(), 1);
415 EXPECT_EQ(names[0], "test");
416
417 registry.remove_executor("test");
418 retrieved = registry.get_executor<int>("test");
419 EXPECT_EQ(retrieved, nullptr);
420}
421
422TEST_F(FaultToleranceTest, FaultToleranceRegistry) {
423 auto& registry = global_fault_tolerance_registry();
424
425 auto manager = std::make_shared<fault_tolerance_manager<int>>("test_manager");
426 registry.register_manager<int>("test", manager);
427
428 auto retrieved = registry.get_manager<int>("test");
429 EXPECT_EQ(retrieved, manager);
430
431 auto names = registry.get_all_names();
432 EXPECT_EQ(names.size(), 1);
433 EXPECT_EQ(names[0], "test");
434
435 registry.remove_manager("test");
436 retrieved = registry.get_manager<int>("test");
437 EXPECT_EQ(retrieved, nullptr);
438}
439
440// Configuration Validation Tests
441TEST_F(FaultToleranceTest, RetryConfigValidation) {
442 retry_config config;
443
444 // Valid config
445 EXPECT_TRUE(config.validate());
446
447 // Invalid max attempts
448 config.max_attempts = 0;
449 EXPECT_FALSE(config.validate());
450
451 // Reset to valid
452 config.max_attempts = 3;
453 EXPECT_TRUE(config.validate());
454
455 // Invalid backoff multiplier
456 config.backoff_multiplier = 0.5;
457 EXPECT_FALSE(config.validate());
458}
459
460TEST_F(FaultToleranceTest, FaultToleranceConfigValidation) {
462
463 // Valid config
464 EXPECT_TRUE(config.validate());
465
466 // Both mechanisms disabled
467 config.enable_circuit_breaker = false;
468 config.enable_retry = false;
469 EXPECT_FALSE(config.validate());
470
471 // Enable one mechanism
472 config.enable_retry = true;
473 EXPECT_TRUE(config.validate());
474}
475
476// Concurrency Tests
477TEST_F(FaultToleranceTest, CircuitBreakerConcurrency) {
479 config.failure_threshold = 10;
480
481 circuit_breaker breaker(config);
482
483 constexpr int num_threads = 4;
484 constexpr int operations_per_thread = 100;
485
486 std::vector<std::thread> threads;
487 std::atomic<int> successful_operations{0};
488
489 for (int i = 0; i < num_threads; ++i) {
490 threads.emplace_back([&breaker, &successful_operations, operations_per_thread]() {
491 for (int j = 0; j < operations_per_thread; ++j) {
492 auto result = execute_with_circuit_breaker<int>(breaker, "test", []() { return kcenon::common::ok(1); });
493 if (result.is_ok()) {
494 successful_operations++;
495 }
496 }
497 });
498 }
499
500 for (auto& thread : threads) {
501 thread.join();
502 }
503
504 EXPECT_EQ(successful_operations.load(), num_threads * operations_per_thread);
505}
506
507// Edge Cases
508TEST_F(FaultToleranceTest, CircuitBreakerResetViaRecreation) {
510 config.failure_threshold = 2;
511
512 auto breaker = std::make_unique<circuit_breaker>(config);
513
514 // Open the circuit
515 for (int i = 0; i < 2; ++i) {
516 execute_with_circuit_breaker<int>(*breaker, "test", [this]() { return always_failing_operation(); });
517 }
518 EXPECT_EQ(breaker->get_state(), circuit_state::OPEN);
519
520 // Reset by recreating
521 breaker = std::make_unique<circuit_breaker>(config);
522 EXPECT_EQ(breaker->get_state(), circuit_state::CLOSED);
523
524 // Should work normally now
525 auto result = execute_with_circuit_breaker<int>(*breaker, "test", [this]() { return always_succeeding_operation(); });
526 EXPECT_TRUE(result.is_ok());
527}
528
529TEST_F(FaultToleranceTest, RetryExecutorResetMetrics) {
530 auto config = create_exponential_backoff_config(3, std::chrono::milliseconds(10));
531 retry_executor<int> executor("reset_test", config);
532
533 // Execute some operations
534 executor.execute([this]() { return always_succeeding_operation(); });
535 executor.execute([this]() { return always_failing_operation(); });
536
537 auto metrics_before = executor.get_metrics();
538 EXPECT_GT(metrics_before.total_executions, 0);
539
540 // Reset metrics
541 executor.reset_metrics();
542
543 auto metrics_after = executor.get_metrics();
544 EXPECT_EQ(metrics_after.total_executions, 0);
545 EXPECT_EQ(metrics_after.successful_executions, 0);
546 EXPECT_EQ(metrics_after.failed_executions, 0);
547}
Circuit breaker integration for monitoring_system.
kcenon::common::Result< int > always_succeeding_operation()
kcenon::common::Result< int > always_failing_operation()
kcenon::common::Result< int > slow_operation(std::chrono::milliseconds delay)
kcenon::common::Result< int > failing_operation()
std::atomic< int > success_after_attempts
std::atomic< int > call_count
Fault tolerance manager template class.
common::Result< T > execute_with_timeout(Func &&func, std::chrono::milliseconds timeout)
Execute a function with timeout.
common::Result< bool > is_healthy()
Check if fault tolerance manager is healthy.
fault_tolerance_metrics get_metrics() const
Get fault tolerance metrics.
common::Result< T > execute(Func &&func)
Execute a function with fault tolerance.
Retry executor template class.
void reset_metrics()
Reset metrics.
retry_metrics get_metrics() const
Get retry metrics.
common::Result< T > execute(Func &&func)
Execute a function with retry logic.
Fault tolerance manager coordinating circuit breakers and retries.
@ delay
Delay requests until resources are available.
common::Result< T > execute_with_circuit_breaker(circuit_breaker &cb, const std::string &name, Func &&func)
Execute an operation through a circuit breaker.
retry_config create_fixed_delay_config(size_t max_attempts=3, std::chrono::milliseconds delay=std::chrono::milliseconds(1000))
Factory function for fixed delay config.
retry_config create_fibonacci_backoff_config(size_t max_attempts=3, std::chrono::milliseconds initial_delay=std::chrono::milliseconds(1000))
Factory function for Fibonacci backoff config.
retry_config create_exponential_backoff_config(size_t max_attempts=3, std::chrono::milliseconds initial_delay=std::chrono::milliseconds(1000))
Factory function for exponential backoff config.
common::resilience::circuit_breaker circuit_breaker
circuit_breaker_registry & global_circuit_breaker_registry()
Get global circuit breaker registry.
fault_tolerance_registry & global_fault_tolerance_registry()
Get global fault tolerance manager registry.
common::resilience::circuit_breaker_config circuit_breaker_config
retry_executor_registry & global_retry_executor_registry()
Get global retry executor registry.
Retry strategies with backoff for monitoring operations.
Extended error information with context.
std::function< bool(const error_info &)> should_retry
bool validate() const
Validate configuration.
TEST_F(FaultToleranceTest, CircuitBreakerClosedState)