Monitoring System 0.1.0
System resource monitoring with pluggable collectors and alerting
Loading...
Searching...
No Matches
health_reliability_example.cpp
Go to the documentation of this file.
1// BSD 3-Clause License
2// Copyright (c) 2021-2025, 🍀☀🌕🌥 🌊
3// See the LICENSE file in the project root for full license information.
4
17#include <iostream>
18#include <thread>
19#include <random>
20#include <atomic>
21
29
30using namespace kcenon::monitoring;
31using namespace std::chrono_literals;
32
33// Simulate a database connection
35private:
36 std::atomic<bool> is_healthy_{true};
37 std::atomic<int> query_count_{0};
38 std::mt19937 rng_{std::random_device{}()};
39
40public:
41 void set_healthy(bool healthy) {
43 }
44
45 kcenon::common::Result<std::string> execute_query(const std::string& query) {
47
48 // Simulate latency
49 std::this_thread::sleep_for(10ms);
50
51 // Simulate failures
52 if (!is_healthy_) {
53 return kcenon::common::Result<std::string>::err(error_info(monitoring_error_code::service_unavailable, "Database connection lost").to_common_error());
54 }
55
56 // Random transient failures (10% chance)
57 std::uniform_int_distribution<> dist(1, 10);
58 if (dist(rng_) == 1) {
59 return kcenon::common::Result<std::string>::err(error_info(monitoring_error_code::operation_timeout, "Query timeout").to_common_error());
60 }
61
62 return kcenon::common::ok("Query result for: " + query);
63 }
64
65 int get_query_count() const { return query_count_; }
66};
67
68// Simulate an external API
70private:
71 std::atomic<int> failure_count_{0};
72 std::atomic<int> call_count_{0};
73
74public:
75 kcenon::common::Result<std::string> call_api(const std::string& endpoint) {
77
78 // Simulate increasing failures
79 if (failure_count_ > 5) {
80 // API is down
81 return kcenon::common::Result<std::string>::err(error_info(monitoring_error_code::service_unavailable, "Service unavailable").to_common_error());
82 }
83
84 // Simulate intermittent failures
85 if (call_count_ % 3 == 0) {
87 return kcenon::common::Result<std::string>::err(error_info(monitoring_error_code::operation_failed, "Internal server error").to_common_error());
88 }
89
90 failure_count_ = 0; // Reset on success
91 return kcenon::common::ok("API response from: " + endpoint);
92 }
93
94 void reset() {
96 call_count_ = 0;
97 }
98
99 int get_call_count() const { return call_count_; }
100};
101
102// Demonstrate health monitoring
104 std::cout << "\n=== Health Monitoring Demo ===" << std::endl;
105
106 // Create health monitor
108 config.check_interval = 2s;
109 config.cache_duration = 1s;
110
111 health_monitor monitor(config);
112
113 // Create database connection for health checks
114 auto database = std::make_shared<DatabaseConnection>();
115
116 // Register liveness check
117 monitor.register_check("database_liveness",
118 std::make_shared<functional_health_check>(
119 "database_liveness",
120 health_check_type::liveness,
121 [database]() -> health_check_result {
122 // Simple ping check
123 auto result = database->execute_query("SELECT 1");
124 if (result.is_ok()) {
125 return health_check_result::healthy("Database is alive");
126 } else {
128 "Database unreachable: " + result.error().message
129 );
130 }
131 },
132 500ms, // timeout
133 true // critical
134 )
135 );
136
137 // Register readiness check
138 monitor.register_check("database_readiness",
139 std::make_shared<functional_health_check>(
140 "database_readiness",
141 health_check_type::readiness,
142 [database]() -> health_check_result {
143 // Check if database can handle queries
144 auto result = database->execute_query("SELECT COUNT(*) FROM users");
145 if (result.is_ok()) {
146 int query_count = database->get_query_count();
147 if (query_count > 100) {
149 "High query count: " + std::to_string(query_count)
150 );
151 }
152 return health_check_result::healthy("Database ready");
153 } else {
155 "Database not ready: " + result.error().message
156 );
157 }
158 },
159 1000ms, // timeout
160 false // non-critical
161 )
162 );
163
164 // Register startup check
165 monitor.register_check("system_startup",
166 std::make_shared<functional_health_check>(
167 "system_startup",
168 health_check_type::startup,
169 []() -> health_check_result {
170 // Check system initialization
171 static bool initialized = false;
172 if (!initialized) {
173 std::this_thread::sleep_for(100ms); // Simulate initialization
174 initialized = true;
175 }
176 return health_check_result::healthy("System initialized");
177 }
178 )
179 );
180
181 // Start health monitoring
182 monitor.start();
183
184 std::cout << "Health monitoring started" << std::endl;
185
186 // Perform health checks
187 std::cout << "\n1. Initial health check:" << std::endl;
188 auto all_checks = monitor.check_all();
189 for (const auto& [name, result] : all_checks) {
190 std::cout << " " << name << ": "
191 << (result.status == health_status::healthy ? "HEALTHY" :
192 result.status == health_status::degraded ? "DEGRADED" : "UNHEALTHY")
193 << " - " << result.message << std::endl;
194 }
195
196 // Get overall status
197 auto overall = monitor.get_overall_status();
198 std::cout << " Overall status: "
199 << (overall == health_status::healthy ? "HEALTHY" :
200 overall == health_status::degraded ? "DEGRADED" : "UNHEALTHY")
201 << std::endl;
202
203 // Simulate database failure
204 std::cout << "\n2. Simulating database failure..." << std::endl;
205 database->set_healthy(false);
206 std::this_thread::sleep_for(1s);
207
208 all_checks = monitor.check_all();
209 for (const auto& [name, result] : all_checks) {
210 if (name.find("database") != std::string::npos) {
211 std::cout << " " << name << ": "
212 << (result.status == health_status::healthy ? "HEALTHY" : "UNHEALTHY")
213 << " - " << result.message << std::endl;
214 }
215 }
216
217 // Register recovery handler
218 monitor.register_recovery_handler("database_liveness",
219 [database]() -> bool {
220 std::cout << " Attempting database recovery..." << std::endl;
221 database->set_healthy(true);
222 return true;
223 }
224 );
225
226 // Recover database
227 std::cout << "\n3. Triggering recovery..." << std::endl;
228 monitor.refresh();
229 std::this_thread::sleep_for(2s);
230
231 all_checks = monitor.check_all();
232 std::cout << " Database status after recovery: "
233 << (all_checks["database_liveness"].status == health_status::healthy ?
234 "HEALTHY" : "UNHEALTHY") << std::endl;
235
236 // Get health report
237 std::cout << "\n4. Health Report:" << std::endl;
238 std::cout << monitor.get_health_report() << std::endl;
239
240 monitor.stop();
241}
242
243// Demonstrate circuit breaker
245 std::cout << "\n=== Circuit Breaker Demo ===" << std::endl;
246
247 // Create external API client
248 auto api_client = std::make_shared<ExternalApiClient>();
249
250 // Configure circuit breaker
251 circuit_breaker_config cb_config;
252 cb_config.failure_threshold = 3;
253 cb_config.timeout = 2s;
254 cb_config.success_threshold = 2;
255
256 circuit_breaker breaker(cb_config);
257
258 std::cout << "Circuit breaker configured:" << std::endl;
259 std::cout << " Failure threshold: " << cb_config.failure_threshold << std::endl;
260 std::cout << " Reset timeout: 2s" << std::endl;
261
262 // Define the operation
263 auto api_operation = [api_client]() -> kcenon::common::Result<std::string> {
264 return api_client->call_api("/users");
265 };
266
267 // Define fallback
268 auto fallback = []() -> kcenon::common::Result<std::string> {
269 return kcenon::common::ok(std::string("Cached response (fallback)"));
270 };
271
272 // Make calls through circuit breaker
273 std::cout << "\n1. Making API calls through circuit breaker:" << std::endl;
274
275 for (int i = 1; i <= 10; ++i) {
276 kcenon::common::Result<std::string> result = kcenon::common::make_error<std::string>(0, "");
277 if (breaker.allow_request()) {
278 result = api_operation();
279 if (result.is_ok()) {
280 breaker.record_success();
281 } else {
282 breaker.record_failure();
283 result = fallback();
284 }
285 } else {
286 result = fallback();
287 }
288
289 std::cout << " Call " << i << ": ";
290 if (result.is_ok()) {
291 std::cout << "SUCCESS - " << result.value() << std::endl;
292 } else {
293 std::cout << "FAILED - " << result.error().message << std::endl;
294 }
295
296 // Check circuit state
297 auto state = breaker.get_state();
298 if (state == circuit_state::OPEN) {
299 std::cout << " [Circuit OPEN - using fallback]" << std::endl;
300 } else if (state == circuit_state::HALF_OPEN) {
301 std::cout << " [Circuit HALF-OPEN - testing]" << std::endl;
302 }
303
304 std::this_thread::sleep_for(300ms);
305 }
306
307 // Get circuit breaker stats
308 auto stats = breaker.get_stats();
309 std::cout << "\n2. Circuit Breaker Stats:" << std::endl;
310 for (const auto& [key, val] : stats) {
311 std::visit([&key](const auto& v) {
312 std::cout << " " << key << ": " << v << std::endl;
313 }, val);
314 }
315
316 // Wait for circuit to reset
317 std::cout << "\n3. Waiting for circuit reset..." << std::endl;
318 api_client->reset(); // Reset API client
319 std::this_thread::sleep_for(3s);
320
321 // Try again after reset
322 std::cout << "\n4. Trying after reset:" << std::endl;
323 for (int i = 1; i <= 3; ++i) {
324 auto result = execute_with_circuit_breaker<std::string>(breaker, "api_breaker", api_operation);
325 std::cout << " Call " << i << ": ";
326 if (result.is_ok()) {
327 std::cout << "SUCCESS" << std::endl;
328 } else {
329 std::cout << "FAILED" << std::endl;
330 }
331 }
332}
333
334// Demonstrate retry policy (simplified)
336 std::cout << "\n=== Retry Policy Demo ===" << std::endl;
337
338 // Configure retry policy
339 retry_config config;
340 config.max_attempts = 3;
341 config.strategy = retry_strategy::exponential_backoff;
342 config.initial_delay = 100ms;
343 config.max_delay = 2s;
344 config.backoff_multiplier = 2.0;
345
346 std::cout << "Retry policy configured:" << std::endl;
347 std::cout << " Max attempts: " << config.max_attempts << std::endl;
348 std::cout << " Strategy: exponential backoff" << std::endl;
349 std::cout << " Initial delay: 100ms" << std::endl;
350
351 // Simulate manual retry logic (since retry_policy class not available)
352 std::cout << "\n1. Executing flaky operation with manual retry:" << std::endl;
353
354 std::atomic<int> attempt_count{0};
355 auto flaky_operation = [&attempt_count]() -> kcenon::common::Result<std::string> {
356 attempt_count++;
357 std::cout << " Attempt " << attempt_count << "..." << std::endl;
358
359 // Fail first 2 attempts
360 if (attempt_count <= 2) {
361 return kcenon::common::Result<std::string>::err(error_info(monitoring_error_code::operation_timeout, "Operation timed out").to_common_error());
362 }
363
364 return kcenon::common::ok(std::string("Operation succeeded!"));
365 };
366
367 kcenon::common::Result<std::string> final_result = kcenon::common::Result<std::string>::err(error_info(monitoring_error_code::operation_failed, "Initialization pending").to_common_error());
368 for (int i = 0; i < static_cast<int>(config.max_attempts); ++i) {
369 final_result = flaky_operation();
370 if (final_result.is_ok()) {
371 break;
372 }
373
374 // Wait before retry
375 if (i < static_cast<int>(config.max_attempts) - 1) {
376 auto delay = config.initial_delay * static_cast<int>(std::pow(config.backoff_multiplier, i));
377 std::this_thread::sleep_for(delay);
378 }
379 }
380
381 if (final_result.is_ok()) {
382 std::cout << " Final result: SUCCESS - " << final_result.value() << std::endl;
383 } else {
384 std::cout << " Final result: FAILED - " << final_result.error().message << std::endl;
385 }
386
387 std::cout << " Total attempts: " << attempt_count << std::endl;
388}
389
390// Demonstrate error boundaries
392 std::cout << "\n=== Error Boundaries Demo ===" << std::endl;
393
394 // Configure error boundary
396 config.error_threshold = 5; // Use correct field name
397 config.error_window = 60s;
398 config.enable_fallback_logging = true; // Use correct field name
399
400 error_boundary<std::string> boundary("critical_section", config); // Specify template type
401
402 // Set error handler
403 boundary.set_error_handler([](const error_info& error, degradation_level level) {
404 std::cout << " Error handler called: " << error.message
405 << " (degradation level: " << static_cast<int>(level) << ")" << std::endl;
406 });
407
408 std::cout << "Error boundary configured:" << std::endl;
409 std::cout << " Max errors: " << config.error_threshold << std::endl;
410 std::cout << " Error window: 60s" << std::endl;
411
412 // Execute operations within boundary
413 std::cout << "\n1. Executing operations within error boundary:" << std::endl;
414
415 for (int i = 1; i <= 7; ++i) {
416 auto result = boundary.execute([i]() -> ::kcenon::common::Result<std::string> {
417 std::cout << " Operation " << i << ": ";
418
419 // Simulate failures on odd numbers
420 if (i % 2 == 1) {
421 std::cout << "FAILED" << std::endl;
422 error_info err(monitoring_error_code::operation_failed,
423 "Operation " + std::to_string(i) + " failed");
424 return ::kcenon::common::Result<std::string>::err(err.to_common_error());
425 }
426
427 std::cout << "SUCCESS" << std::endl;
428 return kcenon::common::ok("Result " + std::to_string(i));
429 });
430
431 if (result.is_err() && result.error().code == static_cast<int>(monitoring_error_code::circuit_breaker_open)) {
432 std::cout << " [Error boundary triggered - too many errors]" << std::endl;
433 break;
434 }
435 }
436
437 // Get statistics
438 auto stats = boundary.get_metrics();
439 std::cout << "\n2. Error Boundary Statistics:" << std::endl;
440 std::cout << " Total operations: " << stats.total_operations << std::endl;
441 std::cout << " Failed operations: " << stats.failed_operations << std::endl;
442 std::cout << " Success rate: "
443 << (stats.total_operations > 0 ?
444 100.0 * (stats.total_operations - stats.failed_operations) / stats.total_operations : 0)
445 << "%" << std::endl;
446}
447
448int main() {
449 std::cout << "=== Health Monitoring & Reliability Example ===" << std::endl;
450
451 try {
452 // Part 1: Health Monitoring
454
455 // Part 2: Circuit Breaker
457
458 // Part 3: Retry Policy
460
461 // Part 4: Error Boundaries
463
464 } catch (const std::exception& e) {
465 std::cerr << "Exception: " << e.what() << std::endl;
466 return 1;
467 }
468
469 std::cout << "\n=== Example completed successfully ===" << std::endl;
470
471 return 0;
472}
Circuit breaker integration for monitoring_system.
kcenon::common::Result< std::string > execute_query(const std::string &query)
std::atomic< int > failure_count_
kcenon::common::Result< std::string > call_api(const std::string &endpoint)
Error boundary implementation for resilient operations.
void set_error_handler(std::function< void(const error_info &, degradation_level)> handler)
Set error handler callback.
auto execute(Func &&func) -> common::Result< T >
Execute a function within the error boundary.
error_boundary_metrics get_metrics() const
Get metrics.
Health monitor with dependency management, auto-recovery, and statistics.
void refresh()
Manually refresh all health checks and trigger recovery if needed.
common::VoidResult stop()
Stop the periodic health monitoring background thread.
void register_recovery_handler(const std::string &check_name, std::function< bool()> handler)
Register a recovery handler for a named health check.
std::unordered_map< std::string, health_check_result > check_all()
Execute all registered health checks.
health_status get_overall_status()
Get the aggregate health status across all cached results.
common::VoidResult start()
Start the periodic health monitoring background thread.
std::string get_health_report()
Generate a human-readable health report.
common::Result< bool > register_check(const std::string &name, std::shared_ptr< health_check > check)
Register a named health check.
Error boundary with degradation levels for fault isolation.
Monitoring system specific error codes.
Fault tolerance manager coordinating circuit breakers and retries.
Health monitoring with dependency graphs, auto-recovery, and statistics.
void demonstrate_health_monitoring()
void demonstrate_retry_policy()
void demonstrate_circuit_breaker()
void demonstrate_error_boundaries()
@ delay
Delay requests until resources are available.
common::Result< T > execute_with_circuit_breaker(circuit_breaker &cb, const std::string &name, Func &&func)
Execute an operation through a circuit breaker.
common::resilience::circuit_breaker circuit_breaker
degradation_level
Degradation levels for error boundary.
common::resilience::circuit_breaker_config circuit_breaker_config
Result pattern type definitions for monitoring system.
Retry strategies with backoff for monitoring operations.
Error boundary configuration.
Extended error information with context.
common::error_info to_common_error() const
Convert to common_system error_info.
Result of a health check operation.
static health_check_result unhealthy(const std::string &msg)
static health_check_result healthy(const std::string &msg="OK")
static health_check_result degraded(const std::string &msg)
Configuration for the health_monitor.
std::chrono::seconds cache_duration
Duration to cache health check results.
std::chrono::milliseconds check_interval
Interval between automatic health check cycles.
std::chrono::milliseconds initial_delay
std::chrono::milliseconds max_delay