Monitoring System 0.1.0
System resource monitoring with pluggable collectors and alerting
Loading...
Searching...
No Matches
alert_manager.h
Go to the documentation of this file.
1// BSD 3-Clause License
2// Copyright (c) 2021-2025, 🍀☀🌕🌥 🌊
3// See the LICENSE file in the project root for full license information.
4
5#pragma once
6
15#include <atomic>
16#include <condition_variable>
17#include <functional>
18#include <memory>
19#include <mutex>
20#include <string>
21#include <thread>
22#include <unordered_map>
23#include <vector>
24
25#include "alert_rule.h"
26#include "alert_types.h"
29
30namespace kcenon::monitoring {
31
32// Forward declarations
33class alert_notifier;
34
40 std::chrono::milliseconds default_evaluation_interval{15000};
41 std::chrono::milliseconds default_repeat_interval{300000};
42 size_t max_alerts_per_rule{100};
43 size_t max_silences{1000};
44 bool enable_grouping{true};
45 std::chrono::milliseconds group_wait{30000};
46 std::chrono::milliseconds group_interval{300000};
47 std::chrono::milliseconds resolve_timeout{300000};
48
53 bool validate() const {
54 return default_evaluation_interval.count() > 0 &&
55 default_repeat_interval.count() > 0 &&
57 max_silences > 0;
58 }
59};
60
66 std::atomic<uint64_t> rules_evaluated{0};
67 std::atomic<uint64_t> alerts_created{0};
68 std::atomic<uint64_t> alerts_resolved{0};
69 std::atomic<uint64_t> alerts_suppressed{0};
70 std::atomic<uint64_t> notifications_sent{0};
71 std::atomic<uint64_t> notifications_failed{0};
72
74
82};
83
123public:
124 using metric_provider_func = std::function<std::optional<double>(const std::string&)>;
125
130
136
141
142 // Non-copyable, non-movable
143 alert_manager(const alert_manager&) = delete;
147
148 // ========== Lifecycle Management ==========
149
154 common::VoidResult start();
155
160 common::VoidResult stop();
161
166 bool is_running() const;
167
168 // ========== Rule Management ==========
169
175 common::VoidResult add_rule(std::shared_ptr<alert_rule> rule);
176
182 common::VoidResult remove_rule(const std::string& rule_name);
183
189 std::shared_ptr<alert_rule> get_rule(const std::string& rule_name) const;
190
195 std::vector<std::shared_ptr<alert_rule>> get_rules() const;
196
202 common::VoidResult add_rule_group(std::shared_ptr<alert_rule_group> group);
203
204 // ========== Alert Operations ==========
205
212 common::VoidResult process_metric(const std::string& metric_name, double value);
213
219 common::VoidResult process_metrics(const std::unordered_map<std::string, double>& metrics);
220
225 std::vector<alert> get_active_alerts() const;
226
232 std::optional<alert> get_alert(const std::string& fingerprint) const;
233
239 common::VoidResult resolve_alert(const std::string& fingerprint);
240
241 // ========== Silence Management ==========
242
248 common::Result<uint64_t> create_silence(const alert_silence& silence);
249
255 common::VoidResult delete_silence(uint64_t silence_id);
256
261 std::vector<alert_silence> get_silences() const;
262
268 bool is_silenced(const alert& a) const;
269
270 // ========== Notifier Management ==========
271
277 common::VoidResult add_notifier(std::shared_ptr<alert_notifier> notifier);
278
284 common::VoidResult remove_notifier(const std::string& notifier_name);
285
290 std::vector<std::shared_ptr<alert_notifier>> get_notifiers() const;
291
292 // ========== Metric Provider ==========
293
299
300 // ========== Event Bus Integration ==========
301
306 void set_event_bus(std::shared_ptr<interface_event_bus> event_bus);
307
308 // ========== Metrics ==========
309
315
321
322private:
327
333 void evaluate_rule(const std::shared_ptr<alert_rule>& rule, double value);
334
341 void update_alert_state(const std::string& fingerprint,
342 bool condition_met,
343 double value,
344 const std::shared_ptr<alert_rule>& rule);
345
350 void send_notifications(const alert& a);
351
356
361
362 // Configuration
364
365 // Rules
366 mutable std::mutex rules_mutex_;
367 std::unordered_map<std::string, std::shared_ptr<alert_rule>> rules_;
368 std::vector<std::shared_ptr<alert_rule_group>> rule_groups_;
369
370 // Alerts
371 mutable std::mutex alerts_mutex_;
372 std::unordered_map<std::string, alert> alerts_;
373
374 // Silences
375 mutable std::mutex silences_mutex_;
376 std::unordered_map<uint64_t, alert_silence> silences_;
377
378 // Notifiers
379 mutable std::mutex notifiers_mutex_;
380 std::vector<std::shared_ptr<alert_notifier>> notifiers_;
381
382 // Metric provider
383 std::mutex provider_mutex_;
385
386 // Event bus
387 std::shared_ptr<interface_event_bus> event_bus_;
388
389 // Metrics
391
392 // Evaluation thread
393 std::atomic<bool> running_{false};
395 std::condition_variable cv_;
396 std::mutex cv_mutex_;
397
398 // Last notification times
399 std::unordered_map<std::string, std::chrono::steady_clock::time_point> last_notification_times_;
400};
401
410public:
411 virtual ~alert_notifier() = default;
412
417 virtual std::string name() const = 0;
418
424 virtual common::VoidResult notify(const alert& a) = 0;
425
431 virtual common::VoidResult notify_group(const alert_group& group) = 0;
432
437 virtual bool is_ready() const = 0;
438};
439
447public:
452 explicit log_notifier(std::string notifier_name = "log_notifier")
453 : name_(std::move(notifier_name)) {}
454
459 std::string name() const override { return name_; }
460
466 common::VoidResult notify(const alert& a) override;
467
473 common::VoidResult notify_group(const alert_group& group) override;
474
479 bool is_ready() const override { return true; }
480
481private:
482 std::string name_;
483};
484
492public:
493 using callback_func = std::function<void(const alert&)>;
494 using group_callback_func = std::function<void(const alert_group&)>;
495
502 callback_notifier(std::string notifier_name,
503 callback_func callback,
504 group_callback_func group_callback = nullptr)
505 : name_(std::move(notifier_name))
506 , callback_(std::move(callback))
507 , group_callback_(std::move(group_callback)) {}
508
513 std::string name() const override { return name_; }
514
515 common::VoidResult notify(const alert& a) override {
516 if (callback_) {
517 callback_(a);
518 return common::ok();
519 }
520 return common::VoidResult::err(error_info(monitoring_error_code::operation_failed, "No callback configured").to_common_error());
521 }
522
523 common::VoidResult notify_group(const alert_group& group) override {
524 if (group_callback_) {
525 group_callback_(group);
526 return common::ok();
527 }
528 // Fall back to individual notifications
529 for (const auto& a : group.alerts) {
530 auto result = notify(a);
531 if (!result.is_ok()) {
532 return result;
533 }
534 }
535 return common::ok();
536 }
537
542 bool is_ready() const override { return callback_ != nullptr; }
543
544private:
545 std::string name_;
548};
549
550} // namespace kcenon::monitoring
Alert rule configuration and evaluation.
Core alert data structures for the monitoring system.
Central coordinator for the alert pipeline.
common::VoidResult remove_notifier(const std::string &notifier_name)
Remove a notifier.
std::vector< std::shared_ptr< alert_notifier > > notifiers_
void set_event_bus(std::shared_ptr< interface_event_bus > event_bus)
Set event bus for publishing alert events.
bool is_running() const
Check if manager is running.
std::unordered_map< uint64_t, alert_silence > silences_
common::VoidResult remove_rule(const std::string &rule_name)
Remove an alert rule.
common::VoidResult stop()
Stop the alert manager.
std::optional< alert > get_alert(const std::string &fingerprint) const
Get alert by fingerprint.
common::VoidResult add_rule(std::shared_ptr< alert_rule > rule)
Add an alert rule.
common::VoidResult delete_silence(uint64_t silence_id)
Delete a silence.
const alert_manager_config & config() const
Get configuration.
common::VoidResult process_metric(const std::string &metric_name, double value)
Process a metric value.
std::shared_ptr< interface_event_bus > event_bus_
alert_manager(alert_manager &&)=delete
common::VoidResult add_rule_group(std::shared_ptr< alert_rule_group > group)
Add a rule group.
common::VoidResult start()
Start the alert manager.
common::Result< uint64_t > create_silence(const alert_silence &silence)
Create a silence.
std::unordered_map< std::string, std::chrono::steady_clock::time_point > last_notification_times_
std::vector< alert > get_active_alerts() const
Get all active alerts.
void update_alert_state(const std::string &fingerprint, bool condition_met, double value, const std::shared_ptr< alert_rule > &rule)
Update alert state.
alert_manager()
Default constructor.
void send_notifications(const alert &a)
Send notifications for an alert.
std::unordered_map< std::string, std::shared_ptr< alert_rule > > rules_
std::condition_variable cv_
std::unordered_map< std::string, alert > alerts_
std::function< std::optional< double >(const std::string &)> metric_provider_func
metric_provider_func metric_provider_
alert_manager_metrics get_metrics() const
Get manager metrics.
common::VoidResult process_metrics(const std::unordered_map< std::string, double > &metrics)
Process a batch of metrics.
std::vector< std::shared_ptr< alert_notifier > > get_notifiers() const
Get all notifiers.
void cleanup_resolved_alerts()
Clean up resolved alerts.
alert_manager(const alert_manager &)=delete
void set_metric_provider(metric_provider_func provider)
Set the metric provider function.
std::shared_ptr< alert_rule > get_rule(const std::string &rule_name) const
Get a rule by name.
alert_manager & operator=(alert_manager &&)=delete
common::VoidResult resolve_alert(const std::string &fingerprint)
Resolve an alert manually.
void cleanup_silences()
Clean up expired silences.
alert_manager & operator=(const alert_manager &)=delete
void evaluate_rule(const std::shared_ptr< alert_rule > &rule, double value)
Evaluate a single rule.
common::VoidResult add_notifier(std::shared_ptr< alert_notifier > notifier)
Add a notifier.
std::vector< std::shared_ptr< alert_rule_group > > rule_groups_
std::vector< std::shared_ptr< alert_rule > > get_rules() const
Get all rules.
void evaluation_loop()
Main evaluation loop.
alert_manager(const alert_manager_config &config)
Construct with configuration.
bool is_silenced(const alert &a) const
Check if an alert is silenced.
std::vector< alert_silence > get_silences() const
Get all active silences.
Base class for alert notification handlers.
virtual std::string name() const =0
Get notifier name.
virtual common::VoidResult notify(const alert &a)=0
Send a notification for an alert.
virtual bool is_ready() const =0
Check if notifier is ready.
virtual common::VoidResult notify_group(const alert_group &group)=0
Send a notification for an alert group.
Notifier that invokes a callback function.
bool is_ready() const override
Check if the callback notifier is ready.
callback_notifier(std::string notifier_name, callback_func callback, group_callback_func group_callback=nullptr)
Construct callback notifier.
common::VoidResult notify(const alert &a) override
Send a notification for an alert.
std::string name() const override
Get the name of this callback notifier.
std::function< void(const alert_group &)> group_callback_func
std::function< void(const alert &)> callback_func
common::VoidResult notify_group(const alert_group &group) override
Send a notification for an alert group.
Thread-safe event bus implementation.
Definition event_bus.h:146
Simple notifier that logs alerts.
bool is_ready() const override
Check if the log notifier is ready.
common::VoidResult notify(const alert &a) override
Log an alert notification.
log_notifier(std::string notifier_name="log_notifier")
Construct log notifier.
std::string name() const override
Get the name of this log notifier.
common::VoidResult notify_group(const alert_group &group) override
Log a grouped alert notification.
Event bus interface for decoupled component communication.
Result pattern type definitions for monitoring system.
Group of related alerts for batch notification.
std::vector< alert > alerts
Alerts in this group.
Configuration for the alert manager.
std::chrono::milliseconds resolve_timeout
Auto-resolve timeout.
std::chrono::milliseconds default_evaluation_interval
Default eval interval.
size_t max_alerts_per_rule
Max alerts per rule.
bool validate() const
Validate configuration.
std::chrono::milliseconds group_wait
Wait time before group send.
bool enable_grouping
Enable alert grouping.
size_t max_silences
Max active silences.
std::chrono::milliseconds default_repeat_interval
Default repeat interval.
std::chrono::milliseconds group_interval
Group batch interval.
Metrics for alert manager operations.
std::atomic< uint64_t > alerts_suppressed
std::atomic< uint64_t > notifications_sent
alert_manager_metrics(const alert_manager_metrics &other)
std::atomic< uint64_t > notifications_failed
Silence configuration to suppress alerts.
Core alert data structure.
Extended error information with context.