Skip to content

Commit 1ac6219

Browse files
committed
feature(sdcm/sct_events): Real-time Argus Events
This commit adds new sct_events related module, which allows posting events as they come from events device into Argus. It is based on the previously available grafana pipeline. To support earlier (and always-available) posting, the argus utils module was extended to provide a singleton argus instance class upon first time initialization of argus client. Task: scylladb/argus#787
1 parent fa0428c commit 1ac6219

File tree

6 files changed

+197
-2
lines changed

6 files changed

+197
-2
lines changed

sdcm/sct_events/argus.py

Lines changed: 149 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,149 @@
1+
# This program is free software; you can redistribute it and/or modify
2+
# it under the terms of the GNU Affero General Public License as published by
3+
# the Free Software Foundation; either version 3 of the License, or
4+
# (at your option) any later version.
5+
#
6+
# This program is distributed in the hope that it will be useful,
7+
# but WITHOUT ANY WARRANTY; without even the implied warranty of
8+
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
9+
#
10+
# See LICENSE for more details.
11+
#
12+
# Copyright (c) 2025 ScyllaDB
13+
14+
import time
15+
import logging
16+
import threading
17+
from typing import NewType, Dict, Any, Tuple, Optional, Callable, cast
18+
from functools import partial
19+
from collections import defaultdict
20+
21+
from argus.common.sct_types import RawEventPayload
22+
from sdcm.sct_events.events_processes import \
23+
EVENTS_ARGUS_ANNOTATOR_ID, EVENTS_ARGUS_AGGREGATOR_ID, EVENTS_ARGUS_POSTMAN_ID, \
24+
EventsProcessesRegistry, BaseEventsProcess, EventsProcessPipe, \
25+
start_events_process, get_events_process, verbose_suppress
26+
from sdcm.utils.argus import Argus
27+
28+
29+
ARGUS_EVENT_AGGREGATOR_TIME_WINDOW: float = 90 # seconds
30+
ARGUS_EVENT_AGGREGATOR_MAX_DUPLICATES: int = 5
31+
ARGUS_EVENT_AGGREGATOR_QUEUE_WAIT_TIMEOUT: float = 1 # seconds
32+
33+
LOGGER = logging.getLogger(__name__)
34+
35+
36+
SCTArgusEvent = NewType("SCTArgusEvent", RawEventPayload)
37+
SCTArgusEventKey = NewType("SCTArgusEventKey", Tuple[str, ...])
38+
39+
40+
class ArgusEventCollector(EventsProcessPipe[Tuple[str, Any], SCTArgusEvent]):
41+
def run(self) -> None:
42+
client = Argus.get().client
43+
for event_tuple in self.inbound_events():
44+
with verbose_suppress("ArgusEventCollector failed to process %s", event_tuple):
45+
event_class, event = event_tuple # try to unpack event from EventsDevice
46+
if not event.publish_to_argus:
47+
continue
48+
if not client:
49+
continue
50+
evt = SCTArgusEvent({
51+
"run_id": client.run_id,
52+
"severity": event.severity.name,
53+
"ts": event.timestamp,
54+
"duration": getattr(event, "duration", None),
55+
"event_type": event_class,
56+
"message": str(event),
57+
"known_issue": getattr(event, "known_issue", None),
58+
"nemesis_name": getattr(event, "nemesis_name", None),
59+
"nemesis_status": getattr(event, "nemesis_status", None),
60+
"node": getattr(event, "node", None),
61+
"received_timestamp": getattr(event, "received_timestamp", None),
62+
"target_node": getattr(event, "target_node", None),
63+
})
64+
self.outbound_queue.put(evt)
65+
66+
67+
class ArgusEventAggregator(EventsProcessPipe[SCTArgusEvent, SCTArgusEvent]):
68+
inbound_events_process = EVENTS_ARGUS_ANNOTATOR_ID
69+
time_window = ARGUS_EVENT_AGGREGATOR_TIME_WINDOW
70+
max_duplicates = ARGUS_EVENT_AGGREGATOR_MAX_DUPLICATES
71+
72+
def run(self) -> None:
73+
time_window_counters: Dict[SCTArgusEventKey, int] = defaultdict(int)
74+
time_window_end = time.perf_counter()
75+
76+
for event in self.inbound_events():
77+
with verbose_suppress("ArgusEventAggregator failed to process an event %s", event):
78+
event_key = self.unique_key(event)
79+
time_diff = time.perf_counter() - time_window_end
80+
81+
# The current time window expired.
82+
if time_diff > 0:
83+
time_window_counters.clear()
84+
85+
# It can be more than one time window expired since last event seen.
86+
time_window_end += (time_diff // self.time_window + 1) * self.time_window
87+
88+
time_window_counters[event_key] += 1
89+
if time_window_counters[event_key] > self.max_duplicates:
90+
continue
91+
92+
# Put the event to the posting queue.
93+
LOGGER.debug("Event moving to posting queue: %s", event)
94+
self.outbound_queue.put(event)
95+
96+
@staticmethod
97+
def unique_key(event: SCTArgusEvent) -> SCTArgusEventKey:
98+
return SCTArgusEventKey(tuple([event["run_id"], event["severity"], event["event_type"]]))
99+
100+
101+
class ArgusEventPostman(BaseEventsProcess[SCTArgusEvent, None], threading.Thread):
102+
inbound_events_process = EVENTS_ARGUS_AGGREGATOR_ID
103+
104+
def __init__(self, _registry: EventsProcessesRegistry):
105+
self.enabled = threading.Event()
106+
self._argus_client = None
107+
super().__init__(_registry=_registry)
108+
109+
def run(self) -> None:
110+
self.enabled.wait()
111+
112+
for event in self.inbound_events(): # events from ArgusAggregator
113+
with verbose_suppress("ArgusEventPostman failed to post an event to '%s' "
114+
"endpoint.\nEvent: %s", self._argus_client.Routes.SUBMIT_EVENT, event):
115+
if self._argus_client:
116+
self._argus_client.submit_event(event)
117+
118+
def enable_argus_posting(self) -> None:
119+
self._argus_client = Argus.get().client
120+
121+
def start_posting_argus_events(self):
122+
self.enabled.set()
123+
124+
def terminate(self) -> None:
125+
super().terminate()
126+
self.enabled.set()
127+
128+
129+
start_argus_event_collector = partial(start_events_process, EVENTS_ARGUS_ANNOTATOR_ID, ArgusEventCollector)
130+
start_argus_aggregator = partial(start_events_process, EVENTS_ARGUS_AGGREGATOR_ID, ArgusEventAggregator)
131+
start_argus_postman = partial(start_events_process, EVENTS_ARGUS_POSTMAN_ID, ArgusEventPostman)
132+
get_argus_postman = cast(Callable[..., ArgusEventPostman], partial(get_events_process, EVENTS_ARGUS_POSTMAN_ID))
133+
134+
135+
def start_argus_pipeline(_registry: Optional[EventsProcessesRegistry] = None) -> None:
136+
start_argus_event_collector(_registry=_registry)
137+
start_argus_aggregator(_registry=_registry)
138+
start_argus_postman(_registry=_registry)
139+
140+
141+
def enable_argus_posting(_registry: Optional[EventsProcessesRegistry] = None) -> None:
142+
get_argus_postman(_registry=_registry).enable_argus_posting()
143+
144+
145+
def start_posting_argus_events(_registry: Optional[EventsProcessesRegistry] = None) -> None:
146+
get_argus_postman(_registry=_registry).start_posting_argus_events()
147+
148+
149+
__all__ = ("start_argus_pipeline", "enable_argus_posting", "start_posting_argus_events")

sdcm/sct_events/base.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -96,6 +96,7 @@ class SctEvent:
9696

9797
_ready_to_publish: bool = False # set it to True in __init__() and to False in publish() to prevent double-publish
9898
publish_to_grafana: bool = True
99+
publish_to_argus: bool = True
99100
save_to_files: bool = True
100101

101102
def __init_subclass__(cls, abstract: bool = False):

sdcm/sct_events/events_processes.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -31,6 +31,9 @@
3131
EVENTS_GRAFANA_ANNOTATOR_ID = "EVENTS_GRAFANA_ANNOTATOR"
3232
EVENTS_GRAFANA_AGGREGATOR_ID = "EVENTS_GRAFANA_AGGREGATOR"
3333
EVENTS_GRAFANA_POSTMAN_ID = "EVENTS_GRAFANA_POSTMAN"
34+
EVENTS_ARGUS_ANNOTATOR_ID = "EVENTS_ARGUS_ANNOTATOR"
35+
EVENTS_ARGUS_AGGREGATOR_ID = "EVENTS_ARGUS_AGGREGATOR"
36+
EVENTS_ARGUS_POSTMAN_ID = "EVENTS_ARGUS_POSTMAN"
3437
EVENTS_ANALYZER_ID = "EVENTS_ANALYZER"
3538
EVENTS_HANDLER_ID = "EVENTS_HANDLER"
3639
EVENTS_COUNTER_ID = "EVENTS_COUNTER_ID"

sdcm/sct_events/setup.py

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,7 @@
1919

2020
from sdcm.sct_config import SCTConfiguration
2121
from sdcm.sct_events import Severity
22+
from sdcm.sct_events.argus import start_argus_pipeline
2223
from sdcm.sct_events.event_handler import start_events_handler
2324
from sdcm.sct_events.grafana import start_grafana_pipeline
2425
from sdcm.sct_events.filters import DbEventsFilter, EventsSeverityChangerFilter
@@ -29,7 +30,7 @@
2930
from sdcm.sct_events.events_analyzer import start_events_analyzer
3031
from sdcm.sct_events.event_counter import start_events_counter
3132
from sdcm.sct_events.events_processes import \
32-
EVENTS_MAIN_DEVICE_ID, EVENTS_FILE_LOGGER_ID, EVENTS_ANALYZER_ID, \
33+
EVENTS_ARGUS_AGGREGATOR_ID, EVENTS_ARGUS_ANNOTATOR_ID, EVENTS_ARGUS_POSTMAN_ID, EVENTS_MAIN_DEVICE_ID, EVENTS_FILE_LOGGER_ID, EVENTS_ANALYZER_ID, \
3334
EVENTS_GRAFANA_ANNOTATOR_ID, EVENTS_GRAFANA_AGGREGATOR_ID, EVENTS_GRAFANA_POSTMAN_ID, \
3435
EventsProcessesRegistry, create_default_events_process_registry, get_events_process, EVENTS_HANDLER_ID, EVENTS_COUNTER_ID
3536
from sdcm.utils.issues import SkipPerIssues
@@ -55,6 +56,7 @@ def start_events_device(log_dir: Optional[Union[str, Path]] = None,
5556

5657
start_events_logger(_registry=_registry)
5758
start_grafana_pipeline(_registry=_registry)
59+
start_argus_pipeline(_registry=_registry)
5860
start_events_analyzer(_registry=_registry)
5961
start_events_handler(_registry=_registry)
6062
start_events_counter(_registry=_registry)
@@ -69,6 +71,9 @@ def stop_events_device(_registry: Optional[EventsProcessesRegistry] = None) -> N
6971
EVENTS_GRAFANA_ANNOTATOR_ID,
7072
EVENTS_GRAFANA_AGGREGATOR_ID,
7173
EVENTS_GRAFANA_POSTMAN_ID,
74+
EVENTS_ARGUS_ANNOTATOR_ID,
75+
EVENTS_ARGUS_AGGREGATOR_ID,
76+
EVENTS_ARGUS_POSTMAN_ID,
7277
EVENTS_COUNTER_ID,
7378
EVENTS_HANDLER_ID,
7479
EVENTS_ANALYZER_ID,

sdcm/test_config.py

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,7 @@
1212
from sdcm.keystore import KeyStore
1313
from sdcm.provision.common.configuration_script import ConfigurationScriptBuilder
1414
from sdcm.sct_events import Severity
15+
from sdcm.sct_events.argus import enable_argus_posting, start_posting_argus_events
1516
from sdcm.sct_events.system import TestFrameworkEvent
1617
from sdcm.utils.argus import ArgusError, get_argus_client
1718
from sdcm.utils.ci_tools import get_job_name
@@ -294,9 +295,15 @@ def init_argus_client(cls, params: dict, test_id: str | None = None):
294295
LOGGER.info("Initializing Argus connection...")
295296
try:
296297
cls._argus_client = get_argus_client(run_id=cls.test_id() if not test_id else test_id)
298+
enable_argus_posting()
299+
start_posting_argus_events()
297300
return
298301
except ArgusError as exc:
299302
LOGGER.warning("Failed to initialize argus client: %s", exc.message)
303+
except RuntimeError as exc:
304+
LOGGER.warning("Skipping setting up argus events: %s", exc)
305+
return
306+
300307
TestFrameworkEvent(
301308
source=cls.__name__,
302309
source_method='init_argus_client',

sdcm/utils/argus.py

Lines changed: 31 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,8 @@
22
import re
33
import os
44
from pathlib import Path
5+
import threading
6+
from typing import Optional
57
from uuid import UUID
68

79
from argus.client.sct.client import ArgusSCTClient
@@ -15,6 +17,31 @@
1517
LOGGER = logging.getLogger(__name__)
1618

1719

20+
class Argus:
21+
INSTANCE: Optional[ArgusSCTClient] = None
22+
INIT_DONE = threading.Event()
23+
24+
def __init__(self, client: ArgusSCTClient):
25+
self._client = client
26+
27+
@classmethod
28+
def init_global(cls, client: ArgusSCTClient):
29+
if cls.INIT_DONE.is_set():
30+
return
31+
cls.INSTANCE = cls(client)
32+
cls.INIT_DONE.set()
33+
34+
@classmethod
35+
def get(cls, init_default=False) -> 'Argus':
36+
if init_default and not cls.INIT_DONE.is_set():
37+
cls.init_global(get_argus_client(run_id=os.environ.get("SCT_TEST_ID"), init_global=False))
38+
return cls.INSTANCE
39+
40+
@property
41+
def client(self) -> ArgusSCTClient:
42+
return self._client
43+
44+
1845
class ArgusError(Exception):
1946

2047
def __init__(self, message: str, *args: list) -> None:
@@ -37,13 +64,16 @@ def is_uuid(uuid) -> bool:
3764
return False
3865

3966

40-
def get_argus_client(run_id: UUID | str) -> ArgusSCTClient:
67+
def get_argus_client(run_id: UUID | str, init_global=True) -> ArgusSCTClient:
4168
if not is_uuid(run_id):
4269
raise ArgusError("Malformed UUID provided")
4370
creds = KeyStore().get_argus_rest_credentials()
4471
argus_client = ArgusSCTClient(
4572
run_id=run_id, auth_token=creds["token"], base_url=creds["baseUrl"], extra_headers=creds.get("extra_headers"))
4673

74+
if init_global:
75+
Argus.init_global(argus_client)
76+
4777
return argus_client
4878

4979

0 commit comments

Comments
 (0)