-
Couldn't load subscription status.
- Fork 110
feature(sdcm/sct_events): Real-time Argus Events #12095
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: master
Are you sure you want to change the base?
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,139 @@ | ||
| # This program is free software; you can redistribute it and/or modify | ||
| # it under the terms of the GNU Affero General Public License as published by | ||
| # the Free Software Foundation; either version 3 of the License, or | ||
| # (at your option) any later version. | ||
| # | ||
| # This program is distributed in the hope that it will be useful, | ||
| # but WITHOUT ANY WARRANTY; without even the implied warranty of | ||
| # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. | ||
| # | ||
| # See LICENSE for more details. | ||
| # | ||
| # Copyright (c) 2025 ScyllaDB | ||
|
|
||
| import time | ||
| import logging | ||
| import threading | ||
| from typing import NewType, Dict, Any, Tuple, Optional, Callable, cast | ||
| from functools import partial | ||
| from collections import defaultdict | ||
|
|
||
| from argus.common.sct_types import RawEventPayload | ||
| from sdcm.sct_events.events_processes import \ | ||
| EVENTS_ARGUS_ANNOTATOR_ID, EVENTS_ARGUS_AGGREGATOR_ID, EVENTS_ARGUS_POSTMAN_ID, \ | ||
| EventsProcessesRegistry, BaseEventsProcess, EventsProcessPipe, \ | ||
| start_events_process, get_events_process, verbose_suppress | ||
| from sdcm.utils.argus import Argus | ||
|
|
||
|
|
||
| ARGUS_EVENT_AGGREGATOR_TIME_WINDOW: float = 90 # seconds | ||
| LOGGER = logging.getLogger(__name__) | ||
|
|
||
|
|
||
| SCTArgusEvent = NewType("SCTArgusEvent", RawEventPayload) | ||
| SCTArgusEventKey = NewType("SCTArgusEventKey", Tuple[str, ...]) | ||
|
|
||
|
|
||
| class ArgusEventCollector(EventsProcessPipe[Tuple[str, Any], SCTArgusEvent]): | ||
| def run(self) -> None: | ||
| if Argus.get() and (client := Argus.get().client): | ||
| run_id = client.run_id | ||
| else: | ||
| run_id = None | ||
| for event_tuple in self.inbound_events(): | ||
| with verbose_suppress("ArgusEventCollector failed to process %s", event_tuple): | ||
| event_class, event = event_tuple # try to unpack event from EventsDevice | ||
| if not event.publish_to_argus: | ||
| continue | ||
| evt = SCTArgusEvent({ | ||
| "run_id": run_id, | ||
| "severity": event.severity.name, | ||
| "ts": event.timestamp, | ||
| "duration": getattr(event, "duration", None), | ||
| "event_type": event_class, | ||
| "message": str(event), | ||
| "known_issue": getattr(event, "known_issue", None), | ||
| "nemesis_name": getattr(event, "nemesis_name", None), | ||
| "nemesis_status": getattr(event, "nemesis_status", None), | ||
| "node": getattr(event, "node", None), | ||
| "received_timestamp": getattr(event, "received_timestamp", None), | ||
| "target_node": getattr(event, "target_node", None), | ||
| }) | ||
| self.outbound_queue.put(evt) | ||
|
|
||
|
|
||
| class ArgusEventAggregator(EventsProcessPipe[SCTArgusEvent, SCTArgusEvent]): | ||
| inbound_events_process = EVENTS_ARGUS_ANNOTATOR_ID | ||
| time_window = ARGUS_EVENT_AGGREGATOR_TIME_WINDOW | ||
|
|
||
| def run(self) -> None: | ||
| time_window_counters: Dict[SCTArgusEventKey, int] = defaultdict(int) | ||
|
|
||
| for event in self.inbound_events(): | ||
| with verbose_suppress("ArgusEventAggregator failed to process an event %s", event): | ||
| event_key = self.unique_key(event) | ||
| if time.perf_counter() - time_window_counters.get(event_key, 0) > ARGUS_EVENT_AGGREGATOR_TIME_WINDOW: | ||
| # not seen event from sometime or ever | ||
| time_window_counters[event_key] = time.perf_counter() | ||
| else: | ||
| # recently seen | ||
| continue | ||
|
|
||
| # Put the event to the posting queue. | ||
| LOGGER.debug("Event moving to posting queue: %s", event) | ||
| self.outbound_queue.put(event) | ||
|
|
||
| @staticmethod | ||
| def unique_key(event: SCTArgusEvent) -> SCTArgusEventKey: | ||
| return SCTArgusEventKey(tuple([event["run_id"], event["severity"], event["event_type"]])) | ||
|
|
||
|
|
||
| class ArgusEventPostman(BaseEventsProcess[SCTArgusEvent, None], threading.Thread): | ||
| inbound_events_process = EVENTS_ARGUS_AGGREGATOR_ID | ||
|
|
||
| def __init__(self, _registry: EventsProcessesRegistry): | ||
| self.enabled = threading.Event() | ||
| self._argus_client = None | ||
| super().__init__(_registry=_registry) | ||
|
|
||
| def run(self) -> None: | ||
| self.enabled.wait() | ||
|
|
||
| for event in self.inbound_events(): # events from ArgusAggregator | ||
| with verbose_suppress("ArgusEventPostman failed to post an event to '%s' " | ||
| "endpoint.\nEvent: %s", self._argus_client.Routes.SUBMIT_EVENT, event): | ||
| if self._argus_client: | ||
| self._argus_client.submit_event(event) | ||
|
|
||
| def enable_argus_posting(self) -> None: | ||
| self._argus_client = Argus.get().client | ||
|
|
||
| def start_posting_argus_events(self): | ||
| self.enabled.set() | ||
|
|
||
| def terminate(self) -> None: | ||
| super().terminate() | ||
| self.enabled.set() | ||
|
|
||
|
|
||
| start_argus_event_collector = partial(start_events_process, EVENTS_ARGUS_ANNOTATOR_ID, ArgusEventCollector) | ||
| start_argus_aggregator = partial(start_events_process, EVENTS_ARGUS_AGGREGATOR_ID, ArgusEventAggregator) | ||
| start_argus_postman = partial(start_events_process, EVENTS_ARGUS_POSTMAN_ID, ArgusEventPostman) | ||
| get_argus_postman = cast(Callable[..., ArgusEventPostman], partial(get_events_process, EVENTS_ARGUS_POSTMAN_ID)) | ||
|
|
||
|
|
||
| def start_argus_pipeline(_registry: Optional[EventsProcessesRegistry] = None) -> None: | ||
| start_argus_event_collector(_registry=_registry) | ||
| start_argus_aggregator(_registry=_registry) | ||
| start_argus_postman(_registry=_registry) | ||
|
|
||
|
|
||
| def enable_argus_posting(_registry: Optional[EventsProcessesRegistry] = None) -> None: | ||
| get_argus_postman(_registry=_registry).enable_argus_posting() | ||
|
|
||
|
|
||
| def start_posting_argus_events(_registry: Optional[EventsProcessesRegistry] = None) -> None: | ||
| get_argus_postman(_registry=_registry).start_posting_argus_events() | ||
|
|
||
|
|
||
| __all__ = ("start_argus_pipeline", "enable_argus_posting", "start_posting_argus_events") | ||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -12,6 +12,7 @@ | |
| from sdcm.keystore import KeyStore | ||
| from sdcm.provision.common.configuration_script import ConfigurationScriptBuilder | ||
| from sdcm.sct_events import Severity | ||
| from sdcm.sct_events.argus import enable_argus_posting, start_posting_argus_events | ||
| from sdcm.sct_events.system import TestFrameworkEvent | ||
| from sdcm.utils.argus import ArgusError, get_argus_client | ||
| from sdcm.utils.ci_tools import get_job_name | ||
|
|
@@ -294,9 +295,15 @@ def init_argus_client(cls, params: dict, test_id: str | None = None): | |
| LOGGER.info("Initializing Argus connection...") | ||
| try: | ||
| cls._argus_client = get_argus_client(run_id=cls.test_id() if not test_id else test_id) | ||
| enable_argus_posting() | ||
|
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I think we should start it along with other devices in There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. We aren't - this is to release the lock on the poster, which would either be started or not already by main events device - the reason this exception handler is there is to guard against situations where events device isn't initialized, for example inside the There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. There are currently 3 steps that need to be taken to enable argus events:
Latter two can be combined into one, but the split here is warranted as |
||
| start_posting_argus_events() | ||
| return | ||
| except ArgusError as exc: | ||
| LOGGER.warning("Failed to initialize argus client: %s", exc.message) | ||
| except RuntimeError as exc: | ||
| LOGGER.warning("Skipping setting up argus events: %s", exc) | ||
| return | ||
|
|
||
| TestFrameworkEvent( | ||
| source=cls.__name__, | ||
| source_method='init_argus_client', | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -2,6 +2,8 @@ | |
| import re | ||
| import os | ||
| from pathlib import Path | ||
| import threading | ||
| from typing import Optional | ||
| from uuid import UUID | ||
|
|
||
| from argus.client.sct.client import ArgusSCTClient | ||
|
|
@@ -15,6 +17,31 @@ | |
| LOGGER = logging.getLogger(__name__) | ||
|
|
||
|
|
||
| class Argus: | ||
|
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. why do we need that? can we create argus client when enabling posting in There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
This means having to instantiate There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. we have There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Circular dependency as well here as importing |
||
| INSTANCE: Optional[ArgusSCTClient] = None | ||
| INIT_DONE = threading.Event() | ||
|
|
||
| def __init__(self, client: ArgusSCTClient): | ||
| self._client = client | ||
|
|
||
| @classmethod | ||
| def init_global(cls, client: ArgusSCTClient): | ||
| if cls.INIT_DONE.is_set(): | ||
| return | ||
| cls.INSTANCE = cls(client) | ||
| cls.INIT_DONE.set() | ||
|
|
||
| @classmethod | ||
| def get(cls, init_default=False) -> 'Argus': | ||
| if init_default and not cls.INIT_DONE.is_set(): | ||
| cls.init_global(get_argus_client(run_id=os.environ.get("SCT_TEST_ID"), init_global=False)) | ||
| return cls.INSTANCE | ||
|
|
||
| @property | ||
| def client(self) -> ArgusSCTClient: | ||
| return self._client | ||
|
|
||
|
|
||
| class ArgusError(Exception): | ||
|
|
||
| def __init__(self, message: str, *args: list) -> None: | ||
|
|
@@ -37,13 +64,16 @@ def is_uuid(uuid) -> bool: | |
| return False | ||
|
|
||
|
|
||
| def get_argus_client(run_id: UUID | str) -> ArgusSCTClient: | ||
| def get_argus_client(run_id: UUID | str, init_global=True) -> ArgusSCTClient: | ||
| if not is_uuid(run_id): | ||
| raise ArgusError("Malformed UUID provided") | ||
| creds = KeyStore().get_argus_rest_credentials() | ||
| argus_client = ArgusSCTClient( | ||
| run_id=run_id, auth_token=creds["token"], base_url=creds["baseUrl"], extra_headers=creds.get("extra_headers")) | ||
|
|
||
| if init_global: | ||
| Argus.init_global(argus_client) | ||
|
|
||
| return argus_client | ||
|
|
||
|
|
||
|
|
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Generally, I dislike this algorithm: it delays event by time_window and in case of burst of 1000 events it will send anyway 200 of them (with 5 max duplicates set).
I think it would be better to fire off event straight away and for some time period silence the other events with the same event_key.
Something like this (needs testing with e.g. unit test):
This solution will bring some memory burden (proportional to no of distinct event_keys) - but assuming even 1M events, should fit memory anyway.
@k0machi can you try this approach?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Hmm, sounds better, I'll try it (probably should also figure out a way to add argus events to unit_tests)
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
see how other events devices are tested