ft-analyzer aggregate flows before creating events

pascal260303 · pascal260303 · commit 0377dd9fbe89 · 2025-07-28T21:12:56.000+02:00
diff --git a/tools/ft-analyzer/ftanalyzer/events/events.py b/tools/ft-analyzer/ftanalyzer/events/events.py
@@ -24,6 +24,14 @@
     "MSG_LENGTH": np.uint64,
 }
 
+CSV_AGGREGATE_TYPES = {
+    "START_TIME": np.uint64,
+    "END_TIME": np.uint64,
+    "PACKETS": np.uint64,
+    "BYTES": np.uint64,
+    "FLOWS": np.uint64,
+}
+
 STATS_CSV_COLUMN_TYPES = {
     "Time": np.uint64,
     "UID": np.uint64,
@@ -94,36 +102,42 @@ class FlowStartEvent(Event):
     packet_rate: float
     time = 0
     flow_rate: float
+    flows: int
 
-    def __init__(self, data_rate, packet_rate, start_time, flow_rate):
+    def __init__(self, data_rate, packet_rate, start_time, flow_rate, flows):
         self.data_rate = data_rate
         self.packet_rate = packet_rate
         self.time = start_time
         self.flow_rate = flow_rate
+        self.flows = flows
 
 
 class FlowEndEvent(Event):
     data_rate: float
     packet_rate: float
     time = 0
     flow_rate: float
+    flows: int
 
-    def __init__(self, data_rate, packet_rate, end_time, flow_rate):
+    def __init__(self, data_rate, packet_rate, end_time, flow_rate, flows):
         self.data_rate = -data_rate
         self.packet_rate = -packet_rate
         self.time = end_time
         self.flow_rate = -flow_rate
+        self.flows = -flows
 
 
 class OnePacketFlow(Event):
     bytes: np.uint64
     packets: np.uint64
     time = 0
+    flows: np.uint64
 
-    def __init__(self, bytes, packets, time):
+    def __init__(self, bytes, packets, time, flows):
         self.bytes = bytes
         self.packets = packets
         self.time = time
+        self.flows = flows
 
 
 class ExportEvent(Event):
@@ -188,12 +202,26 @@ def create_event_queue(
 
     stats_df.to_csv(hosts_stats_file, sep=";", index=False)
 
+    agg_dict = {
+        "PACKETS": ("PACKETS", "sum"),
+        "BYTES": ("BYTES", "sum"),
+        "FLOWS": ("PACKETS", "count"),
+    }
     # One-packet flows
-    one_packet_df = df[df["PACKETS"] == 1].sort_values("START_TIME")
-    one_packet_df.to_csv(one_packet_path, index=False)
+    (
+        df[df["PACKETS"] == 1]
+        .groupby("START_TIME", as_index=False)
+        .agg(**agg_dict)
+        .sort_values("START_TIME")
+        .to_csv(one_packet_path, index=False)
+    )
 
     # Multi-packet flows
-    multi_df = df[df["PACKETS"] > 1]
+    multi_df = (
+        df[df["PACKETS"] > 1]
+        .groupby(["START_TIME", "END_TIME"], as_index=False)
+        .agg(**agg_dict)
+    )
     multi_df.sort_values("START_TIME").to_csv(sorted_by_start_path, index=False)
     multi_df.sort_values("END_TIME").to_csv(sorted_by_end_path, index=False)
 
@@ -245,21 +273,25 @@ def read_host_stats_events(path: os.PathLike):
 
 
 def read_one_packet_events(path: str) -> Iterator[OnePacketFlow]:
-    for chunk in pd.read_csv(path, dtype=CSV_COLUMN_TYPES, chunksize=100_000):
+    CSV_AGGREGATE_TYPES_NO_END = {
+        k: v for k, v in CSV_AGGREGATE_TYPES.items() if k != "END_TIME"
+    }
+    for chunk in pd.read_csv(path, dtype=CSV_AGGREGATE_TYPES_NO_END, chunksize=100_000):
         for row in chunk.itertuples(index=False):
             yield OnePacketFlow(
                 bytes=np.uint64(row.BYTES),
                 packets=np.uint64(row.PACKETS),
                 time=np.uint64(row.START_TIME),
+                flows=row.FLOWS,
             )
 
 
 def read_start_events(path: str) -> Iterator[FlowStartEvent]:
-    for chunk in pd.read_csv(path, dtype=CSV_COLUMN_TYPES, chunksize=100_000):
+    for chunk in pd.read_csv(path, dtype=CSV_AGGREGATE_TYPES, chunksize=100_000):
         durations = (chunk.END_TIME - chunk.START_TIME + 1) / 1_000
         data_rates = (chunk.BYTES * 8) / durations
         packet_rates = chunk.PACKETS / durations
-        flow_rates = 1 / durations
+        flow_rates = chunk.FLOWS / durations
         for row, dr, pr, fr in zip(
             chunk.itertuples(index=False), data_rates, packet_rates, flow_rates
         ):
@@ -268,15 +300,16 @@ def read_start_events(path: str) -> Iterator[FlowStartEvent]:
                 packet_rate=pr,
                 start_time=np.uint64(row.START_TIME),
                 flow_rate=fr,
+                flows=row.FLOWS,
             )
 
 
 def read_end_events(path: str) -> Iterator[FlowEndEvent]:
-    for chunk in pd.read_csv(path, dtype=CSV_COLUMN_TYPES, chunksize=100_000):
+    for chunk in pd.read_csv(path, dtype=CSV_AGGREGATE_TYPES, chunksize=100_000):
         durations = (chunk.END_TIME - chunk.START_TIME + 1) / 1_000
         data_rates = (chunk.BYTES * 8) / durations
         packet_rates = chunk.PACKETS / durations
-        flow_rates = 1 / durations
+        flow_rates = chunk.FLOWS / durations
         for row, dr, pr, fr in zip(
             chunk.itertuples(index=False), data_rates, packet_rates, flow_rates
         ):
@@ -285,4 +318,5 @@ def read_end_events(path: str) -> Iterator[FlowEndEvent]:
                 packet_rate=pr,
                 end_time=np.uint64(row.END_TIME),
                 flow_rate=fr,
+                flows=row.FLOWS,
             )
diff --git a/tools/ft-analyzer/ftanalyzer/models/statistical_model.py b/tools/ft-analyzer/ftanalyzer/models/statistical_model.py
@@ -7,7 +7,7 @@
 """
 
 import atexit
-from concurrent.futures import ProcessPoolExecutor
+from concurrent.futures import ProcessPoolExecutor, ThreadPoolExecutor
 import ipaddress
 import logging
 import operator
@@ -40,10 +40,14 @@
     Event,
     HostStatsEvent,
     OnePacketFlow,
-    FlowEndEvent,
-    FlowStartEvent,
     create_event_queue,
 )
+import sys
+
+
+def is_debugger_active():
+    return sys.gettrace() is not None
+
 
 _TEMP_FILES = []
 
@@ -221,7 +225,9 @@ def __init__(
 
         if use_statistical_counter:
             # statistic objects
-            self._executor = ProcessPoolExecutor()
+            self._executor = (
+                ThreadPoolExecutor() if is_debugger_active() else ProcessPoolExecutor()
+            )
             self._future_sim = self._executor.submit(
                 self._run_sim,
                 host_stats,
@@ -842,7 +848,7 @@ def process_events(
             # aggregate OnePacketFlow events within this window
             total_bytes = sum(e.bytes for e in one_packet_events)
             total_packets = sum(e.packets for e in one_packet_events)
-            total_flows = np.uint64(len(one_packet_events))
+            total_flows = np.uint64(sum(e.flows for e in one_packet_events))
 
             singleton_data_rate = (
                 (total_bytes * 8) / duration_s if duration_s > 0 else 0.0
@@ -921,10 +927,7 @@ def process_events(
                 current_data_rate += e.data_rate
                 current_packet_rate += e.packet_rate
                 current_flow_rate += e.flow_rate
-                if isinstance(e, FlowStartEvent):
-                    current_flow_count += np.uint64(1)
-                elif isinstance(e, FlowEndEvent):
-                    current_flow_count -= np.uint64(1)
+                current_flow_count += e.flows
 
         sim.set_time(event.time)
         simultaneous_events = [event]
@@ -937,7 +940,7 @@ def process_events(
         # aggregate OnePacketFlow events within this window
         total_bytes = sum(e.bytes for e in one_packet_events)
         total_packets = sum(e.packets for e in one_packet_events)
-        total_flows = np.uint64(len(one_packet_events))
+        total_flows = np.uint64(sum(e.flows for e in one_packet_events))
 
         singleton_data_rate = (total_bytes * 8) / duration_s if duration_s > 0 else 0.0
         singleton_packet_rate = total_packets / duration_s if duration_s > 0 else 0.0
diff --git a/tools/ft-orchestration/src/collector/jq.py b/tools/ft-orchestration/src/collector/jq.py
@@ -71,8 +71,12 @@ def __init__(self, executor: Executor, file: str):
         self._file = stdout.strip()
         tmp_file = path.join(self._rsync.get_data_directory(), "flows.json")
         self._cmd_json = f"ipfixcol2 -c {Path(self._conf_dir, self.CONFIG_FILE)}"
-        fields = ", ".join([f'.\\"{field}\\"' for field in CSV_HEADER_TO_ANALYZER_HEADER.keys()])
-        header = ",".join([f'"{field}"' for field in CSV_HEADER_TO_ANALYZER_HEADER.keys()])
+        fields = ", ".join(
+            [f'.\\"{field}\\"' for field in CSV_HEADER_TO_ANALYZER_HEADER.keys()]
+        )
+        header = ",".join(
+            [f'"{field}"' for field in CSV_HEADER_TO_ANALYZER_HEADER.keys()]
+        )
         self._cmd_csv = f"""set -e
 ipfixcol2 -c {Path(self._conf_dir, self.CONFIG_FILE)} > {tmp_file}
 echo {header}
diff --git a/tools/ft-orchestration/tests/simulation/test_simulation_threshold.py b/tools/ft-orchestration/tests/simulation/test_simulation_threshold.py
@@ -294,8 +294,7 @@ def run_single_test(loops: int, speed: MbpsSpeed) -> tuple[bool, StatisticalRepo
     # round up to required accuracy
     if speed_current % scenario.test.mbps_accuracy > 0:
         speed_current = speed_current + (
-            scenario.test.mbps_accuracy
-            - speed_current % scenario.test.mbps_accuracy
+            scenario.test.mbps_accuracy - speed_current % scenario.test.mbps_accuracy
         )
     while True:
         # setup log path