24
24
"MSG_LENGTH" : np .uint64 ,
25
25
}
26
26
27
+ CSV_AGGREGATE_TYPES = {
28
+ "START_TIME" : np .uint64 ,
29
+ "END_TIME" : np .uint64 ,
30
+ "PACKETS" : np .uint64 ,
31
+ "BYTES" : np .uint64 ,
32
+ "FLOWS" : np .uint64 ,
33
+ }
34
+
27
35
STATS_CSV_COLUMN_TYPES = {
28
36
"Time" : np .uint64 ,
29
37
"UID" : np .uint64 ,
@@ -94,36 +102,42 @@ class FlowStartEvent(Event):
94
102
packet_rate : float
95
103
time = 0
96
104
flow_rate : float
105
+ flows : int
97
106
98
- def __init__ (self , data_rate , packet_rate , start_time , flow_rate ):
107
+ def __init__ (self , data_rate , packet_rate , start_time , flow_rate , flows ):
99
108
self .data_rate = data_rate
100
109
self .packet_rate = packet_rate
101
110
self .time = start_time
102
111
self .flow_rate = flow_rate
112
+ self .flows = flows
103
113
104
114
105
115
class FlowEndEvent (Event ):
106
116
data_rate : float
107
117
packet_rate : float
108
118
time = 0
109
119
flow_rate : float
120
+ flows : int
110
121
111
- def __init__ (self , data_rate , packet_rate , end_time , flow_rate ):
122
+ def __init__ (self , data_rate , packet_rate , end_time , flow_rate , flows ):
112
123
self .data_rate = - data_rate
113
124
self .packet_rate = - packet_rate
114
125
self .time = end_time
115
126
self .flow_rate = - flow_rate
127
+ self .flows = - flows
116
128
117
129
118
130
class OnePacketFlow (Event ):
119
131
bytes : np .uint64
120
132
packets : np .uint64
121
133
time = 0
134
+ flows : np .uint64
122
135
123
- def __init__ (self , bytes , packets , time ):
136
+ def __init__ (self , bytes , packets , time , flows ):
124
137
self .bytes = bytes
125
138
self .packets = packets
126
139
self .time = time
140
+ self .flows = flows
127
141
128
142
129
143
class ExportEvent (Event ):
@@ -188,12 +202,26 @@ def create_event_queue(
188
202
189
203
stats_df .to_csv (hosts_stats_file , sep = ";" , index = False )
190
204
205
+ agg_dict = {
206
+ "PACKETS" : ("PACKETS" , "sum" ),
207
+ "BYTES" : ("BYTES" , "sum" ),
208
+ "FLOWS" : ("PACKETS" , "count" ),
209
+ }
191
210
# One-packet flows
192
- one_packet_df = df [df ["PACKETS" ] == 1 ].sort_values ("START_TIME" )
193
- one_packet_df .to_csv (one_packet_path , index = False )
211
+ (
212
+ df [df ["PACKETS" ] == 1 ]
213
+ .groupby ("START_TIME" , as_index = False )
214
+ .agg (** agg_dict )
215
+ .sort_values ("START_TIME" )
216
+ .to_csv (one_packet_path , index = False )
217
+ )
194
218
195
219
# Multi-packet flows
196
- multi_df = df [df ["PACKETS" ] > 1 ]
220
+ multi_df = (
221
+ df [df ["PACKETS" ] > 1 ]
222
+ .groupby (["START_TIME" , "END_TIME" ], as_index = False )
223
+ .agg (** agg_dict )
224
+ )
197
225
multi_df .sort_values ("START_TIME" ).to_csv (sorted_by_start_path , index = False )
198
226
multi_df .sort_values ("END_TIME" ).to_csv (sorted_by_end_path , index = False )
199
227
@@ -245,21 +273,25 @@ def read_host_stats_events(path: os.PathLike):
245
273
246
274
247
275
def read_one_packet_events (path : str ) -> Iterator [OnePacketFlow ]:
248
- for chunk in pd .read_csv (path , dtype = CSV_COLUMN_TYPES , chunksize = 100_000 ):
276
+ CSV_AGGREGATE_TYPES_NO_END = {
277
+ k : v for k , v in CSV_AGGREGATE_TYPES .items () if k != "END_TIME"
278
+ }
279
+ for chunk in pd .read_csv (path , dtype = CSV_AGGREGATE_TYPES_NO_END , chunksize = 100_000 ):
249
280
for row in chunk .itertuples (index = False ):
250
281
yield OnePacketFlow (
251
282
bytes = np .uint64 (row .BYTES ),
252
283
packets = np .uint64 (row .PACKETS ),
253
284
time = np .uint64 (row .START_TIME ),
285
+ flows = row .FLOWS ,
254
286
)
255
287
256
288
257
289
def read_start_events (path : str ) -> Iterator [FlowStartEvent ]:
258
- for chunk in pd .read_csv (path , dtype = CSV_COLUMN_TYPES , chunksize = 100_000 ):
290
+ for chunk in pd .read_csv (path , dtype = CSV_AGGREGATE_TYPES , chunksize = 100_000 ):
259
291
durations = (chunk .END_TIME - chunk .START_TIME + 1 ) / 1_000
260
292
data_rates = (chunk .BYTES * 8 ) / durations
261
293
packet_rates = chunk .PACKETS / durations
262
- flow_rates = 1 / durations
294
+ flow_rates = chunk . FLOWS / durations
263
295
for row , dr , pr , fr in zip (
264
296
chunk .itertuples (index = False ), data_rates , packet_rates , flow_rates
265
297
):
@@ -268,15 +300,16 @@ def read_start_events(path: str) -> Iterator[FlowStartEvent]:
268
300
packet_rate = pr ,
269
301
start_time = np .uint64 (row .START_TIME ),
270
302
flow_rate = fr ,
303
+ flows = row .FLOWS ,
271
304
)
272
305
273
306
274
307
def read_end_events (path : str ) -> Iterator [FlowEndEvent ]:
275
- for chunk in pd .read_csv (path , dtype = CSV_COLUMN_TYPES , chunksize = 100_000 ):
308
+ for chunk in pd .read_csv (path , dtype = CSV_AGGREGATE_TYPES , chunksize = 100_000 ):
276
309
durations = (chunk .END_TIME - chunk .START_TIME + 1 ) / 1_000
277
310
data_rates = (chunk .BYTES * 8 ) / durations
278
311
packet_rates = chunk .PACKETS / durations
279
- flow_rates = 1 / durations
312
+ flow_rates = chunk . FLOWS / durations
280
313
for row , dr , pr , fr in zip (
281
314
chunk .itertuples (index = False ), data_rates , packet_rates , flow_rates
282
315
):
@@ -285,4 +318,5 @@ def read_end_events(path: str) -> Iterator[FlowEndEvent]:
285
318
packet_rate = pr ,
286
319
end_time = np .uint64 (row .END_TIME ),
287
320
flow_rate = fr ,
321
+ flows = row .FLOWS ,
288
322
)
0 commit comments