@@ -268,7 +268,7 @@ def _walk(path: Path):
268
268
yield p
269
269
270
270
271
- def preview_csv (p : Path , file_name : str , simple = True ) -> str :
271
+ def preview_csv (p : Path , file_name : str , simple = True , show_nan_columns = False ) -> str :
272
272
"""Generate a textual preview of a csv file
273
273
274
274
Args:
@@ -287,7 +287,7 @@ def preview_csv(p: Path, file_name: str, simple=True) -> str:
287
287
288
288
if simple :
289
289
cols = df .columns .tolist ()
290
- sel_cols = 15
290
+ sel_cols = min ( len ( cols ), 100 )
291
291
cols_str = ", " .join (cols [:sel_cols ])
292
292
res = f"The columns are: { cols_str } "
293
293
if len (cols ) > sel_cols :
@@ -312,6 +312,10 @@ def preview_csv(p: Path, file_name: str, simple=True) -> str:
312
312
out .append (
313
313
f"{ name } has { df [col ].nunique ()} unique values. Some example values: { df [col ].value_counts ().head (4 ).index .tolist ()} "
314
314
)
315
+ if show_nan_columns :
316
+ nan_cols = [col for col in df .columns .tolist () if df [col ].isnull ().any ()]
317
+ if nan_cols :
318
+ out .append (f"Columns containing NaN values: { ', ' .join (nan_cols )} " )
315
319
316
320
return "\n " .join (out )
317
321
@@ -346,7 +350,7 @@ def preview_json(p: Path, file_name: str):
346
350
return f"-> { file_name } has auto-generated json schema:\n " + builder .to_json (indent = 2 )
347
351
348
352
349
- def describe_data_folder_v2 (base_path , include_file_details = True , simple = False ):
353
+ def describe_data_folder_v2 (base_path , include_file_details = True , simple = False , show_nan_columns = False ):
350
354
"""
351
355
Generate a textual preview of a directory, including an overview of the directory
352
356
structure and previews of individual files
@@ -359,7 +363,7 @@ def describe_data_folder_v2(base_path, include_file_details=True, simple=False):
359
363
file_name = str (fn .relative_to (base_path ))
360
364
361
365
if fn .suffix == ".csv" :
362
- out .append (preview_csv (fn , file_name , simple = simple ))
366
+ out .append (preview_csv (fn , file_name , simple = simple , show_nan_columns = show_nan_columns ))
363
367
elif fn .suffix == ".json" :
364
368
out .append (preview_json (fn , file_name ))
365
369
elif fn .suffix in plaintext_files :
@@ -374,7 +378,9 @@ def describe_data_folder_v2(base_path, include_file_details=True, simple=False):
374
378
375
379
# if the result is very long we generate a simpler version
376
380
if len (result ) > 6_000 and not simple :
377
- return describe_data_folder_v2 (base_path , include_file_details = include_file_details , simple = True )
381
+ return describe_data_folder_v2 (
382
+ base_path , include_file_details = include_file_details , simple = True , show_nan_columns = show_nan_columns
383
+ )
378
384
# if still too long, we truncate
379
385
if len (result ) > 6_000 and simple :
380
386
return result [:6_000 ] + "\n ... (truncated)"
0 commit comments