@@ -198,14 +198,14 @@ def run_query(
198
198
raise ValueError (f'Unrecognized method: { method } ' )
199
199
200
200
201
- def get_args (signed : List , json_output_path : str , columns : List ) -> Iterable :
201
+ def get_args (signed : List , json_output_folder : str , columns : List ) -> Iterable :
202
202
for i , r in enumerate (signed ):
203
- yield (i , r .url , json_output_path , columns )
203
+ yield (i , r .url , json_output_folder , columns )
204
204
205
205
206
206
def download (ipart : int ,
207
207
url : str ,
208
- json_output_path : str ,
208
+ json_output_folder : str ,
209
209
columns : Optional [List ] = None ,
210
210
resp_format : str = 'arrow' ,
211
211
compressed : bool = False ) -> None :
@@ -214,7 +214,7 @@ def download(ipart: int,
214
214
Args:
215
215
ipart (int): presigned url id
216
216
url (str): presigned url
217
- json_output_path (str): directory to save the ipart_th segment of dataframe
217
+ json_output_folder (str): directory to save the ipart_th segment of dataframe
218
218
columns (list): schema to save to json
219
219
resp_format (str): whether to use arrow or json when collect
220
220
compressed (bool): if data is compressed before downloading. Need decompress if compressed=True.
@@ -224,7 +224,7 @@ def download(ipart: int,
224
224
if resp_format == 'json' :
225
225
data = resp .json ()
226
226
pd .DataFrame (data , columns = columns ).to_json (os .path .join (
227
- json_output_path , 'part_' + str (ipart ) + '.jsonl' ),
227
+ json_output_folder , 'part_' + str (ipart ) + '.jsonl' ),
228
228
orient = 'records' ,
229
229
lines = True )
230
230
return
@@ -242,7 +242,7 @@ def download(ipart: int,
242
242
243
243
# Convert the PyArrow table into a pandas DataFrame
244
244
df = table .to_pandas ()
245
- df .to_json (os .path .join (json_output_path ,
245
+ df .to_json (os .path .join (json_output_folder ,
246
246
'part_' + str (ipart ) + '.jsonl' ),
247
247
orient = 'records' ,
248
248
lines = True ,
@@ -256,7 +256,7 @@ def download_starargs(args: Tuple) -> None:
256
256
def fetch_data (method : str , cursor : Optional [Cursor ],
257
257
sparkSession : Optional [SparkSession ], start : int , end : int ,
258
258
order_by : str , tablename : str , columns_str : str ,
259
- json_output_path : str ) -> None :
259
+ json_output_folder : str ) -> None :
260
260
"""Fetches a specified range of rows from a given table to a json file.
261
261
262
262
This function executes a SQL query to retrieve a range of rows, determined by 'start' and 'end' indexes,
@@ -271,7 +271,7 @@ def fetch_data(method: str, cursor: Optional[Cursor],
271
271
order_by (str): The column name to use for ordering the rows.
272
272
tablename (str): The name of the table from which to fetch the data.
273
273
columns_str (str): The string representation of the columns to select from the table.
274
- json_output_path (str): The file path where the resulting JSON file will be saved.
274
+ json_output_folder (str): The file path where the resulting JSON file will be saved.
275
275
276
276
Returns:
277
277
None: The function doesn't return any value, but writes the result to a JSONL file.
@@ -301,15 +301,15 @@ def fetch_data(method: str, cursor: Optional[Cursor],
301
301
records = [r .asDict () for r in ans ] # pyright: ignore
302
302
pdf = pd .DataFrame .from_dict (records )
303
303
304
- pdf .to_json (os .path .join (json_output_path , f'part_{ start + 1 } _{ end } .jsonl' ),
304
+ pdf .to_json (os .path .join (json_output_folder , f'part_{ start + 1 } _{ end } .jsonl' ),
305
305
orient = 'records' ,
306
306
lines = True )
307
307
308
308
309
309
def fetch (
310
310
method : str ,
311
311
tablename : str ,
312
- json_output_path : str ,
312
+ json_output_folder : str ,
313
313
batch_size : int = 1 << 30 ,
314
314
processes : int = 1 ,
315
315
sparkSession : Optional [SparkSession ] = None ,
@@ -320,7 +320,7 @@ def fetch(
320
320
Args:
321
321
method (str): dbconnect or dbsql
322
322
tablename (str): catalog.scheme.tablename on UC
323
- json_output_path (str): path to write the result json file to
323
+ json_output_folder (str): path to write the result json file to
324
324
batch_size (int): number of rows that dbsql fetches each time to avoid OOM
325
325
processes (int): max number of processes to use to parallelize the fetch
326
326
sparkSession (pyspark.sql.sparksession): spark session
@@ -358,7 +358,7 @@ def fetch(
358
358
signed , _ , _ = df .collect_cf ('arrow' ) # pyright: ignore
359
359
log .info (f'len(signed) = { len (signed )} ' )
360
360
361
- args = get_args (signed , json_output_path , columns )
361
+ args = get_args (signed , json_output_folder , columns )
362
362
363
363
# Stopping the SparkSession to avoid spilling connection state into the subprocesses.
364
364
sparkSession .stop ()
@@ -371,7 +371,7 @@ def fetch(
371
371
log .warning (f'batch { start } ' )
372
372
end = min (start + batch_size , nrows )
373
373
fetch_data (method , cursor , sparkSession , start , end , order_by ,
374
- tablename , columns_str , json_output_path )
374
+ tablename , columns_str , json_output_folder )
375
375
376
376
if cursor is not None :
377
377
cursor .close ()
@@ -381,21 +381,24 @@ def fetch_DT(args: Namespace) -> None:
381
381
"""Fetch UC Delta Table to local as jsonl."""
382
382
log .info (f'Start .... Convert delta to json' )
383
383
384
- obj = urllib .parse .urlparse (args .json_output_path )
384
+ obj = urllib .parse .urlparse (args .json_output_folder )
385
385
if obj .scheme != '' :
386
386
raise ValueError (
387
- f'Check the json_output_path and verify it is a local path!' )
387
+ f'Check the json_output_folder and verify it is a local path!' )
388
388
389
- if os .path .exists (args .json_output_path ):
390
- if not os .path .isdir (args .json_output_path ) or os .listdir (
391
- args .json_output_path ):
389
+ if os .path .exists (args .json_output_folder ):
390
+ if not os .path .isdir (args .json_output_folder ) or os .listdir (
391
+ args .json_output_folder ):
392
392
raise RuntimeError (
393
- f'A file or a folder { args .json_output_path } already exists and is not empty. Remove it and retry!'
393
+ f'A file or a folder { args .json_output_folder } already exists and is not empty. Remove it and retry!'
394
394
)
395
395
396
- os .makedirs (args .json_output_path , exist_ok = True )
396
+ os .makedirs (args .json_output_folder , exist_ok = True )
397
397
398
- log .info (f'Directory { args .json_output_path } created.' )
398
+ if not args .json_output_filename .endswith ('.jsonl' ):
399
+ raise ValueError ('json_output_filename needs to be a jsonl file' )
400
+
401
+ log .info (f'Directory { args .json_output_folder } created.' )
399
402
400
403
method = 'dbsql'
401
404
dbsql = None
@@ -451,16 +454,16 @@ def fetch_DT(args: Namespace) -> None:
451
454
'Failed to create sql connection to db workspace. To use sql connect, you need to provide http_path and cluster_id!'
452
455
) from e
453
456
454
- fetch (method , args .delta_table_name , args .json_output_path , args . batch_size ,
455
- args .processes , sparkSession , dbsql )
457
+ fetch (method , args .delta_table_name , args .json_output_folder ,
458
+ args .batch_size , args . processes , sparkSession , dbsql )
456
459
457
460
if dbsql is not None :
458
461
dbsql .close ()
459
462
460
463
# combine downloaded jsonl into one big jsonl for IFT
461
464
iterative_combine_jsons (
462
- args .json_output_path ,
463
- os .path .join (args .json_output_path , 'combined.jsonl' ))
465
+ args .json_output_folder ,
466
+ os .path .join (args .json_output_folder , args . json_output_filename ))
464
467
465
468
466
469
if __name__ == '__main__' :
@@ -471,7 +474,7 @@ def fetch_DT(args: Namespace) -> None:
471
474
required = True ,
472
475
type = str ,
473
476
help = 'UC table <catalog>.<schema>.<table name>' )
474
- parser .add_argument ('--json_output_path ' ,
477
+ parser .add_argument ('--json_output_folder ' ,
475
478
required = True ,
476
479
type = str ,
477
480
help = 'Local path to save the converted json' )
@@ -505,6 +508,12 @@ def fetch_DT(args: Namespace) -> None:
505
508
help =
506
509
'Use serverless or not. Make sure the workspace is entitled with serverless'
507
510
)
511
+ parser .add_argument (
512
+ '--json_output_filename' ,
513
+ required = False ,
514
+ type = str ,
515
+ default = 'train-00000-of-00001.jsonl' ,
516
+ help = 'The combined final jsonl that combines all partitioned jsonl' )
508
517
args = parser .parse_args ()
509
518
510
519
from databricks .sdk import WorkspaceClient
0 commit comments