Skip to content

Commit 16c7f1a

Browse files
pan-x-casubram12
andauthored
GH-14932: [Python] Add python bindings for JSON streaming reader (#45084)
### Rationale for this change The C++ arrow has a JSON streaming reader which is not exposed on the Python interface. ### What changes are included in this PR? This PR is based on #33761. It adds the `open_json` method to open a streaming reader for a JSON file. ### Are these changes tested? Yes ### Are there any user-facing changes? Yes. A new `open_json` method has been added to the Python interface, located at `pyarrow.json.open_json`, and its parameters are the same as the `pyarrow.json.read_json` * GitHub Issue: #14932 Lead-authored-by: pxc <[email protected]> Co-authored-by: Akshay Subramanian <[email protected]> Signed-off-by: Antoine Pitrou <[email protected]>
1 parent deccce1 commit 16c7f1a

File tree

8 files changed

+396
-38
lines changed

8 files changed

+396
-38
lines changed

docs/source/python/api/formats.rst

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -66,6 +66,7 @@ JSON Files
6666

6767
ReadOptions
6868
ParseOptions
69+
open_json
6970
read_json
7071

7172
.. _api.parquet:

docs/source/python/json.rst

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -115,3 +115,15 @@ and pass it to :func:`read_json`. For example, you can pass an explicit
115115

116116
Similarly, you can choose performance settings by passing a
117117
:class:`ReadOptions` instance to :func:`read_json`.
118+
119+
120+
Incremental reading
121+
-------------------
122+
123+
For memory-constrained environments, it is also possible to read a JSON file
124+
one batch at a time, using :func:`open_json`.
125+
126+
In this case, type inference is done on the first block and types are frozen afterwards.
127+
To make sure the right data types are inferred, either set
128+
:attr:`ReadOptions.block_size` to a large enough value, or use
129+
:attr:`ParseOptions.explicit_schema` to set the desired data types explicitly.

python/pyarrow/_csv.pyx

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1295,7 +1295,7 @@ def open_csv(input_file, read_options=None, parse_options=None,
12951295
Options for converting CSV data
12961296
(see pyarrow.csv.ConvertOptions constructor for defaults)
12971297
memory_pool : MemoryPool, optional
1298-
Pool to allocate Table memory from
1298+
Pool to allocate RecordBatch memory from
12991299
13001300
Returns
13011301
-------

python/pyarrow/_json.pyx

Lines changed: 77 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -21,7 +21,9 @@
2121

2222
from pyarrow.includes.common cimport *
2323
from pyarrow.includes.libarrow cimport *
24-
from pyarrow.lib cimport (_Weakrefable, MemoryPool,
24+
25+
from pyarrow.lib cimport (_Weakrefable, Schema,
26+
RecordBatchReader, MemoryPool,
2527
maybe_unbox_memory_pool,
2628
get_input_stream, pyarrow_wrap_table,
2729
pyarrow_wrap_schema, pyarrow_unwrap_schema)
@@ -266,6 +268,38 @@ cdef _get_parse_options(ParseOptions parse_options, CJSONParseOptions* out):
266268
out[0] = parse_options.options
267269

268270

271+
cdef class JSONStreamingReader(RecordBatchReader):
272+
"""An object that reads record batches incrementally from a JSON file.
273+
274+
Should not be instantiated directly by user code.
275+
"""
276+
cdef readonly:
277+
Schema schema
278+
279+
def __init__(self):
280+
raise TypeError(f"Do not call {self.__class__.__name__}'s "
281+
"constructor directly, "
282+
"use pyarrow.json.open_json() instead.")
283+
284+
cdef _open(self, shared_ptr[CInputStream] stream,
285+
CJSONReadOptions c_read_options,
286+
CJSONParseOptions c_parse_options,
287+
MemoryPool memory_pool):
288+
cdef:
289+
shared_ptr[CSchema] c_schema
290+
CIOContext io_context
291+
292+
io_context = CIOContext(maybe_unbox_memory_pool(memory_pool))
293+
294+
with nogil:
295+
self.reader = <shared_ptr[CRecordBatchReader]> GetResultValue(
296+
CJSONStreamingReader.Make(stream, move(c_read_options),
297+
move(c_parse_options), io_context))
298+
c_schema = self.reader.get().schema()
299+
300+
self.schema = pyarrow_wrap_schema(c_schema)
301+
302+
269303
def read_json(input_file, read_options=None, parse_options=None,
270304
MemoryPool memory_pool=None):
271305
"""
@@ -308,3 +342,45 @@ def read_json(input_file, read_options=None, parse_options=None,
308342
table = GetResultValue(reader.get().Read())
309343

310344
return pyarrow_wrap_table(table)
345+
346+
347+
def open_json(input_file, read_options=None, parse_options=None,
348+
MemoryPool memory_pool=None):
349+
"""
350+
Open a streaming reader of JSON data.
351+
352+
Reading using this function is always single-threaded.
353+
354+
Parameters
355+
----------
356+
input_file : string, path or file-like object
357+
The location of JSON data. If a string or path, and if it ends
358+
with a recognized compressed file extension (e.g. ".gz" or ".bz2"),
359+
the data is automatically decompressed when reading.
360+
read_options : pyarrow.json.ReadOptions, optional
361+
Options for the JSON reader (see pyarrow.json.ReadOptions constructor
362+
for defaults)
363+
parse_options : pyarrow.json.ParseOptions, optional
364+
Options for the JSON parser
365+
(see pyarrow.json.ParseOptions constructor for defaults)
366+
memory_pool : MemoryPool, optional
367+
Pool to allocate RecordBatch memory from
368+
369+
Returns
370+
-------
371+
:class:`pyarrow.json.JSONStreamingReader`
372+
"""
373+
cdef:
374+
shared_ptr[CInputStream] stream
375+
CJSONReadOptions c_read_options
376+
CJSONParseOptions c_parse_options
377+
JSONStreamingReader reader
378+
379+
_get_reader(input_file, &stream)
380+
_get_read_options(read_options, &c_read_options)
381+
_get_parse_options(parse_options, &c_parse_options)
382+
383+
reader = JSONStreamingReader.__new__(JSONStreamingReader)
384+
reader._open(stream, move(c_read_options), move(c_parse_options),
385+
memory_pool)
386+
return reader

python/pyarrow/includes/libarrow.pxd

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2176,6 +2176,13 @@ cdef extern from "arrow/json/reader.h" namespace "arrow::json" nogil:
21762176

21772177
CResult[shared_ptr[CTable]] Read()
21782178

2179+
cdef cppclass CJSONStreamingReader" arrow::json::StreamingReader"(
2180+
CRecordBatchReader):
2181+
@staticmethod
2182+
CResult[shared_ptr[CJSONStreamingReader]] Make(
2183+
shared_ptr[CInputStream],
2184+
CJSONReadOptions, CJSONParseOptions, CIOContext)
2185+
21792186

21802187
cdef extern from "arrow/util/thread_pool.h" namespace "arrow::internal" nogil:
21812188

python/pyarrow/json.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -16,4 +16,4 @@
1616
# under the License.
1717

1818

19-
from pyarrow._json import ReadOptions, ParseOptions, read_json # noqa
19+
from pyarrow._json import ReadOptions, ParseOptions, read_json, open_json # noqa

python/pyarrow/tests/test_csv.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -387,7 +387,7 @@ def read_bytes(self, b, **kwargs):
387387
"""
388388
:param b: bytes to be parsed
389389
:param kwargs: arguments passed on to open the csv file
390-
:return: b parsed as a single RecordBatch
390+
:return: b parsed as a single Table
391391
"""
392392
raise NotImplementedError
393393

0 commit comments

Comments
 (0)