-
Notifications
You must be signed in to change notification settings - Fork 4k
GH-44066: [Python] Add Python wrapper for JsonExtensionType #44070
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
20e1b36
399a87a
e0c92d5
943553a
4ab988c
64d4975
b74fca6
33b9241
466b597
16958c4
d26b327
dc5143f
f4d753c
86208e3
7afd8be
520f0aa
9646944
583ba67
20fe633
e8baa24
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -1926,3 +1926,56 @@ def test_bool8_scalar(): | |
| assert pa.scalar(1, type=pa.bool8()).as_py() is True | ||
| assert pa.scalar(2, type=pa.bool8()).as_py() is True | ||
| assert pa.scalar(None, type=pa.bool8()).as_py() is None | ||
|
|
||
|
|
||
| @pytest.mark.parametrize("storage_type", ( | ||
| pa.string(), pa.large_string(), pa.string_view())) | ||
| def test_json(storage_type, pickle_module): | ||
| data = ['{"a": 1}', '{"b": 2}', None] | ||
| json_type = pa.json_(storage_type) | ||
| storage = pa.array(data, type=storage_type) | ||
| array = pa.array(data, type=json_type) | ||
| json_arr_class = json_type.__arrow_ext_class__() | ||
|
|
||
| assert pa.json_() == pa.json_(pa.utf8()) | ||
|
||
| assert json_type.extension_name == "arrow.json" | ||
| assert json_type.storage_type == storage_type | ||
| assert json_type.__class__ is pa.JsonType | ||
jorisvandenbossche marked this conversation as resolved.
Outdated
Show resolved
Hide resolved
|
||
|
|
||
| assert json_type == pa.json_(storage_type) | ||
| assert json_type != storage_type | ||
|
|
||
| assert isinstance(array, pa.JsonArray) | ||
|
|
||
| assert array.to_pylist() == data | ||
| assert array[0].as_py() == data[0] | ||
| assert array[2].as_py() is None | ||
|
|
||
| # Pickle roundtrip | ||
| result = pickle_module.loads(pickle_module.dumps(json_type)) | ||
| assert result == json_type | ||
|
|
||
| # IPC roundtrip | ||
| buf = ipc_write_batch(pa.RecordBatch.from_arrays([array], ["ext"])) | ||
| batch = ipc_read_batch(buf) | ||
| reconstructed_array = batch.column(0) | ||
| assert reconstructed_array.type == json_type | ||
| assert reconstructed_array == array | ||
| assert isinstance(array, json_arr_class) | ||
|
|
||
| assert json_type.__arrow_ext_scalar_class__() == pa.JsonScalar | ||
| assert isinstance(array[0], pa.JsonScalar) | ||
|
|
||
| # cast storage -> extension type | ||
| result = storage.cast(json_type) | ||
| assert result == array | ||
|
|
||
| # cast extension type -> storage type | ||
| inner = array.cast(storage_type) | ||
| assert inner == storage | ||
rok marked this conversation as resolved.
Outdated
Show resolved
Hide resolved
|
||
|
|
||
| for storage_type in (pa.int32(), pa.large_binary(), pa.float32()): | ||
| with pytest.raises( | ||
| pa.ArrowInvalid, | ||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Pity this doesn't raise
Member
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. We could catch and raise it but it's probably not a good idea.
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Indeed. |
||
| match=f"Invalid storage type for JsonExtensionType: {storage_type}"): | ||
| pa.json_(storage_type) | ||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -1812,6 +1812,43 @@ cdef class ExtensionType(BaseExtensionType): | |
| return ExtensionScalar | ||
|
|
||
|
|
||
| cdef class JsonType(BaseExtensionType): | ||
| """ | ||
| Concrete class for JSON extension type. | ||
|
|
||
| Examples | ||
| -------- | ||
| Define the extension type for JSON array | ||
|
|
||
| >>> import pyarrow as pa | ||
| >>> json_type = pa.json_(pa.large_utf8()) | ||
|
|
||
| Create an extension array | ||
|
|
||
| >>> arr = [None, '{ "id":30, "values":["a", "b"] }'] | ||
| >>> storage = pa.array(arr, pa.large_utf8()) | ||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Side note: it would be nice if one could write
Member
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. That would be a nice feature. #44406 |
||
| >>> pa.ExtensionArray.from_storage(json_type, storage) | ||
| <pyarrow.lib.JsonArray object at ...> | ||
| [ | ||
| null, | ||
| "{ "id":30, "values":["a", "b"] }" | ||
| ] | ||
| """ | ||
|
|
||
| cdef void init(self, const shared_ptr[CDataType]& type) except *: | ||
| BaseExtensionType.init(self, type) | ||
| self.json_ext_type = <const CJsonType*> type.get() | ||
|
|
||
| def __arrow_ext_class__(self): | ||
| return JsonArray | ||
|
|
||
| def __reduce__(self): | ||
| return json_, (self.storage_type,) | ||
|
|
||
| def __arrow_ext_scalar_class__(self): | ||
| return JsonScalar | ||
|
|
||
|
|
||
| cdef class UuidType(BaseExtensionType): | ||
| """ | ||
| Concrete class for UUID extension type. | ||
|
|
@@ -5296,6 +5333,44 @@ def run_end_encoded(run_end_type, value_type): | |
| return pyarrow_wrap_data_type(ree_type) | ||
|
|
||
|
|
||
| def json_(DataType storage_type=utf8()): | ||
| """ | ||
| Create instance of JSON extension type. | ||
|
|
||
| Parameters | ||
| ---------- | ||
| storage_type : DataType, default pyarrow.string() | ||
| The underlying data type. Can be on of the following types: | ||
| string, large_string, string_view. | ||
|
|
||
rok marked this conversation as resolved.
Show resolved
Hide resolved
|
||
| Returns | ||
| ------- | ||
| type : JsonType | ||
|
|
||
| Examples | ||
| -------- | ||
| Create an instance of JSON extension type: | ||
|
|
||
| >>> import pyarrow as pa | ||
| >>> pa.json_(pa.utf8()) | ||
| JsonType(extension<arrow.json>) | ||
|
|
||
| Use the JSON type to create an array: | ||
|
|
||
| >>> pa.array(['{"a": 1}', '{"b": 2}'], type=pa.json_(pa.utf8())) | ||
| <pyarrow.lib.JsonArray object at ...> | ||
| [ | ||
| "{"a": 1}", | ||
| "{"b": 2}" | ||
| ] | ||
| """ | ||
|
|
||
| cdef JsonType out = JsonType.__new__(JsonType) | ||
| c_json_ext_type = GetResultValue(CJsonType.Make(storage_type.sp_type)) | ||
| out.init(c_json_ext_type) | ||
| return out | ||
|
|
||
|
|
||
| def uuid(): | ||
| """ | ||
| Create UuidType instance. | ||
|
|
||
Uh oh!
There was an error while loading. Please reload this page.