Skip to content

Commit ed91f6f

Browse files
rmnskbpitrou
andauthored
GH-47441: [Python][Parquet] Allow passing write_time_adjusted_to_utc to Python's ParquetWriter (#47745)
### Rationale for this change Please see #47441 and #41476. The `ArrowWriterProperties.write_time_adjusted_to_utc` flag is available in C++, yet isn't accessible from Python. This PR introduces the said flag to Python API as well. ### What changes are included in this PR? Exposure of `use_time_adjusted_to_utc` boolean argument in Python's API. ### Are these changes tested? Yes, roundtrip parquet tests for all combinations of time types and their respective time units. ### Are there any user-facing changes? The users will be able to adjust the said flag directly from Python API. * GitHub Issue: #47441 Lead-authored-by: Bogdan Romenskii <[email protected]> Co-authored-by: Antoine Pitrou <[email protected]> Signed-off-by: Antoine Pitrou <[email protected]>
1 parent 0f311e1 commit ed91f6f

File tree

5 files changed

+59
-2
lines changed

5 files changed

+59
-2
lines changed

python/pyarrow/_parquet.pxd

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -66,6 +66,7 @@ cdef shared_ptr[ArrowWriterProperties] _create_arrow_writer_properties(
6666
writer_engine_version=*,
6767
use_compliant_nested_type=*,
6868
store_schema=*,
69+
write_time_adjusted_to_utc=*,
6970
) except *
7071

7172

python/pyarrow/_parquet.pyx

Lines changed: 7 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -2202,7 +2202,8 @@ cdef shared_ptr[ArrowWriterProperties] _create_arrow_writer_properties(
22022202
allow_truncated_timestamps=False,
22032203
writer_engine_version=None,
22042204
use_compliant_nested_type=True,
2205-
store_schema=True) except *:
2205+
store_schema=True,
2206+
write_time_adjusted_to_utc=False) except *:
22062207
"""Arrow writer properties"""
22072208
cdef:
22082209
shared_ptr[ArrowWriterProperties] arrow_properties
@@ -2251,6 +2252,8 @@ cdef shared_ptr[ArrowWriterProperties] _create_arrow_writer_properties(
22512252
elif writer_engine_version != "V2":
22522253
raise ValueError(f"Unsupported Writer Engine Version: {writer_engine_version}")
22532254

2255+
arrow_props.set_time_adjusted_to_utc(write_time_adjusted_to_utc)
2256+
22542257
arrow_properties = arrow_props.build()
22552258

22562259
return arrow_properties
@@ -2312,7 +2315,8 @@ cdef class ParquetWriter(_Weakrefable):
23122315
write_page_checksum=False,
23132316
sorting_columns=None,
23142317
store_decimal_as_integer=False,
2315-
use_content_defined_chunking=False):
2318+
use_content_defined_chunking=False,
2319+
write_time_adjusted_to_utc=False):
23162320
cdef:
23172321
shared_ptr[WriterProperties] properties
23182322
shared_ptr[ArrowWriterProperties] arrow_properties
@@ -2356,6 +2360,7 @@ cdef class ParquetWriter(_Weakrefable):
23562360
writer_engine_version=writer_engine_version,
23572361
use_compliant_nested_type=use_compliant_nested_type,
23582362
store_schema=store_schema,
2363+
write_time_adjusted_to_utc=write_time_adjusted_to_utc,
23592364
)
23602365

23612366
pool = maybe_unbox_memory_pool(memory_pool)

python/pyarrow/includes/libparquet.pxd

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -519,6 +519,7 @@ cdef extern from "parquet/api/writer.h" namespace "parquet" nogil:
519519
Builder* enable_compliant_nested_types()
520520
Builder* disable_compliant_nested_types()
521521
Builder* set_engine_version(ArrowWriterEngineVersion version)
522+
Builder* set_time_adjusted_to_utc(c_bool adjusted)
522523
shared_ptr[ArrowWriterProperties] build()
523524
c_bool support_deprecated_int96_timestamps()
524525

python/pyarrow/parquet/core.py

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -939,6 +939,12 @@ def _sanitize_table(table, new_schema, flavor):
939939
balance between deduplication ratio and fragmentation. Use norm_level=1 or
940940
norm_level=2 to reach a higher deduplication ratio at the expense of
941941
fragmentation.
942+
write_time_adjusted_to_utc : bool, default False
943+
Set the value of isAdjustedTOUTC when writing a TIME column.
944+
If True, this tells the Parquet reader that the TIME columns
945+
are expressed in reference to midnight in the UTC timezone.
946+
If False (the default), the TIME columns are assumed to be expressed
947+
in reference to midnight in an unknown, presumably local, timezone.
942948
"""
943949

944950
_parquet_writer_example_doc = """\
@@ -1035,6 +1041,7 @@ def __init__(self, where, schema, filesystem=None,
10351041
write_page_checksum=False,
10361042
sorting_columns=None,
10371043
store_decimal_as_integer=False,
1044+
write_time_adjusted_to_utc=False,
10381045
**options):
10391046
if use_deprecated_int96_timestamps is None:
10401047
# Use int96 timestamps for Spark
@@ -1088,6 +1095,7 @@ def __init__(self, where, schema, filesystem=None,
10881095
write_page_checksum=write_page_checksum,
10891096
sorting_columns=sorting_columns,
10901097
store_decimal_as_integer=store_decimal_as_integer,
1098+
write_time_adjusted_to_utc=write_time_adjusted_to_utc,
10911099
**options)
10921100
self.is_open = True
10931101

@@ -1949,6 +1957,7 @@ def write_table(table, where, row_group_size=None, version='2.6',
19491957
write_page_checksum=False,
19501958
sorting_columns=None,
19511959
store_decimal_as_integer=False,
1960+
write_time_adjusted_to_utc=False,
19521961
**kwargs):
19531962
# Implementor's note: when adding keywords here / updating defaults, also
19541963
# update it in write_to_dataset and _dataset_parquet.pyx ParquetFileWriteOptions
@@ -1980,6 +1989,7 @@ def write_table(table, where, row_group_size=None, version='2.6',
19801989
write_page_checksum=write_page_checksum,
19811990
sorting_columns=sorting_columns,
19821991
store_decimal_as_integer=store_decimal_as_integer,
1992+
write_time_adjusted_to_utc=write_time_adjusted_to_utc,
19831993
**kwargs) as writer:
19841994
writer.write_table(table, row_group_size=row_group_size)
19851995
except Exception:

python/pyarrow/tests/parquet/test_parquet_writer.py

Lines changed: 40 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -447,3 +447,43 @@ def test_parquet_content_defined_chunking_parameters(tempdir):
447447
# using min_chunk_size, max_chunk_size and norm_level
448448
cdc_options = {"min_chunk_size": 32_768, "max_chunk_size": 65_536, "norm_level": 1}
449449
pq.write_table(table, path, use_content_defined_chunking=cdc_options)
450+
451+
452+
@pytest.mark.parametrize("time_type, time_unit", [
453+
(pa.time32, "s"),
454+
(pa.time32, "ms"),
455+
(pa.time64, "us"),
456+
(pa.time64, "ns"),
457+
])
458+
@pytest.mark.parametrize("utc_flag_val", [False, True])
459+
def test_arrow_writer_props_time_adjusted_to_utc(
460+
tempdir,
461+
utc_flag_val,
462+
time_type,
463+
time_unit,
464+
):
465+
# GH-47441
466+
filename = tempdir / "time_adjusted_to_utc.parquet"
467+
468+
time_values = [0, 123, 10_000, 86_399]
469+
470+
table = pa.table({
471+
"time_col": pa.array(time_values, type=time_type(time_unit)),
472+
})
473+
474+
schema = pa.schema([
475+
("time_col", time_type(time_unit)),
476+
])
477+
478+
with pq.ParquetWriter(
479+
where=filename,
480+
schema=schema,
481+
write_time_adjusted_to_utc=utc_flag_val,
482+
) as writer:
483+
writer.write_table(table)
484+
485+
result = pq.read_table(filename, schema=schema)
486+
487+
result.validate(full=True)
488+
489+
assert result.equals(table)

0 commit comments

Comments
 (0)