Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion .env
Original file line number Diff line number Diff line change
Expand Up @@ -98,7 +98,7 @@ VCPKG="a42af01b72c28a8e1d7b48107b33e4f286a55ef6" # 2023.11.20 Release
# ci/docker/python-wheel-windows-vs2019.dockerfile.
# This is a workaround for our CI problem that "archery docker build" doesn't
# use pulled built images in dev/tasks/python-wheels/github.windows.yml.
PYTHON_WHEEL_WINDOWS_IMAGE_REVISION=2024-02-05
PYTHON_WHEEL_WINDOWS_IMAGE_REVISION=2024-03-12

# Use conanio/${CONAN} for "docker-compose run --rm conan". See
# https://github.com/conan-io/conan-docker-tools#readme for available
Expand Down
3 changes: 3 additions & 0 deletions python/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -260,6 +260,9 @@ set(EXECUTABLE_OUTPUT_PATH "${BUILD_OUTPUT_ROOT_DIRECTORY}")

# Python and Numpy libraries
find_package(Python3Alt REQUIRED)
message(STATUS "Found NumPy version: ${Python3_NumPy_VERSION}")
message(STATUS "NumPy include dir: ${NUMPY_INCLUDE_DIRS}")

include(UseCython)

# PyArrow C++
Expand Down
10 changes: 6 additions & 4 deletions python/pyarrow/src/arrow/python/arrow_to_pandas.cc
Original file line number Diff line number Diff line change
Expand Up @@ -255,7 +255,8 @@ Status SetBufferBase(PyArrayObject* arr, const std::shared_ptr<Buffer>& buffer)
}

inline void set_numpy_metadata(int type, const DataType* datatype, PyArray_Descr* out) {
auto metadata = reinterpret_cast<PyArray_DatetimeDTypeMetaData*>(out->c_metadata);
auto metadata =
reinterpret_cast<PyArray_DatetimeDTypeMetaData*>(PyDataType_C_METADATA(out));
if (type == NPY_DATETIME) {
if (datatype->id() == Type::TIMESTAMP) {
const auto& timestamp_type = checked_cast<const TimestampType&>(*datatype);
Expand All @@ -276,7 +277,7 @@ Status PyArray_NewFromPool(int nd, npy_intp* dims, PyArray_Descr* descr, MemoryP
//
// * Track allocations
// * Get better performance through custom allocators
int64_t total_size = descr->elsize;
int64_t total_size = PyDataType_ELSIZE(descr);
for (int i = 0; i < nd; ++i) {
total_size *= dims[i];
}
Expand Down Expand Up @@ -537,8 +538,9 @@ class PandasWriter {

void SetDatetimeUnit(NPY_DATETIMEUNIT unit) {
PyAcquireGIL lock;
auto date_dtype = reinterpret_cast<PyArray_DatetimeDTypeMetaData*>(
PyArray_DESCR(reinterpret_cast<PyArrayObject*>(block_arr_.obj()))->c_metadata);
auto date_dtype =
reinterpret_cast<PyArray_DatetimeDTypeMetaData*>(PyDataType_C_METADATA(
PyArray_DESCR(reinterpret_cast<PyArrayObject*>(block_arr_.obj()))));
date_dtype->meta.base = unit;
}

Expand Down
6 changes: 3 additions & 3 deletions python/pyarrow/src/arrow/python/numpy_convert.cc
Original file line number Diff line number Diff line change
Expand Up @@ -46,7 +46,7 @@ NumPyBuffer::NumPyBuffer(PyObject* ao) : Buffer(nullptr, 0) {
PyArrayObject* ndarray = reinterpret_cast<PyArrayObject*>(ao);
auto ptr = reinterpret_cast<uint8_t*>(PyArray_DATA(ndarray));
data_ = const_cast<const uint8_t*>(ptr);
size_ = PyArray_SIZE(ndarray) * PyArray_DESCR(ndarray)->elsize;
size_ = PyArray_NBYTES(ndarray);
capacity_ = size_;
is_mutable_ = !!(PyArray_FLAGS(ndarray) & NPY_ARRAY_WRITEABLE);
}
Expand Down Expand Up @@ -150,7 +150,7 @@ Result<std::shared_ptr<DataType>> NumPyDtypeToArrow(PyArray_Descr* descr) {
TO_ARROW_TYPE_CASE(UNICODE, utf8);
case NPY_DATETIME: {
auto date_dtype =
reinterpret_cast<PyArray_DatetimeDTypeMetaData*>(descr->c_metadata);
reinterpret_cast<PyArray_DatetimeDTypeMetaData*>(PyDataType_C_METADATA(descr));
switch (date_dtype->meta.base) {
case NPY_FR_s:
return timestamp(TimeUnit::SECOND);
Expand All @@ -170,7 +170,7 @@ Result<std::shared_ptr<DataType>> NumPyDtypeToArrow(PyArray_Descr* descr) {
} break;
case NPY_TIMEDELTA: {
auto timedelta_dtype =
reinterpret_cast<PyArray_DatetimeDTypeMetaData*>(descr->c_metadata);
reinterpret_cast<PyArray_DatetimeDTypeMetaData*>(PyDataType_C_METADATA(descr));
switch (timedelta_dtype->meta.base) {
case NPY_FR_s:
return duration(TimeUnit::SECOND);
Expand Down
7 changes: 7 additions & 0 deletions python/pyarrow/src/arrow/python/numpy_interop.h
Original file line number Diff line number Diff line change
Expand Up @@ -67,6 +67,13 @@
#define NPY_INT32_IS_INT 0
#endif

// Backported NumPy 2 API (can be removed if numpy 2 is required)
#if NPY_ABI_VERSION < 0x02000000
#define PyDataType_ELSIZE(descr) ((descr)->elsize)
#define PyDataType_C_METADATA(descr) ((descr)->c_metadata)
#define PyDataType_FIELDS(descr) ((descr)->fields)
#endif
Comment on lines +71 to +75
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Can we use the following until NumPy's Windows nightly wheel build is fixed?

Suggested change
#if NPY_ABI_VERSION < 0x02000000
#define PyDataType_ELSIZE(descr) ((descr)->elsize)
#define PyDataType_C_METADATA(descr) ((descr)->c_metadata)
#define PyDataType_FIELDS(descr) ((descr)->fields)
#endif
#ifndef PyDataType_ELSIZE
#define PyDataType_ELSIZE(descr) ((descr)->elsize)
#endif
#ifndef PyDataType_C_METADATA
#define PyDataType_C_METADATA(descr) ((descr)->c_metadata)
#endif
#ifndef PyDataType_FIELDS
#define PyDataType_FIELDS(descr) ((descr)->fields)
#endif

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Hmmm, I guess it bites us that we don't have more fine grained patch versions :(. Those are defined as static inline functions unfortunately. You could put #ifndef _PyArray_DescrNumPy2, but it would be a temporary work-around :(.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Sorry, too late, that doesn't make sense either... It is a struct after all. I don't think there is a way to distinguish an older nightly :(.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Ah, sorry. I should have checked how to implement these APIs.


namespace arrow {
namespace py {

Expand Down
21 changes: 12 additions & 9 deletions python/pyarrow/src/arrow/python/numpy_to_arrow.cc
Original file line number Diff line number Diff line change
Expand Up @@ -196,7 +196,7 @@ class NumPyConverter {
mask_ = reinterpret_cast<PyArrayObject*>(mo);
}
length_ = static_cast<int64_t>(PyArray_SIZE(arr_));
itemsize_ = static_cast<int>(PyArray_DESCR(arr_)->elsize);
itemsize_ = static_cast<int64_t>(PyArray_ITEMSIZE(arr_));
stride_ = static_cast<int64_t>(PyArray_STRIDES(arr_)[0]);
}

Expand Down Expand Up @@ -296,7 +296,7 @@ class NumPyConverter {
PyArrayObject* mask_;
int64_t length_;
int64_t stride_;
int itemsize_;
int64_t itemsize_;

bool from_pandas_;
compute::CastOptions cast_options_;
Expand Down Expand Up @@ -478,7 +478,8 @@ inline Status NumPyConverter::ConvertData<Date32Type>(std::shared_ptr<Buffer>* d

RETURN_NOT_OK(PrepareInputData<Date32Type>(data));

auto date_dtype = reinterpret_cast<PyArray_DatetimeDTypeMetaData*>(dtype_->c_metadata);
auto date_dtype =
reinterpret_cast<PyArray_DatetimeDTypeMetaData*>(PyDataType_C_METADATA(dtype_));
if (dtype_->type_num == NPY_DATETIME) {
// If we have inbound datetime64[D] data, this needs to be downcasted
// separately here from int64_t to int32_t, because this data is not
Expand Down Expand Up @@ -514,7 +515,8 @@ inline Status NumPyConverter::ConvertData<Date64Type>(std::shared_ptr<Buffer>* d

RETURN_NOT_OK(PrepareInputData<Date64Type>(data));

auto date_dtype = reinterpret_cast<PyArray_DatetimeDTypeMetaData*>(dtype_->c_metadata);
auto date_dtype =
reinterpret_cast<PyArray_DatetimeDTypeMetaData*>(PyDataType_C_METADATA(dtype_));
if (dtype_->type_num == NPY_DATETIME) {
// If we have inbound datetime64[D] data, this needs to be downcasted
// separately here from int64_t to int32_t, because this data is not
Expand Down Expand Up @@ -628,11 +630,11 @@ namespace {
// NumPy unicode is UCS4/UTF32 always
constexpr int kNumPyUnicodeSize = 4;

Status AppendUTF32(const char* data, int itemsize, int byteorder,
Status AppendUTF32(const char* data, int64_t itemsize, int byteorder,
::arrow::internal::ChunkedStringBuilder* builder) {
// The binary \x00\x00\x00\x00 indicates a nul terminator in NumPy unicode,
// so we need to detect that here to truncate if necessary. Yep.
int actual_length = 0;
Py_ssize_t actual_length = 0;
for (; actual_length < itemsize / kNumPyUnicodeSize; ++actual_length) {
const char* code_point = data + actual_length * kNumPyUnicodeSize;
if ((*code_point == '\0') && (*(code_point + 1) == '\0') &&
Expand Down Expand Up @@ -705,7 +707,7 @@ Status NumPyConverter::Visit(const StringType& type) {
auto AppendNonNullValue = [&](const uint8_t* data) {
if (is_binary_type) {
if (ARROW_PREDICT_TRUE(util::ValidateUTF8(data, itemsize_))) {
return builder.Append(data, itemsize_);
return builder.Append(data, static_cast<int32_t>(itemsize_));
} else {
return Status::Invalid("Encountered non-UTF8 binary value: ",
HexEncode(data, itemsize_));
Expand Down Expand Up @@ -750,12 +752,13 @@ Status NumPyConverter::Visit(const StructType& type) {
PyAcquireGIL gil_lock;

// Create converters for each struct type field
if (dtype_->fields == NULL || !PyDict_Check(dtype_->fields)) {
if (PyDataType_FIELDS(dtype_) == NULL || !PyDict_Check(PyDataType_FIELDS(dtype_))) {
return Status::TypeError("Expected struct array");
}

for (auto field : type.fields()) {
PyObject* tup = PyDict_GetItemString(dtype_->fields, field->name().c_str());
PyObject* tup =
PyDict_GetItemString(PyDataType_FIELDS(dtype_), field->name().c_str());
if (tup == NULL) {
return Status::Invalid("Missing field '", field->name(), "' in struct array");
}
Expand Down