Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 1 addition & 2 deletions python/pyarrow/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -49,5 +49,4 @@
list_, struct, field,
DataType, Field, Schema, schema)

from pyarrow.table import (Column, RecordBatch, dataframe_from_batches, Table,
from_pandas_dataframe)
from pyarrow.table import Column, RecordBatch, Table, from_pandas_dataframe
94 changes: 48 additions & 46 deletions python/pyarrow/table.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -415,52 +415,6 @@ cdef class RecordBatch:
return result


def dataframe_from_batches(batches):
"""
Convert a list of Arrow RecordBatches to a pandas.DataFrame

Parameters
----------

batches: list of RecordBatch
RecordBatch list to be converted, schemas must be equal
"""

cdef:
vector[shared_ptr[CArray]] c_array_chunks
vector[shared_ptr[CColumn]] c_columns
shared_ptr[CTable] c_table
Array arr
Schema schema

import pandas as pd

schema = batches[0].schema

# check schemas are equal
if any((not schema.equals(other.schema) for other in batches[1:])):
raise ArrowException("Error converting list of RecordBatches to "
"DataFrame, not all schemas are equal")

cdef int K = batches[0].num_columns

# create chunked columns from the batches
c_columns.resize(K)
for i in range(K):
for batch in batches:
arr = batch[i]
c_array_chunks.push_back(arr.sp_array)
c_columns[i].reset(new CColumn(schema.sp_schema.get().field(i),
c_array_chunks))
c_array_chunks.clear()

# create a Table from columns and convert to DataFrame
c_table.reset(new CTable('', schema.sp_schema, c_columns))
table = Table()
table.init(c_table)
return table.to_pandas()


cdef class Table:
"""
A collection of top-level named, equal length Arrow arrays.
Expand Down Expand Up @@ -567,6 +521,54 @@ cdef class Table:

return result

@staticmethod
def from_batches(batches):
"""
Construct a Table from a list of Arrow RecordBatches

Parameters
----------

batches: list of RecordBatch
RecordBatch list to be converted, schemas must be equal
"""

cdef:
vector[shared_ptr[CArray]] c_array_chunks
vector[shared_ptr[CColumn]] c_columns
shared_ptr[CTable] c_table
Array arr
Schema schema

import pandas as pd

schema = batches[0].schema

# check schemas are equal
for other in batches[1:]:
if not schema.equals(other.schema):
raise ArrowException("Error converting list of RecordBatches "
"to DataFrame, not all schemas are equal: {%s} != {%s}"
% (str(schema), str(other.schema)))

cdef int K = batches[0].num_columns

# create chunked columns from the batches
c_columns.resize(K)
for i in range(K):
for batch in batches:
arr = batch[i]
c_array_chunks.push_back(arr.sp_array)
c_columns[i].reset(new CColumn(schema.sp_schema.get().field(i),
c_array_chunks))
c_array_chunks.clear()

# create a Table from columns and convert to DataFrame
c_table.reset(new CTable('', schema.sp_schema, c_columns))
table = Table()
table.init(c_table)
return table

def to_pandas(self):
"""
Convert the arrow::Table to a pandas DataFrame
Expand Down
5 changes: 3 additions & 2 deletions python/pyarrow/tests/test_table.py
Original file line number Diff line number Diff line change
Expand Up @@ -69,7 +69,8 @@ def test_recordbatchlist_to_pandas():
batch1 = pa.RecordBatch.from_pandas(data1)
batch2 = pa.RecordBatch.from_pandas(data2)

result = pa.dataframe_from_batches([batch1, batch2])
table = pa.Table.from_batches([batch1, batch2])
result = table.to_pandas()
data = pd.concat([data1, data2], ignore_index=True)
assert_frame_equal(data, result)

Expand All @@ -82,7 +83,7 @@ def test_recordbatchlist_schema_equals():
batch2 = pa.RecordBatch.from_pandas(data2)

with pytest.raises(pa.ArrowException):
pa.dataframe_from_batches([batch1, batch2])
pa.Table.from_batches([batch1, batch2])


def test_table_basics():
Expand Down