-
Notifications
You must be signed in to change notification settings - Fork 63
Introduce Polars
for dumping and loading data
#457
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: master
Are you sure you want to change the base?
Changes from 6 commits
45623d1
c62b9b0
4548266
665489a
4ab9c35
c55c3ab
94e9fc8
1a344e1
d70f285
8a9063f
5395556
33fb5ef
2a88189
79839e1
b200fd9
14ce3ab
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -11,8 +11,6 @@ | |
import luigi.contrib.s3 | ||
import luigi.format | ||
import numpy as np | ||
import pandas as pd | ||
import pandas.errors | ||
from luigi.format import TextFormat | ||
|
||
from gokart.object_storage import ObjectStorage | ||
|
@@ -21,6 +19,16 @@ | |
logger = getLogger(__name__) | ||
|
||
|
||
try: | ||
import polars as pl | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I prefer to raise an exception instead of ignoring the ValueError.
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Thanks for your suggestion. fixed in 79839e1 |
||
|
||
DATAFRAME_FRAMEWORK = 'polars' | ||
except ImportError: | ||
import pandas as pd | ||
|
||
DATAFRAME_FRAMEWORK = 'pandas' | ||
|
||
|
||
class FileProcessor: | ||
@abstractmethod | ||
def format(self): | ||
|
@@ -131,6 +139,24 @@ def __init__(self, sep=',', encoding: str = 'utf-8'): | |
def format(self): | ||
return TextFormat(encoding=self._encoding) | ||
|
||
def load(self, file): ... | ||
|
||
def dump(self, obj, file): ... | ||
|
||
|
||
class PolarsCsvFileProcessor(CsvFileProcessor): | ||
def load(self, file): | ||
try: | ||
return pl.read_csv(file, separator=self._sep, encoding=self._encoding) | ||
except pl.exceptions.NoDataError: | ||
return pl.DataFrame() | ||
|
||
def dump(self, obj, file): | ||
assert isinstance(obj, (pl.DataFrame, pl.Series)), f'requires pl.DataFrame or pl.Series, but {type(obj)} is passed.' | ||
obj.write_csv(file, separator=self._sep, include_header=True) | ||
|
||
|
||
class PandasCsvFileProcessor(CsvFileProcessor): | ||
def load(self, file): | ||
try: | ||
return pd.read_csv(file, sep=self._sep, encoding=self._encoding) | ||
|
@@ -164,6 +190,34 @@ def __init__(self, orient: str | None = None): | |
def format(self): | ||
return luigi.format.Nop | ||
|
||
def load(self, file): ... | ||
|
||
def dump(self, obj, file): ... | ||
|
||
|
||
class PolarsJsonFileProcessor(JsonFileProcessor): | ||
def load(self, file): | ||
try: | ||
if self._orient == 'records': | ||
return pl.read_ndjson(file) | ||
return pl.read_json(file) | ||
except pl.exceptions.ComputeError: | ||
return pl.DataFrame() | ||
|
||
def dump(self, obj, file): | ||
assert isinstance(obj, pl.DataFrame) or isinstance(obj, pl.Series) or isinstance(obj, dict), ( | ||
f'requires pl.DataFrame or pl.Series or dict, but {type(obj)} is passed.' | ||
) | ||
if isinstance(obj, dict): | ||
obj = pl.from_dict(obj) | ||
|
||
if self._orient == 'records': | ||
obj.write_ndjson(file) | ||
else: | ||
obj.write_json(file) | ||
|
||
|
||
class PandasJsonFileProcessor(JsonFileProcessor): | ||
def load(self, file): | ||
try: | ||
return pd.read_json(file, orient=self._orient, lines=True if self._orient == 'records' else False) | ||
|
@@ -215,11 +269,27 @@ def __init__(self, engine='pyarrow', compression=None): | |
def format(self): | ||
return luigi.format.Nop | ||
|
||
def load(self, file): ... | ||
|
||
def dump(self, obj, file): ... | ||
|
||
|
||
class PolarsParquetFileProcessor(ParquetFileProcessor): | ||
def load(self, file): | ||
if ObjectStorage.is_buffered_reader(file): | ||
return pl.read_parquet(file.name) | ||
else: | ||
return pl.read_parquet(BytesIO(file.read())) | ||
|
||
def dump(self, obj, file): | ||
assert isinstance(obj, (pl.DataFrame)), f'requires pl.DataFrame, but {type(obj)} is passed.' | ||
use_pyarrow = self._engine == 'pyarrow' | ||
compression = 'uncompressed' if self._compression is None else self._compression | ||
obj.write_parquet(file, use_pyarrow=use_pyarrow, compression=compression) | ||
|
||
|
||
class PandasParquetFileProcessor(ParquetFileProcessor): | ||
def load(self, file): | ||
# FIXME(mamo3gr): enable streaming (chunked) read with S3. | ||
# pandas.read_parquet accepts file-like object | ||
# but file (luigi.contrib.s3.ReadableS3File) should have 'tell' method, | ||
# which is needed for pandas to read a file in chunks. | ||
if ObjectStorage.is_buffered_reader(file): | ||
return pd.read_parquet(file.name) | ||
else: | ||
|
@@ -240,6 +310,27 @@ def __init__(self, store_index_in_feather: bool): | |
def format(self): | ||
return luigi.format.Nop | ||
|
||
def load(self, file): ... | ||
|
||
def dump(self, obj, file): ... | ||
|
||
|
||
class PolarsFeatherFileProcessor(FeatherFileProcessor): | ||
def load(self, file): | ||
# Since polars' DataFrame doesn't have index, just load feather file | ||
# TODO: Fix ingnoring store_index_in_feather variable | ||
# Currently in PolarsFeatherFileProcessor, we ignored store_index_in_feather variable to avoid | ||
# a breaking change of FeatherFileProcessor's default behavior. | ||
if ObjectStorage.is_buffered_reader(file): | ||
return pl.read_ipc(file.name) | ||
return pl.read_ipc(BytesIO(file.read())) | ||
|
||
def dump(self, obj, file): | ||
assert isinstance(obj, (pl.DataFrame)), f'requires pl.DataFrame, but {type(obj)} is passed.' | ||
obj.write_ipc(file.name) | ||
|
||
|
||
class PandasFeatherFileProcessor(FeatherFileProcessor): | ||
def load(self, file): | ||
# FIXME(mamo3gr): enable streaming (chunked) read with S3. | ||
# pandas.read_feather accepts file-like object | ||
|
@@ -281,6 +372,18 @@ def dump(self, obj, file): | |
dump_obj.to_feather(file.name) | ||
|
||
|
||
if DATAFRAME_FRAMEWORK == 'polars': | ||
CsvFileProcessor = PolarsCsvFileProcessor # type: ignore | ||
JsonFileProcessor = PolarsJsonFileProcessor # type: ignore | ||
ParquetFileProcessor = PolarsParquetFileProcessor # type: ignore | ||
FeatherFileProcessor = PolarsFeatherFileProcessor # type: ignore | ||
else: | ||
CsvFileProcessor = PandasCsvFileProcessor # type: ignore | ||
JsonFileProcessor = PandasJsonFileProcessor # type: ignore | ||
ParquetFileProcessor = PandasParquetFileProcessor # type: ignore | ||
FeatherFileProcessor = PandasFeatherFileProcessor # type: ignore | ||
|
||
|
||
def make_file_processor(file_path: str, store_index_in_feather: bool) -> FileProcessor: | ||
extension2processor = { | ||
'.txt': TextFileProcessor(), | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
add the test run with
polars
extra