Skip to content

Commit 0e60e0b

Browse files
fix: Pandas parser does fail to parse integer or boolean only dataframes (#1683)
- fixes #1678 Converting the Series returned by `iterrows()` to a dict converts `np.int64` type to python's native `int` type and fixes the bug (same with booleans). + Adding non-regression tests - I was also concerned with the next lines, especially : `if value is np.nan: value = None` - it was untested, so I added a test. It looks like `to_dict` would not change the behavior of `np.nan` conversion (see side note), so I left this code unchanged. - Primary keys are returned as `int`s or `tuple[int]`, no `np.int64` there - `Timestamps` types are kept unchanged, so the `if isinstance(value, pd.Timestamp):` still applies. ## Side note `np.nan` behavior is quite strange with `df.iterrows()` : in a number column, it will be converted to `float("nan")`, whereas in string column it will be kept as `np.nan`. Adding `to_dict()` to the row Series does not change the types.
1 parent 97f42ec commit 0e60e0b

File tree

2 files changed

+39
-7
lines changed

2 files changed

+39
-7
lines changed

frictionless/formats/pandas/__spec__/test_parser.py

Lines changed: 38 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,7 @@
22
from decimal import Decimal
33

44
import isodate
5+
import numpy as np
56
import pandas as pd
67
import pytz
78
from dateutil.tz import tzoffset, tzutc
@@ -14,13 +15,44 @@
1415

1516

1617
def test_pandas_parser():
17-
dataframe = pd.DataFrame(data={"id": [1, 2], "name": ["english", "中国人"]})
18+
test_cases = [
19+
{
20+
"name": "Integer type only dataframe, cf issue 1678",
21+
"df_data": {"int": [1]},
22+
"expected_header": ["int"],
23+
"expected_rows": [{"int": 1}],
24+
},
25+
{
26+
"name": "Boolean type only dataframe, cf issue 1678",
27+
"df_data": {"bool": [True]},
28+
"expected_header": ["bool"],
29+
"expected_rows": [{"bool": True}],
30+
},
31+
{
32+
"name": "Mixed types dataframe, chinese characters",
33+
"df_data": {"id": [1, 2], "name": ["english", "中国人"]},
34+
"expected_header": ["id", "name"],
35+
"expected_rows": [
36+
{"id": 1, "name": "english"},
37+
{"id": 2, "name": "中国人"},
38+
],
39+
},
40+
]
41+
for tc in test_cases:
42+
dataframe = pd.DataFrame(data=tc["df_data"])
43+
44+
with TableResource(data=dataframe) as resource:
45+
assert resource.header == tc["expected_header"], tc["name"]
46+
assert resource.read_rows() == tc["expected_rows"], tc["name"]
47+
48+
49+
def test_pandas_parser_with_nan():
50+
dataframe = pd.DataFrame(data={"x": [np.nan]})
51+
1852
with TableResource(data=dataframe) as resource:
19-
assert resource.header == ["id", "name"]
20-
assert resource.read_rows() == [
21-
{"id": 1, "name": "english"},
22-
{"id": 2, "name": "中国人"},
23-
]
53+
test_name = 'np.nan converted to Decimal("NaN")'
54+
row = resource.read_rows()[0]
55+
assert row["x"].is_nan(), test_name
2456

2557

2658
def test_pandas_parser_from_dataframe_with_primary_key_having_datetime():

frictionless/formats/pandas/parser.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -53,7 +53,7 @@ def read_cell_stream_create(self):
5353
pk = pk if isinstance(pk, tuple) else [pk] # type: ignore
5454
value = pk[schema.primary_key.index(field.name)] # type: ignore
5555
else:
56-
value = item[field.name]
56+
value = item.to_dict()[field.name]
5757
if value is np.nan:
5858
value = None
5959
elif isinstance(value, pd.Timestamp):

0 commit comments

Comments
 (0)