Skip to content
Open
Show file tree
Hide file tree
Changes from 2 commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion .pre-commit-config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -72,4 +72,4 @@ repos:
hooks:
- id: codespell
additional_dependencies:
- tomli
- tomli
10 changes: 7 additions & 3 deletions pandera/io/pandas_io.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@
from functools import partial
from pathlib import Path
from typing import Dict, Optional, Union

from frictionless.fields import AnyField

Check warning on line 9 in pandera/io/pandas_io.py

View check run for this annotation

Codecov / codecov/patch

pandera/io/pandas_io.py#L9

Added line #L9 was not covered by tests
import pandas as pd

import pandera.errors
Expand Down Expand Up @@ -642,11 +642,13 @@
duplicates, no missing values etc.
"""

def __init__(self, field, primary_keys) -> None:
def __init__(self, field: AnyField, primary_keys: list[str]) -> None:

Check warning on line 645 in pandera/io/pandas_io.py

View check run for this annotation

Codecov / codecov/patch

pandera/io/pandas_io.py#L645

Added line #L645 was not covered by tests
self.constraints = field.constraints or {}
self.primary_keys = primary_keys
self.description = field.description
self.title = field.title

Check warning on line 649 in pandera/io/pandas_io.py

View check run for this annotation

Codecov / codecov/patch

pandera/io/pandas_io.py#L648-L649

Added lines #L648 - L649 were not covered by tests
self.name = field.name
self.type = field.get("type", "string")
self.type = field.to_dict().get("type", "string")

Check warning on line 651 in pandera/io/pandas_io.py

View check run for this annotation

Codecov / codecov/patch

pandera/io/pandas_io.py#L651

Added line #L651 was not covered by tests

@property
def dtype(self) -> str:
Expand Down Expand Up @@ -791,6 +793,8 @@
"required": self.required,
"name": self.name,
"regex": self.regex,
"description": self.description,
"title": self.title,
}


Expand Down
81 changes: 17 additions & 64 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -8,12 +8,10 @@ version_file = "pandera/_version.py"
[project]
Copy link
Collaborator

@cosmicBboy cosmicBboy May 5, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

please revert formatting changes to this file, it's not relevant to the PR. Will also need to document that pandera only supports frictionless >= 5.18.1 after these changes.

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

done, sorry my vscode used ruff as autoformater

name = "pandera"
dynamic = ["version"]
authors = [
{name = "Niels Bantilan", email = "[email protected]"},
]
authors = [{ name = "Niels Bantilan", email = "[email protected]" }]
description = "A light-weight and flexible data validation and testing tool for statistical data objects."
readme = "README.md"
license = {file = "LICENSE.txt"}
license = { file = "LICENSE.txt" }
requires-python = ">=3.9"
keywords = ["pandas", "validation", "data-structures"]
classifiers = [
Expand Down Expand Up @@ -43,55 +41,26 @@ Documentation = "https://pandera.readthedocs.io"
Homepage = "https://github.com/pandera-dev/pandera"

[project.optional-dependencies]
pandas = [
"numpy >= 1.24.4",
"pandas >= 2.1.1",
]
strategies = [
"hypothesis >= 6.92.7",
]
hypotheses = [
"scipy",
]
io = [
"pyyaml >= 5.1",
"black",
"frictionless <= 4.40.8",
]
pandas = ["numpy >= 1.24.4", "pandas >= 2.1.1"]
strategies = ["hypothesis >= 6.92.7"]
hypotheses = ["scipy"]
io = ["pyyaml >= 5.1", "black", "frictionless >= 5.18.1"]
mypy = ["pandas-stubs"]
fastapi = ["fastapi"]
geopandas = [
"geopandas",
"shapely",
]
geopandas = ["geopandas", "shapely"]
pyspark = ["pyspark[connect] >= 3.2.0"]
modin = [
"modin",
"ray",
"dask[dataframe]",
"distributed",
]
modin-ray = [
"modin",
"ray",
]
modin-dask = [
"modin",
"dask[dataframe]",
"distributed",
]
dask = [
"dask[dataframe]",
"distributed",
]
modin = ["modin", "ray", "dask[dataframe]", "distributed"]
modin-ray = ["modin", "ray"]
modin-dask = ["modin", "dask[dataframe]", "distributed"]
dask = ["dask[dataframe]", "distributed"]

polars = ["polars >= 0.20.0"]
all = [
"hypothesis >= 6.92.7",
"scipy",
"pyyaml >= 5.1",
"black",
"frictionless <= 4.40.8",
"frictionless >= 5.18.1",
"pyspark[connect] >= 3.2.0",
"modin",
"ray",
Expand Down Expand Up @@ -121,12 +90,7 @@ dev = [
"python-multipart",
"uv",
]
testing = [
"pytest",
"pytest-cov",
"pytest-xdist",
"pytest-asyncio",
]
testing = ["pytest", "pytest-cov", "pytest-xdist", "pytest-asyncio"]
docs = [
"sphinx",
"sphinx-design",
Expand All @@ -147,7 +111,7 @@ docs = [

[tool.setuptools]
packages = ["pandera"]
package-data = {"pandera" = ["py.typed"]}
package-data = { "pandera" = ["py.typed"] }

[tool.pyright]
include = ["pandera", "tests"]
Expand All @@ -159,22 +123,11 @@ log_cli_level = 20

[tool.ruff]
line-length = 120
extend-exclude = [
"setup.py",
".venv",
".nox",
".git",
"asv_bench",
]
extend-exclude = ["setup.py", ".venv", ".nox", ".git", "asv_bench"]

[tool.black]
line-length = 79
target-version = [
'py39',
'py310',
'py311',
'py312',
]
target-version = ['py39', 'py310', 'py311', 'py312']
include = '\.pyi?$'
exclude = '''
(
Expand All @@ -187,4 +140,4 @@ exclude = '''
'''

[tool.codespell]
ignore-words-list = ["notin", "splitted", "fo", "strat"]
ignore-words-list = ["notin", "splitted", "fo", "strat"]
2 changes: 1 addition & 1 deletion requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@ typing_extensions
hypothesis >= 6.92.7
pyyaml >= 5.1
typing_inspect >= 0.6.0
frictionless <= 4.40.8
frictionless >= 5.18.1
pyarrow
pydantic
scipy
Expand Down
52 changes: 48 additions & 4 deletions tests/io/test_pandas_io.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@
import pandera.typing as pat
from pandera.api.pandas.container import DataFrameSchema
from pandera.engines import pandas_engine
from pandera.io import from_frictionless_schema

try:
from pandera import io
Expand Down Expand Up @@ -1155,6 +1156,7 @@ def datetime_check(pandas_obj, *, stat): ...
maxLength: 80
minLength: 3
name: string_col
type: string
- constraints:
pattern: \\d{3}[A-Z]
name: string_col_2
Expand All @@ -1172,16 +1174,17 @@ def datetime_check(pandas_obj, *, stat): ...
required: true
name: float_col
type: number
- constraints:
name: float_col_2
- name: float_col_2
type: number
- constraints:
minimum: "20201231"
name: date_col
type: date
primaryKey: integer_col
"""
)


FRICTIONLESS_JSON = {
"fields": [
{
Expand Down Expand Up @@ -1432,7 +1435,7 @@ def datetime_check(pandas_obj, *, stat): ...
)
def test_frictionless_schema_parses_correctly(frictionless_schema):
"""Test parsing frictionless schema from yaml and json."""
schema = pandera.io.from_frictionless_schema(frictionless_schema)
schema = from_frictionless_schema(frictionless_schema)

assert str(schema.to_yaml()).strip() == YAML_FROM_FRICTIONLESS.strip()

Expand Down Expand Up @@ -1513,11 +1516,52 @@ def test_frictionless_schema_primary_key(frictionless_schema):
If the primary key is only one field, the unique field should be in the
column level and not the dataframe level.
"""
schema = pandera.io.from_frictionless_schema(frictionless_schema)
schema = from_frictionless_schema(frictionless_schema)
if len(frictionless_schema["primaryKey"]) == 1:
assert schema.columns[frictionless_schema["primaryKey"][0]].unique
assert schema.unique is None
else:
assert schema.unique == frictionless_schema["primaryKey"]
for key in frictionless_schema["primaryKey"]:
assert not schema.columns[key].unique


@pytest.mark.parametrize(
"frictionless_schema",
[
{
"fields": [
{
"name": "street_id",
"type": "string",
"description": "Id of the street",
"title": "street identifier",
"example": "45566_4455_4", # example does not exists in pandera so no need to check it
},
{
"name": "street_type",
"type": "string",
"constraints": {
"enum": ["highway", "motorway", "secondary"]
},
},
{
"name": "timestamp",
"type": "datetime",
"format": "%Y-%m-%d_%H:%M",
},
{
"name": "count",
"type": "integer",
},
],
"primaryKey": ["street_id", "timestamp"],
}
],
)
def test_frictionless_schema_with_description_and_title(
frictionless_schema: dict[str, str],
):
schema = from_frictionless_schema(frictionless_schema)
assert schema.columns["street_id"].description == "Id of the street"
assert schema.columns["street_id"].title == "street identifier"
Loading