Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
23 commits
Select commit Hold shift + click to select a range
7486594
add note to find spot later
Meghansaha Jan 8, 2026
db2b70a
add in polars lib check; prep for dict handling and schema creation
Meghansaha Jan 25, 2026
3f37b61
add schema
Meghansaha Feb 1, 2026
fc2b745
proto df
Meghansaha Feb 4, 2026
698fc48
Created draft df to be returned
Meghansaha Feb 10, 2026
2d7e9b8
Merge remote-tracking branch 'upstream/main' into add-get_dataframe
Meghansaha Feb 10, 2026
cf66d3f
Merge branch 'posit-dev:main' into add-get_dataframe
Meghansaha Feb 20, 2026
a958ded
first draft of polars table complete.
Meghansaha Feb 25, 2026
44f41c7
pandas done
Meghansaha Mar 7, 2026
b3099a8
add duckdb/ibis workflow
Meghansaha Mar 11, 2026
ee5264f
Merge branch 'posit-dev:main' into add-get_dataframe
Meghansaha Mar 11, 2026
c53e3ed
fix ruff errors
Meghansaha Mar 11, 2026
c3b3fa7
Merge branch 'add-get_dataframe' of https://github.com/Meghansaha/poi…
Meghansaha Mar 11, 2026
9f47df9
reformat w/ ruff
Meghansaha Mar 11, 2026
3b5f38e
pull in updates from main
Meghansaha Mar 11, 2026
e3210cb
update documentation, add test placeholder
Meghansaha Mar 21, 2026
47237c1
add tests for `get_dataframe`; need to look at ibis workflow again
Meghansaha Mar 25, 2026
6ce63b3
Merge branch 'posit-dev:main' into add-get_dataframe
Meghansaha Mar 25, 2026
eefb6ce
debugging
Meghansaha Apr 2, 2026
68aa301
Merge branch 'add-get_dataframe' of https://github.com/Meghansaha/poi…
Meghansaha Apr 2, 2026
bd970a3
typo fix
Meghansaha Apr 2, 2026
583e755
cleanup
Meghansaha Apr 7, 2026
17cb5d7
Merge branch 'posit-dev:main' into add-get_dataframe
Meghansaha Apr 7, 2026
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
296 changes: 296 additions & 0 deletions pointblank/validate.py
Original file line number Diff line number Diff line change
Expand Up @@ -157,6 +157,7 @@
"missing_vals_tbl",
"get_action_metadata",
"get_column_count",
"get_dataframe",
"get_data_path",
"get_row_count",
"get_validation_summary",
Expand Down Expand Up @@ -4150,6 +4151,8 @@ def connect_to_table(connection_string: str) -> Any:
"Install it with: pip install 'ibis-framework[duckdb]' (or other backend as needed)"
)

import os

import ibis

# Check if connection string includes table specification
Expand All @@ -4165,6 +4168,8 @@ def connect_to_table(connection_string: str) -> Any:
available_tables = []

conn.disconnect()
conn.close()
os.unlink(base_connection)

# Create helpful error message
if available_tables:
Expand Down Expand Up @@ -17805,6 +17810,297 @@ def get_step_report(

return step_report

def get_dataframe(self, tbl_type: Literal["polars", "pandas", "duckdb"] = "polars"):
"""
Validation results as a dataframe

The `get_dataframe()` method returns a dataframe that represents the validation report. This dataframe provides a summary of the validation results, including the validation steps, the number of test units, the number of failing test units, and the fraction of failing test units. This can be particularly helpful for logging purposes and enables writing validation summaries to CSVs and other on-disk formats.

Parameters
----------
tbl_type :
The output backend for the dataframe. The named options are `"polars"`, `"pandas"`, and `"duckdb"`. Default is "polars".

Supported DataFrame Types
-------------------------
The `tbl_type=` parameter can be set to one of the following:

- `"polars"`: A Polars DataFrame.
- `"pandas"`: A Pandas DataFrame.
- `"duckdb"`: An Ibis table for a DuckDB database.

Examples
--------

```{python}
import pointblank as pb

# Create a validation
validation = (
pb.Validate(data=pb.load_dataset("small_table", tbl_type = "polars"), label="My validation")
.col_vals_gt(columns="d", value=100)
.col_vals_regex(columns="b", pattern=r"[0-9]-[a-z]{3}-[0-9]{3}")
.interrogate()
)

# Get a dataframe of the validation summary results
df_validation = validation.get_dataframe()

```

"""
# Raise an error if tbl_type is not one of the supported types
if tbl_type not in ["polars", "pandas", "duckdb"]:
raise ValueError(
f"The DataFrame type `{tbl_type}` is not valid. Choose one of the following:\n"
"- `polars`\n"
"- `pandas`\n"
"- `duckdb`"
)

# Grab the summary data from validation info helper function
report_original = _validation_info_as_dict(self.validation_info)

# Pop the extracts off if present
if "extract" in report_original:
report_original.pop("extract")

# Set a dictionary for converting column names for df
names_dict = {
"active": "active",
"i": "step_number",
"assertion_type": "step_description",
"column": "columns",
"values": "values",
"pre": "original_pre",
"segments": "original_segments",
"eval_error": "step_evaluated",
"n": "units",
"all_passed": "all_units_passed",
"n_passed": "pass_n",
"f_passed": "pass_pct",
"n_failed": "failed_n",
"f_failed": "failed_pct",
"warning": "warning",
"error": "error",
"critical": "critical",
"brief": "input_brief",
"autobrief": "autobrief",
}

final_report = {
key: report_original[key] for key in names_dict.keys() if key in report_original
}

# Check for polars, raise if not installed
if tbl_type == "polars":
if not _is_lib_present(lib_name="polars"):
raise ImportError(
"The Polars library is not installed but is required when specifying "
'`tbl_type="polars".'
)

import polars as pl

# Create the schema for the df
pl_schema = pl.Schema(
{
"active": pl.Boolean,
"i": pl.Int64,
"assertion_type": pl.String,
"column": pl.String,
"values": pl.Object,
"pre": pl.Object,
"segments": pl.String,
"eval_error": pl.Boolean,
"n": pl.Int64,
"all_passed": pl.Boolean,
"n_passed": pl.Int64,
"f_passed": pl.Float64,
"n_failed": pl.Int64,
"f_failed": pl.Float64,
"warning": pl.Boolean,
"error": pl.Boolean,
"critical": pl.Boolean,
"brief": pl.String,
"autobrief": pl.String, # Default brief if none found
}
)

df_validation_results = (
pl.DataFrame(data=final_report, schema=pl_schema)
.rename(names_dict)
.with_columns(
brief=pl.coalesce(pl.col("input_brief"), pl.col("autobrief")),
preprocessed=pl.when(pl.col("original_pre").is_not_null())
.then(pl.lit(True))
.otherwise(pl.lit(False)),
segmented=pl.when(pl.col("original_segments").is_not_null())
.then(pl.lit(True))
.otherwise(pl.lit(False)),
# Extract pattern from values if it's a dict, otherwise keep as-is
values=pl.col("values").map_elements(
lambda x: x.get("pattern") if isinstance(x, dict) and "pattern" in x else x,
return_dtype=pl.Object,
),
)
.with_columns(
[
pl.when(~pl.col("active"))
.then(pl.lit("-"))
.otherwise(pl.col(col))
.alias(col)
for col in [
"step_evaluated",
"units",
"all_units_passed",
"pass_n",
"pass_pct",
"failed_n",
"failed_pct",
"warning",
"error",
"critical",
]
]
)
.drop(["input_brief", "autobrief", "original_pre", "original_segments"])
)

return df_validation_results

if tbl_type == "pandas":
if not _is_lib_present(lib_name="pandas"):
raise ImportError(
"The Pandas library is not installed but is required when specifying "
'`tbl_type="pandas".'
)

import pandas as pd

def transform_validation_results(df):
# Coalesce: use fillna for first occurrence
df = df.assign(brief=df["input_brief"].fillna(df["autobrief"]))

# Boolean columns based on null checks
df = df.assign(
preprocessed=df["original_pre"].notna(),
segmented=df["original_segments"].notna(),
)

# Extract pattern from dict
df = df.assign(
values=df["values"].apply(
lambda x: x.get("pattern") if isinstance(x, dict) and "pattern" in x else x
)
)

# Create conditional columns in a loop
conditional_cols = [
"step_evaluated",
"units",
"all_units_passed",
"pass_n",
"pass_pct",
"failed_n",
"failed_pct",
"warning",
"error",
"critical",
]

for col in conditional_cols:
df[col] = df[col].where(df["active"], "-")

# Drop columns
df = df.drop(
columns=["input_brief", "autobrief", "original_pre", "original_segments"]
)

return df

df_validation_results = (
pd.DataFrame(data=final_report)
.rename(columns=names_dict)
.pipe(transform_validation_results)
)

return df_validation_results

if tbl_type == "duckdb":
if not _is_lib_present(lib_name="ibis"):
raise ImportError(
"The Ibis library is not installed but is required when specifying "
'`tbl_type="duckdb".'
)

import ibis
import ibis.expr.datatypes as dt

ibis_schema = {
"active": dt.Boolean(),
"i": dt.Int64(),
"assertion_type": dt.String(),
"column": dt.String(),
"values": dt.json(),
"pre": dt.json(),
"segments": dt.String(),
"eval_error": dt.Boolean(),
"n": dt.Int64(),
"all_passed": dt.Boolean(),
"n_passed": dt.Int64(),
"f_passed": dt.Float64(),
"n_failed": dt.Int64(),
"f_failed": dt.Float64(),
"warning": dt.Boolean(),
"error": dt.Boolean(),
"critical": dt.Boolean(),
"brief": dt.String(),
"autobrief": dt.String(),
}

# Pulling out clean regex pattern if needed
final_report["values"] = [
values.get("pattern")
if isinstance(values, dict) and "pattern" in values
else values
for values in final_report["values"]
]

report_table = ibis.memtable(final_report, schema=ibis_schema).rename(
{v: k for k, v in names_dict.items()}
)

conditional_cols = [
"step_number",
"step_evaluated",
"units",
"all_units_passed",
"pass_n",
"pass_pct",
"failed_n",
"failed_pct",
"warning",
"error",
"critical",
]

df_validation_results = report_table.mutate(
brief=ibis.coalesce(report_table.input_brief, report_table.autobrief),
preprocessed=report_table.original_pre.notnull(),
segmented=report_table.original_segments.notnull(),
**{
col: ibis.ifelse(
~report_table.active,
ibis.literal("-"),
report_table[col].cast("string"),
)
for col in conditional_cols
},
).drop("input_brief", "autobrief", "original_pre", "original_segments")

return df_validation_results

def _add_validation(self, validation_info):
"""
Add a validation to the list of validations.
Expand Down
44 changes: 43 additions & 1 deletion tests/test_validate.py
Original file line number Diff line number Diff line change
Expand Up @@ -78,7 +78,7 @@ class StrEnum(str, Enum):

## If we specifically disable tests in pytest set the availability to False
if os.environ.get("SKIP_PYSPARK_TESTS", "").lower() in ("true", "1", "yes"):
PYSPARK_AVAILABLE = False
PYSPARKAVAILABLE = False
SQLITE_AVAILABLE = True
if os.environ.get("SKIP_SQLITE_TESTS", "").lower() in ("true", "1", "yes"):
SQLITE_AVAILABLE = False
Expand Down Expand Up @@ -13899,6 +13899,47 @@ def test_get_step_report_schema_checks(schema) -> None:
assert isinstance(validation.get_step_report(i=1), GT.GT)


def test_get_dataframe_wrong_tbl_type_messaging():
tbl = pl.DataFrame({"name": ["Monica", "Erica", "Rita", "Tina"], "mambo_no": [2, 3, 4, 5]})

validation = Validate(data=tbl).col_vals_gt(columns="mambo_no", value=5).interrogate()

with pytest.raises(ValueError, match="The DataFrame type `polar` is not valid. Choose one of"):
validation.get_dataframe("polar")


@pytest.mark.parametrize(
"library, tbl_type", [("Polars", "polars"), ("Pandas", "pandas"), ("Ibis", "duckdb")]
)
def test_get_dataframe_missing_libraries(library, tbl_type):

validation = Validate(data="small_table")

with patch("pointblank.validate._is_lib_present") as mock_is_lib:
mock_is_lib.return_value = False # library not present

with pytest.raises(ImportError, match=f"The {library} library is not installed"):
validation.get_dataframe(tbl_type)


def test_get_dataframe_returns_polars_df():
validation = Validate(data="small_table")
df_polars = validation.get_dataframe("polars")
assert isinstance(df_polars, pl.DataFrame)


def test_get_dataframe_returns_pandas_df():
validation = Validate(data="small_table")
df_pandas = validation.get_dataframe("pandas")
assert isinstance(df_pandas, pd.DataFrame)


def test_get_dataframe_returns_ibis_memtable():
validation = Validate(data="small_table")
df_ibis = validation.get_dataframe("duckdb")
assert isinstance(df_ibis, ibis.expr.types.relations.Table)


def get_schema_info(
data_tbl,
schema,
Expand Down Expand Up @@ -19317,6 +19358,7 @@ def test_col_vals_ge_timezone_datetime_duckdb() -> None:

finally:
conn.close()
os.unlink(temp_db_path)


@pytest.mark.xfail(reason="Mixed timezone comparisons may not work correctly yet")
Expand Down
Loading