-
Notifications
You must be signed in to change notification settings - Fork 13
Open
Description
Hi,
An example of filtering specific schema could be:
from decimal import Decimal
import dataframely as dy
import polars as pl
class InvoiceIdSchema(dy.Schema):
invoice_id = dy.String(primary_key=True)
class InvoiceSchema(InvoiceIdSchema):
admission_date = dy.Date(nullable=False)
discharge_date = dy.Date(nullable=False)
received_at = dy.Datetime(nullable=False)
amount = dy.Decimal(nullable=False, min_exclusive=Decimal(0))
@dy.rule()
def discharge_after_admission(cls) -> pl.Expr:
return pl.col("discharge_date") >= pl.col("admission_date")
@dy.rule()
def received_at_after_discharge(cls) -> pl.Expr:
return pl.col("received_at").dt.date() >= pl.col("discharge_date")
class DiagnosisSchema(InvoiceIdSchema):
diagnosis_code = dy.String(primary_key=True, regex=r"[A-Z][0-9]{2,4}")
diagnosis_date = dy.Date(nullable=False)
is_main = dy.Bool(nullable=False)
@dy.rule(group_by=["invoice_id"])
def exactly_one_main_diagnosis(cls) -> pl.Expr:
return pl.col("is_main").sum() == 1
class HospitalClaims(dy.Collection):
invoices: dy.LazyFrame[InvoiceSchema]
diagnoses: dy.LazyFrame[DiagnosisSchema]
@dy.filter(members=["diagnoses"]) # Apply filter only to diagnoses member
def diagnosis_date_after_admission(self) -> pl.LazyFrame:
"""
Filter diagnoses to only include those where diagnosis_date >= admission_date.
This requires joining with invoices to get admission_date.
Returns: LazyFrame with valid diagnoses rows
"""
return self.diagnoses.join(
self.invoices.select(["invoice_id", "admission_date"]),
on="invoice_id",
how="left"
).filter(
pl.col("diagnosis_date") >= pl.col("admission_date")
)
@dy.rule() # Collection-level validation across members
def diagnosis_before_discharge(self) -> pl.LazyFrame:
"""
Validate that all diagnosis dates occur before or on discharge date.
Returns: LazyFrame with rows that violate this rule (diagnosis_date > discharge_date)
"""
invalid_rows = self.diagnoses.join(
self.invoices.select(["invoice_id", "discharge_date"]),
on="invoice_id",
how="inner"
).filter(
pl.col("diagnosis_date") > pl.col("discharge_date")
)
return invalid_rows
Ability to filter only specifc schema for filtering collection.
AndreasAlbertQC
Metadata
Metadata
Assignees
Labels
No labels