Skip to content

Feature request - Support for filtering specific schemas for collection filter #239

@ereb2impact

Description

@ereb2impact

Hi,

An example of filtering specific schema could be:

from decimal import Decimal
import dataframely as dy
import polars as pl
class InvoiceIdSchema(dy.Schema):
    invoice_id = dy.String(primary_key=True)
class InvoiceSchema(InvoiceIdSchema):
    admission_date = dy.Date(nullable=False)
    discharge_date = dy.Date(nullable=False)
    received_at = dy.Datetime(nullable=False)
    amount = dy.Decimal(nullable=False, min_exclusive=Decimal(0))

    @dy.rule()
    def discharge_after_admission(cls) -> pl.Expr:
        return pl.col("discharge_date") >= pl.col("admission_date")

    @dy.rule()
    def received_at_after_discharge(cls) -> pl.Expr:
        return pl.col("received_at").dt.date() >= pl.col("discharge_date")
class DiagnosisSchema(InvoiceIdSchema):
    diagnosis_code = dy.String(primary_key=True, regex=r"[A-Z][0-9]{2,4}")
    diagnosis_date = dy.Date(nullable=False)
    is_main = dy.Bool(nullable=False)

    @dy.rule(group_by=["invoice_id"])
    def exactly_one_main_diagnosis(cls) -> pl.Expr:
        return pl.col("is_main").sum() == 1
class HospitalClaims(dy.Collection):
    invoices: dy.LazyFrame[InvoiceSchema]
    diagnoses: dy.LazyFrame[DiagnosisSchema]

    @dy.filter(members=["diagnoses"])  # Apply filter only to diagnoses member
    def diagnosis_date_after_admission(self) -> pl.LazyFrame:
        """
        Filter diagnoses to only include those where diagnosis_date >= admission_date.
        This requires joining with invoices to get admission_date.
        
        Returns: LazyFrame with valid diagnoses rows
        """
        return self.diagnoses.join(
            self.invoices.select(["invoice_id", "admission_date"]),
            on="invoice_id",
            how="left"
        ).filter(
            pl.col("diagnosis_date") >= pl.col("admission_date")
        )

    @dy.rule()  # Collection-level validation across members
    def diagnosis_before_discharge(self) -> pl.LazyFrame:
        """
        Validate that all diagnosis dates occur before or on discharge date.
        
        Returns: LazyFrame with rows that violate this rule (diagnosis_date > discharge_date)
        """
        invalid_rows = self.diagnoses.join(
            self.invoices.select(["invoice_id", "discharge_date"]),
            on="invoice_id",
            how="inner"
        ).filter(
            pl.col("diagnosis_date") > pl.col("discharge_date")
        )
        
        return invalid_rows

Ability to filter only specifc schema for filtering collection.

Metadata

Metadata

Assignees

No one assigned

    Labels

    No labels
    No labels

    Type

    No type

    Projects

    No projects

    Milestone

    No milestone

    Relationships

    None yet

    Development

    No branches or pull requests

    Issue actions