Selectors#

Inspired (or rather directly lifted from) Polars, selectors are a convenient way to interact with the schema.

Importing#

Selectors are available from the top-level namespace

import checkedframe as cf

cf.selectors.all()

For a shorter name, you can use, by convention

import checkedframe.selectors as cfs

cfs.all()

Usage#

Selectors support set operations and resolve to a list of column names. For example, given the below schema,

import checkedframe as cf
import checkedframe.selectors as cfs


class S(cf.Schema):
   float1 = cf.Float64()
   float2 = cf.Float64()
   string1 = cf.String()

Operation

Expression

Result

UNION

cfs.float() | cfs.string()

[“float1”, “float2”, “string1”]

INTERSECTION

cfs.float() & cfs.contains(“1”)

[“float1”]

DIFFERENCE

cfs.float() - cfs.contains(“1”)

[“float2”]

SYMMETRIC DIFFERENCE

cfs.float() ^ cfs.contains(“1”)

[“float2”, “string1”]

COMPLEMENT

~cfs.float()

[“string1”]

Note

Selectors operate on the schema, not the DataFrame.

Functions#

checkedframe.selectors.all() Selector#

Select all columns.

Return type:

Selector

Examples

import checkedframe as cf
import checkedframe.selectors as cfs
import polars as pl

class S(cf.Schema):
    customer_id = cf.String()
    balances = cf.Float64()


df = pl.DataFrame({
    "customer_id": ["a", "b", None],
    "balances": [1.0, None, 3.0],
})

S.validate(df)

Output:

SchemaError: Found 2 error(s)
  customer_id: 1 error(s)
    - `nullable=False` failed for 1 / 3 (33.33%) rows: Must not be null
  balances: 1 error(s)
    - `nullable=False` failed for 1 / 3 (33.33%) rows: Must not be null

Make all columns nullable

@cf.apply_configs(cf.Config(cfs.all(), nullable=True))
class S(cf.Schema):
    customer_id = cf.String()
    balances = cf.Float64()


S.validate(df)
checkedframe.selectors.boolean() Selector#

Select all boolean columns.

Return type:

Selector

Examples

Apply a check to all boolean columns

import checkedframe as cf
import checkedframe.selectors as cfs
import polars as pl

class S(cf.Schema):
    is_new_customer = cf.Boolean()
    is_high_balances = cf.Boolean()

    @cf.Check(columns=cfs.boolean())
    def check_cardinality(s: pl.Series) -> bool:
        """Should not be all True/False"""
        return not (s.all() or s.not_().all())


df = pl.DataFrame(
    {
        "is_new_customer": [False, False, False],
        "is_high_balances": [True, True, False],
    }
)

S.validate(df)

Output:

SchemaError: Found 1 error(s)
  is_new_customer: 1 error(s)
    - check_cardinality failed for 3 / 3 (100.00%) rows: Should not be all True/False
checkedframe.selectors.by_dtype(*dtypes: TypedColumn | type[TypedColumn] | CfUnion | Iterable[TypedColumn | type[TypedColumn] | CfUnion]) Selector#

Select columns whose dtypes match the given dtypes.

Return type:

Selector

Examples

Apply a check to all List(String) columns

import checkedframe as cf
import checkedframe.selectors as cfs
import polars as pl

class S(cf.Schema):
    customer_id = cf.String()
    payment_pattern = cf.List(cf.String)
    payment_pattern_enum = cf.List(cf.Int64)

    @cf.Check(columns=cfs.by_dtype(cf.List(cf.String)))
    def check_list_length(name: str) -> pl.Expr:
        """Each payment pattern must have at least one element."""
        return pl.col(name).list.len() > 0


df = pl.DataFrame({
    "customer_id": ["a", "b", "c"],
    "payment_pattern": [["a"], [], ["c"]],
    "payment_pattern_enum": [[1], [2], [3]],
})

S.validate(df)

Output:

SchemaError: Found 1 error(s)
  payment_pattern: 1 error(s)
    - check_list_length failed for 1 / 3 (33.33%) rows: Each payment pattern must have at least one element.
checkedframe.selectors.by_name(*names: str | Iterable[str]) Selector#

Select all columns matching the given names.

Return type:

Selector

Examples

Make all float columns and customer_id nullable

import checkedframe as cf
import checkedframe.selectors as cfs

@cf.apply_configs(cf.Config(cfs.float() | cfs.by_name("customer_id"), nullable=True))
class S(cf.Schema):
    customer_id = cf.String()
    checking_balances = cf.Float64()
    savings_balances = cf.Float64()
checkedframe.selectors.categorical() Selector#

Select all categorical columns.

Return type:

Selector

Examples

Apply a check to all categorical columns

import checkedframe as cf
import checkedframe.selectors as cfs
import pandas as pd

class S(cf.Schema):
    customer_id = cf.String()
    business_type = cf.Categorical()

    @cf.Check(columns=cfs.categorical())
    def check_categories(s: pd.Series) -> bool:
        """Number of categories must be <= 2."""
        return len(s.cat.categories) <= 2


df = pd.DataFrame(
    {
        "customer_id": ["a23", "c39", "b88"],
        "business_type": ["tech", "finance", "non-profit"],
    }
).assign(business_type=lambda df: df["business_type"].astype("category"))

S.validate(df)

Output:

SchemaError: Found 1 error(s)
  business_type: 1 error(s)
    - check_categories failed for 3 / 3 (100.00%) rows: Number of categories must be <= 2.
checkedframe.selectors.contains(*substrings: str | Iterable[str]) Selector#

Select columns whose names contain the given literal substring(s).

Return type:

Selector

Examples

Apply a check to all columns that contain “balances”

import checkedframe as cf
import checkedframe.selectors as cfs
import pandas as pd

class S(cf.Schema):
    balances_L1 = cf.Float64()
    balances_L2 = cf.Float64()

    @cf.Check(columns=cfs.starts_with("balances"))
    def check_balances_range(s: pd.Series) -> pd.Series:
        """Balances must be in range [0, 1_000_000]"""
        return (s >= 0) & (s <= 1_000_000)


df = pd.DataFrame(
    {"balances_L1": [-1.0, 500.56, 300.12], "balances_L2": [500.29, 600.99, 700.42]}
)

S.validate(df)

Output:

SchemaError: Found 1 error(s)
  balances_L1: 1 error(s)
    - check_balances_range failed for 1 / 3 (33.33%) rows: Balances must be in range [0, 1_000_000]
checkedframe.selectors.date() Selector#

Select all date columns.

Return type:

Selector

Examples

Apply a check to all date columns

import datetime

import checkedframe as cf
import checkedframe.selectors as cfs
import polars as pl

class S(cf.Schema):
    customer_id = cf.String()
    birth_date = cf.Date()

    @cf.Check(columns=cfs.date())
    def check_date(name: str) -> pl.Expr:
        """Date must be after 1900-01-01"""
        return pl.col(name) > datetime.date(1900, 1, 1)


df = pl.DataFrame(
    {
        "customer_id": ["a", "b", "c"],
        "birth_date": [
            datetime.date(1899, 1, 1),
            datetime.date(2000, 1, 1),
            datetime.date(2010, 1, 1),
        ],
    }
)

S.validate(df)

Output:

SchemaError: Found 1 error(s)
  birth_date: 1 error(s)
    - check_date failed for 1 / 3 (33.33%) rows: Date must be after 1900-01-01
checkedframe.selectors.datetime() Selector#

Select all datetime columns.

Return type:

Selector

Examples

Datetimes must be after 1900-01-01.

import datetime

import checkedframe as cf
import checkedframe.selectors as cfs
import polars as pl

class S(cf.Schema):
    customer_id = cf.String()
    birth_date = cf.Datetime()

    @cf.Check(columns=cfs.datetime())
    def check_date(name: str) -> pl.Expr:
        """Date must be after 1900-01-01"""
        return pl.col(name) > datetime.datetime(1900, 1, 1)


df = pl.DataFrame(
    {
        "customer_id": ["a", "b", "c"],
        "birth_date": [
            datetime.datetime(1899, 1, 1),
            datetime.datetime(2000, 1, 1),
            datetime.datetime(2010, 1, 1),
        ],
    }
)

S.validate(df)

Output:

SchemaError: Found 1 error(s)
  birth_date: 1 error(s)
    - check_date failed for 1 / 3 (33.33%) rows: Date must be after 1900-01-01
checkedframe.selectors.decimal() Selector#

Select all decimal columns.

Return type:

Selector

Examples

Apply a check to all decimal columns

from decimal import Decimal

import checkedframe as cf
import checkedframe.selectors as cfs
import polars as pl

class S(cf.Schema):
    customer_id = cf.String()
    income = cf.Decimal()

    @cf.Check(columns=cfs.decimal())
    def check_positive(name: str) -> pl.Expr:
        """Decimal must be positive"""
        return pl.col(name) > 0


df = pl.DataFrame({
    "customer_id": ["a", "b", "c"],
    "income": [Decimal("100.0"), Decimal("-50.0"), Decimal("200.0")],
})

S.validate(df)

Output:

SchemaError: Found 1 error(s)
  income: 1 error(s)
    - check_positive failed for 1 / 3 (33.33%) rows: Decimal must be positive
checkedframe.selectors.ends_with(*suffixes: str | Iterable[str]) Selector#

Select columns whose names end with the given literal substring(s).

Return type:

Selector

Examples

Apply a check to all columns ending in “balances”

import checkedframe as cf
import checkedframe.selectors as cfs
import pandas as pd

class S(cf.Schema):
    L1_balances = cf.Float64()
    L2_balances = cf.Float64()

    @cf.Check(columns=cfs.starts_with("balances"))
    def check_balances_range(s: pd.Series) -> pd.Series:
        """Balances must be in range [0, 1_000_000]"""
        return (s >= 0) & (s <= 1_000_000)


df = pd.DataFrame(
    {"L1_balances": [-1.0, 500.56, 300.12], "L2_balances": [500.29, 600.99, 700.42]}
)

S.validate(df)

Output:

SchemaError: Found 1 error(s)
  L1_balances: 1 error(s)
    - check_balances_range failed for 1 / 3 (33.33%) rows: Balances must be in range [0, 1_000_000]
checkedframe.selectors.float() Selector#

Select all float columns.

Return type:

Selector

Examples

Allow NaNs in all float columns

import checkedframe as cf
import checkedframe.selectors as cfs

@cf.apply_configs(cf.Config(cfs.float(), allow_nan=True))
class S(cf.Schema):
    customer_id = cf.String()
    balances = cf.Float64()
    income = cf.Float32()
checkedframe.selectors.integer() Selector#

Select all integer columns.

Return type:

Selector

Examples

Apply a check to all integer columns

import checkedframe as cf
import checkedframe.selectors as cfs
import polars as pl

class S(cf.Schema):
    customer_id = cf.String()
    tenure = cf.Int64()
    age = cf.UInt32()

    @cf.Check(columns=cfs.unsigned_integer())
    def check_positive(name: str) -> pl.Expr:
        """Value must be positive"""
        return pl.col(name) > 0


df = pl.DataFrame({
    "customer_id": ["a", "b", "c"],
    "tenure": [1, 2, 3],
    "age": [0, 2, 3],
})

S.validate(df)

Output:

SchemaError: Found 1 error(s)
  age: 1 error(s)
    - check_positive failed for 1 / 3 (33.33%) rows: Value must be positive
checkedframe.selectors.matches(pattern: str | Pattern[str]) Selector#

Select all columns that match the given regex pattern.

Parameters:

pattern (str | re.Pattern[str]) – A Python regex

Return type:

Selector

Examples

Apply a check to all columns ending in “L” followed by a number

import checkedframe as cf
import checkedframe.selectors as cfs
import polars as pl

class S(cf.Schema):
    age = cf.Int64()
    age_L1 = cf.Int64()
    age_L2 = cf.Int64()

    @cf.Check(columns=cfs.matches(r"L\d+$"))
    def check_lags(name: str) -> pl.Expr:
        """Age lags should be less than current age"""
        return pl.col(name).lt(pl.col("age"))


df = pl.DataFrame({
    "age": [60, 62],
    "age_L1": [100, 61],
    "age_L2": [58, 60],
})

S.validate(df)

Output:

SchemaError: Found 1 error(s)
  age_L1: 1 error(s)
    - check_lags failed for 1 / 2 (50.00%) rows: Age lags should be less than current age
checkedframe.selectors.numeric() Selector#

Select all numeric columns.

Return type:

Selector

Examples

Apply a check to all numeric columns

import checkedframe as cf
import checkedframe.selectors as cfs
import polars as pl

class S(cf.Schema):
    customer_id = cf.String()
    tenure = cf.Int64()
    age = cf.Float64()

    @cf.Check(columns=cfs.numeric())
    def check_numeric_max(name: str) -> pl.Expr:
        """Numeric cols must be less than 100"""
        return pl.col(name).lt(100)


df = pl.DataFrame(
    {
        "customer_id": ["a1", "b1", "c1"],
        "tenure": [10, 5, 40],
        "age": [30.2, 25.6, 150.1],
    }
)

S.validate(df)

Output:

SchemaError: Found 1 error(s)
  age: 1 error(s)
    - check_numeric_max failed for 1 / 3 (33.33%) rows: Numeric cols must be less than 100
checkedframe.selectors.signed_integer() Selector#

Select all signed integer columns.

Return type:

Selector

Examples

Apply a check to all signed integer columns

import checkedframe as cf
import checkedframe.selectors as cfs
import polars as pl

class S(cf.Schema):
    customer_id = cf.String()
    tenure = cf.Int64()
    age = cf.UInt32()

    @cf.Check(columns=cfs.signed_integer())
    def check_positive(name: str) -> pl.Expr:
        """Value must be positive"""
        return pl.col(name) > 0


df = pl.DataFrame({
    "customer_id": ["a", "b", "c"],
    "tenure": [1, 2, 3],
    "age": [0, 2, 3],
})

S.validate(df)
checkedframe.selectors.starts_with(*prefixes: str | Iterable[str]) Selector#

Select columns whose names start with the given literal substring(s).

Return type:

Selector

Examples

Apply a check to all columns that start with “balances”

import checkedframe as cf
import checkedframe.selectors as cfs
import pandas as pd

class S(cf.Schema):
    balances_L1 = cf.Float64()
    balances_L2 = cf.Float64()

    @cf.Check(columns=cfs.starts_with("balances"))
    def check_balances_range(s: pd.Series) -> pd.Series:
        """Balances must be in range [0, 1_000_000]"""
        return (s >= 0) & (s <= 1_000_000)


df = pd.DataFrame(
    {"balances_L1": [-1.0, 500.56, 300.12], "balances_L2": [500.29, 600.99, 700.42]}
)

S.validate(df)

Output:

SchemaError: Found 1 error(s)
  balances_L1: 1 error(s)
    - check_balances_range failed for 1 / 3 (33.33%) rows: Balances must be in range [0, 1_000_000]
checkedframe.selectors.string() Selector#

Select all string columns.

Return type:

Selector

Examples

Apply a check to all string columns

import checkedframe as cf
import checkedframe.selectors as cfs
import polars as pl

class S(cf.Schema):
    marital_status = cf.String()
    employment_status = cf.String()

    @cf.Check(columns=cfs.string())
    def check_cardinality(s: pl.Series) -> bool:
        """String column must have <= 2 unique values"""
        return s.n_unique() <= 2


df = pl.DataFrame(
    {
        "marital_status": ["married", "unmarried", "unknown"],
        "employment_status": ["employed", "unemployed", "employed"],
    }
)

S.validate(df)

Output:

SchemaError: Found 1 error(s)
  marital_status: 1 error(s)
    - check_cardinality failed for 3 / 3 (100.00%) rows: String column must have <= 2 unique values
checkedframe.selectors.temporal() Selector#

Select all temporal columns.

Return type:

Selector

Examples

Apply a check to all temporal columns

import datetime

import checkedframe as cf
import checkedframe.selectors as cfs
import polars as pl

class S(cf.Schema):
    join_time = cf.Datetime()
    birth_date = cf.Date()

    @cf.Check(columns=cfs.temporal())
    def check_no_invalid_dates(name: str) -> pl.Expr:
        return pl.col(name).dt.date() >= pl.date(1900, 1, 1)

df = pl.DataFrame(
    {
        "join_time": [
            datetime.datetime(1899, 1, 1, 1, 1),
            datetime.datetime(2010, 2, 20, 6, 3, 9),
        ],
        "birth_date": [datetime.date(1899, 1, 1), datetime.date(2000, 5, 19)],
    },
)

S.validate(df)

Output:

SchemaError: Found 2 error(s)
  join_time: 1 error(s)
  - check_no_invalid_dates failed for 1 / 2 (50.00%) rows:
  birth_date: 1 error(s)
  - check_no_invalid_dates failed for 1 / 2 (50.00%) rows:
checkedframe.selectors.unsigned_integer() Selector#

Select all unsigned integer columns.

Return type:

Selector

Examples

Apply a check to all unsigned integer columns

import checkedframe as cf
import checkedframe.selectors as cfs
import polars as pl

class S(cf.Schema):
    customer_id = cf.String()
    tenure = cf.Int64()
    age = cf.UInt32()

    @cf.Check(columns=cfs.unsigned_integer())
    def check_positive(name: str) -> pl.Expr:
        """Value must be positive"""
        return pl.col(name) > 0


df = pl.DataFrame({
    "customer_id": ["a", "b", "c"],
    "tenure": [1, 2, 3],
    "age": [0, 2, 3],
})

S.validate(df)

Output:

SchemaError: Found 1 error(s)
  age: 1 error(s)
    - check_positive failed for 1 / 3 (33.33%) rows: Value must be positive