Skip to content

Core

The core module provides the building blocks for defining DataFrame schemas.

Defining a schema

from typing import Annotated
import pandas as pd
from typedframes import BaseSchema, Column

class UserSchema(BaseSchema):
    user_id = Column(type=int)
    email   = Column(type=str)
    region  = Column(type=str)

# Annotate with the schema — the checker validates all downstream column access
df: Annotated[pd.DataFrame, UserSchema] = pd.read_csv("users.csv")
print(df["user_id"])   # ✓ validated at lint time
print(df["username"])  # ✗ unknown-column: 'username' not in UserSchema

Descriptor access

Column descriptors provide refactor-safe column name references:

# .s — returns column name as str, for pandas subscript access
df[UserSchema.user_id.s]             # equivalent to df["user_id"]
df.groupby(UserSchema.region.s)

# .col — returns pl.Expr, for polars expressions
df.filter(UserSchema.email.col.is_not_null())
df.select(UserSchema.user_id.col, UserSchema.region.col)

Column sets

ColumnSet captures groups of columns — by explicit list or regex pattern:

from typedframes import ColumnSet

class MetricsSchema(BaseSchema):
    user_id    = Column(type=int)
    score_cols = ColumnSet(members=r"score_\d+", type=float, regex=True)

# .s returns the list of matched names (raises ValueError for regex sets)
class ReportSchema(BaseSchema):
    user_id  = Column(type=int)
    metadata = ColumnSet(members=["source", "campaign"], type=str)

df[ReportSchema.metadata.s]  # → ["source", "campaign"]

Column groups

ColumnGroup bundles related Column or ColumnSet members under one name:

from typedframes import ColumnGroup, ColumnSet

class SensorSchema(BaseSchema):
    timestamp    = Column(type=str)
    temperatures = ColumnSet(members=r"temp_\d+",     type=float, regex=True)
    pressures    = ColumnSet(members=r"pressure_\d+", type=float, regex=True)

    # Bundle both sets for convenient selection
    all_sensors = ColumnGroup(members=[temperatures, pressures])

# .s returns flat list of names (raises ValueError for regex members)
# .cols() returns list of pl.Expr for polars selection
df.select(SensorSchema.all_sensors.cols())

typedframes.BaseSchema

Backend-agnostic schema definition for DataFrame validation.

Define your DataFrame schema once and use it for static analysis with both pandas and polars.

Class Attributes

enforce_columns: If True, validate that all defined columns exist. enforce_types: If True, enforce column types during I/O operations. allow_extra_columns: If True, allow columns not defined in schema. greedy_column_sets: If True, allow columns to match multiple ColumnSets.

Example

class UserData(BaseSchema): user_id = Column(type=int) email = Column(type=str) scores = ColumnSet(members=r"score_\d+", type=float, regex=True)

Use with type annotations for static analysis:

df: Annotated[pd.DataFrame, UserData] = pd.read_csv("data.csv") df: Annotated[pl.DataFrame, UserData] = pl.read_csv("data.csv")

Functions

all_column_names() classmethod

Return list of all explicitly defined column names (from Columns and non-regex ColumnSets).

Source code in src/typedframes/base_schema.py
121
122
123
124
125
126
127
128
129
130
@classmethod
def all_column_names(cls) -> list[str]:
    """Return list of all explicitly defined column names (from Columns and non-regex ColumnSets)."""
    names: list[str] = [col.column_name for col in cls.columns().values()]

    for cs in cls.column_sets().values():
        if not cs.regex and isinstance(cs.members, list):
            names.extend(cs.members)

    return names

column_groups() classmethod

Return mapping of attribute names to ColumnGroup definitions, including inherited.

Source code in src/typedframes/base_schema.py
114
115
116
117
118
119
@classmethod
def column_groups(cls) -> dict[str, ColumnGroup]:
    """Return mapping of attribute names to ColumnGroup definitions, including inherited."""
    if "_column_group_map" not in cls.__dict__ or cls._column_group_map is None:
        cls._column_group_map = _collect_from_mro(cls, ColumnGroup)
    return cls._column_group_map

column_sets() classmethod

Return mapping of attribute names to ColumnSet definitions, including inherited.

Source code in src/typedframes/base_schema.py
107
108
109
110
111
112
@classmethod
def column_sets(cls) -> dict[str, ColumnSet]:
    """Return mapping of attribute names to ColumnSet definitions, including inherited."""
    if "_column_set_map" not in cls.__dict__ or cls._column_set_map is None:
        cls._column_set_map = _collect_from_mro(cls, ColumnSet)
    return cls._column_set_map

columns() classmethod

Return mapping of attribute names to Column definitions, including inherited.

Source code in src/typedframes/base_schema.py
100
101
102
103
104
105
@classmethod
def columns(cls) -> dict[str, Column]:
    """Return mapping of attribute names to Column definitions, including inherited."""
    if "_column_map" not in cls.__dict__ or cls._column_map is None:
        cls._column_map = _collect_from_mro(cls, Column)
    return cls._column_map

compute_column_map(dataframe_columns, *, greedy=None) classmethod

Compute column type map and ColumnSet consumption from actual DataFrame columns.

Parameters:

Name Type Description Default
dataframe_columns list[str]

List of column names from the DataFrame.

required
greedy bool | None

Override greedy_column_sets setting.

None

Returns:

Type Description
dict[str, type]

Tuple of (column_type_map, column_consumed_map) where:

dict[str, list[str]]
  • column_type_map: Dict mapping column name to its type
tuple[dict[str, type], dict[str, list[str]]]
  • column_consumed_map: Dict mapping ColumnSet name to list of matched columns

Raises:

Type Description
ColumnGroupError

If a column matches multiple ColumnSets (when not greedy).

Source code in src/typedframes/base_schema.py
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
@classmethod
def compute_column_map(
    cls,
    dataframe_columns: list[str],
    *,
    greedy: bool | None = None,
) -> tuple[dict[str, type], dict[str, list[str]]]:
    """
    Compute column type map and ColumnSet consumption from actual DataFrame columns.

    Args:
        dataframe_columns: List of column names from the DataFrame.
        greedy: Override greedy_column_sets setting.

    Returns:
        Tuple of (column_type_map, column_consumed_map) where:
        - column_type_map: Dict mapping column name to its type
        - column_consumed_map: Dict mapping ColumnSet name to list of matched columns

    Raises:
        ColumnGroupError: If a column matches multiple ColumnSets (when not greedy).

    """
    greedy = greedy if greedy is not None else cls.greedy_column_sets
    column_consumed_map: dict[str, list[str]] = defaultdict(list)
    key_column_map = cls._build_key_column_map()

    if not cls.column_sets():
        return {k: v.type for k, v in key_column_map.items()}, dict(column_consumed_map)

    column_bag: list[Column | ColumnSet | None] = [key_column_map.get(c) for c in dataframe_columns]
    consumed: list[bool] = [col is not None for col in column_bag]

    column_sets_list = list(cls.column_sets().values())
    for i, col_name in enumerate(dataframe_columns):
        for cs in column_sets_list:
            if cls._match_column_to_set(
                col_name, cs, consumed=consumed[i], greedy=greedy, current_match=column_bag[i]
            ):
                consumed[i] = True
                column_bag[i] = cs
                column_consumed_map[cs.name].append(col_name)

    result: dict[str, type] = {}
    for i, col_or_set in enumerate(column_bag):
        if col_or_set is not None:
            result[dataframe_columns[i]] = col_or_set.type

    return result, dict(column_consumed_map)

from_pandas(df) classmethod

Validate and return a pandas DataFrame with schema metadata.

This is a simple passthrough for use with type annotations. The linter performs static validation; this provides runtime metadata.

Source code in src/typedframes/base_schema.py
235
236
237
238
239
240
241
242
243
@classmethod
def from_pandas(cls, df: pd.DataFrame) -> pd.DataFrame:
    """
    Validate and return a pandas DataFrame with schema metadata.

    This is a simple passthrough for use with type annotations.
    The linter performs static validation; this provides runtime metadata.
    """
    return df

from_polars(df) classmethod

Validate and return a polars DataFrame with schema metadata.

This is a simple passthrough for use with type annotations. The linter performs static validation; this provides runtime metadata.

Source code in src/typedframes/base_schema.py
245
246
247
248
249
250
251
252
253
@classmethod
def from_polars(cls, df: pl.DataFrame) -> pl.DataFrame:
    """
    Validate and return a polars DataFrame with schema metadata.

    This is a simple passthrough for use with type annotations.
    The linter performs static validation; this provides runtime metadata.
    """
    return df

validate_columns(df_columns) classmethod

Validate DataFrame columns against schema.

Returns list of error messages (empty if valid).

Source code in src/typedframes/base_schema.py
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
@classmethod
def validate_columns(cls, df_columns: list[str]) -> list[str]:
    """
    Validate DataFrame columns against schema.

    Returns list of error messages (empty if valid).
    """
    defined = set(cls.all_column_names())
    errors: list[str] = [
        f"Missing required column: {col.column_name}"
        for col in cls.columns().values()
        if col.column_name not in df_columns and cls.enforce_columns
    ]

    if not cls.allow_extra_columns:
        for col_name in df_columns:
            if col_name not in defined:
                is_matched = False
                for cs in cls.column_sets().values():
                    if cs.regex and isinstance(cs.members, list) and any(re.match(p, col_name) for p in cs.members):
                        is_matched = True
                        break
                if not is_matched:
                    errors.append(f"Unexpected column: {col_name}")

    return errors

typedframes.Column(type=Any, alias=None, nullable=False, description='') dataclass

Represents a single column in a DataFrame schema.

Attributes:

Name Type Description
type type

The Python type of the column (e.g., int, str, float).

alias str | None

Optional alternative name for the column in the actual DataFrame.

nullable bool

Whether the column allows null values.

description str

Human-readable description of the column's purpose.

Example

class UserData(BaseSchema): user_id = Column(type=int) email = Column(type=str, alias="user_email") age = Column(type=int, nullable=True, description="User's age in years")

Attributes

col property

Return a polars column expression for this column.

Useful for building polars queries with schema column references.

Example

df.filter(UserSchema.age.col > 18) df.select(UserSchema.email.col, UserSchema.user_id.col)

column_name property

Return the effective column name (alias if set, otherwise attribute name).

s property

Return the column name as a string for native pandas subscript access.

Provides refactor-safe column name access without requiring PandasFrame. Parallel to .col (polars expression) for pandas string access.

Example

df[UserSchema.user_id.s] # pandas — works with plain pd.DataFrame df.groupby(UserSchema.region.s) # pandas groupby

Functions

__set_name__(owner, name)

Set the name attribute from the class attribute name.

Source code in src/typedframes/column.py
37
38
39
def __set_name__(self, owner: type, name: str) -> None:
    """Set the name attribute from the class attribute name."""
    self.name = name

__str__()

Return the column name as a string for use in subscript access.

Source code in src/typedframes/column.py
83
84
85
def __str__(self) -> str:
    """Return the column name as a string for use in subscript access."""
    return self.column_name

typedframes.ColumnSet(members, type=Any, regex=False, description='') dataclass

Represents a set of columns matching a pattern or explicit list.

Used for grouping related columns that share a common type, such as time series data or multi-dimensional measurements.

Attributes:

Name Type Description
members list[str]

List of column names (or a single name as a string, which is normalized to a single-element list). When regex=True, each string is treated as a regex pattern.

type type

The Python type shared by all columns in the set.

regex bool

If True, members is treated as a regex pattern for matching column names.

description str

Human-readable description of the column set's purpose.

Example

class SensorData(BaseSchema): # Explicit member list temperatures = ColumnSet(members=["temp_1", "temp_2", "temp_3"], type=float)

# Regex pattern matching
pressures = ColumnSet(members=r"pressure_\\d+", type=float, regex=True)

Attributes

s property

Return the column names as a list of strings for native pandas subscript access.

For non-regex ColumnSets, returns the explicit member names. For regex ColumnSets, raises ValueError because the matched column names are only known at runtime (use PandasFrame.from_schema() for regex resolution).

Example

df[SensorSchema.temperatures.s] # pandas — works for non-regex ColumnSets

Functions

__post_init__()

Normalize members to a list.

Source code in src/typedframes/column_set.py
43
44
45
46
def __post_init__(self) -> None:
    """Normalize members to a list."""
    if isinstance(self.members, str):
        self.members = [self.members]

__set_name__(owner, name)

Set the name attribute from the class attribute name.

Source code in src/typedframes/column_set.py
48
49
50
def __set_name__(self, owner: type, name: str) -> None:
    """Set the name attribute from the class attribute name."""
    self.name = name

cols(matched_columns=None)

Return polars column expressions for all columns in this set.

Parameters:

Name Type Description Default
matched_columns list[str] | None

Optional list of column names that matched this set. If not provided, uses the explicit members list (not applicable for regex).

None

Returns:

Type Description
list[Expr]

List of polars column expressions.

Example

df.select(SensorSchema.temperatures.cols())

Source code in src/typedframes/column_set.py
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
def cols(self, matched_columns: list[str] | None = None) -> list[pl.Expr]:
    """
    Return polars column expressions for all columns in this set.

    Args:
        matched_columns: Optional list of column names that matched this set.
            If not provided, uses the explicit members list (not applicable for regex).

    Returns:
        List of polars column expressions.

    Example:
        df.select(SensorSchema.temperatures.cols())

    """
    try:
        import polars as pl
    except ImportError:
        from .missing_dependency_error import MissingDependencyError

        package = "polars"
        raise MissingDependencyError(package, "ColumnSet.cols") from None

    if matched_columns is not None:
        return [pl.col(c) for c in matched_columns]

    if self.regex:
        msg = "Cannot get column expressions for regex members without matched_columns"
        raise ValueError(msg)

    return [pl.col(c) for c in self.members]

typedframes.ColumnGroup(members, description='') dataclass

Groups multiple Columns and ColumnSets for convenient access.

Useful for organizing related columns that span multiple Column/ColumnSet definitions, such as grouping all measurement columns together.

Attributes:

Name Type Description
members list[Column | ColumnSet | ColumnGroup]

List of Column, ColumnSet, or nested ColumnGroup objects.

description str

Human-readable description of the group's purpose.

Example

class SensorData(BaseSchema): timestamp = Column(type=str) temperatures = ColumnSet(members=r"temp_\d+", type=float, regex=True) pressures = ColumnSet(members=r"pressure_\d+", type=float, regex=True)

# Group for convenient access to all sensor data
all_sensors = ColumnGroup(members=[temperatures, pressures])

Attributes

s property

Return all column names in this group as a list of strings.

Groups containing regex ColumnSet members raise ValueError because matched column names are only known at runtime (use PandasFrame.from_schema() for regex resolution).

Returns:

Type Description
list[str]

Flat list of column name strings for all non-regex members.

Example

df[SensorSchema.all_sensors.s] # pandas — works for non-regex groups

Functions

__set_name__(owner, name)

Set the name attribute from the class attribute name.

Source code in src/typedframes/column_group.py
42
43
44
def __set_name__(self, owner: type, name: str) -> None:
    """Set the name attribute from the class attribute name."""
    self.name = name

cols(consumed_map=None)

Return polars column expressions for all columns in this group.

Parameters:

Name Type Description Default
consumed_map dict[str, list[str]] | None

Mapping of ColumnSet names to matched column names. Required for ColumnSets with regex patterns.

None

Returns:

Type Description
list[Expr]

List of polars column expressions.

Example

df.select(SensorSchema.all_sensors.cols())

Source code in src/typedframes/column_group.py
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
def cols(self, consumed_map: dict[str, list[str]] | None = None) -> list[pl.Expr]:
    """
    Return polars column expressions for all columns in this group.

    Args:
        consumed_map: Mapping of ColumnSet names to matched column names.
            Required for ColumnSets with regex patterns.

    Returns:
        List of polars column expressions.

    Example:
        df.select(SensorSchema.all_sensors.cols())

    """
    try:
        import polars as pl
    except ImportError:
        from .missing_dependency_error import MissingDependencyError

        package = "polars"
        raise MissingDependencyError(package, "ColumnGroup.cols") from None

    names = self.get_column_names(consumed_map)
    return [pl.col(n) for n in names]

get_column_names(consumed_map=None)

Get all column names in this group.

Parameters:

Name Type Description Default
consumed_map dict[str, list[str]] | None

Mapping of ColumnSet names to matched column names. Required for ColumnSets with regex patterns.

None

Returns:

Type Description
list[str]

List of column names.

Source code in src/typedframes/column_group.py
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
def get_column_names(self, consumed_map: dict[str, list[str]] | None = None) -> list[str]:
    """
    Get all column names in this group.

    Args:
        consumed_map: Mapping of ColumnSet names to matched column names.
            Required for ColumnSets with regex patterns.

    Returns:
        List of column names.

    """
    consumed_map = consumed_map or {}
    result: list[str] = []

    for member in self.members:
        if isinstance(member, Column):
            effective_name = member.alias if isinstance(member.alias, str) else member.name
            result.append(effective_name)
        elif isinstance(member, ColumnSet):
            if member.name in consumed_map:
                result.extend(consumed_map[member.name])
            else:
                result.extend(member.members)
        elif isinstance(member, ColumnGroup):
            result.extend(member.get_column_names(consumed_map))

    return result