Constrainer API

Constrainer API

Synthetic data constraint processing module.

Class Architecture

classDiagram
    class Constrainer {
        dict config
        dict _constrainers
        int resample_trails
        __init__(config)
        apply(df, target_rows) DataFrame
        resample_until_satisfy() DataFrame
        register(name, constraint_class)
    }

    class BaseConstrainer {
        <<abstract>>
        validate_config(df) bool
        apply(df) DataFrame
        _check_columns_exist(df, columns) bool
    }

    class NaNGroupConstrainer {
        dict constraints
        __init__(constraints)
        validate_config(df) None
        apply(df) DataFrame
    }

    class FieldConstrainer {
        list config
        list COMPARISON_OPS
        __init__(config)
        validate_config(df) bool
        apply(df) DataFrame
        _tokenize(condition) list
        _parse_expression() Series
    }

    class FieldCombinationConstrainer {
        list constraints
        __init__(constraints)
        validate_config(df) None
        apply(df) DataFrame
        _validate_constraint_format()
        _is_na_value(value) bool
    }

    class FieldProportionsConstrainer {
        FieldProportionsConfig config
        int _target_rows
        __init__(config)
        validate_config() bool
        apply(df, target_rows) DataFrame
        _constraint_filter_field_proportions()
    }

    class FieldProportionsConfig {
        list fields
        str mode
        float tolerance
        int _target_rows
        verify_data(data, target_n_rows)
        check_proportions(filtered_data)
    }

    BaseConstrainer <|-- NaNGroupConstrainer
    BaseConstrainer <|-- FieldConstrainer
    BaseConstrainer <|-- FieldCombinationConstrainer
    BaseConstrainer <|-- FieldProportionsConstrainer

    Constrainer ..> NaNGroupConstrainer
    Constrainer ..> FieldConstrainer
    Constrainer ..> FieldCombinationConstrainer
    Constrainer ..> FieldProportionsConstrainer
    
    FieldProportionsConstrainer *-- FieldProportionsConfig

    %% 樣式標示
    style Constrainer fill:#e6f3ff,stroke:#4a90e2,stroke-width:3px
    style BaseConstrainer fill:#f3e6ff,stroke:#9966cc,stroke-width:2px
    style NaNGroupConstrainer fill:#fff2e6,stroke:#ff9800,stroke-width:2px
    style FieldConstrainer fill:#fff2e6,stroke:#ff9800,stroke-width:2px
    style FieldCombinationConstrainer fill:#fff2e6,stroke:#ff9800,stroke-width:2px
    style FieldProportionsConstrainer fill:#fff2e6,stroke:#ff9800,stroke-width:2px
    style FieldProportionsConfig fill:#f3e6ff,stroke:#9966cc,stroke-width:2px

Basic Usage

from petsard import Constrainer

# Initialize from YAML config
constrainer = Constrainer(config)

# Apply constraints
constrained_data = constrainer.apply(synthetic_data)

# Validate data
validation_result = constrainer.validate(data)

Constructor

def __init__(config: dict)

Parameters

  • config: Constraint configuration dictionary (usually loaded from YAML)

Example

config = {
    'nan_groups': {...},
    'field_constraints': [...],
    'field_combinations': [...],
    'field_proportions': [...]
}
constrainer = Constrainer(config)

apply()

Apply all constraints to data.

def apply(df: pd.DataFrame, target_rows: int = None) -> pd.DataFrame

Parameters

  • df: Input DataFrame
  • target_rows: Target row count (optional, for internal use)

Returns

  • DataFrame meeting all constraints

Example

result = constrainer.apply(synthetic_data)

validate()

Validate data against constraints without modifying it.

def validate(
    data: pd.DataFrame,
    return_details: bool = True,
    max_examples_per_rule: int = 6
) -> dict

Parameters

  • data: DataFrame to validate
  • return_details: Whether to return detailed violation records
  • max_examples_per_rule: Maximum violation examples per rule

Returns

  • Validation result dictionary containing:
    • total_rows: Total row count
    • passed_rows: Passed row count
    • failed_rows: Failed row count
    • pass_rate: Pass rate
    • is_fully_compliant: Whether fully compliant
    • constraint_violations: Violation statistics
    • violation_details: Violation data (optional)

Example

result = constrainer.validate(data)
print(f"Pass rate: {result['pass_rate']:.2%}")

if not result['is_fully_compliant']:
    print(result['violation_details'])

resample_until_satisfy()

Resample until constraints are met and target row count is reached.

def resample_until_satisfy(
    data: pd.DataFrame,
    target_rows: int,
    synthesizer,
    postprocessor=None,
    max_trials: int = 300,
    sampling_ratio: float = 10.0,
    verbose_step: int = 10
) -> pd.DataFrame

Parameters

  • data: Initial data
  • target_rows: Target row count
  • synthesizer: Synthesizer instance
  • postprocessor: Postprocessor (optional)
  • max_trials: Maximum trials
  • sampling_ratio: Sampling multiplier per trial
  • verbose_step: Progress display interval

Returns

  • DataFrame meeting constraints and target row count

Example

result = constrainer.resample_until_satisfy(
    data=pd.DataFrame(),
    target_rows=1000,
    synthesizer=synthesizer,
    max_trials=50
)
print(f"Trials: {constrainer.resample_trails}")

register()

Register custom constraint type.

@classmethod
def register(cls, name: str, constraint_class: type)

Parameters

  • name: Constraint type name
  • constraint_class: Class inheriting from BaseConstrainer

Example

class CustomConstrainer(BaseConstrainer):
    def apply(self, df):
        # Custom logic
        return df

Constrainer.register('custom', CustomConstrainer)

Method Comparison

MethodPurposeModifies Data
apply()Apply and filter constraints
validate()Validate data quality
resample_until_satisfy()Generate constraint-compliant data

Notes

  • Constraint configuration should be defined in YAML files
  • Constraints use strict AND logic combination
  • validate() does not modify data, only checks
  • resample_until_satisfy() is suitable for strict constraints