Constrainer API
Synthetic data constraint processing module.
Class Architecture
classDiagram class Constrainer { dict config dict _constrainers int resample_trails __init__(config) apply(df, target_rows) DataFrame resample_until_satisfy() DataFrame register(name, constraint_class) } class BaseConstrainer { <<abstract>> validate_config(df) bool apply(df) DataFrame _check_columns_exist(df, columns) bool } class NaNGroupConstrainer { dict constraints __init__(constraints) validate_config(df) None apply(df) DataFrame } class FieldConstrainer { list config list COMPARISON_OPS __init__(config) validate_config(df) bool apply(df) DataFrame _tokenize(condition) list _parse_expression() Series } class FieldCombinationConstrainer { list constraints __init__(constraints) validate_config(df) None apply(df) DataFrame _validate_constraint_format() _is_na_value(value) bool } class FieldProportionsConstrainer { FieldProportionsConfig config int _target_rows __init__(config) validate_config() bool apply(df, target_rows) DataFrame _constraint_filter_field_proportions() } class FieldProportionsConfig { list fields str mode float tolerance int _target_rows verify_data(data, target_n_rows) check_proportions(filtered_data) } BaseConstrainer <|-- NaNGroupConstrainer BaseConstrainer <|-- FieldConstrainer BaseConstrainer <|-- FieldCombinationConstrainer BaseConstrainer <|-- FieldProportionsConstrainer Constrainer ..> NaNGroupConstrainer Constrainer ..> FieldConstrainer Constrainer ..> FieldCombinationConstrainer Constrainer ..> FieldProportionsConstrainer FieldProportionsConstrainer *-- FieldProportionsConfig %% 樣式標示 style Constrainer fill:#e6f3ff,stroke:#4a90e2,stroke-width:3px style BaseConstrainer fill:#f3e6ff,stroke:#9966cc,stroke-width:2px style NaNGroupConstrainer fill:#fff2e6,stroke:#ff9800,stroke-width:2px style FieldConstrainer fill:#fff2e6,stroke:#ff9800,stroke-width:2px style FieldCombinationConstrainer fill:#fff2e6,stroke:#ff9800,stroke-width:2px style FieldProportionsConstrainer fill:#fff2e6,stroke:#ff9800,stroke-width:2px style FieldProportionsConfig fill:#f3e6ff,stroke:#9966cc,stroke-width:2px
Basic Usage
from petsard import Constrainer
# Initialize from YAML config
constrainer = Constrainer(config)
# Apply constraints
constrained_data = constrainer.apply(synthetic_data)
# Validate data
validation_result = constrainer.validate(data)
Constructor
def __init__(config: dict)
Parameters
config
: Constraint configuration dictionary (usually loaded from YAML)
Example
config = {
'nan_groups': {...},
'field_constraints': [...],
'field_combinations': [...],
'field_proportions': [...]
}
constrainer = Constrainer(config)
apply()
Apply all constraints to data.
def apply(df: pd.DataFrame, target_rows: int = None) -> pd.DataFrame
Parameters
df
: Input DataFrametarget_rows
: Target row count (optional, for internal use)
Returns
- DataFrame meeting all constraints
Example
result = constrainer.apply(synthetic_data)
validate()
Validate data against constraints without modifying it.
def validate(
data: pd.DataFrame,
return_details: bool = True,
max_examples_per_rule: int = 6
) -> dict
Parameters
data
: DataFrame to validatereturn_details
: Whether to return detailed violation recordsmax_examples_per_rule
: Maximum violation examples per rule
Returns
- Validation result dictionary containing:
total_rows
: Total row countpassed_rows
: Passed row countfailed_rows
: Failed row countpass_rate
: Pass rateis_fully_compliant
: Whether fully compliantconstraint_violations
: Violation statisticsviolation_details
: Violation data (optional)
Example
result = constrainer.validate(data)
print(f"Pass rate: {result['pass_rate']:.2%}")
if not result['is_fully_compliant']:
print(result['violation_details'])
resample_until_satisfy()
Resample until constraints are met and target row count is reached.
def resample_until_satisfy(
data: pd.DataFrame,
target_rows: int,
synthesizer,
postprocessor=None,
max_trials: int = 300,
sampling_ratio: float = 10.0,
verbose_step: int = 10
) -> pd.DataFrame
Parameters
data
: Initial datatarget_rows
: Target row countsynthesizer
: Synthesizer instancepostprocessor
: Postprocessor (optional)max_trials
: Maximum trialssampling_ratio
: Sampling multiplier per trialverbose_step
: Progress display interval
Returns
- DataFrame meeting constraints and target row count
Example
result = constrainer.resample_until_satisfy(
data=pd.DataFrame(),
target_rows=1000,
synthesizer=synthesizer,
max_trials=50
)
print(f"Trials: {constrainer.resample_trails}")
register()
Register custom constraint type.
@classmethod
def register(cls, name: str, constraint_class: type)
Parameters
name
: Constraint type nameconstraint_class
: Class inheriting from BaseConstrainer
Example
class CustomConstrainer(BaseConstrainer):
def apply(self, df):
# Custom logic
return df
Constrainer.register('custom', CustomConstrainer)
Method Comparison
Method | Purpose | Modifies Data |
---|---|---|
apply() | Apply and filter constraints | ✅ |
validate() | Validate data quality | ❌ |
resample_until_satisfy() | Generate constraint-compliant data | ✅ |
Notes
- Constraint configuration should be defined in YAML files
- Constraints use strict AND logic combination
- validate() does not modify data, only checks
- resample_until_satisfy() is suitable for strict constraints