Skip to content

Bases: BaseSynthesizer

Source code in ydata/sdk/synthesizers/regular.py
class RegularSynthesizer(BaseSynthesizer):

    def sample(self, n_samples: int = 1, condition_on: Optional[dict] = None) -> pdDataFrame:
        """Sample from a [`RegularSynthesizer`][ydata.sdk.synthesizers.RegularSynthesizer]
        instance.

        Arguments:
            n_samples (int): number of rows in the sample
            condition_on: (Optional[dict]): (optional) conditional sampling parameters

        Returns:
            synthetic data
        """
        if n_samples < 1:
            raise InputError("Parameter 'n_samples' must be greater than 0")

        payload = {"numberOfRecords": n_samples}
        if condition_on is not None:
            payload["extraData"] = {
                "condition_on": condition_on
            }
        return self._sample(payload=payload)

    def fit(self, X: Union[DataSource, pdDataFrame],
            privacy_level: PrivacyLevel = PrivacyLevel.HIGH_FIDELITY,
            entities: Optional[Union[str, List[str]]] = None,
            generate_cols: Optional[List[str]] = None,
            exclude_cols: Optional[List[str]] = None,
            dtypes: Optional[Dict[str, Union[str, DataType]]] = None,
            target: Optional[str] = None,
            anonymize: Optional[dict] = None,
            condition_on: Optional[List[str]] = None) -> None:
        """Fit the synthesizer.

        The synthesizer accepts as training dataset either a pandas [`DataFrame`][pandas.DataFrame] directly or a YData [`DataSource`][ydata.sdk.datasources.DataSource].

        Arguments:
            X (Union[DataSource, pandas.DataFrame]): Training dataset
            privacy_level (PrivacyLevel): Synthesizer privacy level (defaults to high fidelity)
            entities (Union[str, List[str]]): (optional) columns representing entities ID
            generate_cols (List[str]): (optional) columns that should be synthesized
            exclude_cols (List[str]): (optional) columns that should not be synthesized
            dtypes (Dict[str, Union[str, DataType]]): (optional) datatype mapping that will overwrite the datasource metadata column datatypes
            target (Optional[str]): (optional) Target column
            name (Optional[str]): (optional) Synthesizer instance name
            anonymize (Optional[str]): (optional) fields to anonymize and the anonymization strategy
            condition_on: (Optional[List[str]]): (optional) list of features to condition upon
        """
        BaseSynthesizer.fit(self, X=X, datatype=DataSourceType.TABULAR, entities=entities,
                            generate_cols=generate_cols, exclude_cols=exclude_cols, dtypes=dtypes,
                            target=target, anonymize=anonymize, privacy_level=privacy_level,
                            condition_on=condition_on)

    def __repr__(self):
        if self._model is not None:
            return self._model.__repr__()
        else:
            return "RegularSynthesizer(Not Initialized)"

fit(X, privacy_level=PrivacyLevel.HIGH_FIDELITY, entities=None, generate_cols=None, exclude_cols=None, dtypes=None, target=None, anonymize=None, condition_on=None)

Fit the synthesizer.

The synthesizer accepts as training dataset either a pandas DataFrame directly or a YData DataSource.

Parameters:

Name Type Description Default
X Union[DataSource, DataFrame]

Training dataset

required
privacy_level PrivacyLevel

Synthesizer privacy level (defaults to high fidelity)

HIGH_FIDELITY
entities Union[str, List[str]]

(optional) columns representing entities ID

None
generate_cols List[str]

(optional) columns that should be synthesized

None
exclude_cols List[str]

(optional) columns that should not be synthesized

None
dtypes Dict[str, Union[str, DataType]]

(optional) datatype mapping that will overwrite the datasource metadata column datatypes

None
target Optional[str]

(optional) Target column

None
name Optional[str]

(optional) Synthesizer instance name

required
anonymize Optional[str]

(optional) fields to anonymize and the anonymization strategy

None
condition_on Optional[List[str]]

(Optional[List[str]]): (optional) list of features to condition upon

None
Source code in ydata/sdk/synthesizers/regular.py
def fit(self, X: Union[DataSource, pdDataFrame],
        privacy_level: PrivacyLevel = PrivacyLevel.HIGH_FIDELITY,
        entities: Optional[Union[str, List[str]]] = None,
        generate_cols: Optional[List[str]] = None,
        exclude_cols: Optional[List[str]] = None,
        dtypes: Optional[Dict[str, Union[str, DataType]]] = None,
        target: Optional[str] = None,
        anonymize: Optional[dict] = None,
        condition_on: Optional[List[str]] = None) -> None:
    """Fit the synthesizer.

    The synthesizer accepts as training dataset either a pandas [`DataFrame`][pandas.DataFrame] directly or a YData [`DataSource`][ydata.sdk.datasources.DataSource].

    Arguments:
        X (Union[DataSource, pandas.DataFrame]): Training dataset
        privacy_level (PrivacyLevel): Synthesizer privacy level (defaults to high fidelity)
        entities (Union[str, List[str]]): (optional) columns representing entities ID
        generate_cols (List[str]): (optional) columns that should be synthesized
        exclude_cols (List[str]): (optional) columns that should not be synthesized
        dtypes (Dict[str, Union[str, DataType]]): (optional) datatype mapping that will overwrite the datasource metadata column datatypes
        target (Optional[str]): (optional) Target column
        name (Optional[str]): (optional) Synthesizer instance name
        anonymize (Optional[str]): (optional) fields to anonymize and the anonymization strategy
        condition_on: (Optional[List[str]]): (optional) list of features to condition upon
    """
    BaseSynthesizer.fit(self, X=X, datatype=DataSourceType.TABULAR, entities=entities,
                        generate_cols=generate_cols, exclude_cols=exclude_cols, dtypes=dtypes,
                        target=target, anonymize=anonymize, privacy_level=privacy_level,
                        condition_on=condition_on)

sample(n_samples=1, condition_on=None)

Sample from a RegularSynthesizer instance.

Parameters:

Name Type Description Default
n_samples int

number of rows in the sample

1
condition_on Optional[dict]

(Optional[dict]): (optional) conditional sampling parameters

None

Returns:

Type Description
DataFrame

synthetic data

Source code in ydata/sdk/synthesizers/regular.py
def sample(self, n_samples: int = 1, condition_on: Optional[dict] = None) -> pdDataFrame:
    """Sample from a [`RegularSynthesizer`][ydata.sdk.synthesizers.RegularSynthesizer]
    instance.

    Arguments:
        n_samples (int): number of rows in the sample
        condition_on: (Optional[dict]): (optional) conditional sampling parameters

    Returns:
        synthetic data
    """
    if n_samples < 1:
        raise InputError("Parameter 'n_samples' must be greater than 0")

    payload = {"numberOfRecords": n_samples}
    if condition_on is not None:
        payload["extraData"] = {
            "condition_on": condition_on
        }
    return self._sample(payload=payload)

PrivacyLevel

Bases: StringEnum

Privacy level exposed to the end-user.

BALANCED_PRIVACY_FIDELITY = 'BALANCED_PRIVACY_FIDELITY' class-attribute instance-attribute

Balanced privacy/fidelity

HIGH_FIDELITY = 'HIGH_FIDELITY' class-attribute instance-attribute

High fidelity

HIGH_PRIVACY = 'HIGH_PRIVACY' class-attribute instance-attribute

High privacy