Skip to content

preprocessor

The schema package provide tools to infer and build a dataset schema.

Modules:

Name Description
preprocessor

Feature preprocessor.

zoo

Preprocessor zoo.

Classes:

Name Description
IdentityPreprocessor

Identity Preprocessor class.

SklearnPreprocessor

Preprocessor class based on sklearn preprocessing classes.

TorchPreprocessor

Preprocessor class based on pytorch.

__all__ = ['IdentityPreprocessor', 'SklearnPreprocessor', 'TorchPreprocessor'] #

IdentityPreprocessor #

Identity Preprocessor class.

Parameters:

Name Type Description Default

preprocessed_size #

str
None

Methods:

Name Description
to_model

Convert to PreprocessorInsert instance.

from_model

Create the client object from api response.

to_model() -> PreprocessorInsert #

Convert to PreprocessorInsert instance.

Source code in src/xpdeep/dataset/preprocessor/preprocessor.py
def to_model(self) -> PreprocessorInsert:
    """Convert to PreprocessorInsert instance."""
    return PreprocessorInsert(
        preprocessed_size=self.preprocessed_size, value=IdentityPreprocessorValue(type_="IDENTITY")
    )

from_model(json_response: dict[str, object]) -> IdentityPreprocessor #

Create the client object from api response.

Source code in src/xpdeep/dataset/preprocessor/preprocessor.py
@classmethod
def from_model(cls, json_response: dict[str, object]) -> IdentityPreprocessor:
    """Create the client object from api response."""
    preprocessor = PreprocessorSelect.from_dict(json_response)
    return cls(preprocessed_size=preprocessor.preprocessed_size)

SklearnPreprocessor #

Preprocessor class based on sklearn preprocessing classes.

Parameters:

Name Type Description Default

preprocessed_size #

str
None

preprocess_function #

str
required

Methods:

Name Description
transform

Transform a feature raw value into its preprocessed value.

inverse_transform

Inverse transform a feature preprocessed value into its raw value.

to_model

Convert to PreprocessorInsert instance.

from_model

Create the client object from api response.

Attributes:

Name Type Description
preprocess_function TransformerMixin

preprocess_function: TransformerMixin #

transform(feature_raw_value: object) -> torch.Tensor #

Transform a feature raw value into its preprocessed value.

Source code in src/xpdeep/dataset/preprocessor/preprocessor.py
def transform(self, feature_raw_value: object) -> torch.Tensor:
    """Transform a feature raw value into its preprocessed value."""
    if not isinstance(self.preprocess_function, TransformerMixin):
        msg = f"{self.preprocess_function} was not parsable"
        raise TypeError(msg)
    return self.preprocess_function.transform(feature_raw_value)  # type: ignore[no-any-return]

inverse_transform(preprocessed_value: torch.Tensor) -> object #

Inverse transform a feature preprocessed value into its raw value.

Source code in src/xpdeep/dataset/preprocessor/preprocessor.py
def inverse_transform(self, preprocessed_value: torch.Tensor) -> object:
    """Inverse transform a feature preprocessed value into its raw value."""
    if not isinstance(self.preprocess_function, TransformerMixin):
        msg = f"{self.preprocess_function} was not parsable"
        raise TypeError(msg)

    return self.preprocess_function.inverse_transform(preprocessed_value)

to_model() -> PreprocessorInsert #

Convert to PreprocessorInsert instance.

Source code in src/xpdeep/dataset/preprocessor/preprocessor.py
def to_model(self) -> PreprocessorInsert:
    """Convert to PreprocessorInsert instance."""
    # Lazy import because Skops import is quite slow when debugging.
    import skops.io as skio  # noqa: PLC0415

    transformer = TrustedObjectInput(
        module="xpdeep_database.models.types.pydantic.trusted_object",
        class_="StateDict",
        reconstructor="_reconstructor",
        state=StateDictInput.from_dict({
            "__dict__": {
                "root": {"skops_blob": base64.b64encode(skio.dumps(self.preprocess_function)).decode("utf-8")}
            },
            "__pydantic_fields_set__": ["root"],
        }),
    )

    return PreprocessorInsert(
        preprocessed_size=self.preprocessed_size,
        value=NumpyPreprocessorValueInput(
            type_="NUMPY",
            transformer=transformer,
        ),
    )

from_model(json_response: dict[str, object]) -> SklearnPreprocessor #

Create the client object from api response.

Source code in src/xpdeep/dataset/preprocessor/preprocessor.py
@classmethod
def from_model(cls, json_response: dict[str, object]) -> SklearnPreprocessor:
    """Create the client object from api response."""
    preprocessor_insert = PreprocessorInsert.from_dict(json_response)

    preprocessor_value = preprocessor_insert.value
    if not isinstance(preprocessor_value, NumpyPreprocessorValueInput):
        msg = "An error occurs while reading the preprocessor response."
        raise ApiError(msg)

    # Lazy import because Skops import is quite slow when debugging.
    import skops.io as skio  # noqa: PLC0415

    try:
        skops_bytes_as_str = (
            preprocessor_value.transformer.state["__dict__"]  # type:ignore[union-attr]
            .additional_properties["root"]
            .additional_properties["skops_blob"]
        )

        skops_bytes = base64.b64decode(skops_bytes_as_str)  # type:ignore[arg-type]
        preprocess_function = skio.loads(data=skops_bytes)

    except Exception as e:
        msg = "An error occurs while reading the scikit learn preprocessor json response."
        raise ApiError(msg) from e

    return cls(
        preprocessed_size=preprocessor_insert.preprocessed_size,
        preprocess_function=preprocess_function,
    )

TorchPreprocessor(input_size: tuple[int, ...], module_transform: torch.nn.Module | None = None, module_inverse_transform: torch.nn.Module | None = None, **additional_attributes: object) #

Preprocessor class based on pytorch.

To customize your preprocessor, inherit from this class and implement the transform and inverse_transform methods. Additionally, you can define module_transform and module_inverse_transform in the init method.

Initialize the preprocessor.

Parameters:

Name Type Description Default

input_size #

tuple[int, ...]

The dimensions of the data that the preprocessor expects, excluding the batch size. input_size must match the dimensions of the data in your dataset. - Set an empty tuple () if no specific dimensions are provided (e.g., for scalar values). - Or by example,For an array of size (3, 2) in your dataset, set input_size to (3, 2).

required

module_transform #

Module | None

A PyTorch module to preprocess data from the raw input space to the preprocessed space. If transform is not inherited, this will override the default transform method.

None

module_inverse_transform #

Module | None

A PyTorch module to reverse the preprocessing, converting data from the preprocessed space back to the raw input space. If inverse_transform is not inherited, this will override the default inverse_transform method.

None

**additional_attributes #

object

Any additional keyword arguments can be passed when instantiating a TorchPreprocessor or a child class. These arguments will be set as class attributes. It can be especially useful to better customize the implementation of transform and ìnverse_transform methods.

{}

Methods:

Name Description
forward

Transform.

transform

Process data: ie take in input a tensor and return the tensor preprocessed.

inverse_transform

Reciprocal of preprocess.

to_model

Convert to PreprocessorInsert instance.

from_model

Convert to TorchPreprocessor.

Attributes:

Name Type Description
input_size
ward
module_transform
module_inverse_transform
Source code in src/xpdeep/dataset/preprocessor/preprocessor.py
def __init__(
    self,
    input_size: tuple[int, ...],
    module_transform: torch.nn.Module | None = None,
    module_inverse_transform: torch.nn.Module | None = None,
    **additional_attributes: object,
):
    """Initialize the preprocessor.

    Parameters
    ----------
    input_size : tuple[int, ...]
        The dimensions of the data that the preprocessor expects, excluding the batch size.
        `input_size` must match the dimensions of the data in your dataset.
        - Set an empty tuple `()` if no specific dimensions are provided (e.g., for scalar values).
        - Or by example,For an array of size `(3, 2)` in your dataset, set `input_size` to `(3, 2)`.
    module_transform : torch.nn.Module | None
        A PyTorch module to preprocess data from the raw input space to the preprocessed space.
        If `transform` is not inherited, this will override the default `transform` method.
    module_inverse_transform : torch.nn.Module | None
        A PyTorch module to reverse the preprocessing, converting data from the preprocessed space
        back to the raw input space.
        If `inverse_transform` is not inherited, this will override the default `inverse_transform` method.
    **additional_attributes : object
        Any additional keyword arguments can be passed when instantiating a TorchPreprocessor or a child class.
        These arguments will be set as class attributes.
        It can be especially useful to better customize the implementation of `transform` and
        `ìnverse_transform` methods.
    """
    super().__init__()

    self.input_size = input_size
    self.ward = True  # Whether to call `transform` or `inverse_transform when 'forward' is triggered.
    self.module_transform = module_transform
    self.module_inverse_transform = module_inverse_transform

    for additional_attr_name, additional_attr_value in additional_attributes.items():
        setattr(self, additional_attr_name, additional_attr_value)

input_size = input_size #

ward = True #

module_transform = module_transform #

module_inverse_transform = module_inverse_transform #

forward(inputs: torch.Tensor) -> torch.Tensor #

Transform.

Source code in src/xpdeep/dataset/preprocessor/preprocessor.py
def forward(self, inputs: torch.Tensor) -> torch.Tensor:
    """Transform."""
    if self.ward:
        return self.transform(inputs)
    return self.inverse_transform(inputs)

transform(inputs: torch.Tensor) -> torch.Tensor #

Process data: ie take in input a tensor and return the tensor preprocessed.

Source code in src/xpdeep/dataset/preprocessor/preprocessor.py
def transform(self, inputs: torch.Tensor) -> torch.Tensor:
    """Process data: ie take in input a tensor and return the tensor preprocessed."""
    if self.module_transform is None:
        raise NotImplementedError("Implement this function.")
    return cast("torch.Tensor", self.module_transform(inputs))

inverse_transform(output: torch.Tensor) -> torch.Tensor #

Reciprocal of preprocess.

ie \forall x inverse_transform(transform(x)) = transform(inverse_transform(x)) = x.

Source code in src/xpdeep/dataset/preprocessor/preprocessor.py
def inverse_transform(self, output: torch.Tensor) -> torch.Tensor:
    r"""Reciprocal of preprocess.

    ie \forall x inverse_transform(transform(x)) = transform(inverse_transform(x)) = x.
    """
    if self.module_inverse_transform is None:
        raise NotImplementedError("implement this function.")
    return cast("torch.Tensor", self.module_inverse_transform(output))

to_model() -> PreprocessorInsert #

Convert to PreprocessorInsert instance.

Source code in src/xpdeep/dataset/preprocessor/preprocessor.py
def to_model(self) -> PreprocessorInsert:
    """Convert to PreprocessorInsert instance."""
    output = self.transform(torch.randn(size=(2, *self.input_size)))
    preprocessed_size = output.size()[1:]

    self.ward = True  # Use 'transform' on forward
    preprocess_transformer = convert_payload(
        self._to_torch_artifact(model=self, example_inputs=torch.randn(2, *self.input_size))
    )

    self.ward = False  # Use 'inverse_transform' on forward
    inverse_preprocess_transformer = convert_payload(
        self._to_torch_artifact(model=self, example_inputs=torch.randn(2, *preprocessed_size))
    )

    return PreprocessorInsert(
        preprocessed_size=list(preprocessed_size),
        value=TorchPreprocessorValue(
            type_="TORCH", transformer=preprocess_transformer, inverse_transformer=inverse_preprocess_transformer
        ),
    )

from_model(json_response: dict[str, object]) -> TorchPreprocessor #

Convert to TorchPreprocessor.

Source code in src/xpdeep/dataset/preprocessor/preprocessor.py
@classmethod
def from_model(cls, json_response: dict[str, object]) -> TorchPreprocessor:
    """Convert to TorchPreprocessor."""
    preprocessor_insert = PreprocessorInsert.from_dict(json_response)

    preprocessor_value = preprocessor_insert.value
    if not isinstance(preprocessor_value, TorchPreprocessorValue):
        msg = "An error occurs while reading the preprocessor response."
        raise ApiError(msg)

    # Convert back to bytes
    transform = TorchArtifactTypes(
        exported_program=preprocessor_value.transformer.exported_program,  # type: ignore[arg-type]
        state_dict=preprocessor_value.transformer.state_dict,  # type: ignore[arg-type]
        constants=preprocessor_value.transformer.constants,  # type: ignore[arg-type]
        example_inputs=None,
    )

    inverse_transform = TorchArtifactTypes(
        exported_program=preprocessor_value.inverse_transformer.exported_program,  # type: ignore[arg-type]
        state_dict=preprocessor_value.inverse_transformer.state_dict,  # type: ignore[arg-type]
        constants=preprocessor_value.inverse_transformer.constants,  # type: ignore[arg-type]
        example_inputs=None,
    )

    transform = transform.from_pydantic().module()
    inverse_transform = inverse_transform.from_pydantic().module()

    if preprocessor_insert.preprocessed_size is None:
        msg = "Unknow preprocessed_size of returned Torch preprocessor."
        raise ApiError(msg)

    input_size = inverse_transform(torch.randn(size=(2, *preprocessor_insert.preprocessed_size))).size()[1:]

    return cls(
        input_size=input_size,
        module_transform=transform,
        module_inverse_transform=inverse_transform,
    )