Skip to content

preprocessor

The schema package provide tools to infer and build a dataset schema.

Modules:

Name Description
preprocessor

Feature preprocessor.

utils_stable_hash

Utility for the hash.

zoo

Preprocessor zoo.

Classes:

Name Description
IdentityPreprocessor

Identity Preprocessor class.

SklearnPreprocessor

Preprocessor class based on sklearn preprocessing classes.

TorchPreprocessor

Preprocessor class based on pytorch.

__all__ = ['IdentityPreprocessor', 'SklearnPreprocessor', 'TorchPreprocessor'] #

IdentityPreprocessor #

Identity Preprocessor class.

Parameters:

Name Type Description Default

preprocessed_size #

str
None

Methods:

Name Description
to_model

Convert to PreprocessorInsert instance.

from_model

Create the client object from api response.

stable_hash

Return the hash.

to_model() -> PreprocessorInsert #

Convert to PreprocessorInsert instance.

Source code in src/xpdeep/dataset/preprocessor/preprocessor.py
def to_model(self) -> PreprocessorInsert:
    """Convert to PreprocessorInsert instance."""
    return PreprocessorInsert(
        preprocessed_size=self.preprocessed_size, value=IdentityPreprocessorValue(type_="IDENTITY")
    )

from_model(preprocessor_input: PreprocessorSelectInput) -> IdentityPreprocessor #

Create the client object from api response.

Source code in src/xpdeep/dataset/preprocessor/preprocessor.py
@classmethod
def from_model(cls, preprocessor_input: PreprocessorSelectInput) -> IdentityPreprocessor:
    """Create the client object from api response."""
    return cls(preprocessed_size=preprocessor_input.preprocessed_size)

stable_hash() -> str #

Return the hash.

Source code in src/xpdeep/dataset/preprocessor/preprocessor.py
def stable_hash(self) -> str:
    """Return the hash."""
    return str(hashlib.sha256(f"{self.__class__.__name__}_{self.preprocessed_size}".encode()).hexdigest())

SklearnPreprocessor #

Preprocessor class based on sklearn preprocessing classes.

Parameters:

Name Type Description Default

preprocessed_size #

str
None

preprocess_function #

str
required

Methods:

Name Description
transform

Transform a feature raw value into its preprocessed value.

inverse_transform

Inverse transform a feature preprocessed value into its raw value.

to_model

Convert to PreprocessorInsert instance.

from_model

Create the client object from api response.

stable_hash

Return the hash.

Attributes:

Name Type Description
preprocess_function TransformerMixin

preprocess_function: TransformerMixin #

transform(feature_raw_value: object) -> torch.Tensor #

Transform a feature raw value into its preprocessed value.

Source code in src/xpdeep/dataset/preprocessor/preprocessor.py
def transform(self, feature_raw_value: object) -> torch.Tensor:
    """Transform a feature raw value into its preprocessed value."""
    if not isinstance(self.preprocess_function, TransformerMixin):
        msg = f"{self.preprocess_function} was not parsable"
        raise TypeError(msg)
    return self.preprocess_function.transform(feature_raw_value)  # type: ignore[no-any-return]

inverse_transform(preprocessed_value: torch.Tensor) -> object #

Inverse transform a feature preprocessed value into its raw value.

Source code in src/xpdeep/dataset/preprocessor/preprocessor.py
def inverse_transform(self, preprocessed_value: torch.Tensor) -> object:
    """Inverse transform a feature preprocessed value into its raw value."""
    if not isinstance(self.preprocess_function, TransformerMixin):
        msg = f"{self.preprocess_function} was not parsable"
        raise TypeError(msg)

    return self.preprocess_function.inverse_transform(preprocessed_value)

to_model() -> PreprocessorInsert #

Convert to PreprocessorInsert instance.

Source code in src/xpdeep/dataset/preprocessor/preprocessor.py
def to_model(self) -> PreprocessorInsert:
    """Convert to PreprocessorInsert instance."""
    # Lazy import because Skops import is quite slow when debugging.
    import skops.io as skio  # noqa: PLC0415

    transformer = TrustedObjectInput(
        module="xpdeep_database.models.types.pydantic.trusted_object",
        class_="StateDict",
        reconstructor="_reconstructor",
        state=StateDictInput.from_dict({
            "__dict__": {
                "root": {"skops_blob": base64.b64encode(skio.dumps(self.preprocess_function)).decode("utf-8")}
            },
            "__pydantic_fields_set__": ["root"],
        }),
    )

    return PreprocessorInsert(
        preprocessed_size=self.preprocessed_size,
        value=NumpyPreprocessorValueInput(
            type_="NUMPY",
            transformer=transformer,
        ),
    )

from_model(preprocessor_input: PreprocessorSelectInput) -> SklearnPreprocessor #

Create the client object from api response.

Source code in src/xpdeep/dataset/preprocessor/preprocessor.py
@classmethod
def from_model(cls, preprocessor_input: PreprocessorSelectInput) -> SklearnPreprocessor:
    """Create the client object from api response."""
    preprocessor_value = preprocessor_input.value
    if not isinstance(preprocessor_value, NumpyPreprocessorValueInput):
        msg = "An error occurs while reading the preprocessor response."
        raise ApiError(msg)

    # Lazy import because Skops import is quite slow when debugging.
    import skops.io as skio  # noqa: PLC0415

    try:
        skops_bytes_as_str = (
            preprocessor_value.transformer.state["__dict__"]  # type:ignore[union-attr]
            .additional_properties["root"]
            .additional_properties["skops_blob"]
        )

        skops_bytes = base64.b64decode(skops_bytes_as_str)  # type:ignore[arg-type]
        preprocess_function = skio.loads(data=skops_bytes)

    except Exception as e:
        msg = "An error occurs while reading the scikit learn preprocessor json response."
        raise ApiError(msg) from e

    return cls(
        preprocessed_size=preprocessor_input.preprocessed_size,
        preprocess_function=preprocess_function,
    )

stable_hash() -> str #

Return the hash.

Source code in src/xpdeep/dataset/preprocessor/preprocessor.py
def stable_hash(self) -> str:
    """Return the hash."""
    hash_preprocessed_function = serializable_preprocess_function(preprocess_function=self.preprocess_function)
    return str(
        hashlib.sha256(
            f"{self.__class__.__name__}_{self.preprocessed_size}_{hash_preprocessed_function}".encode()
        ).hexdigest()
    )

TorchPreprocessor(input_size: tuple[int, ...], module_transform: torch.nn.Module | None = None, module_inverse_transform: torch.nn.Module | None = None, **additional_attributes: object) #

Preprocessor class based on pytorch.

To customize your preprocessor, inherit from this class and implement the transform and inverse_transform methods. Additionally, you can define module_transform and module_inverse_transform in the init method.

Initialize the preprocessor.

Parameters:

Name Type Description Default

input_size #

tuple[int, ...]

The dimensions of the data that the preprocessor expects, excluding the batch size. input_size must match the dimensions of the data in your dataset. - Set an empty tuple () if no specific dimensions are provided (e.g., for scalar values). - Or by example,For an array of size (3, 2) in your dataset, set input_size to (3, 2).

required

module_transform #

Module | None

A PyTorch module to preprocess data from the raw input space to the preprocessed space. If transform is not inherited, this will override the default transform method.

None

module_inverse_transform #

Module | None

A PyTorch module to reverse the preprocessing, converting data from the preprocessed space back to the raw input space. If inverse_transform is not inherited, this will override the default inverse_transform method.

None

**additional_attributes #

object

Any additional keyword arguments can be passed when instantiating a TorchPreprocessor or a child class. These arguments will be set as class attributes. It can be especially useful to better customize the implementation of transform and ìnverse_transform methods.

{}

Methods:

Name Description
forward

Transform.

transform

Process data: ie take in input a tensor and return the tensor preprocessed.

inverse_transform

Reciprocal of preprocess.

to_model

Convert to PreprocessorInsert instance.

stable_hash

Compute a stable hash for a torch module or exported program.

from_model

Convert to TorchPreprocessor.

Attributes:

Name Type Description
input_size
ward
module_transform
module_inverse_transform
Source code in src/xpdeep/dataset/preprocessor/preprocessor.py
def __init__(
    self,
    input_size: tuple[int, ...],
    module_transform: torch.nn.Module | None = None,
    module_inverse_transform: torch.nn.Module | None = None,
    **additional_attributes: object,
):
    """Initialize the preprocessor.

    Parameters
    ----------
    input_size : tuple[int, ...]
        The dimensions of the data that the preprocessor expects, excluding the batch size.
        `input_size` must match the dimensions of the data in your dataset.
        - Set an empty tuple `()` if no specific dimensions are provided (e.g., for scalar values).
        - Or by example,For an array of size `(3, 2)` in your dataset, set `input_size` to `(3, 2)`.
    module_transform : torch.nn.Module | None
        A PyTorch module to preprocess data from the raw input space to the preprocessed space.
        If `transform` is not inherited, this will override the default `transform` method.
    module_inverse_transform : torch.nn.Module | None
        A PyTorch module to reverse the preprocessing, converting data from the preprocessed space
        back to the raw input space.
        If `inverse_transform` is not inherited, this will override the default `inverse_transform` method.
    **additional_attributes : object
        Any additional keyword arguments can be passed when instantiating a TorchPreprocessor or a child class.
        These arguments will be set as class attributes.
        It can be especially useful to better customize the implementation of `transform` and
        `ìnverse_transform` methods.
    """
    super().__init__()

    self.input_size = input_size
    self.ward = True  # Whether to call `transform` or `inverse_transform when 'forward' is triggered.
    self.module_transform = module_transform
    self.module_inverse_transform = module_inverse_transform

    for additional_attr_name, additional_attr_value in additional_attributes.items():
        setattr(self, additional_attr_name, additional_attr_value)

input_size = input_size #

ward = True #

module_transform = module_transform #

module_inverse_transform = module_inverse_transform #

forward(inputs: torch.Tensor) -> torch.Tensor #

Transform.

Source code in src/xpdeep/dataset/preprocessor/preprocessor.py
def forward(self, inputs: torch.Tensor) -> torch.Tensor:
    """Transform."""
    if self.ward:
        return self.transform(inputs)
    return self.inverse_transform(inputs)

transform(inputs: torch.Tensor) -> torch.Tensor #

Process data: ie take in input a tensor and return the tensor preprocessed.

Source code in src/xpdeep/dataset/preprocessor/preprocessor.py
def transform(self, inputs: torch.Tensor) -> torch.Tensor:
    """Process data: ie take in input a tensor and return the tensor preprocessed."""
    if self.module_transform is None:
        raise NotImplementedError("Implement this function.")
    return cast("torch.Tensor", self.module_transform(inputs))

inverse_transform(output: torch.Tensor) -> torch.Tensor #

Reciprocal of preprocess.

ie \forall x inverse_transform(transform(x)) = transform(inverse_transform(x)) = x.

Source code in src/xpdeep/dataset/preprocessor/preprocessor.py
def inverse_transform(self, output: torch.Tensor) -> torch.Tensor:
    r"""Reciprocal of preprocess.

    ie \forall x inverse_transform(transform(x)) = transform(inverse_transform(x)) = x.
    """
    if self.module_inverse_transform is None:
        raise NotImplementedError("implement this function.")
    return cast("torch.Tensor", self.module_inverse_transform(output))

to_model() -> PreprocessorInsert #

Convert to PreprocessorInsert instance.

Source code in src/xpdeep/dataset/preprocessor/preprocessor.py
def to_model(self) -> PreprocessorInsert:
    """Convert to PreprocessorInsert instance."""
    output = self.transform(torch.randn(size=(2, *self.input_size)))
    preprocessed_size = output.size()[1:]

    self.ward = True  # Use 'transform' on forward
    preprocess_transformer = convert_payload(
        self._to_torch_artifact(model=self, example_inputs=torch.randn(2, *self.input_size))
    )

    self.ward = False  # Use 'inverse_transform' on forward
    inverse_preprocess_transformer = convert_payload(
        self._to_torch_artifact(model=self, example_inputs=torch.randn(2, *preprocessed_size))
    )

    return PreprocessorInsert(
        preprocessed_size=list(preprocessed_size),
        value=TorchPreprocessorValue(
            type_="TORCH", transformer=preprocess_transformer, inverse_transformer=inverse_preprocess_transformer
        ),
    )

stable_hash() -> str #

Compute a stable hash for a torch module or exported program.

Returns:

Type Description
str

Hexadecimal digest that is stable across runs for identical parameters, buffers, and shapes.

Notes

The hash is computed by: - Obtaining the state dictionary. - Sorting keys lexicographically. - Moving tensors to CPU, making them contiguous. - Hashing key names, dtypes, shapes, and raw tensor bytes.

This makes the hash independent of device placement and dictionary insertion order. It will change whenever any parameter or buffer content or shape changes.

Source code in src/xpdeep/dataset/preprocessor/preprocessor.py
@final
def stable_hash(self) -> str:
    """
    Compute a stable hash for a torch module or exported program.

    Returns
    -------
    str
        Hexadecimal digest that is stable across runs for identical
        parameters, buffers, and shapes.

    Notes
    -----
    The hash is computed by:
    - Obtaining the state dictionary.
    - Sorting keys lexicographically.
    - Moving tensors to CPU, making them contiguous.
    - Hashing key names, dtypes, shapes, and raw tensor bytes.

    This makes the hash independent of device placement and dictionary
    insertion order. It will change whenever any parameter or buffer
    content or shape changes.
    """
    # Make a torch export  roundtrip to ensure consistency between state dicts
    # Before the first export roundtrip export may lead to state dict changes.
    model = self.to_model()
    self_roundtrip = TorchPreprocessor.from_model(
        PreprocessorSelectInput(
            id="ignore",
            preprocessed_size=model.preprocessed_size,
            value=model.value,
            modality_index=model.modality_index if not isinstance(model.modality_index, Unset) else "unset",
        )
    )

    state_dict = self_roundtrip.state_dict()

    hash_object = hashlib.sha256()

    hash_object.update(f"{self_roundtrip.input_size}".encode())
    # Sort keys to make ordering deterministic
    for name in state_dict:
        tensor = state_dict[name]

        # Only handle tensor entries; skip others to avoid surprises
        if not isinstance(tensor, torch.Tensor):
            continue

        # Standardize representation: CPU, contiguous
        tensor_cpu = tensor.detach().to("cpu").contiguous()

        # Include metadata in the hash
        # Remove prefix because roundtrip of torch export add module_inverse_transform
        new_name = name.replace("module_transform.", "")
        new_name = new_name.replace("module_inverse_transform.", "")
        metadata = f"{new_name}|{tensor_cpu.dtype!s}|{tuple(tensor_cpu.shape)}\n"
        hash_object.update(metadata.encode("utf-8"))
        # Include raw bytes
        # Using view ensures no copy if already contiguous and byte aligned
        tensor_norm = tensor_cpu.abs().sum().item()
        hash_object.update(f"{round(tensor_norm, 7)}".encode())

    return hash_object.hexdigest()

from_model(preprocessor_input: PreprocessorSelectInput) -> TorchPreprocessor #

Convert to TorchPreprocessor.

Source code in src/xpdeep/dataset/preprocessor/preprocessor.py
@classmethod
def from_model(cls, preprocessor_input: PreprocessorSelectInput) -> TorchPreprocessor:
    """Convert to TorchPreprocessor."""
    preprocessor_value = preprocessor_input.value
    if not isinstance(preprocessor_value, TorchPreprocessorValue):
        msg = "An error occurs while reading the preprocessor response."
        raise ApiError(msg)

    # Convert back to bytes
    transform = TorchArtifactTypes(
        exported_program=cast(bytes, preprocessor_value.transformer.exported_program),
        state_dict=cast(bytes, preprocessor_value.transformer.state_dict),
        constants=cast(bytes, preprocessor_value.transformer.constants),
        example_inputs=None,
    )

    inverse_transform = TorchArtifactTypes(
        exported_program=cast(bytes, preprocessor_value.inverse_transformer.exported_program),
        state_dict=cast(bytes, preprocessor_value.inverse_transformer.state_dict),
        constants=cast(bytes, preprocessor_value.inverse_transformer.constants),
        example_inputs=None,
    )

    transform = transform.from_pydantic().module()
    inverse_transform = inverse_transform.from_pydantic().module()

    if preprocessor_input.preprocessed_size is None:
        msg = "Unknow preprocessed_size of returned Torch preprocessor."
        raise ApiError(msg)

    input_size = inverse_transform(torch.randn(size=(2, *preprocessor_input.preprocessed_size))).size()[1:]

    return cls(
        input_size=input_size,
        module_transform=transform,
        module_inverse_transform=inverse_transform,
    )