preprocessor

The schema package provide tools to infer and build a dataset schema.

Modules:

Name	Description
`preprocessor`	Feature preprocessor.
`utils_stable_hash`	Utility for the hash.
`zoo`	Preprocessor zoo.

Classes:

Name	Description
`IdentityPreprocessor`	Identity Preprocessor class.
`SklearnPreprocessor`	Preprocessor class based on sklearn preprocessing classes.
`TorchPreprocessor`	Preprocessor class based on pytorch.

`all = ['IdentityPreprocessor', 'SklearnPreprocessor', 'TorchPreprocessor']` #

`IdentityPreprocessor` #

Identity Preprocessor class.

Parameters:

Name	Type	Description	Default
`preprocessed_size` #	`str`		`None`

Methods:

Name	Description
`to_model`	Convert to PreprocessorInsert instance.
`from_model`	Create the client object from api response.
`stable_hash`	Return the hash.

`to_model() -> PreprocessorInsert` #

Convert to PreprocessorInsert instance.

Source code in src/xpdeep/dataset/preprocessor/preprocessor.py

def to_model(self) -> PreprocessorInsert:
    """Convert to PreprocessorInsert instance."""
    return PreprocessorInsert(
        preprocessed_size=self.preprocessed_size, value=IdentityPreprocessorValue(type_="IDENTITY")
    )

`from_model(preprocessor_input: PreprocessorSelectInput) -> IdentityPreprocessor` #

Create the client object from api response.

Source code in src/xpdeep/dataset/preprocessor/preprocessor.py

@classmethod
def from_model(cls, preprocessor_input: PreprocessorSelectInput) -> IdentityPreprocessor:
    """Create the client object from api response."""
    return cls(preprocessed_size=preprocessor_input.preprocessed_size)

`stable_hash() -> str` #

Return the hash.

Source code in src/xpdeep/dataset/preprocessor/preprocessor.py

def stable_hash(self) -> str:
    """Return the hash."""
    return str(hashlib.sha256(f"{self.__class__.__name__}_{self.preprocessed_size}".encode()).hexdigest())

`SklearnPreprocessor` #

Preprocessor class based on sklearn preprocessing classes.

Parameters:

Name	Type	Description	Default
`preprocessed_size` #	`str`		`None`
`preprocess_function` #	`str`		required

Methods:

Name	Description
`transform`	Transform a feature raw value into its preprocessed value.
`inverse_transform`	Inverse transform a feature preprocessed value into its raw value.
`to_model`	Convert to PreprocessorInsert instance.
`from_model`	Create the client object from api response.
`stable_hash`	Return the hash.

Attributes:

Name	Type	Description
`preprocess_function`	`TransformerMixin`

`preprocess_function: TransformerMixin` #

`transform(feature_raw_value: object) -> torch.Tensor` #

Transform a feature raw value into its preprocessed value.

Source code in src/xpdeep/dataset/preprocessor/preprocessor.py

def transform(self, feature_raw_value: object) -> torch.Tensor:
    """Transform a feature raw value into its preprocessed value."""
    if not isinstance(self.preprocess_function, TransformerMixin):
        msg = f"{self.preprocess_function} was not parsable"
        raise TypeError(msg)
    return self.preprocess_function.transform(feature_raw_value)  # type: ignore[no-any-return]

`inverse_transform(preprocessed_value: torch.Tensor) -> object` #

Inverse transform a feature preprocessed value into its raw value.

Source code in src/xpdeep/dataset/preprocessor/preprocessor.py

def inverse_transform(self, preprocessed_value: torch.Tensor) -> object:
    """Inverse transform a feature preprocessed value into its raw value."""
    if not isinstance(self.preprocess_function, TransformerMixin):
        msg = f"{self.preprocess_function} was not parsable"
        raise TypeError(msg)

    return self.preprocess_function.inverse_transform(preprocessed_value)

`to_model() -> PreprocessorInsert` #

Convert to PreprocessorInsert instance.

Source code in src/xpdeep/dataset/preprocessor/preprocessor.py

def to_model(self) -> PreprocessorInsert:
    """Convert to PreprocessorInsert instance."""
    # Lazy import because Skops import is quite slow when debugging.
    import skops.io as skio  # noqa: PLC0415

    transformer = TrustedObjectInput(
        module="xpdeep_database.models.types.pydantic.trusted_object",
        class_="StateDict",
        reconstructor="_reconstructor",
        state=StateDictInput.from_dict({
            "__dict__": {
                "root": {"skops_blob": base64.b64encode(skio.dumps(self.preprocess_function)).decode("utf-8")}
            },
            "__pydantic_fields_set__": ["root"],
        }),
    )

    return PreprocessorInsert(
        preprocessed_size=self.preprocessed_size,
        value=NumpyPreprocessorValueInput(
            type_="NUMPY",
            transformer=transformer,
        ),
    )

`from_model(preprocessor_input: PreprocessorSelectInput) -> SklearnPreprocessor` #

Create the client object from api response.

Source code in src/xpdeep/dataset/preprocessor/preprocessor.py

@classmethod
def from_model(cls, preprocessor_input: PreprocessorSelectInput) -> SklearnPreprocessor:
    """Create the client object from api response."""
    preprocessor_value = preprocessor_input.value
    if not isinstance(preprocessor_value, NumpyPreprocessorValueInput):
        msg = "An error occurs while reading the preprocessor response."
        raise ApiError(msg)

    # Lazy import because Skops import is quite slow when debugging.
    import skops.io as skio  # noqa: PLC0415

    try:
        skops_bytes_as_str = (
            preprocessor_value.transformer.state["__dict__"]  # type:ignore[union-attr]
            .additional_properties["root"]
            .additional_properties["skops_blob"]
        )

        skops_bytes = base64.b64decode(skops_bytes_as_str)  # type:ignore[arg-type]
        preprocess_function = skio.loads(data=skops_bytes)

    except Exception as e:
        msg = "An error occurs while reading the scikit learn preprocessor json response."
        raise ApiError(msg) from e

    return cls(
        preprocessed_size=preprocessor_input.preprocessed_size,
        preprocess_function=preprocess_function,
    )

`stable_hash() -> str` #

Return the hash.

Source code in src/xpdeep/dataset/preprocessor/preprocessor.py

def stable_hash(self) -> str:
    """Return the hash."""
    hash_preprocessed_function = serializable_preprocess_function(preprocess_function=self.preprocess_function)
    return str(
        hashlib.sha256(
            f"{self.__class__.__name__}_{self.preprocessed_size}_{hash_preprocessed_function}".encode()
        ).hexdigest()
    )

`TorchPreprocessor(input_size: tuple[int, ...], module_transform: torch.nn.Module | None = None, module_inverse_transform: torch.nn.Module | None = None, **additional_attributes: object)` #

Preprocessor class based on pytorch.

To customize your preprocessor, inherit from this class and implement the transform and inverse_transform methods. Additionally, you can define module_transform and module_inverse_transform in the init method.

Initialize the preprocessor.

Parameters:

Name	Type	Description	Default
`input_size` #	`tuple[int, ...]`	The dimensions of the data that the preprocessor expects, excluding the batch size. `input_size` must match the dimensions of the data in your dataset. - Set an empty tuple `()` if no specific dimensions are provided (e.g., for scalar values). - Or by example,For an array of size `(3, 2)` in your dataset, set `input_size` to `(3, 2)`.	required
`module_transform` #	`Module \| None`	A PyTorch module to preprocess data from the raw input space to the preprocessed space. If `transform` is not inherited, this will override the default `transform` method.	`None`
`module_inverse_transform` #	`Module \| None`	A PyTorch module to reverse the preprocessing, converting data from the preprocessed space back to the raw input space. If `inverse_transform` is not inherited, this will override the default `inverse_transform` method.	`None`
`**additional_attributes` #	`object`	Any additional keyword arguments can be passed when instantiating a TorchPreprocessor or a child class. These arguments will be set as class attributes. It can be especially useful to better customize the implementation of `transform` and `ìnverse_transform` methods.	`{}`

Methods:

Name	Description
`forward`	Transform.
`transform`	Process data: ie take in input a tensor and return the tensor preprocessed.
`inverse_transform`	Reciprocal of preprocess.
`to_model`	Convert to PreprocessorInsert instance.
`stable_hash`	Compute a stable hash for a torch module or exported program.
`from_model`	Convert to TorchPreprocessor.

Attributes:

Name	Type	Description
`input_size`
`ward`
`module_transform`
`module_inverse_transform`

Source code in src/xpdeep/dataset/preprocessor/preprocessor.py

def __init__(
    self,
    input_size: tuple[int, ...],
    module_transform: torch.nn.Module | None = None,
    module_inverse_transform: torch.nn.Module | None = None,
    **additional_attributes: object,
):
    """Initialize the preprocessor.

    Parameters
    ----------
    input_size : tuple[int, ...]
        The dimensions of the data that the preprocessor expects, excluding the batch size.
        `input_size` must match the dimensions of the data in your dataset.
        - Set an empty tuple `()` if no specific dimensions are provided (e.g., for scalar values).
        - Or by example,For an array of size `(3, 2)` in your dataset, set `input_size` to `(3, 2)`.
    module_transform : torch.nn.Module | None
        A PyTorch module to preprocess data from the raw input space to the preprocessed space.
        If `transform` is not inherited, this will override the default `transform` method.
    module_inverse_transform : torch.nn.Module | None
        A PyTorch module to reverse the preprocessing, converting data from the preprocessed space
        back to the raw input space.
        If `inverse_transform` is not inherited, this will override the default `inverse_transform` method.
    **additional_attributes : object
        Any additional keyword arguments can be passed when instantiating a TorchPreprocessor or a child class.
        These arguments will be set as class attributes.
        It can be especially useful to better customize the implementation of `transform` and
        `ìnverse_transform` methods.
    """
    super().__init__()

    self.input_size = input_size
    self.ward = True  # Whether to call `transform` or `inverse_transform when 'forward' is triggered.
    self.module_transform = module_transform
    self.module_inverse_transform = module_inverse_transform

    for additional_attr_name, additional_attr_value in additional_attributes.items():
        setattr(self, additional_attr_name, additional_attr_value)

`input_size = input_size` #

`ward = True` #

`module_transform = module_transform` #

`module_inverse_transform = module_inverse_transform` #

`forward(inputs: torch.Tensor) -> torch.Tensor` #

Transform.

Source code in src/xpdeep/dataset/preprocessor/preprocessor.py

def forward(self, inputs: torch.Tensor) -> torch.Tensor:
    """Transform."""
    if self.ward:
        return self.transform(inputs)
    return self.inverse_transform(inputs)

`transform(inputs: torch.Tensor) -> torch.Tensor` #

Process data: ie take in input a tensor and return the tensor preprocessed.

Source code in src/xpdeep/dataset/preprocessor/preprocessor.py

def transform(self, inputs: torch.Tensor) -> torch.Tensor:
    """Process data: ie take in input a tensor and return the tensor preprocessed."""
    if self.module_transform is None:
        raise NotImplementedError("Implement this function.")
    return cast("torch.Tensor", self.module_transform(inputs))

`inverse_transform(output: torch.Tensor) -> torch.Tensor` #

Reciprocal of preprocess.

ie \forall x inverse_transform(transform(x)) = transform(inverse_transform(x)) = x.

Source code in src/xpdeep/dataset/preprocessor/preprocessor.py

def inverse_transform(self, output: torch.Tensor) -> torch.Tensor:
    r"""Reciprocal of preprocess.

    ie \forall x inverse_transform(transform(x)) = transform(inverse_transform(x)) = x.
    """
    if self.module_inverse_transform is None:
        raise NotImplementedError("implement this function.")
    return cast("torch.Tensor", self.module_inverse_transform(output))

`to_model() -> PreprocessorInsert` #

Convert to PreprocessorInsert instance.

Source code in src/xpdeep/dataset/preprocessor/preprocessor.py

def to_model(self) -> PreprocessorInsert:
    """Convert to PreprocessorInsert instance."""
    output = self.transform(torch.randn(size=(2, *self.input_size)))
    preprocessed_size = output.size()[1:]

    self.ward = True  # Use 'transform' on forward
    preprocess_transformer = convert_payload(
        self._to_torch_artifact(model=self, example_inputs=torch.randn(2, *self.input_size))
    )

    self.ward = False  # Use 'inverse_transform' on forward
    inverse_preprocess_transformer = convert_payload(
        self._to_torch_artifact(model=self, example_inputs=torch.randn(2, *preprocessed_size))
    )

    return PreprocessorInsert(
        preprocessed_size=list(preprocessed_size),
        value=TorchPreprocessorValue(
            type_="TORCH", transformer=preprocess_transformer, inverse_transformer=inverse_preprocess_transformer
        ),
    )

`stable_hash() -> str` #

Compute a stable hash for a torch module or exported program.

Returns:

Type	Description
`str`	Hexadecimal digest that is stable across runs for identical parameters, buffers, and shapes.

Notes

The hash is computed by: - Obtaining the state dictionary. - Sorting keys lexicographically. - Moving tensors to CPU, making them contiguous. - Hashing key names, dtypes, shapes, and raw tensor bytes.

This makes the hash independent of device placement and dictionary insertion order. It will change whenever any parameter or buffer content or shape changes.

Source code in src/xpdeep/dataset/preprocessor/preprocessor.py

@final
def stable_hash(self) -> str:
    """
    Compute a stable hash for a torch module or exported program.

    Returns
    -------
    str
        Hexadecimal digest that is stable across runs for identical
        parameters, buffers, and shapes.

    Notes
    -----
    The hash is computed by:
    - Obtaining the state dictionary.
    - Sorting keys lexicographically.
    - Moving tensors to CPU, making them contiguous.
    - Hashing key names, dtypes, shapes, and raw tensor bytes.

    This makes the hash independent of device placement and dictionary
    insertion order. It will change whenever any parameter or buffer
    content or shape changes.
    """
    # Make a torch export  roundtrip to ensure consistency between state dicts
    # Before the first export roundtrip export may lead to state dict changes.
    model = self.to_model()
    self_roundtrip = TorchPreprocessor.from_model(
        PreprocessorSelectInput(
            id="ignore",
            preprocessed_size=model.preprocessed_size,
            value=model.value,
            modality_index=model.modality_index if not isinstance(model.modality_index, Unset) else "unset",
        )
    )

    state_dict = self_roundtrip.state_dict()

    hash_object = hashlib.sha256()

    hash_object.update(f"{self_roundtrip.input_size}".encode())
    # Sort keys to make ordering deterministic
    for name in state_dict:
        tensor = state_dict[name]

        # Only handle tensor entries; skip others to avoid surprises
        if not isinstance(tensor, torch.Tensor):
            continue

        # Standardize representation: CPU, contiguous
        tensor_cpu = tensor.detach().to("cpu").contiguous()

        # Include metadata in the hash
        # Remove prefix because roundtrip of torch export add module_inverse_transform
        new_name = name.replace("module_transform.", "")
        new_name = new_name.replace("module_inverse_transform.", "")
        metadata = f"{new_name}|{tensor_cpu.dtype!s}|{tuple(tensor_cpu.shape)}\n"
        hash_object.update(metadata.encode("utf-8"))
        # Include raw bytes
        # Using view ensures no copy if already contiguous and byte aligned
        tensor_norm = tensor_cpu.abs().sum().item()
        hash_object.update(f"{round(tensor_norm, 7)}".encode())

    return hash_object.hexdigest()

`from_model(preprocessor_input: PreprocessorSelectInput) -> TorchPreprocessor` #

Convert to TorchPreprocessor.

Source code in src/xpdeep/dataset/preprocessor/preprocessor.py

@classmethod
def from_model(cls, preprocessor_input: PreprocessorSelectInput) -> TorchPreprocessor:
    """Convert to TorchPreprocessor."""
    preprocessor_value = preprocessor_input.value
    if not isinstance(preprocessor_value, TorchPreprocessorValue):
        msg = "An error occurs while reading the preprocessor response."
        raise ApiError(msg)

    # Convert back to bytes
    transform = TorchArtifactTypes(
        exported_program=cast(bytes, preprocessor_value.transformer.exported_program),
        state_dict=cast(bytes, preprocessor_value.transformer.state_dict),
        constants=cast(bytes, preprocessor_value.transformer.constants),
        example_inputs=None,
    )

    inverse_transform = TorchArtifactTypes(
        exported_program=cast(bytes, preprocessor_value.inverse_transformer.exported_program),
        state_dict=cast(bytes, preprocessor_value.inverse_transformer.state_dict),
        constants=cast(bytes, preprocessor_value.inverse_transformer.constants),
        example_inputs=None,
    )

    transform = transform.from_pydantic().module()
    inverse_transform = inverse_transform.from_pydantic().module()

    if preprocessor_input.preprocessed_size is None:
        msg = "Unknow preprocessed_size of returned Torch preprocessor."
        raise ApiError(msg)

    input_size = inverse_transform(torch.randn(size=(2, *preprocessor_input.preprocessed_size))).size()[1:]

    return cls(
        input_size=input_size,
        module_transform=transform,
        module_inverse_transform=inverse_transform,
    )

preprocessor

`all = ['IdentityPreprocessor', 'SklearnPreprocessor', 'TorchPreprocessor']` #

`IdentityPreprocessor` #

`preprocessed_size` #

`to_model() -> PreprocessorInsert` #

`from_model(preprocessor_input: PreprocessorSelectInput) -> IdentityPreprocessor` #

`stable_hash() -> str` #

`SklearnPreprocessor` #

`preprocessed_size` #

`preprocess_function` #

`preprocess_function: TransformerMixin` #

`transform(feature_raw_value: object) -> torch.Tensor` #

`inverse_transform(preprocessed_value: torch.Tensor) -> object` #

`to_model() -> PreprocessorInsert` #

`from_model(preprocessor_input: PreprocessorSelectInput) -> SklearnPreprocessor` #

`stable_hash() -> str` #

`TorchPreprocessor(input_size: tuple[int, ...], module_transform: torch.nn.Module | None = None, module_inverse_transform: torch.nn.Module | None = None, **additional_attributes: object)` #

`input_size` #

`module_transform` #

`module_inverse_transform` #

`**additional_attributes` #

`input_size = input_size` #

`ward = True` #

`module_transform = module_transform` #

`module_inverse_transform = module_inverse_transform` #

`forward(inputs: torch.Tensor) -> torch.Tensor` #

`transform(inputs: torch.Tensor) -> torch.Tensor` #

`inverse_transform(output: torch.Tensor) -> torch.Tensor` #

`to_model() -> PreprocessorInsert` #

`stable_hash() -> str` #

`from_model(preprocessor_input: PreprocessorSelectInput) -> TorchPreprocessor` #

preprocessor

__all__ = ['IdentityPreprocessor', 'SklearnPreprocessor', 'TorchPreprocessor'] #

IdentityPreprocessor #

preprocessed_size #

to_model() -> PreprocessorInsert #

from_model(preprocessor_input: PreprocessorSelectInput) -> IdentityPreprocessor #

stable_hash() -> str #

SklearnPreprocessor #

preprocessed_size #

preprocess_function #

preprocess_function: TransformerMixin #

transform(feature_raw_value: object) -> torch.Tensor #

inverse_transform(preprocessed_value: torch.Tensor) -> object #

to_model() -> PreprocessorInsert #

from_model(preprocessor_input: PreprocessorSelectInput) -> SklearnPreprocessor #

stable_hash() -> str #

TorchPreprocessor(input_size: tuple[int, ...], module_transform: torch.nn.Module | None = None, module_inverse_transform: torch.nn.Module | None = None, **additional_attributes: object) #

input_size #

module_transform #

module_inverse_transform #

**additional_attributes #

input_size = input_size #

ward = True #

module_transform = module_transform #

module_inverse_transform = module_inverse_transform #

forward(inputs: torch.Tensor) -> torch.Tensor #

transform(inputs: torch.Tensor) -> torch.Tensor #

inverse_transform(output: torch.Tensor) -> torch.Tensor #

to_model() -> PreprocessorInsert #

stable_hash() -> str #

from_model(preprocessor_input: PreprocessorSelectInput) -> TorchPreprocessor #

`all = ['IdentityPreprocessor', 'SklearnPreprocessor', 'TorchPreprocessor']` #

`IdentityPreprocessor` #

`preprocessed_size` #

`to_model() -> PreprocessorInsert` #

`from_model(preprocessor_input: PreprocessorSelectInput) -> IdentityPreprocessor` #

`stable_hash() -> str` #

`SklearnPreprocessor` #

`preprocessed_size` #

`preprocess_function` #

`preprocess_function: TransformerMixin` #

`transform(feature_raw_value: object) -> torch.Tensor` #

`inverse_transform(preprocessed_value: torch.Tensor) -> object` #

`to_model() -> PreprocessorInsert` #

`from_model(preprocessor_input: PreprocessorSelectInput) -> SklearnPreprocessor` #

`stable_hash() -> str` #

`TorchPreprocessor(input_size: tuple[int, ...], module_transform: torch.nn.Module | None = None, module_inverse_transform: torch.nn.Module | None = None, **additional_attributes: object)` #

`input_size` #

`module_transform` #

`module_inverse_transform` #

`**additional_attributes` #

`input_size = input_size` #

`ward = True` #

`module_transform = module_transform` #

`module_inverse_transform = module_inverse_transform` #

`forward(inputs: torch.Tensor) -> torch.Tensor` #

`transform(inputs: torch.Tensor) -> torch.Tensor` #

`inverse_transform(output: torch.Tensor) -> torch.Tensor` #

`to_model() -> PreprocessorInsert` #

`stable_hash() -> str` #

`from_model(preprocessor_input: PreprocessorSelectInput) -> TorchPreprocessor` #