Create a Schema#
To use your dataset with the Xpdeep framework, you need a schema
object defining the dataset structure.
The Schema
exists with several levels.
As it can be tedious to find the correct schema, Xpdeep provides an AutoAnalyzer
to help you get a first schema version.
You can later update it if some feature analysis seem incorrect.
Note
The dataset during the training process corresponds to the train dataset.
1. Find a Schema#
The first step consist in finding each feature type and their associated preprocessor.
You can find a list of available Feature
in the API reference.
Warning
For security issue, we do not allow arbitrary code to be executed in the framework yet. Therefore, with StandardDataset
,
your preprocessing must come from a list of trusted preprocessors.
Xpdeep currently support scikit-learn
and pytorch
preprocessing to be used to build your preprocessor.
With the Auto Analyzer#
With a dataset object, you can get a first schema proposal using analyze
method of ParquetDataset
object instance.
Set the Target(s)#
The only requirement is to indicate which feature(s) should be considered as target(s).
Please use target_names parameter to specify which features should be considered as targets prior to the analysis.
👀 Full file preview
import os
import tempfile
import pyarrow as pa
import pyarrow.parquet as pq
import xpdeep
from xpdeep import Project
from xpdeep.dataset.parquet_dataset import FittedParquetDataset, ParquetDataset
from xpdeep.dataset.upload import upload
demo = {"api_key": "your_api_key", "api_url": "your_api_url"}
xpdeep.init(**demo)
train_data = pa.table({
"petal_length": [1.4, 1.5, 1.3, 4.5, 4.1, 5.0, 6.0, 5.5],
"petal_width": [0.2, 0.2, 0.2, 1.5, 1.3, 1.8, 2.5, 2.3],
"flower_type": ["Setosa", "Setosa", "Setosa", "Versicolor", "Versicolor", "Versicolor", "Virginica", "Virginica"],
"index_xp_deep": list(range(8)),
})
test_data = pa.table({
"petal_length": [5.1, 1.2, 1.1, 6.1, 4.7, 1.6, 4.3, 5.8],
"petal_width": [1.9, 0.2, 0.3, 2.2, 1.4, 0.4, 1.5, 2.0],
"flower_type": ["Versicolor", "Setosa", "Setosa", "Virginica", "Versicolor", "Setosa", "Versicolor", "Virginica"],
"index_xp_deep": list(range(8)),
})
xpdeep.set_project(Project("toy_dataset_project", name="toy dataset example", description="tutorial"))
# Write locally the raw data.
with tempfile.TemporaryDirectory() as temp_dir:
train_set_path = os.path.join(temp_dir, "train.parquet")
test_set_path = os.path.join(temp_dir, "test.parquet")
pq.write_table(train_data, train_set_path)
pq.write_table(test_data, test_set_path)
directory = upload(
directory_name="my_uploaded_data",
relative_paths=False,
train_set_path=train_set_path,
test_set_path=test_set_path,
) # Absolute path
train_dataset = ParquetDataset(split_name="train", identifier_name="toy_dataset", path=directory["train_set_path"])
analyzed_train_dataset = train_dataset.analyze(target_names=["flower_type"])
fitted_train_dataset = analyzed_train_dataset.fit()
print(fitted_train_dataset.fitted_schema)
fitted_validation_dataset = FittedParquetDataset(
split_name="test",
identifier_name="toy_dataset",
path=directory["test_set_path"],
fitted_schema=fitted_train_dataset.fitted_schema,
)
You can also set the target name directly on the analyzed schema, after the analysis.
analyzed_train_dataset = train_dataset.analyze()
analyzed_train_dataset.analyzed_schema["flower_type"].is_target = True
Set the Features#
In addition, you can force a feature type by calling the analyze
method with specific features. In the following
example, the feature with name "petal_length" will be a NumericalFeature
.
from xpdeep.dataset.schema.feature.feature import NumericalFeature
from xpdeep.dataset.schema.preprocessor import SklearnPreprocessor
from sklearn.preprocessing import StandardScaler
forced_feature = NumericalFeature(
name="petal_length",
is_target=False,
preprocessor=SklearnPreprocessor(preprocess_function=StandardScaler()))
analyzed_train_dataset = train_dataset.analyze(forced_feature)
As the returned schema is only a proposal, you can edit it later if it doesn't correctly match your needs. Any feature can be overwritten or updated.
from xpdeep.dataset.schema.feature.feature import NumericalFeature
from xpdeep.dataset.schema.preprocessor import SklearnPreprocessor
from sklearn.preprocessing import StandardScaler
# Set feature type name after the schema inference
analyzed_train_dataset = train_dataset.analyze()
analyzed_train_dataset.analyzed_schema["petal_length"] = NumericalFeature(
name="petal_length",
is_target=False,
preprocessor=SklearnPreprocessor(
preprocess_function=StandardScaler(),
),
)
This editable schema can be updated to match other wanted feature and preprocessing type.
You can remove a feature from the schema if needed, using its name:
Or from Scratch#
You can also create your own analyzed schema from scratch without using the auto-analyze.
from xpdeep.dataset.schema.feature.feature import NumericalFeature
from xpdeep.dataset.schema.preprocessor import SklearnPreprocessor
from sklearn.preprocessing import StandardScaler
from xpdeep.dataset.schema.schema import AnalyzedSchema
feature_1 = NumericalFeature(
name="petal_length",
is_target=False,
preprocessor=SklearnPreprocessor(
preprocess_function=StandardScaler(),
),
)
feature_2 = NumericalFeature(
name="petal_width",
is_target=False,
preprocessor=SklearnPreprocessor(
preprocess_function=StandardScaler(),
),
)
analyzed_schema = AnalyzedSchema(feature_1, feature_2)
Finally, use the analyzed schema to build the AnalyzedParquetDataset
.
from xpdeep.dataset.parquet_dataset import AnalyzedParquetDataset
analyzed_train_dataset = AnalyzedParquetDataset(
split_name="train",
identifier_name="my_local_train_dataset",
path=directory["train_set_path"],
analyzed_schema=analyzed_schema
)
You can remove a feature from the schema if needed, using its name:
2. Fit the Schema#
Once satisfied with the feature types and their preprocessor, the next step is to fit the dataset schema. Each preprocessor is indeed responsible for the raw <-> preprocessed space mapping and must be fit to allow the association.
From the AnalyzedParquetDataset#
The schema object can be used to automatically fit each feature preprocessor.
fitted_train_dataset = analyzed_train_dataset.fit()
print(fitted_train_dataset.fitted_schema)
+-----------------------------------------------+
| Schema Contents |
+--------------------+--------------+-----------+
| Type | Name | Is Target |
+--------------------+--------------+-----------+
| NumericalFeature | petal_length | ❌ |
| NumericalFeature | petal_width | ❌ |
| CategoricalFeature | flower_type | ✅ |
+--------------------+--------------+-----------+
👀 Full file preview
import os
import tempfile
import pyarrow as pa
import pyarrow.parquet as pq
import xpdeep
from xpdeep import Project
from xpdeep.dataset.parquet_dataset import FittedParquetDataset, ParquetDataset
from xpdeep.dataset.upload import upload
demo = {"api_key": "your_api_key", "api_url": "your_api_url"}
xpdeep.init(**demo)
train_data = pa.table({
"petal_length": [1.4, 1.5, 1.3, 4.5, 4.1, 5.0, 6.0, 5.5],
"petal_width": [0.2, 0.2, 0.2, 1.5, 1.3, 1.8, 2.5, 2.3],
"flower_type": ["Setosa", "Setosa", "Setosa", "Versicolor", "Versicolor", "Versicolor", "Virginica", "Virginica"],
"index_xp_deep": list(range(8)),
})
test_data = pa.table({
"petal_length": [5.1, 1.2, 1.1, 6.1, 4.7, 1.6, 4.3, 5.8],
"petal_width": [1.9, 0.2, 0.3, 2.2, 1.4, 0.4, 1.5, 2.0],
"flower_type": ["Versicolor", "Setosa", "Setosa", "Virginica", "Versicolor", "Setosa", "Versicolor", "Virginica"],
"index_xp_deep": list(range(8)),
})
xpdeep.set_project(Project("toy_dataset_project", name="toy dataset example", description="tutorial"))
# Write locally the raw data.
with tempfile.TemporaryDirectory() as temp_dir:
train_set_path = os.path.join(temp_dir, "train.parquet")
test_set_path = os.path.join(temp_dir, "test.parquet")
pq.write_table(train_data, train_set_path)
pq.write_table(test_data, test_set_path)
directory = upload(
directory_name="my_uploaded_data",
relative_paths=False,
train_set_path=train_set_path,
test_set_path=test_set_path,
) # Absolute path
train_dataset = ParquetDataset(split_name="train", identifier_name="toy_dataset", path=directory["train_set_path"])
analyzed_train_dataset = train_dataset.analyze(target_names=["flower_type"])
fitted_train_dataset = analyzed_train_dataset.fit()
print(fitted_train_dataset.fitted_schema)
fitted_validation_dataset = FittedParquetDataset(
split_name="test",
identifier_name="toy_dataset",
path=directory["test_set_path"],
fitted_schema=fitted_train_dataset.fitted_schema,
)
Or from Scratch#
It is also possible to directly build a FittedParquetDataset
from an existing FittedSchema using the default constructor.
This can be useful to instantiate FittedParquetDataset
for a test set from another dataset schema.
from xpdeep.dataset.parquet_dataset import FittedParquetDataset
fitted_validation_dataset = FittedParquetDataset(
split_name="test",
identifier_name="my_local_test_dataset",
path=directory["test_set_path"],
fitted_schema=fitted_train_dataset.fitted_schema
)
👀 Full file preview
import os
import tempfile
import pyarrow as pa
import pyarrow.parquet as pq
import xpdeep
from xpdeep import Project
from xpdeep.dataset.parquet_dataset import FittedParquetDataset, ParquetDataset
from xpdeep.dataset.upload import upload
demo = {"api_key": "your_api_key", "api_url": "your_api_url"}
xpdeep.init(**demo)
train_data = pa.table({
"petal_length": [1.4, 1.5, 1.3, 4.5, 4.1, 5.0, 6.0, 5.5],
"petal_width": [0.2, 0.2, 0.2, 1.5, 1.3, 1.8, 2.5, 2.3],
"flower_type": ["Setosa", "Setosa", "Setosa", "Versicolor", "Versicolor", "Versicolor", "Virginica", "Virginica"],
"index_xp_deep": list(range(8)),
})
test_data = pa.table({
"petal_length": [5.1, 1.2, 1.1, 6.1, 4.7, 1.6, 4.3, 5.8],
"petal_width": [1.9, 0.2, 0.3, 2.2, 1.4, 0.4, 1.5, 2.0],
"flower_type": ["Versicolor", "Setosa", "Setosa", "Virginica", "Versicolor", "Setosa", "Versicolor", "Virginica"],
"index_xp_deep": list(range(8)),
})
xpdeep.set_project(Project("toy_dataset_project", name="toy dataset example", description="tutorial"))
# Write locally the raw data.
with tempfile.TemporaryDirectory() as temp_dir:
train_set_path = os.path.join(temp_dir, "train.parquet")
test_set_path = os.path.join(temp_dir, "test.parquet")
pq.write_table(train_data, train_set_path)
pq.write_table(test_data, test_set_path)
directory = upload(
directory_name="my_uploaded_data",
relative_paths=False,
train_set_path=train_set_path,
test_set_path=test_set_path,
) # Absolute path
train_dataset = ParquetDataset(split_name="train", identifier_name="toy_dataset", path=directory["train_set_path"])
analyzed_train_dataset = train_dataset.analyze(target_names=["flower_type"])
fitted_train_dataset = analyzed_train_dataset.fit()
print(fitted_train_dataset.fitted_schema)
fitted_validation_dataset = FittedParquetDataset(
split_name="test",
identifier_name="toy_dataset",
path=directory["test_set_path"],
fitted_schema=fitted_train_dataset.fitted_schema,
)
Once in possession of a suitable fitted schema associated to your FittedParquetDataset
, the next step is to
build your explainable model.