Create your Explainable Parquet Dataset#
See key concepts to instantiate a project prior to the dataset creation.
1. Convert your Raw Data#
Currently, Xpdeep StandardDataset
supports .parquet
format under the ParquetDataset
class. Parquet files are
backed by an Arrow Table, where each column represent a feature.
Note
Xpdeep currently supports cloud storage (S3, GCP) and local storage for your .parquet
file. It relies internally on
fsspec
to transfer the data if on a cloud location.
You must then split your data into train, validation and test. Each split is itself a ParquetDataset
.
The Arrow Format#
As HuggingFace's Datasets package underlines, Arrow makes it possible to process and move large quantities of data quickly.
It is a specific data format that stores data in a columnar memory layout. This provides several significant advantages:
- Arrowβs standard format allows zero-copy reads which removes virtually all serialization overhead.
- Arrow is language-agnostic so it supports different programming languages.
- Arrow is column-oriented so it is faster at querying and processing slices or columns of data.
- Arrow allows for copy-free hand-offs to standard machine learning tools such as NumPy, Pandas, PyTorch, and TensorFlow.
- Arrow supports many, possibly nested, column types.
Let's make an Arrow Table from a toy dataset made of 8 samples, containing two numerical features "petal_length" and "petal_width", with a categorical target "flower_type".
Here, we set an additional column "index_xp_deep" which is currently the row index, required by Xpdeep.
import pyarrow as pa
import pyarrow.parquet as pq
raw_data = pa.table(
{
'petal_length': [1.4, 1.5, 1.3, 4.5, 4.1, 5.0, 6.0, 5.5],
'petal_width': [0.2, 0.2, 0.2, 1.5, 1.3, 1.8, 2.5, 2.3],
'flower_type': ["Setosa", "Setosa", "Setosa", "Versicolor", "Versicolor", "Versicolor", "Virginica", "Virginica"],
'index_xp_deep': list(range(8))
}
)
# Write the table to a Parquet file
pq.write_table(raw_data, "train.parquet")
π Full file preview
import os
import tempfile
import pyarrow as pa
import pyarrow.parquet as pq
import xpdeep
from xpdeep import Project
from xpdeep.dataset.parquet_dataset import ParquetDataset
from xpdeep.dataset.upload import upload
demo = {"api_key": "your_api_key", "api_url": "your_api_url"}
xpdeep.init(**demo)
train_data = pa.table({
"petal_length": [1.4, 1.5, 1.3, 4.5, 4.1, 5.0, 6.0, 5.5],
"petal_width": [0.2, 0.2, 0.2, 1.5, 1.3, 1.8, 2.5, 2.3],
"flower_type": ["Setosa", "Setosa", "Setosa", "Versicolor", "Versicolor", "Versicolor", "Virginica", "Virginica"],
"index_xp_deep": list(range(8)),
})
test_data = pa.table({
"petal_length": [5.1, 1.2, 1.1, 6.1, 4.7, 1.6, 4.3, 5.8],
"petal_width": [1.9, 0.2, 0.3, 2.2, 1.4, 0.4, 1.5, 2.0],
"flower_type": ["Versicolor", "Setosa", "Setosa", "Virginica", "Versicolor", "Setosa", "Versicolor", "Virginica"],
"index_xp_deep": list(range(8)),
})
xpdeep.set_project(Project("toy_dataset_project", name="toy dataset example", description="tutorial"))
# Write locally the raw data.
with tempfile.TemporaryDirectory() as temp_dir:
train_set_path = os.path.join(temp_dir, "train.parquet")
test_set_path = os.path.join(temp_dir, "test.parquet")
pq.write_table(train_data, train_set_path)
pq.write_table(test_data, test_set_path)
directory = upload(
directory_name="my_uploaded_data",
relative_paths=False,
train_set_path=train_set_path,
test_set_path=test_set_path,
) # Absolute path
train_dataset = ParquetDataset(split_name="train", identifier_name="toy_dataset", path=directory["train_set_path"])
test_dataset = ParquetDataset(split_name="test", identifier_name="toy_dataset", path=directory["test_set_path"])
Tip
Please follow Getting Started to get your api key.
2. Upload your Converted Data#
You need to give xpdeep access to your data files, once they've been converted to the correct format
- If your data is locally stored, you need to upload it. You can also upload multiple files at once with additional
parameters when calling the
upload
function.
from xpdeep.dataset.upload import upload
# Relative path
directory = upload(directory_name="my_uploaded_data", train_set_path="train.parquet")
# Absolute path
directory = upload(
directory_name="my_uploaded_data",
relative_paths=False,
train_set_path="/Home/my_directory/train.parquet",
)
# Multiple files
directory = upload(
directory_name="my_uploaded_data",
train_set_path="train.parquet",
test_set_path="test.parquet",
)
π Full file preview
import os
import tempfile
import pyarrow as pa
import pyarrow.parquet as pq
import xpdeep
from xpdeep import Project
from xpdeep.dataset.parquet_dataset import ParquetDataset
from xpdeep.dataset.upload import upload
demo = {"api_key": "your_api_key", "api_url": "your_api_url"}
xpdeep.init(**demo)
train_data = pa.table({
"petal_length": [1.4, 1.5, 1.3, 4.5, 4.1, 5.0, 6.0, 5.5],
"petal_width": [0.2, 0.2, 0.2, 1.5, 1.3, 1.8, 2.5, 2.3],
"flower_type": ["Setosa", "Setosa", "Setosa", "Versicolor", "Versicolor", "Versicolor", "Virginica", "Virginica"],
"index_xp_deep": list(range(8)),
})
test_data = pa.table({
"petal_length": [5.1, 1.2, 1.1, 6.1, 4.7, 1.6, 4.3, 5.8],
"petal_width": [1.9, 0.2, 0.3, 2.2, 1.4, 0.4, 1.5, 2.0],
"flower_type": ["Versicolor", "Setosa", "Setosa", "Virginica", "Versicolor", "Setosa", "Versicolor", "Virginica"],
"index_xp_deep": list(range(8)),
})
xpdeep.set_project(Project("toy_dataset_project", name="toy dataset example", description="tutorial"))
# Write locally the raw data.
with tempfile.TemporaryDirectory() as temp_dir:
train_set_path = os.path.join(temp_dir, "train.parquet")
test_set_path = os.path.join(temp_dir, "test.parquet")
pq.write_table(train_data, train_set_path)
pq.write_table(test_data, test_set_path)
directory = upload(
directory_name="my_uploaded_data",
relative_paths=False,
train_set_path=train_set_path,
test_set_path=test_set_path,
) # Absolute path
train_dataset = ParquetDataset(split_name="train", identifier_name="toy_dataset", path=directory["train_set_path"])
test_dataset = ParquetDataset(split_name="test", identifier_name="toy_dataset", path=directory["test_set_path"])
- If your data is stored on an Xpdeep compatible cloud provider, you need to provide storage options while creating the dataset. Storage option should be provided at the dataset instantiation, please refer to the next step below.
Most cloud providers will be supported. Internally, Xpdeep dataset rely on fsspec to access the different buckets. Please refer again to the Datasets cloud storage documentation as a tutorial.
3. Instantiate a Dataset#
From your newly converted Arrow Table data, you can instantiate an Xpdeep explainable dataset. Please provide either an uploaded path if you uploaded your local data first, or storage option with the associated url if your data is cloud located.
Instantiate from local files:
from xpdeep.dataset.parquet_dataset import ParquetDataset
# From local files previously uploaded to xpdeep cloud.
train_dataset = ParquetDataset(
split_name="train",
identifier_name="my_local_dataset",
path=directory["train_set_path"],
)
π Full file preview
import os
import tempfile
import pyarrow as pa
import pyarrow.parquet as pq
import xpdeep
from xpdeep import Project
from xpdeep.dataset.parquet_dataset import ParquetDataset
from xpdeep.dataset.upload import upload
demo = {"api_key": "your_api_key", "api_url": "your_api_url"}
xpdeep.init(**demo)
train_data = pa.table({
"petal_length": [1.4, 1.5, 1.3, 4.5, 4.1, 5.0, 6.0, 5.5],
"petal_width": [0.2, 0.2, 0.2, 1.5, 1.3, 1.8, 2.5, 2.3],
"flower_type": ["Setosa", "Setosa", "Setosa", "Versicolor", "Versicolor", "Versicolor", "Virginica", "Virginica"],
"index_xp_deep": list(range(8)),
})
test_data = pa.table({
"petal_length": [5.1, 1.2, 1.1, 6.1, 4.7, 1.6, 4.3, 5.8],
"petal_width": [1.9, 0.2, 0.3, 2.2, 1.4, 0.4, 1.5, 2.0],
"flower_type": ["Versicolor", "Setosa", "Setosa", "Virginica", "Versicolor", "Setosa", "Versicolor", "Virginica"],
"index_xp_deep": list(range(8)),
})
xpdeep.set_project(Project("toy_dataset_project", name="toy dataset example", description="tutorial"))
# Write locally the raw data.
with tempfile.TemporaryDirectory() as temp_dir:
train_set_path = os.path.join(temp_dir, "train.parquet")
test_set_path = os.path.join(temp_dir, "test.parquet")
pq.write_table(train_data, train_set_path)
pq.write_table(test_data, test_set_path)
directory = upload(
directory_name="my_uploaded_data",
relative_paths=False,
train_set_path=train_set_path,
test_set_path=test_set_path,
) # Absolute path
train_dataset = ParquetDataset(split_name="train", identifier_name="toy_dataset", path=directory["train_set_path"])
test_dataset = ParquetDataset(split_name="test", identifier_name="toy_dataset", path=directory["test_set_path"])
Or from an S3 cloud URL with the associated credentials (only an example, not functional):
from xpdeep.dataset.parquet_dataset import ParquetDataset
my_cloud_provider_url = "s3://xpdeep-cloud-dataset/train.parquet"
storage_options = (
{
"key": "KEY",
"secret": "SECRET",
"client_kwargs": {
"region_name": "fr-par",
"endpoint_url": "https://s3.fr-par.scw.cloud",
},
},
)
train_dataset = ParquetDataset(
split_name="train",
identifier_name="my_cloud_dataset",
path=my_cloud_provider_url,
storage_options=storage_options,
)
π Full file preview
import os
import tempfile
import pyarrow as pa
import pyarrow.parquet as pq
import xpdeep
from xpdeep import Project
from xpdeep.dataset.parquet_dataset import ParquetDataset
from xpdeep.dataset.upload import upload
demo = {"api_key": "your_api_key", "api_url": "your_api_url"}
xpdeep.init(**demo)
train_data = pa.table({
"petal_length": [1.4, 1.5, 1.3, 4.5, 4.1, 5.0, 6.0, 5.5],
"petal_width": [0.2, 0.2, 0.2, 1.5, 1.3, 1.8, 2.5, 2.3],
"flower_type": ["Setosa", "Setosa", "Setosa", "Versicolor", "Versicolor", "Versicolor", "Virginica", "Virginica"],
"index_xp_deep": list(range(8)),
})
test_data = pa.table({
"petal_length": [5.1, 1.2, 1.1, 6.1, 4.7, 1.6, 4.3, 5.8],
"petal_width": [1.9, 0.2, 0.3, 2.2, 1.4, 0.4, 1.5, 2.0],
"flower_type": ["Versicolor", "Setosa", "Setosa", "Virginica", "Versicolor", "Setosa", "Versicolor", "Virginica"],
"index_xp_deep": list(range(8)),
})
xpdeep.set_project(Project("toy_dataset_project", name="toy dataset example", description="tutorial"))
# Write locally the raw data.
with tempfile.TemporaryDirectory() as temp_dir:
train_set_path = os.path.join(temp_dir, "train.parquet")
test_set_path = os.path.join(temp_dir, "test.parquet")
pq.write_table(train_data, train_set_path)
pq.write_table(test_data, test_set_path)
directory = upload(
directory_name="my_uploaded_data",
relative_paths=False,
train_set_path=train_set_path,
test_set_path=test_set_path,
) # Absolute path
train_dataset = ParquetDataset(split_name="train", identifier_name="toy_dataset", path=directory["train_set_path"])
test_dataset = ParquetDataset(split_name="test", identifier_name="toy_dataset", path=directory["test_set_path"])
4. Create a Schema#
This dataset does not contain a Schema
object yet. As a schema is a requirement to get any result from your data, you
can either use this dataset to infer a schema automatically or make your own schema from scratch. Please check the next
section to learn how to get a schema for your explainable dataset.