Create your Explainable Parquet Dataset#

See key concepts to instantiate a project prior to the dataset creation.

1. Convert your Raw Data#

Currently, Xpdeep StandardDataset supports .parquet format under the ParquetDataset class. Parquet files are backed by an Arrow Table, where each column represent a feature.

Note

Xpdeep currently supports cloud storage (S3, GCP) and local storage for your .parquet file. It relies internally on fsspec to transfer the data if on a cloud location.

You must then split your data into train, validation and test. Each split is itself a ParquetDataset.

The Arrow Format#

As HuggingFace's Datasets package underlines, Arrow makes it possible to process and move large quantities of data quickly.

It is a specific data format that stores data in a columnar memory layout. This provides several significant advantages:

Arrow’s standard format allows zero-copy reads which removes virtually all serialization overhead.
Arrow is language-agnostic so it supports different programming languages.
Arrow is column-oriented so it is faster at querying and processing slices or columns of data.
Arrow allows for copy-free hand-offs to standard machine learning tools such as NumPy, Pandas, PyTorch, and TensorFlow.
Arrow supports many, possibly nested, column types.

Let's make an Arrow Table from a toy dataset made of 8 samples, containing two numerical features "petal_length" and "petal_width", with a categorical target "flower_type".

Here, we set an additional column "index_xp_deep" which is currently the row index, required by Xpdeep.

import pyarrow as pa
import pyarrow.parquet as pq
raw_data = pa.table(
    {
        'petal_length': [1.4, 1.5, 1.3, 4.5, 4.1, 5.0, 6.0, 5.5],
        'petal_width': [0.2, 0.2, 0.2, 1.5, 1.3, 1.8, 2.5, 2.3],
        'flower_type': ["Setosa", "Setosa", "Setosa", "Versicolor", "Versicolor", "Versicolor", "Virginica", "Virginica"],
        'index_xp_deep': list(range(8))
    }
)

# Write the table to a Parquet file
pq.write_table(raw_data, "train.parquet")

👀 Full file preview

import os
import tempfile

import pyarrow as pa
import pyarrow.parquet as pq

import xpdeep
from xpdeep import Project
from xpdeep.dataset.parquet_dataset import ParquetDataset
from xpdeep.dataset.upload import upload

demo = {"api_key": "your_api_key", "api_url": "your_api_url"}

xpdeep.init(**demo)

train_data = pa.table({
    "petal_length": [1.4, 1.5, 1.3, 4.5, 4.1, 5.0, 6.0, 5.5],
    "petal_width": [0.2, 0.2, 0.2, 1.5, 1.3, 1.8, 2.5, 2.3],
    "flower_type": ["Setosa", "Setosa", "Setosa", "Versicolor", "Versicolor", "Versicolor", "Virginica", "Virginica"],
    "index_xp_deep": list(range(8)),
})

test_data = pa.table({
    "petal_length": [5.1, 1.2, 1.1, 6.1, 4.7, 1.6, 4.3, 5.8],
    "petal_width": [1.9, 0.2, 0.3, 2.2, 1.4, 0.4, 1.5, 2.0],
    "flower_type": ["Versicolor", "Setosa", "Setosa", "Virginica", "Versicolor", "Setosa", "Versicolor", "Virginica"],
    "index_xp_deep": list(range(8)),
})

xpdeep.set_project(Project("toy_dataset_project", name="toy dataset example", description="tutorial"))


# Write locally the raw data.
with tempfile.TemporaryDirectory() as temp_dir:
    train_set_path = os.path.join(temp_dir, "train.parquet")
    test_set_path = os.path.join(temp_dir, "test.parquet")

    pq.write_table(train_data, train_set_path)
    pq.write_table(test_data, test_set_path)

    directory = upload(
        directory_name="my_uploaded_data",
        relative_paths=False,
        train_set_path=train_set_path,
        test_set_path=test_set_path,
    )  # Absolute path

    train_dataset = ParquetDataset(split_name="train", identifier_name="toy_dataset", path=directory["train_set_path"])

    test_dataset = ParquetDataset(split_name="test", identifier_name="toy_dataset", path=directory["test_set_path"])

Tip

Please follow Getting Started to get your api key.

2. Upload your Converted Data#

You need to give xpdeep access to your data files, once they've been converted to the correct format

If your data is locally stored, you need to upload it. You can also upload multiple files at once with additional parameters when calling the upload function.

from xpdeep.dataset.upload import upload

# Relative path
directory = upload(directory_name="my_uploaded_data", train_set_path="train.parquet")

# Absolute path
directory = upload(
    directory_name="my_uploaded_data",
    relative_paths=False,
    train_set_path="/Home/my_directory/train.parquet",
)

# Multiple files
directory = upload(
    directory_name="my_uploaded_data",
    train_set_path="train.parquet",
    test_set_path="test.parquet",
)

👀 Full file preview

import os
import tempfile

import pyarrow as pa
import pyarrow.parquet as pq

import xpdeep
from xpdeep import Project
from xpdeep.dataset.parquet_dataset import ParquetDataset
from xpdeep.dataset.upload import upload

demo = {"api_key": "your_api_key", "api_url": "your_api_url"}

xpdeep.init(**demo)

train_data = pa.table({
    "petal_length": [1.4, 1.5, 1.3, 4.5, 4.1, 5.0, 6.0, 5.5],
    "petal_width": [0.2, 0.2, 0.2, 1.5, 1.3, 1.8, 2.5, 2.3],
    "flower_type": ["Setosa", "Setosa", "Setosa", "Versicolor", "Versicolor", "Versicolor", "Virginica", "Virginica"],
    "index_xp_deep": list(range(8)),
})

test_data = pa.table({
    "petal_length": [5.1, 1.2, 1.1, 6.1, 4.7, 1.6, 4.3, 5.8],
    "petal_width": [1.9, 0.2, 0.3, 2.2, 1.4, 0.4, 1.5, 2.0],
    "flower_type": ["Versicolor", "Setosa", "Setosa", "Virginica", "Versicolor", "Setosa", "Versicolor", "Virginica"],
    "index_xp_deep": list(range(8)),
})

xpdeep.set_project(Project("toy_dataset_project", name="toy dataset example", description="tutorial"))


# Write locally the raw data.
with tempfile.TemporaryDirectory() as temp_dir:
    train_set_path = os.path.join(temp_dir, "train.parquet")
    test_set_path = os.path.join(temp_dir, "test.parquet")

    pq.write_table(train_data, train_set_path)
    pq.write_table(test_data, test_set_path)

    directory = upload(
        directory_name="my_uploaded_data",
        relative_paths=False,
        train_set_path=train_set_path,
        test_set_path=test_set_path,
    )  # Absolute path

    train_dataset = ParquetDataset(split_name="train", identifier_name="toy_dataset", path=directory["train_set_path"])

    test_dataset = ParquetDataset(split_name="test", identifier_name="toy_dataset", path=directory["test_set_path"])

If your data is stored on an Xpdeep compatible cloud provider, you need to provide storage options while creating the dataset. Storage option should be provided at the dataset instantiation, please refer to the next step below.

Most cloud providers will be supported. Internally, Xpdeep dataset rely on fsspec to access the different buckets. Please refer again to the Datasets cloud storage documentation as a tutorial.

3. Instantiate a Dataset#

From your newly converted Arrow Table data, you can instantiate an Xpdeep explainable dataset. Please provide either an uploaded path if you uploaded your local data first, or storage option with the associated url if your data is cloud located.

Instantiate from local files:

from xpdeep.dataset.parquet_dataset import ParquetDataset

# From local files previously uploaded to xpdeep cloud.
train_dataset = ParquetDataset(
    split_name="train",
    identifier_name="my_local_dataset",
    path=directory["train_set_path"],
)

👀 Full file preview

import os
import tempfile

import pyarrow as pa
import pyarrow.parquet as pq

import xpdeep
from xpdeep import Project
from xpdeep.dataset.parquet_dataset import ParquetDataset
from xpdeep.dataset.upload import upload

demo = {"api_key": "your_api_key", "api_url": "your_api_url"}

xpdeep.init(**demo)

train_data = pa.table({
    "petal_length": [1.4, 1.5, 1.3, 4.5, 4.1, 5.0, 6.0, 5.5],
    "petal_width": [0.2, 0.2, 0.2, 1.5, 1.3, 1.8, 2.5, 2.3],
    "flower_type": ["Setosa", "Setosa", "Setosa", "Versicolor", "Versicolor", "Versicolor", "Virginica", "Virginica"],
    "index_xp_deep": list(range(8)),
})

test_data = pa.table({
    "petal_length": [5.1, 1.2, 1.1, 6.1, 4.7, 1.6, 4.3, 5.8],
    "petal_width": [1.9, 0.2, 0.3, 2.2, 1.4, 0.4, 1.5, 2.0],
    "flower_type": ["Versicolor", "Setosa", "Setosa", "Virginica", "Versicolor", "Setosa", "Versicolor", "Virginica"],
    "index_xp_deep": list(range(8)),
})

xpdeep.set_project(Project("toy_dataset_project", name="toy dataset example", description="tutorial"))


# Write locally the raw data.
with tempfile.TemporaryDirectory() as temp_dir:
    train_set_path = os.path.join(temp_dir, "train.parquet")
    test_set_path = os.path.join(temp_dir, "test.parquet")

    pq.write_table(train_data, train_set_path)
    pq.write_table(test_data, test_set_path)

    directory = upload(
        directory_name="my_uploaded_data",
        relative_paths=False,
        train_set_path=train_set_path,
        test_set_path=test_set_path,
    )  # Absolute path

    train_dataset = ParquetDataset(split_name="train", identifier_name="toy_dataset", path=directory["train_set_path"])

    test_dataset = ParquetDataset(split_name="test", identifier_name="toy_dataset", path=directory["test_set_path"])

Or from an S3 cloud URL with the associated credentials (only an example, not functional):

from xpdeep.dataset.parquet_dataset import ParquetDataset

my_cloud_provider_url = "s3://xpdeep-cloud-dataset/train.parquet"
storage_options = (
    {
        "key": "KEY",
        "secret": "SECRET",
        "client_kwargs": {
            "region_name": "fr-par",
            "endpoint_url": "https://s3.fr-par.scw.cloud",
        },
    },
)
train_dataset = ParquetDataset(
    split_name="train",
    identifier_name="my_cloud_dataset",
    path=my_cloud_provider_url,
    storage_options=storage_options,
)

👀 Full file preview

import os
import tempfile

import pyarrow as pa
import pyarrow.parquet as pq

import xpdeep
from xpdeep import Project
from xpdeep.dataset.parquet_dataset import ParquetDataset
from xpdeep.dataset.upload import upload

demo = {"api_key": "your_api_key", "api_url": "your_api_url"}

xpdeep.init(**demo)

train_data = pa.table({
    "petal_length": [1.4, 1.5, 1.3, 4.5, 4.1, 5.0, 6.0, 5.5],
    "petal_width": [0.2, 0.2, 0.2, 1.5, 1.3, 1.8, 2.5, 2.3],
    "flower_type": ["Setosa", "Setosa", "Setosa", "Versicolor", "Versicolor", "Versicolor", "Virginica", "Virginica"],
    "index_xp_deep": list(range(8)),
})

test_data = pa.table({
    "petal_length": [5.1, 1.2, 1.1, 6.1, 4.7, 1.6, 4.3, 5.8],
    "petal_width": [1.9, 0.2, 0.3, 2.2, 1.4, 0.4, 1.5, 2.0],
    "flower_type": ["Versicolor", "Setosa", "Setosa", "Virginica", "Versicolor", "Setosa", "Versicolor", "Virginica"],
    "index_xp_deep": list(range(8)),
})

xpdeep.set_project(Project("toy_dataset_project", name="toy dataset example", description="tutorial"))


# Write locally the raw data.
with tempfile.TemporaryDirectory() as temp_dir:
    train_set_path = os.path.join(temp_dir, "train.parquet")
    test_set_path = os.path.join(temp_dir, "test.parquet")

    pq.write_table(train_data, train_set_path)
    pq.write_table(test_data, test_set_path)

    directory = upload(
        directory_name="my_uploaded_data",
        relative_paths=False,
        train_set_path=train_set_path,
        test_set_path=test_set_path,
    )  # Absolute path

    train_dataset = ParquetDataset(split_name="train", identifier_name="toy_dataset", path=directory["train_set_path"])

    test_dataset = ParquetDataset(split_name="test", identifier_name="toy_dataset", path=directory["test_set_path"])

4. Create a Schema#

This dataset does not contain a Schema object yet. As a schema is a requirement to get any result from your data, you can either use this dataset to infer a schema automatically or make your own schema from scratch. Please check the next section to learn how to get a schema for your explainable dataset.