Skip to content

From a pytorch model to a deep explainable model#

For a quick introduction to the Xpdeep APIs, this section demonstrates, on the HAR dataset, how to adapt a standard deep model's PyTorch code to transition to designing an explainable deep model.

We will review the key steps involved in designing a deep model, from architecture specification and training to generating explanations (for Xpdeep).

For each step in building a deep model, we provide:

  • Tabs labeled "SOTA and Xpdeep" for code that is identical for both the SOTA deep model and the Xpdeep explainable model.

  • Tabs labeled "Xpdeep" for code specific to the Xpdeep explainable model.

1. Project Setup#

Setup Api Key and URL#

from xpdeep import init

init(api_key="MY_API_KEY", api_url="MY_API_URL")

Create a Project#

from xpdeep import set_project
from xpdeep.project import Project

set_project(Project.create_or_get(name="Har Tutorial"))

2. Data preparation#

Read Raw Data#

from pathlib import Path
import numpy as np
import pandas as pd

# Read train data
features_dict = {}

split_name = "train"

for feature_filepath in sorted(Path(f"{split_name}/Inertial Signals/").rglob("*.txt")):
    feature_name = feature_filepath.stem
    features_dict[feature_name] = np.squeeze(
        pd.read_csv(feature_filepath, sep=r"\s+", header=None).to_numpy(dtype=np.float32)
    )

train_inputs = np.transpose(np.stack(list(features_dict.values()), axis=1), (0, 2, 1))
train_targets = np.squeeze(
    pd.read_csv(f"{split_name}/y_{split_name}.txt", sep=r"\s+", header=None).to_numpy(dtype=np.float32)
)


# Read test data
features_dict = {}

split_name = "test"

for feature_filepath in sorted(Path(f"{split_name}/Inertial Signals/").rglob("*.txt")):
    feature_name = feature_filepath.stem
    features_dict[feature_name] = np.squeeze(
        pd.read_csv(feature_filepath, sep=r"\s+", header=None).to_numpy(dtype=np.float32)
    )

test_inputs = np.transpose(np.stack(list(features_dict.values()), axis=1), (0, 2, 1))
test_targets = np.squeeze(
    pd.read_csv(f"{split_name}/y_{split_name}.txt", sep=r"\s+", header=None).to_numpy(dtype=np.float32)
)

# Map the target to their labels
activity_mapping = {
    1: "Walking",
    2: "Walking upstairs",
    3: "Walking downstairs",
    4: "Sitting",
    5: "Standing",
    6: "Laying",
}

targets_mapper = np.vectorize(lambda x: activity_mapping[x])
train_targets = targets_mapper(train_targets)  # Map targets to their labels.

test_targets = targets_mapper(test_targets) 
test_val_data = pd.DataFrame.from_dict({"human_activity": test_inputs.tolist(), "activity": test_targets})

Split Data#

import numpy as np
from sklearn.model_selection import train_test_split

train_data = pd.DataFrame.from_dict({"human_activity": train_inputs.tolist(), "activity": train_targets})
test_data, val_data = train_test_split(test_val_data, test_size=0.5, random_state=42)

print(f"Input shape : {np.array(train_data["human_activity"].to_list()).shape}")

Conversion to Parquet Format#

import pyarrow as pa
import pyarrow.parquet as pq

# Convert to pyarrow Table format
train_table = pa.Table.from_pandas(train_data, preserve_index=False)
val_table = pa.Table.from_pandas(val_data, preserve_index=False)
test_table = pa.Table.from_pandas(test_data, preserve_index=False)

# Save each split as ".parquet" file
pq.write_table(train_table, "train.parquet")
pq.write_table(val_table, "val.parquet")
pq.write_table(test_table, "test.parquet")

Upload#

from xpdeep.dataset.upload import upload

directory = upload(
    directory_name="har_uploaded",
    train_set_path="train.parquet",
    test_set_path="test.parquet",
    val_set_path="val.parquet",
)

Preprocess Data#

from sklearn.preprocessing import OneHotEncoder, StandardScaler

input_standard_scaler_for_nn = StandardScaler().fit(np.array(train_data["human_activity"].to_list()).reshape(-1, 1))
target_one_hot_encoder_for_nn = OneHotEncoder(sparse_output=False).fit(train_data[["activity"]].values)

x_train = np.array(train_data["human_activity"].to_list())
x_train_shape_d1, x_train_shape_d2, x_train_shape_d3 = x_train.shape
x_train = input_standard_scaler_for_nn.transform(x_train.reshape(-1,1)).reshape(x_train_shape_d1, x_train_shape_d2, x_train_shape_d3)
y_train = target_one_hot_encoder_for_nn.transform(train_data["activity"].to_numpy().reshape(-1,1))

x_val = np.array(val_data["human_activity"].to_list())
x_val_shape_d1, x_val_shape_d2, x_val_shape_d3 = x_val.shape
x_val = input_standard_scaler_for_nn.transform(x_val.reshape(-1,1)).reshape(x_val_shape_d1, x_val_shape_d2, x_val_shape_d3)
y_val = target_one_hot_encoder_for_nn.transform(val_data["activity"].to_numpy().reshape(-1,1))

x_test = np.array(test_data["human_activity"].to_list())
x_test_shape_d1, x_test_shape_d2, x_test_shape_d3 = x_test.shape
x_test = input_standard_scaler_for_nn.transform(x_test.reshape(-1,1)).reshape(x_test_shape_d1, x_test_shape_d2, x_test_shape_d3)
y_test = target_one_hot_encoder_for_nn.transform(test_data["activity"].to_numpy().reshape(-1,1))
from xpdeep.dataset.schema.feature.feature import (
    CategoricalFeature,
    MultivariateTimeSeries,
)
from xpdeep.dataset.schema.schema import AnalyzedSchema
from xpdeep.dataset.schema.preprocessor import TorchPreprocessor, SklearnPreprocessor
from sklearn.preprocessing import OneHotEncoder
import torch
from xpdeep.dataset.parquet_dataset import FittedParquetDataset
from xpdeep.dataset.parquet_dataset import AnalyzedParquetDataset

# 1/ Create Analyzed Schema

class ScaleHAR(TorchPreprocessor):
    def __init__(self, input_size: tuple[int, ...]):
        """Initialize the scaler."""
        super().__init__(input_size=input_size)
        self.mean = torch.nn.Parameter(
            torch.tensor(train_table.column("human_activity").to_pylist()).mean(dim=(0,1))
        )
        self.std = torch.nn.Parameter(
            torch.tensor(train_table.column("human_activity").to_pylist()).std(dim=(0,1))
        )

    def transform(self, inputs: torch.Tensor) -> torch.Tensor:
        """Transform."""
        return (inputs - self.mean) / self.std

    def inverse_transform(self, output: torch.Tensor) -> torch.Tensor:
        """Apply inverse transform."""
        return output * self.std + self.mean

analyzed_schema = AnalyzedSchema(
    MultivariateTimeSeries(
        asynchronous=True,
        channel_names=[
            "body_acc_x",
            "body_acc_y",
            "body_acc_z",
            "body_gyro_x",
            "body_gyro_y",
            "body_gyro_z",
            "total_acc_x",
            "total_acc_y",
            "total_acc_z",
        ],
        name="human_activity",
        preprocessor=ScaleHAR(input_size=(128, 9)),
    ),
    CategoricalFeature(
        is_target=True,
        name="activity",
        preprocessor=SklearnPreprocessor(preprocess_function=OneHotEncoder(sparse_output=False)),
    ),
)

# 2/ Create Analyzed Parquet on Train Dataset

analyzed_train_dataset = AnalyzedParquetDataset(
    split_name="train",
    identifier_name="my_local_dataset",
    path=directory["train_set_path"],
    analyzed_schema=analyzed_schema,
)

print(analyzed_schema)

#3/ Create Fitted Parquet Datasets

fit_train_dataset = analyzed_train_dataset.fit()

fit_test_dataset = FittedParquetDataset(
    split_name="test",
    identifier_name="my_local_dataset",
    path=directory["test_set_path"],
    fitted_schema=fit_train_dataset.fitted_schema,
)

fit_val_dataset = FittedParquetDataset(
    split_name="validation",
    identifier_name="my_local_dataset",
    path=directory["val_set_path"],
    fitted_schema=fit_train_dataset.fitted_schema,
)

3. Model Construction#

Architecture Specification#

from torch.nn import Sequential
import torch

device = "cpu"

class SotaModel(Sequential):
    def __init__(self):

        layers = [
            torch.nn.Conv1d(9, 32, kernel_size=3, stride=1, padding=1),
            torch.nn.ReLU(),

            torch.nn.Conv1d(32, 64, kernel_size=3, stride=1, padding=1),
            torch.nn.ReLU(),

            torch.nn.Conv1d(64, 128, kernel_size=3, stride=1, padding=1),
            torch.nn.ReLU(),
            torch.nn.Flatten(),

            torch.nn.LazyLinear(out_features=6),
        ]

        super().__init__(*layers)


    def forward(self, inputs: torch.Tensor) -> torch.Tensor:
         x = inputs.transpose(1, 2)
         return super().forward(x)
import torch
from torch.nn import Sequential


class FeatureExtractor(Sequential):
    def __init__(self):

        layers = [
            torch.nn.Conv1d(9, 32, kernel_size=3, stride=1, padding=1),
            torch.nn.ReLU(),

            torch.nn.Conv1d(32, 64, kernel_size=3, stride=1, padding=1),
            torch.nn.ReLU(),

            torch.nn.Conv1d(64, 128, kernel_size=3, stride=1, padding=1),
            torch.nn.ReLU(),
            torch.nn.Flatten(),
        ]

        super().__init__(*layers)

    def forward(self, inputs: torch.Tensor) -> torch.Tensor:
        x = inputs.transpose(1, 2)
        return super().forward(x)


class TaskLearner(Sequential):
    def __init__(self):

        layers = [
            torch.nn.LazyLinear(out_features=6),
            torch.nn.Softmax(dim=-1)
        ]

        super().__init__(*layers)

Model Instantiation#

sota_model = SotaModel()
from xpdeep.model.model_builder import ModelDecisionGraphParameters
from xpdeep.model.xpdeep_model import XpdeepModel

# Explanation Architecture
explanation_architecture = ModelDecisionGraphParameters(
    graph_depth=3,
    discrimination_weight=0.1,
    target_homogeneity_weight=2.0,
    prune_step=11,
    target_homogeneity_pruning_threshold=0.7,
    population_pruning_threshold=0.05,
    balancing_weight=1.0,
)

# XPDEEP Model Architecture
xpdeep_model = XpdeepModel.from_torch(
    fitted_schema=fit_train_dataset.fitted_schema,
    feature_extraction=FeatureExtractor(),
    task_learner=TaskLearner(),
    decision_graph_parameters=explanation_architecture,
)

4. Training#

Training Specification#

from torch import nn

loss_fn = nn.CrossEntropyLoss()
optimizer = torch.optim.AdamW(sota_model.parameters(), lr=1e-3)
batch_size = 128
epochs = 20
from xpdeep.trainer.callbacks import EarlyStopping, Scheduler
from functools import partial
from xpdeep.metric import DictMetrics, TorchGlobalMetric, TorchLeafMetric
from torchmetrics.classification import MulticlassAccuracy, MulticlassConfusionMatrix
from torch.optim.lr_scheduler import ReduceLROnPlateau
from xpdeep.trainer.trainer import Trainer
from xpdeep.model.zoo.cross_entropy_loss_from_proba import CrossEntropyLossFromProbabilities


target_size = fit_train_dataset.fitted_schema.target_size[1]

# Explanation Metrics
metrics = DictMetrics(
    global_multi_class_accuracy=TorchGlobalMetric(
        partial(MulticlassAccuracy, num_classes=target_size, average="micro"), target_as_indexes=True
    ),
    leaf_multi_class_accuracy=TorchLeafMetric(
        partial(MulticlassAccuracy, num_classes=target_size, average="micro"), target_as_indexes=True
    ),
    leaf_confusion_matrix=TorchLeafMetric(
        partial(MulticlassConfusionMatrix, num_classes=target_size, normalize="all"), target_as_indexes=True
    ),

)

callbacks = [
    EarlyStopping(monitoring_metric="Total loss", mode="minimize", patience=10),
    Scheduler(
        pre_scheduler=partial(ReduceLROnPlateau, patience=3, mode="max"),
        step_method="epoch",
        monitoring_metric="global_multi_class_accuracy",
    ),
]

# XPDEEP Training Specifications
trainer = Trainer(
    loss=CrossEntropyLossFromProbabilities(reduction="none"),
    optimizer=partial(torch.optim.AdamW, lr=0.001, foreach=False, fused=False),
    start_epoch=0,
    max_epochs=60,
    metrics=metrics,
    callbacks=callbacks,
)

Model Training#

import torch

def train(X_train, y_train, model, loss_fn, optimizer):
    size = len(X_train)
    model.train()
    total_loss = 0

    for batch in range(size//batch_size):

        X_batch, y_batch = torch.tensor(X_train[batch*batch_size:(batch+1)*batch_size,:,:], dtype=torch.float32).to(device), torch.tensor(y_train[batch*batch_size:(batch+1)*batch_size,:], dtype=torch.float32).to(device)

        # Compute prediction error
        pred = model(X_batch)
        loss = loss_fn(pred, y_batch)

        # Backpropagation
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        total_loss += loss.item()

    average_loss = total_loss/(size//batch_size)
    return average_loss


def eval_(X_test, y_test, model, loss_fn):
    # size = len(X_test)
    model.eval()
    with torch.no_grad():
        X_test, y_test = torch.tensor(X_test, dtype=torch.float32).to(device), torch.tensor(y_test, dtype=torch.float32).to(device)

        pred = model(X_test)
        test_loss = loss_fn(pred, y_test).item()

        accuracy = (torch.argmax(pred, 1) == torch.argmax(y_test, 1)).float().mean()

        return nn.Softmax(dim=-1)(pred).round(), test_loss, accuracy


for t in range(epochs):

    print(f"\nEpoch {t+1}\n-------------------------------")


    training_loss = train(
        x_train, 
        y_train, 
        sota_model, 
        loss_fn, 
        optimizer
    )

    _, val_loss, _ = eval_(
        x_val, 
        y_val, 
        sota_model, 
        loss_fn
    )

    print(f"Training Loss: {training_loss}\nValidation Loss: {val_loss}")

_, _, accuracy_on_train = eval_(x_train, y_train, sota_model, loss_fn)
_, _, accuracy_on_validation = eval_(x_val, y_val, sota_model, loss_fn)
_, _, accuracy_on_test = eval_(x_test, y_test, sota_model, loss_fn)

print(f"\nAccuracies: "
      f"\nAccuracy on train set      : {accuracy_on_train}"
      f"\nAccuracy on validation set : {accuracy_on_validation}"
      f"\nAccuracy on test set       : {accuracy_on_test}"
)
trained_model = trainer.train(
    model=xpdeep_model,
    train_set=fit_train_dataset,
    validation_set=fit_val_dataset,
    batch_size=128,
)

5. Explanation Generation#

from xpdeep.explain.explainer import Explainer
from xpdeep.explain.quality_metrics import Infidelity, Sensitivity
from xpdeep.explain.statistic import DictStats, DistributionStat

statistics = DictStats(
    distribution_target=DistributionStat(on="target"), distribution_prediction=DistributionStat(on="prediction")
)

quality_metrics = [Sensitivity(), Infidelity()]

explainer = Explainer(
    description_representativeness=1000, quality_metrics=quality_metrics, metrics=metrics, statistics=statistics
)

model_explanations = explainer.global_explain(
    trained_model,
    train_set=fit_train_dataset,
    test_set=fit_test_dataset,
    validation_set=fit_val_dataset,
)

print(model_explanations.visualisation_link)