Adult Income Dataset#
Adult Income is a dataset for binary classification from tabular inputs.
Please download the dataset here and update the tutorial data path accordingly.
Adult Income dataset represents an individual’s annual income, along with various factors. Intuitively, it is influenced by the individual’s education level, age, gender, occupation, etc. The target "income" is divided into two classes: "<=50K" and ">50K".
The following image summarizes the dataset.
Please follow this end-to-end tutorial to prepare the dataset, create and train the model, and finally compute explanations.
Prepare the Dataset#
1. Split and Convert your Raw Data#
The first step consists in creating your train, test and validation splits as StandardDataset
.
import pandas as pd
# Load the CSV file
file_path = 'adult_income.csv'
data = pd.read_csv(file_path)
Here we decide to drop two columns, "fnlwgt", and "education" as they won't help for the classification task.
The dataset is then split into train, test and validation set.
from sklearn.model_selection import train_test_split
# Split the data into training and testing sets
train_data, test_data = train_test_split(data, test_size=0.2, random_state=42)
# Further split the training set into training and validation sets
train_data, val_data = train_test_split(train_data, test_size=0.25, random_state=42)
In addition, we need to add an index_xp_deep
column on each split, please see the doc.
train_data['index_xp_deep'] = range(len(train_data))
test_data['index_xp_deep'] = range(len(test_data))
val_data['index_xp_deep'] = range(len(val_data))
👀 Full file preview
"""Adult Income workflow, classification, tabular data."""
from functools import partial
import pandas as pd
import pyarrow as pa
import pyarrow.parquet as pq
import torch
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from torch.optim.lr_scheduler import ReduceLROnPlateau
from xpdeep import init, set_project
from xpdeep.dataset.parquet_dataset import FittedParquetDataset, ParquetDataset
from xpdeep.dataset.schema.feature.feature import NumericalFeature
from xpdeep.dataset.schema.preprocessor import SklearnPreprocessor
from xpdeep.dataset.upload import upload
from xpdeep.explain.explainer import Explainer
from xpdeep.explain.quality_metrics import Infidelity, Sensitivity
from xpdeep.explain.statistic import DictStats, DistributionStat
from xpdeep.filtering.criteria import CategoricalCriterion, NumericalCriterion
from xpdeep.filtering.filter import Filter
from xpdeep.metrics.metric import DictMetrics
from xpdeep.metrics.zoo.multiclass_metrics import MulticlassConfusionMatrix, MulticlassAccuracy, MulticlassF1Score
from xpdeep.model.model_builder import ModelDecisionGraphParameters
from xpdeep.model.xpdeep_model import XpdeepModel
from xpdeep.model.zoo.cross_entropy_loss_from_proba import CrossEntropyLossFromProbabilities
from xpdeep.model.zoo.mlp import MLP
from xpdeep.project import Project
from xpdeep.trainer.callbacks import EarlyStopping, ModelCheckpoint, Scheduler
from xpdeep.trainer.trainer import Trainer
torch.random.manual_seed(5)
# ##### Prepare the Dataset #######
# 1. Split and Convert your Raw Data
# Load the CSV file
file_path = "adult_income.csv"
data = pd.read_csv(file_path)
data = data.drop(columns=["fnlwgt", "education"])
# Split the data into training and testing sets
train_data, test_data = train_test_split(data, test_size=0.2, random_state=42)
# Further split the training set into training and validation sets
train_data, val_data = train_test_split(train_data, test_size=0.25, random_state=42)
train_data["index_xp_deep"] = range(len(train_data))
test_data["index_xp_deep"] = range(len(test_data))
val_data["index_xp_deep"] = range(len(val_data))
# Convert to pyarrow Table format
train_table = pa.Table.from_pandas(train_data, preserve_index=False)
val_table = pa.Table.from_pandas(val_data, preserve_index=False)
test_table = pa.Table.from_pandas(test_data, preserve_index=False)
# Save each split as ".parquet" file
pq.write_table(train_table, "train.parquet")
pq.write_table(val_table, "val.parquet")
pq.write_table(test_table, "test.parquet")
# 2. Upload your Converted Data
init(api_key="api_key", api_url="api_url")
set_project(Project(id="AdultIncomeId", name="Adult Income Tutorial"))
directory = upload(
directory_name="adult_income_uploaded",
train_set_path="train.parquet",
test_set_path="test.parquet",
val_set_path="val.parquet",
)
# 3. Instantiate a Dataset
train_dataset = ParquetDataset(
split_name="train",
identifier_name="my_local_dataset",
path=directory["train_set_path"],
)
# 4. Find a schema
analyzed_train_dataset = train_dataset.analyze(target_names=["income"])
print(analyzed_train_dataset.analyzed_schema)
preprocessor = SklearnPreprocessor(preprocess_function=StandardScaler())
analyzed_train_dataset.analyzed_schema["educational-num"] = NumericalFeature(
name="educational-num", is_target=False, preprocessor=preprocessor
)
print(analyzed_train_dataset.analyzed_schema)
# 5. Fit the schema
fit_train_dataset = analyzed_train_dataset.fit()
fit_test_dataset = FittedParquetDataset(
split_name="test",
identifier_name="my_local_dataset",
path=directory["test_set_path"],
fitted_schema=fit_train_dataset.fitted_schema,
)
fit_val_dataset = FittedParquetDataset(
split_name="val",
identifier_name="my_local_dataset",
path=directory["val_set_path"],
fitted_schema=fit_train_dataset.fitted_schema,
)
# ##### Prepare the Model #######
# 1. Create the required torch models
input_size = fit_train_dataset.fitted_schema.input_size[1]
target_size = fit_train_dataset.fitted_schema.target_size[1]
print(f"input_size: {input_size} - target_size: {target_size}")
feature_extraction = MLP(input_size=input_size, hidden_channels=[128, 50])
task_learner = MLP(input_size=50, hidden_channels=[target_size], last_activation=partial(torch.nn.Softmax, dim=1))
# 2. Explainable Model Specifications
model_specifications = ModelDecisionGraphParameters(
graph_depth=3,
target_homogeneity_pruning_threshold=0.8,
population_pruning_threshold=0.15,
prune_step=5,
target_homogeneity_weight=1.0,
discrimination_weight=0.1,
balancing_weight=0.05,
)
# 3. Create the Explainable Model
xpdeep_model = XpdeepModel.from_torch(
fitted_schema=fit_train_dataset.fitted_schema,
feature_extraction=feature_extraction,
task_learner=task_learner,
backbone=None,
decision_graph_parameters=model_specifications,
)
# ##### Train #######
# Metrics to monitor the training.
metrics = DictMetrics(
multi_class_accuracy=partial(MulticlassAccuracy, num_classes=2),
multi_class_F1_score=partial(MulticlassF1Score, num_classes=2),
confusion_matrix=partial(MulticlassConfusionMatrix, normalize="all", num_classes=2),
)
callbacks = [
EarlyStopping(monitoring_metric="Total loss", mode="minimize", patience=5),
Scheduler(pre_scheduler=partial(ReduceLROnPlateau), step_method="epoch", monitoring_metric="Total loss"),
ModelCheckpoint(monitoring_metric="global_multi_class_F1_score", mode="minimize"),
]
# Optimizer is a partial object as pytorch needs to give the model as optimizer parameter.
optimizer = partial(torch.optim.AdamW, lr=0.001)
trainer = Trainer(
loss=CrossEntropyLossFromProbabilities(reduction="none"),
optimizer=optimizer,
callbacks=callbacks,
start_epoch=0,
max_epochs=20,
metrics=metrics,
)
trained_model = trainer.train(
model=xpdeep_model,
train_set=fit_train_dataset,
validation_set=fit_val_dataset,
batch_size=128,
)
# ##### Explain #######
# 1. Build the Explainer
statistics = DictStats(
distribution_target=DistributionStat(on="target"), distribution_prediction=DistributionStat(on="prediction")
)
quality_metrics = [Sensitivity(), Infidelity()]
explainer = Explainer(
description_representativeness=1000, quality_metrics=quality_metrics, metrics=metrics, statistics=statistics
)
# 2. Model Functioning Explanations
model_explanations = explainer.global_explain(
trained_model,
train_set=fit_train_dataset,
test_set=fit_test_dataset,
validation_set=fit_val_dataset,
)
visualisation_link = model_explanations.visualisation_link
# 3. Inference and their Causal Explanations
my_filter = Filter("testing_filter", fit_test_dataset)
my_filter.add_criteria(
NumericalCriterion(fit_test_dataset.fitted_schema["age"], max_=30),
CategoricalCriterion(fit_test_dataset.fitted_schema["workclass"], categories=["Private"]),
)
causal_explanations = explainer.local_explain(trained_model, fit_test_dataset, my_filter)
visualisation_link = causal_explanations.visualisation_link
As stated in the doc, Xpdeep requires a ".parquet" file to create the dataset. The original data is stored as a ".csv" file, therefore each split must be converted to a ".parquet" file.
Tip
To get your ".parquet" files, you can easily convert each split from pandas.DataFrame
to pyarrow.Table
first.
Warning
Here with set preserve_index
to False in order to remove the DataFrame "index" column from the resulting Pyarrow Table.
import pyarrow as pa
import pyarrow.parquet as pq
# Convert to pyarrow Table format
train_table = pa.Table.from_pandas(train_data, preserve_index=False)
val_table = pa.Table.from_pandas(val_data, preserve_index=False)
test_table = pa.Table.from_pandas(test_data, preserve_index=False)
# Save each split as ".parquet" file
pq.write_table(train_table, "train.parquet")
pq.write_table(val_table, "val.parquet")
pq.write_table(test_table, "test.parquet")
👀 Full file preview
"""Adult Income workflow, classification, tabular data."""
from functools import partial
import pandas as pd
import pyarrow as pa
import pyarrow.parquet as pq
import torch
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from torch.optim.lr_scheduler import ReduceLROnPlateau
from xpdeep import init, set_project
from xpdeep.dataset.parquet_dataset import FittedParquetDataset, ParquetDataset
from xpdeep.dataset.schema.feature.feature import NumericalFeature
from xpdeep.dataset.schema.preprocessor import SklearnPreprocessor
from xpdeep.dataset.upload import upload
from xpdeep.explain.explainer import Explainer
from xpdeep.explain.quality_metrics import Infidelity, Sensitivity
from xpdeep.explain.statistic import DictStats, DistributionStat
from xpdeep.filtering.criteria import CategoricalCriterion, NumericalCriterion
from xpdeep.filtering.filter import Filter
from xpdeep.metrics.metric import DictMetrics
from xpdeep.metrics.zoo.multiclass_metrics import MulticlassConfusionMatrix, MulticlassAccuracy, MulticlassF1Score
from xpdeep.model.model_builder import ModelDecisionGraphParameters
from xpdeep.model.xpdeep_model import XpdeepModel
from xpdeep.model.zoo.cross_entropy_loss_from_proba import CrossEntropyLossFromProbabilities
from xpdeep.model.zoo.mlp import MLP
from xpdeep.project import Project
from xpdeep.trainer.callbacks import EarlyStopping, ModelCheckpoint, Scheduler
from xpdeep.trainer.trainer import Trainer
torch.random.manual_seed(5)
# ##### Prepare the Dataset #######
# 1. Split and Convert your Raw Data
# Load the CSV file
file_path = "adult_income.csv"
data = pd.read_csv(file_path)
data = data.drop(columns=["fnlwgt", "education"])
# Split the data into training and testing sets
train_data, test_data = train_test_split(data, test_size=0.2, random_state=42)
# Further split the training set into training and validation sets
train_data, val_data = train_test_split(train_data, test_size=0.25, random_state=42)
train_data["index_xp_deep"] = range(len(train_data))
test_data["index_xp_deep"] = range(len(test_data))
val_data["index_xp_deep"] = range(len(val_data))
# Convert to pyarrow Table format
train_table = pa.Table.from_pandas(train_data, preserve_index=False)
val_table = pa.Table.from_pandas(val_data, preserve_index=False)
test_table = pa.Table.from_pandas(test_data, preserve_index=False)
# Save each split as ".parquet" file
pq.write_table(train_table, "train.parquet")
pq.write_table(val_table, "val.parquet")
pq.write_table(test_table, "test.parquet")
# 2. Upload your Converted Data
init(api_key="api_key", api_url="api_url")
set_project(Project(id="AdultIncomeId", name="Adult Income Tutorial"))
directory = upload(
directory_name="adult_income_uploaded",
train_set_path="train.parquet",
test_set_path="test.parquet",
val_set_path="val.parquet",
)
# 3. Instantiate a Dataset
train_dataset = ParquetDataset(
split_name="train",
identifier_name="my_local_dataset",
path=directory["train_set_path"],
)
# 4. Find a schema
analyzed_train_dataset = train_dataset.analyze(target_names=["income"])
print(analyzed_train_dataset.analyzed_schema)
preprocessor = SklearnPreprocessor(preprocess_function=StandardScaler())
analyzed_train_dataset.analyzed_schema["educational-num"] = NumericalFeature(
name="educational-num", is_target=False, preprocessor=preprocessor
)
print(analyzed_train_dataset.analyzed_schema)
# 5. Fit the schema
fit_train_dataset = analyzed_train_dataset.fit()
fit_test_dataset = FittedParquetDataset(
split_name="test",
identifier_name="my_local_dataset",
path=directory["test_set_path"],
fitted_schema=fit_train_dataset.fitted_schema,
)
fit_val_dataset = FittedParquetDataset(
split_name="val",
identifier_name="my_local_dataset",
path=directory["val_set_path"],
fitted_schema=fit_train_dataset.fitted_schema,
)
# ##### Prepare the Model #######
# 1. Create the required torch models
input_size = fit_train_dataset.fitted_schema.input_size[1]
target_size = fit_train_dataset.fitted_schema.target_size[1]
print(f"input_size: {input_size} - target_size: {target_size}")
feature_extraction = MLP(input_size=input_size, hidden_channels=[128, 50])
task_learner = MLP(input_size=50, hidden_channels=[target_size], last_activation=partial(torch.nn.Softmax, dim=1))
# 2. Explainable Model Specifications
model_specifications = ModelDecisionGraphParameters(
graph_depth=3,
target_homogeneity_pruning_threshold=0.8,
population_pruning_threshold=0.15,
prune_step=5,
target_homogeneity_weight=1.0,
discrimination_weight=0.1,
balancing_weight=0.05,
)
# 3. Create the Explainable Model
xpdeep_model = XpdeepModel.from_torch(
fitted_schema=fit_train_dataset.fitted_schema,
feature_extraction=feature_extraction,
task_learner=task_learner,
backbone=None,
decision_graph_parameters=model_specifications,
)
# ##### Train #######
# Metrics to monitor the training.
metrics = DictMetrics(
multi_class_accuracy=partial(MulticlassAccuracy, num_classes=2),
multi_class_F1_score=partial(MulticlassF1Score, num_classes=2),
confusion_matrix=partial(MulticlassConfusionMatrix, normalize="all", num_classes=2),
)
callbacks = [
EarlyStopping(monitoring_metric="Total loss", mode="minimize", patience=5),
Scheduler(pre_scheduler=partial(ReduceLROnPlateau), step_method="epoch", monitoring_metric="Total loss"),
ModelCheckpoint(monitoring_metric="global_multi_class_F1_score", mode="minimize"),
]
# Optimizer is a partial object as pytorch needs to give the model as optimizer parameter.
optimizer = partial(torch.optim.AdamW, lr=0.001)
trainer = Trainer(
loss=CrossEntropyLossFromProbabilities(reduction="none"),
optimizer=optimizer,
callbacks=callbacks,
start_epoch=0,
max_epochs=20,
metrics=metrics,
)
trained_model = trainer.train(
model=xpdeep_model,
train_set=fit_train_dataset,
validation_set=fit_val_dataset,
batch_size=128,
)
# ##### Explain #######
# 1. Build the Explainer
statistics = DictStats(
distribution_target=DistributionStat(on="target"), distribution_prediction=DistributionStat(on="prediction")
)
quality_metrics = [Sensitivity(), Infidelity()]
explainer = Explainer(
description_representativeness=1000, quality_metrics=quality_metrics, metrics=metrics, statistics=statistics
)
# 2. Model Functioning Explanations
model_explanations = explainer.global_explain(
trained_model,
train_set=fit_train_dataset,
test_set=fit_test_dataset,
validation_set=fit_val_dataset,
)
visualisation_link = model_explanations.visualisation_link
# 3. Inference and their Causal Explanations
my_filter = Filter("testing_filter", fit_test_dataset)
my_filter.add_criteria(
NumericalCriterion(fit_test_dataset.fitted_schema["age"], max_=30),
CategoricalCriterion(fit_test_dataset.fitted_schema["workclass"], categories=["Private"]),
)
causal_explanations = explainer.local_explain(trained_model, fit_test_dataset, my_filter)
visualisation_link = causal_explanations.visualisation_link
2. Upload your Converted Data#
Warning
Don't forget to set up a Project
and initialize the API with your credentials !
from xpdeep import init, set_project
from xpdeep.project import Project
init(api_key="api_key", api_url="api_url")
set_project(Project(id="AdultIncomeId",name="Adult Income Tutorial"))
With your Project
set up, you can upload the converted parquet files into Xpdeep server.
from xpdeep.dataset.upload import upload
directory = upload(
directory_name="adult_income_uploaded",
train_set_path="train.parquet",
test_set_path="test.parquet",
val_set_path="val.parquet",
)
👀 Full file preview
"""Adult Income workflow, classification, tabular data."""
from functools import partial
import pandas as pd
import pyarrow as pa
import pyarrow.parquet as pq
import torch
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from torch.optim.lr_scheduler import ReduceLROnPlateau
from xpdeep import init, set_project
from xpdeep.dataset.parquet_dataset import FittedParquetDataset, ParquetDataset
from xpdeep.dataset.schema.feature.feature import NumericalFeature
from xpdeep.dataset.schema.preprocessor import SklearnPreprocessor
from xpdeep.dataset.upload import upload
from xpdeep.explain.explainer import Explainer
from xpdeep.explain.quality_metrics import Infidelity, Sensitivity
from xpdeep.explain.statistic import DictStats, DistributionStat
from xpdeep.filtering.criteria import CategoricalCriterion, NumericalCriterion
from xpdeep.filtering.filter import Filter
from xpdeep.metrics.metric import DictMetrics
from xpdeep.metrics.zoo.multiclass_metrics import MulticlassConfusionMatrix, MulticlassAccuracy, MulticlassF1Score
from xpdeep.model.model_builder import ModelDecisionGraphParameters
from xpdeep.model.xpdeep_model import XpdeepModel
from xpdeep.model.zoo.cross_entropy_loss_from_proba import CrossEntropyLossFromProbabilities
from xpdeep.model.zoo.mlp import MLP
from xpdeep.project import Project
from xpdeep.trainer.callbacks import EarlyStopping, ModelCheckpoint, Scheduler
from xpdeep.trainer.trainer import Trainer
torch.random.manual_seed(5)
# ##### Prepare the Dataset #######
# 1. Split and Convert your Raw Data
# Load the CSV file
file_path = "adult_income.csv"
data = pd.read_csv(file_path)
data = data.drop(columns=["fnlwgt", "education"])
# Split the data into training and testing sets
train_data, test_data = train_test_split(data, test_size=0.2, random_state=42)
# Further split the training set into training and validation sets
train_data, val_data = train_test_split(train_data, test_size=0.25, random_state=42)
train_data["index_xp_deep"] = range(len(train_data))
test_data["index_xp_deep"] = range(len(test_data))
val_data["index_xp_deep"] = range(len(val_data))
# Convert to pyarrow Table format
train_table = pa.Table.from_pandas(train_data, preserve_index=False)
val_table = pa.Table.from_pandas(val_data, preserve_index=False)
test_table = pa.Table.from_pandas(test_data, preserve_index=False)
# Save each split as ".parquet" file
pq.write_table(train_table, "train.parquet")
pq.write_table(val_table, "val.parquet")
pq.write_table(test_table, "test.parquet")
# 2. Upload your Converted Data
init(api_key="api_key", api_url="api_url")
set_project(Project(id="AdultIncomeId", name="Adult Income Tutorial"))
directory = upload(
directory_name="adult_income_uploaded",
train_set_path="train.parquet",
test_set_path="test.parquet",
val_set_path="val.parquet",
)
# 3. Instantiate a Dataset
train_dataset = ParquetDataset(
split_name="train",
identifier_name="my_local_dataset",
path=directory["train_set_path"],
)
# 4. Find a schema
analyzed_train_dataset = train_dataset.analyze(target_names=["income"])
print(analyzed_train_dataset.analyzed_schema)
preprocessor = SklearnPreprocessor(preprocess_function=StandardScaler())
analyzed_train_dataset.analyzed_schema["educational-num"] = NumericalFeature(
name="educational-num", is_target=False, preprocessor=preprocessor
)
print(analyzed_train_dataset.analyzed_schema)
# 5. Fit the schema
fit_train_dataset = analyzed_train_dataset.fit()
fit_test_dataset = FittedParquetDataset(
split_name="test",
identifier_name="my_local_dataset",
path=directory["test_set_path"],
fitted_schema=fit_train_dataset.fitted_schema,
)
fit_val_dataset = FittedParquetDataset(
split_name="val",
identifier_name="my_local_dataset",
path=directory["val_set_path"],
fitted_schema=fit_train_dataset.fitted_schema,
)
# ##### Prepare the Model #######
# 1. Create the required torch models
input_size = fit_train_dataset.fitted_schema.input_size[1]
target_size = fit_train_dataset.fitted_schema.target_size[1]
print(f"input_size: {input_size} - target_size: {target_size}")
feature_extraction = MLP(input_size=input_size, hidden_channels=[128, 50])
task_learner = MLP(input_size=50, hidden_channels=[target_size], last_activation=partial(torch.nn.Softmax, dim=1))
# 2. Explainable Model Specifications
model_specifications = ModelDecisionGraphParameters(
graph_depth=3,
target_homogeneity_pruning_threshold=0.8,
population_pruning_threshold=0.15,
prune_step=5,
target_homogeneity_weight=1.0,
discrimination_weight=0.1,
balancing_weight=0.05,
)
# 3. Create the Explainable Model
xpdeep_model = XpdeepModel.from_torch(
fitted_schema=fit_train_dataset.fitted_schema,
feature_extraction=feature_extraction,
task_learner=task_learner,
backbone=None,
decision_graph_parameters=model_specifications,
)
# ##### Train #######
# Metrics to monitor the training.
metrics = DictMetrics(
multi_class_accuracy=partial(MulticlassAccuracy, num_classes=2),
multi_class_F1_score=partial(MulticlassF1Score, num_classes=2),
confusion_matrix=partial(MulticlassConfusionMatrix, normalize="all", num_classes=2),
)
callbacks = [
EarlyStopping(monitoring_metric="Total loss", mode="minimize", patience=5),
Scheduler(pre_scheduler=partial(ReduceLROnPlateau), step_method="epoch", monitoring_metric="Total loss"),
ModelCheckpoint(monitoring_metric="global_multi_class_F1_score", mode="minimize"),
]
# Optimizer is a partial object as pytorch needs to give the model as optimizer parameter.
optimizer = partial(torch.optim.AdamW, lr=0.001)
trainer = Trainer(
loss=CrossEntropyLossFromProbabilities(reduction="none"),
optimizer=optimizer,
callbacks=callbacks,
start_epoch=0,
max_epochs=20,
metrics=metrics,
)
trained_model = trainer.train(
model=xpdeep_model,
train_set=fit_train_dataset,
validation_set=fit_val_dataset,
batch_size=128,
)
# ##### Explain #######
# 1. Build the Explainer
statistics = DictStats(
distribution_target=DistributionStat(on="target"), distribution_prediction=DistributionStat(on="prediction")
)
quality_metrics = [Sensitivity(), Infidelity()]
explainer = Explainer(
description_representativeness=1000, quality_metrics=quality_metrics, metrics=metrics, statistics=statistics
)
# 2. Model Functioning Explanations
model_explanations = explainer.global_explain(
trained_model,
train_set=fit_train_dataset,
test_set=fit_test_dataset,
validation_set=fit_val_dataset,
)
visualisation_link = model_explanations.visualisation_link
# 3. Inference and their Causal Explanations
my_filter = Filter("testing_filter", fit_test_dataset)
my_filter.add_criteria(
NumericalCriterion(fit_test_dataset.fitted_schema["age"], max_=30),
CategoricalCriterion(fit_test_dataset.fitted_schema["workclass"], categories=["Private"]),
)
causal_explanations = explainer.local_explain(trained_model, fit_test_dataset, my_filter)
visualisation_link = causal_explanations.visualisation_link
3. Instantiate a Dataset#
Here we instantiate a ParquetDataset
for the train set only. We will create the validation and test dataset later.
from xpdeep.dataset.parquet_dataset import ParquetDataset
train_dataset = ParquetDataset(
split_name="train",
identifier_name="my_local_dataset",
path=directory["train_set_path"],
)
👀 Full file preview
"""Adult Income workflow, classification, tabular data."""
from functools import partial
import pandas as pd
import pyarrow as pa
import pyarrow.parquet as pq
import torch
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from torch.optim.lr_scheduler import ReduceLROnPlateau
from xpdeep import init, set_project
from xpdeep.dataset.parquet_dataset import FittedParquetDataset, ParquetDataset
from xpdeep.dataset.schema.feature.feature import NumericalFeature
from xpdeep.dataset.schema.preprocessor import SklearnPreprocessor
from xpdeep.dataset.upload import upload
from xpdeep.explain.explainer import Explainer
from xpdeep.explain.quality_metrics import Infidelity, Sensitivity
from xpdeep.explain.statistic import DictStats, DistributionStat
from xpdeep.filtering.criteria import CategoricalCriterion, NumericalCriterion
from xpdeep.filtering.filter import Filter
from xpdeep.metrics.metric import DictMetrics
from xpdeep.metrics.zoo.multiclass_metrics import MulticlassConfusionMatrix, MulticlassAccuracy, MulticlassF1Score
from xpdeep.model.model_builder import ModelDecisionGraphParameters
from xpdeep.model.xpdeep_model import XpdeepModel
from xpdeep.model.zoo.cross_entropy_loss_from_proba import CrossEntropyLossFromProbabilities
from xpdeep.model.zoo.mlp import MLP
from xpdeep.project import Project
from xpdeep.trainer.callbacks import EarlyStopping, ModelCheckpoint, Scheduler
from xpdeep.trainer.trainer import Trainer
torch.random.manual_seed(5)
# ##### Prepare the Dataset #######
# 1. Split and Convert your Raw Data
# Load the CSV file
file_path = "adult_income.csv"
data = pd.read_csv(file_path)
data = data.drop(columns=["fnlwgt", "education"])
# Split the data into training and testing sets
train_data, test_data = train_test_split(data, test_size=0.2, random_state=42)
# Further split the training set into training and validation sets
train_data, val_data = train_test_split(train_data, test_size=0.25, random_state=42)
train_data["index_xp_deep"] = range(len(train_data))
test_data["index_xp_deep"] = range(len(test_data))
val_data["index_xp_deep"] = range(len(val_data))
# Convert to pyarrow Table format
train_table = pa.Table.from_pandas(train_data, preserve_index=False)
val_table = pa.Table.from_pandas(val_data, preserve_index=False)
test_table = pa.Table.from_pandas(test_data, preserve_index=False)
# Save each split as ".parquet" file
pq.write_table(train_table, "train.parquet")
pq.write_table(val_table, "val.parquet")
pq.write_table(test_table, "test.parquet")
# 2. Upload your Converted Data
init(api_key="api_key", api_url="api_url")
set_project(Project(id="AdultIncomeId", name="Adult Income Tutorial"))
directory = upload(
directory_name="adult_income_uploaded",
train_set_path="train.parquet",
test_set_path="test.parquet",
val_set_path="val.parquet",
)
# 3. Instantiate a Dataset
train_dataset = ParquetDataset(
split_name="train",
identifier_name="my_local_dataset",
path=directory["train_set_path"],
)
# 4. Find a schema
analyzed_train_dataset = train_dataset.analyze(target_names=["income"])
print(analyzed_train_dataset.analyzed_schema)
preprocessor = SklearnPreprocessor(preprocess_function=StandardScaler())
analyzed_train_dataset.analyzed_schema["educational-num"] = NumericalFeature(
name="educational-num", is_target=False, preprocessor=preprocessor
)
print(analyzed_train_dataset.analyzed_schema)
# 5. Fit the schema
fit_train_dataset = analyzed_train_dataset.fit()
fit_test_dataset = FittedParquetDataset(
split_name="test",
identifier_name="my_local_dataset",
path=directory["test_set_path"],
fitted_schema=fit_train_dataset.fitted_schema,
)
fit_val_dataset = FittedParquetDataset(
split_name="val",
identifier_name="my_local_dataset",
path=directory["val_set_path"],
fitted_schema=fit_train_dataset.fitted_schema,
)
# ##### Prepare the Model #######
# 1. Create the required torch models
input_size = fit_train_dataset.fitted_schema.input_size[1]
target_size = fit_train_dataset.fitted_schema.target_size[1]
print(f"input_size: {input_size} - target_size: {target_size}")
feature_extraction = MLP(input_size=input_size, hidden_channels=[128, 50])
task_learner = MLP(input_size=50, hidden_channels=[target_size], last_activation=partial(torch.nn.Softmax, dim=1))
# 2. Explainable Model Specifications
model_specifications = ModelDecisionGraphParameters(
graph_depth=3,
target_homogeneity_pruning_threshold=0.8,
population_pruning_threshold=0.15,
prune_step=5,
target_homogeneity_weight=1.0,
discrimination_weight=0.1,
balancing_weight=0.05,
)
# 3. Create the Explainable Model
xpdeep_model = XpdeepModel.from_torch(
fitted_schema=fit_train_dataset.fitted_schema,
feature_extraction=feature_extraction,
task_learner=task_learner,
backbone=None,
decision_graph_parameters=model_specifications,
)
# ##### Train #######
# Metrics to monitor the training.
metrics = DictMetrics(
multi_class_accuracy=partial(MulticlassAccuracy, num_classes=2),
multi_class_F1_score=partial(MulticlassF1Score, num_classes=2),
confusion_matrix=partial(MulticlassConfusionMatrix, normalize="all", num_classes=2),
)
callbacks = [
EarlyStopping(monitoring_metric="Total loss", mode="minimize", patience=5),
Scheduler(pre_scheduler=partial(ReduceLROnPlateau), step_method="epoch", monitoring_metric="Total loss"),
ModelCheckpoint(monitoring_metric="global_multi_class_F1_score", mode="minimize"),
]
# Optimizer is a partial object as pytorch needs to give the model as optimizer parameter.
optimizer = partial(torch.optim.AdamW, lr=0.001)
trainer = Trainer(
loss=CrossEntropyLossFromProbabilities(reduction="none"),
optimizer=optimizer,
callbacks=callbacks,
start_epoch=0,
max_epochs=20,
metrics=metrics,
)
trained_model = trainer.train(
model=xpdeep_model,
train_set=fit_train_dataset,
validation_set=fit_val_dataset,
batch_size=128,
)
# ##### Explain #######
# 1. Build the Explainer
statistics = DictStats(
distribution_target=DistributionStat(on="target"), distribution_prediction=DistributionStat(on="prediction")
)
quality_metrics = [Sensitivity(), Infidelity()]
explainer = Explainer(
description_representativeness=1000, quality_metrics=quality_metrics, metrics=metrics, statistics=statistics
)
# 2. Model Functioning Explanations
model_explanations = explainer.global_explain(
trained_model,
train_set=fit_train_dataset,
test_set=fit_test_dataset,
validation_set=fit_val_dataset,
)
visualisation_link = model_explanations.visualisation_link
# 3. Inference and their Causal Explanations
my_filter = Filter("testing_filter", fit_test_dataset)
my_filter.add_criteria(
NumericalCriterion(fit_test_dataset.fitted_schema["age"], max_=30),
CategoricalCriterion(fit_test_dataset.fitted_schema["workclass"], categories=["Private"]),
)
causal_explanations = explainer.local_explain(trained_model, fit_test_dataset, my_filter)
visualisation_link = causal_explanations.visualisation_link
4. Find a schema#
We use the AutoAnalyzer
to get a schema proposal on the train set.
The only requirement is to specify the target name, here the "income" feature. It takes two values: "<=50K" and ">50K".
analyzed_train_dataset = train_dataset.analyze(target_names=["income"])
print(analyzed_train_dataset.analyzed_schema)
+--------------------------------------------------+
| Schema Contents |
+--------------------+-----------------+-----------+
| Type | Name | Is Target |
+--------------------+-----------------+-----------+
| NumericalFeature | age | ❌ |
| CategoricalFeature | workclass | ❌ |
| CategoricalFeature | educational-num | ❌ |
| CategoricalFeature | marital-status | ❌ |
| CategoricalFeature | occupation | ❌ |
| CategoricalFeature | relationship | ❌ |
| CategoricalFeature | race | ❌ |
| CategoricalFeature | gender | ❌ |
| NumericalFeature | capital-gain | ❌ |
| NumericalFeature | capital-loss | ❌ |
| NumericalFeature | hours-per-week | ❌ |
| CategoricalFeature | native-country | ❌ |
| CategoricalFeature | income | ✅ |
| Metadata | index_xp_deep | |
+--------------------+-----------------+-----------+
Note
Please note that the index_xp_deep
column is automatically recognized and stored as a Metadata
in the Schema
.
However, we would like the feature "educational-num" to be a NumericalFeature
and not a CategoricalFeature
, as its
values contain a magnitude relationship, the most educated being the biggest number. We can force the feature type:
from xpdeep.dataset.schema.preprocessor import SklearnPreprocessor
from sklearn.preprocessing import StandardScaler
from xpdeep.dataset.schema.feature.feature import NumericalFeature
preprocessor = SklearnPreprocessor(preprocess_function=StandardScaler())
analyzed_train_dataset.analyzed_schema["educational-num"] = NumericalFeature(name="educational-num",
is_target=False,
preprocessor=preprocessor)
print(analyzed_train_dataset.analyzed_schema)
👀 Full file preview
"""Adult Income workflow, classification, tabular data."""
from functools import partial
import pandas as pd
import pyarrow as pa
import pyarrow.parquet as pq
import torch
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from torch.optim.lr_scheduler import ReduceLROnPlateau
from xpdeep import init, set_project
from xpdeep.dataset.parquet_dataset import FittedParquetDataset, ParquetDataset
from xpdeep.dataset.schema.feature.feature import NumericalFeature
from xpdeep.dataset.schema.preprocessor import SklearnPreprocessor
from xpdeep.dataset.upload import upload
from xpdeep.explain.explainer import Explainer
from xpdeep.explain.quality_metrics import Infidelity, Sensitivity
from xpdeep.explain.statistic import DictStats, DistributionStat
from xpdeep.filtering.criteria import CategoricalCriterion, NumericalCriterion
from xpdeep.filtering.filter import Filter
from xpdeep.metrics.metric import DictMetrics
from xpdeep.metrics.zoo.multiclass_metrics import MulticlassConfusionMatrix, MulticlassAccuracy, MulticlassF1Score
from xpdeep.model.model_builder import ModelDecisionGraphParameters
from xpdeep.model.xpdeep_model import XpdeepModel
from xpdeep.model.zoo.cross_entropy_loss_from_proba import CrossEntropyLossFromProbabilities
from xpdeep.model.zoo.mlp import MLP
from xpdeep.project import Project
from xpdeep.trainer.callbacks import EarlyStopping, ModelCheckpoint, Scheduler
from xpdeep.trainer.trainer import Trainer
torch.random.manual_seed(5)
# ##### Prepare the Dataset #######
# 1. Split and Convert your Raw Data
# Load the CSV file
file_path = "adult_income.csv"
data = pd.read_csv(file_path)
data = data.drop(columns=["fnlwgt", "education"])
# Split the data into training and testing sets
train_data, test_data = train_test_split(data, test_size=0.2, random_state=42)
# Further split the training set into training and validation sets
train_data, val_data = train_test_split(train_data, test_size=0.25, random_state=42)
train_data["index_xp_deep"] = range(len(train_data))
test_data["index_xp_deep"] = range(len(test_data))
val_data["index_xp_deep"] = range(len(val_data))
# Convert to pyarrow Table format
train_table = pa.Table.from_pandas(train_data, preserve_index=False)
val_table = pa.Table.from_pandas(val_data, preserve_index=False)
test_table = pa.Table.from_pandas(test_data, preserve_index=False)
# Save each split as ".parquet" file
pq.write_table(train_table, "train.parquet")
pq.write_table(val_table, "val.parquet")
pq.write_table(test_table, "test.parquet")
# 2. Upload your Converted Data
init(api_key="api_key", api_url="api_url")
set_project(Project(id="AdultIncomeId", name="Adult Income Tutorial"))
directory = upload(
directory_name="adult_income_uploaded",
train_set_path="train.parquet",
test_set_path="test.parquet",
val_set_path="val.parquet",
)
# 3. Instantiate a Dataset
train_dataset = ParquetDataset(
split_name="train",
identifier_name="my_local_dataset",
path=directory["train_set_path"],
)
# 4. Find a schema
analyzed_train_dataset = train_dataset.analyze(target_names=["income"])
print(analyzed_train_dataset.analyzed_schema)
preprocessor = SklearnPreprocessor(preprocess_function=StandardScaler())
analyzed_train_dataset.analyzed_schema["educational-num"] = NumericalFeature(
name="educational-num", is_target=False, preprocessor=preprocessor
)
print(analyzed_train_dataset.analyzed_schema)
# 5. Fit the schema
fit_train_dataset = analyzed_train_dataset.fit()
fit_test_dataset = FittedParquetDataset(
split_name="test",
identifier_name="my_local_dataset",
path=directory["test_set_path"],
fitted_schema=fit_train_dataset.fitted_schema,
)
fit_val_dataset = FittedParquetDataset(
split_name="val",
identifier_name="my_local_dataset",
path=directory["val_set_path"],
fitted_schema=fit_train_dataset.fitted_schema,
)
# ##### Prepare the Model #######
# 1. Create the required torch models
input_size = fit_train_dataset.fitted_schema.input_size[1]
target_size = fit_train_dataset.fitted_schema.target_size[1]
print(f"input_size: {input_size} - target_size: {target_size}")
feature_extraction = MLP(input_size=input_size, hidden_channels=[128, 50])
task_learner = MLP(input_size=50, hidden_channels=[target_size], last_activation=partial(torch.nn.Softmax, dim=1))
# 2. Explainable Model Specifications
model_specifications = ModelDecisionGraphParameters(
graph_depth=3,
target_homogeneity_pruning_threshold=0.8,
population_pruning_threshold=0.15,
prune_step=5,
target_homogeneity_weight=1.0,
discrimination_weight=0.1,
balancing_weight=0.05,
)
# 3. Create the Explainable Model
xpdeep_model = XpdeepModel.from_torch(
fitted_schema=fit_train_dataset.fitted_schema,
feature_extraction=feature_extraction,
task_learner=task_learner,
backbone=None,
decision_graph_parameters=model_specifications,
)
# ##### Train #######
# Metrics to monitor the training.
metrics = DictMetrics(
multi_class_accuracy=partial(MulticlassAccuracy, num_classes=2),
multi_class_F1_score=partial(MulticlassF1Score, num_classes=2),
confusion_matrix=partial(MulticlassConfusionMatrix, normalize="all", num_classes=2),
)
callbacks = [
EarlyStopping(monitoring_metric="Total loss", mode="minimize", patience=5),
Scheduler(pre_scheduler=partial(ReduceLROnPlateau), step_method="epoch", monitoring_metric="Total loss"),
ModelCheckpoint(monitoring_metric="global_multi_class_F1_score", mode="minimize"),
]
# Optimizer is a partial object as pytorch needs to give the model as optimizer parameter.
optimizer = partial(torch.optim.AdamW, lr=0.001)
trainer = Trainer(
loss=CrossEntropyLossFromProbabilities(reduction="none"),
optimizer=optimizer,
callbacks=callbacks,
start_epoch=0,
max_epochs=20,
metrics=metrics,
)
trained_model = trainer.train(
model=xpdeep_model,
train_set=fit_train_dataset,
validation_set=fit_val_dataset,
batch_size=128,
)
# ##### Explain #######
# 1. Build the Explainer
statistics = DictStats(
distribution_target=DistributionStat(on="target"), distribution_prediction=DistributionStat(on="prediction")
)
quality_metrics = [Sensitivity(), Infidelity()]
explainer = Explainer(
description_representativeness=1000, quality_metrics=quality_metrics, metrics=metrics, statistics=statistics
)
# 2. Model Functioning Explanations
model_explanations = explainer.global_explain(
trained_model,
train_set=fit_train_dataset,
test_set=fit_test_dataset,
validation_set=fit_val_dataset,
)
visualisation_link = model_explanations.visualisation_link
# 3. Inference and their Causal Explanations
my_filter = Filter("testing_filter", fit_test_dataset)
my_filter.add_criteria(
NumericalCriterion(fit_test_dataset.fitted_schema["age"], max_=30),
CategoricalCriterion(fit_test_dataset.fitted_schema["workclass"], categories=["Private"]),
)
causal_explanations = explainer.local_explain(trained_model, fit_test_dataset, my_filter)
visualisation_link = causal_explanations.visualisation_link
5. Fit the schema#
With your Schema
analyzed on the train set, you can now fit the schema to fit each feature preprocessor on the train set.
We use the same FittedSchema
to create a FittedParquetDataset
corresponding to the validation and test set.
from xpdeep.dataset.parquet_dataset import FittedParquetDataset
fit_test_dataset = FittedParquetDataset(split_name="test",
identifier_name="my_local_dataset",
path=directory["test_set_path"],
fitted_schema=fit_train_dataset.fitted_schema)
fit_val_dataset = FittedParquetDataset(split_name="val",
identifier_name="my_local_dataset",
path=directory["val_set_path"],
fitted_schema=fit_train_dataset.fitted_schema)
👀 Full file preview
"""Adult Income workflow, classification, tabular data."""
from functools import partial
import pandas as pd
import pyarrow as pa
import pyarrow.parquet as pq
import torch
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from torch.optim.lr_scheduler import ReduceLROnPlateau
from xpdeep import init, set_project
from xpdeep.dataset.parquet_dataset import FittedParquetDataset, ParquetDataset
from xpdeep.dataset.schema.feature.feature import NumericalFeature
from xpdeep.dataset.schema.preprocessor import SklearnPreprocessor
from xpdeep.dataset.upload import upload
from xpdeep.explain.explainer import Explainer
from xpdeep.explain.quality_metrics import Infidelity, Sensitivity
from xpdeep.explain.statistic import DictStats, DistributionStat
from xpdeep.filtering.criteria import CategoricalCriterion, NumericalCriterion
from xpdeep.filtering.filter import Filter
from xpdeep.metrics.metric import DictMetrics
from xpdeep.metrics.zoo.multiclass_metrics import MulticlassConfusionMatrix, MulticlassAccuracy, MulticlassF1Score
from xpdeep.model.model_builder import ModelDecisionGraphParameters
from xpdeep.model.xpdeep_model import XpdeepModel
from xpdeep.model.zoo.cross_entropy_loss_from_proba import CrossEntropyLossFromProbabilities
from xpdeep.model.zoo.mlp import MLP
from xpdeep.project import Project
from xpdeep.trainer.callbacks import EarlyStopping, ModelCheckpoint, Scheduler
from xpdeep.trainer.trainer import Trainer
torch.random.manual_seed(5)
# ##### Prepare the Dataset #######
# 1. Split and Convert your Raw Data
# Load the CSV file
file_path = "adult_income.csv"
data = pd.read_csv(file_path)
data = data.drop(columns=["fnlwgt", "education"])
# Split the data into training and testing sets
train_data, test_data = train_test_split(data, test_size=0.2, random_state=42)
# Further split the training set into training and validation sets
train_data, val_data = train_test_split(train_data, test_size=0.25, random_state=42)
train_data["index_xp_deep"] = range(len(train_data))
test_data["index_xp_deep"] = range(len(test_data))
val_data["index_xp_deep"] = range(len(val_data))
# Convert to pyarrow Table format
train_table = pa.Table.from_pandas(train_data, preserve_index=False)
val_table = pa.Table.from_pandas(val_data, preserve_index=False)
test_table = pa.Table.from_pandas(test_data, preserve_index=False)
# Save each split as ".parquet" file
pq.write_table(train_table, "train.parquet")
pq.write_table(val_table, "val.parquet")
pq.write_table(test_table, "test.parquet")
# 2. Upload your Converted Data
init(api_key="api_key", api_url="api_url")
set_project(Project(id="AdultIncomeId", name="Adult Income Tutorial"))
directory = upload(
directory_name="adult_income_uploaded",
train_set_path="train.parquet",
test_set_path="test.parquet",
val_set_path="val.parquet",
)
# 3. Instantiate a Dataset
train_dataset = ParquetDataset(
split_name="train",
identifier_name="my_local_dataset",
path=directory["train_set_path"],
)
# 4. Find a schema
analyzed_train_dataset = train_dataset.analyze(target_names=["income"])
print(analyzed_train_dataset.analyzed_schema)
preprocessor = SklearnPreprocessor(preprocess_function=StandardScaler())
analyzed_train_dataset.analyzed_schema["educational-num"] = NumericalFeature(
name="educational-num", is_target=False, preprocessor=preprocessor
)
print(analyzed_train_dataset.analyzed_schema)
# 5. Fit the schema
fit_train_dataset = analyzed_train_dataset.fit()
fit_test_dataset = FittedParquetDataset(
split_name="test",
identifier_name="my_local_dataset",
path=directory["test_set_path"],
fitted_schema=fit_train_dataset.fitted_schema,
)
fit_val_dataset = FittedParquetDataset(
split_name="val",
identifier_name="my_local_dataset",
path=directory["val_set_path"],
fitted_schema=fit_train_dataset.fitted_schema,
)
# ##### Prepare the Model #######
# 1. Create the required torch models
input_size = fit_train_dataset.fitted_schema.input_size[1]
target_size = fit_train_dataset.fitted_schema.target_size[1]
print(f"input_size: {input_size} - target_size: {target_size}")
feature_extraction = MLP(input_size=input_size, hidden_channels=[128, 50])
task_learner = MLP(input_size=50, hidden_channels=[target_size], last_activation=partial(torch.nn.Softmax, dim=1))
# 2. Explainable Model Specifications
model_specifications = ModelDecisionGraphParameters(
graph_depth=3,
target_homogeneity_pruning_threshold=0.8,
population_pruning_threshold=0.15,
prune_step=5,
target_homogeneity_weight=1.0,
discrimination_weight=0.1,
balancing_weight=0.05,
)
# 3. Create the Explainable Model
xpdeep_model = XpdeepModel.from_torch(
fitted_schema=fit_train_dataset.fitted_schema,
feature_extraction=feature_extraction,
task_learner=task_learner,
backbone=None,
decision_graph_parameters=model_specifications,
)
# ##### Train #######
# Metrics to monitor the training.
metrics = DictMetrics(
multi_class_accuracy=partial(MulticlassAccuracy, num_classes=2),
multi_class_F1_score=partial(MulticlassF1Score, num_classes=2),
confusion_matrix=partial(MulticlassConfusionMatrix, normalize="all", num_classes=2),
)
callbacks = [
EarlyStopping(monitoring_metric="Total loss", mode="minimize", patience=5),
Scheduler(pre_scheduler=partial(ReduceLROnPlateau), step_method="epoch", monitoring_metric="Total loss"),
ModelCheckpoint(monitoring_metric="global_multi_class_F1_score", mode="minimize"),
]
# Optimizer is a partial object as pytorch needs to give the model as optimizer parameter.
optimizer = partial(torch.optim.AdamW, lr=0.001)
trainer = Trainer(
loss=CrossEntropyLossFromProbabilities(reduction="none"),
optimizer=optimizer,
callbacks=callbacks,
start_epoch=0,
max_epochs=20,
metrics=metrics,
)
trained_model = trainer.train(
model=xpdeep_model,
train_set=fit_train_dataset,
validation_set=fit_val_dataset,
batch_size=128,
)
# ##### Explain #######
# 1. Build the Explainer
statistics = DictStats(
distribution_target=DistributionStat(on="target"), distribution_prediction=DistributionStat(on="prediction")
)
quality_metrics = [Sensitivity(), Infidelity()]
explainer = Explainer(
description_representativeness=1000, quality_metrics=quality_metrics, metrics=metrics, statistics=statistics
)
# 2. Model Functioning Explanations
model_explanations = explainer.global_explain(
trained_model,
train_set=fit_train_dataset,
test_set=fit_test_dataset,
validation_set=fit_val_dataset,
)
visualisation_link = model_explanations.visualisation_link
# 3. Inference and their Causal Explanations
my_filter = Filter("testing_filter", fit_test_dataset)
my_filter.add_criteria(
NumericalCriterion(fit_test_dataset.fitted_schema["age"], max_=30),
CategoricalCriterion(fit_test_dataset.fitted_schema["workclass"], categories=["Private"]),
)
causal_explanations = explainer.local_explain(trained_model, fit_test_dataset, my_filter)
visualisation_link = causal_explanations.visualisation_link
And that's all for the dataset preparation. We now have three FittedParquetDataset
, each with its FittedSchema
,
ready to be used.
Prepare the Model#
We need now to create an explainable model XpdeepModel
.
1. Create the required torch models#
We have a binary classification task with tabular input data. We will use a basic Multi Layer Perceptron (MLP) for this task.
Tip
Model input and output sizes (including the batch dimension) can be easily retrieved from the fitted schema.
Therefore:
- The
FeatureExtractionModel
will embed input data into a 50 dimension space. - The
TaskLearnerModel
will use aSoftmax
output layer for the output which is of size 2. - No
BackboneModel
is required for this simple task.
Note
Here we use an output size of 2 as the AutoAnalyzer
default behaviour is to associate a OneHotEncoder
to a categorical target, even with only two different values.
Warning
Currently xpdeep does not support label encoding for targets. Classes must be therefore one-hot encoded.
In consequence, Sigmoid
activation cannot be used, even for a binary classification.
from functools import partial
import torch
from xpdeep.model.zoo.mlp import MLP
feature_extraction = MLP(input_size=input_size, hidden_channels=[128, 50])
task_learner = MLP(input_size=50, hidden_channels=[target_size], last_activation=partial(torch.nn.Softmax, dim=1))
👀 Full file preview
"""Adult Income workflow, classification, tabular data."""
from functools import partial
import pandas as pd
import pyarrow as pa
import pyarrow.parquet as pq
import torch
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from torch.optim.lr_scheduler import ReduceLROnPlateau
from xpdeep import init, set_project
from xpdeep.dataset.parquet_dataset import FittedParquetDataset, ParquetDataset
from xpdeep.dataset.schema.feature.feature import NumericalFeature
from xpdeep.dataset.schema.preprocessor import SklearnPreprocessor
from xpdeep.dataset.upload import upload
from xpdeep.explain.explainer import Explainer
from xpdeep.explain.quality_metrics import Infidelity, Sensitivity
from xpdeep.explain.statistic import DictStats, DistributionStat
from xpdeep.filtering.criteria import CategoricalCriterion, NumericalCriterion
from xpdeep.filtering.filter import Filter
from xpdeep.metrics.metric import DictMetrics
from xpdeep.metrics.zoo.multiclass_metrics import MulticlassConfusionMatrix, MulticlassAccuracy, MulticlassF1Score
from xpdeep.model.model_builder import ModelDecisionGraphParameters
from xpdeep.model.xpdeep_model import XpdeepModel
from xpdeep.model.zoo.cross_entropy_loss_from_proba import CrossEntropyLossFromProbabilities
from xpdeep.model.zoo.mlp import MLP
from xpdeep.project import Project
from xpdeep.trainer.callbacks import EarlyStopping, ModelCheckpoint, Scheduler
from xpdeep.trainer.trainer import Trainer
torch.random.manual_seed(5)
# ##### Prepare the Dataset #######
# 1. Split and Convert your Raw Data
# Load the CSV file
file_path = "adult_income.csv"
data = pd.read_csv(file_path)
data = data.drop(columns=["fnlwgt", "education"])
# Split the data into training and testing sets
train_data, test_data = train_test_split(data, test_size=0.2, random_state=42)
# Further split the training set into training and validation sets
train_data, val_data = train_test_split(train_data, test_size=0.25, random_state=42)
train_data["index_xp_deep"] = range(len(train_data))
test_data["index_xp_deep"] = range(len(test_data))
val_data["index_xp_deep"] = range(len(val_data))
# Convert to pyarrow Table format
train_table = pa.Table.from_pandas(train_data, preserve_index=False)
val_table = pa.Table.from_pandas(val_data, preserve_index=False)
test_table = pa.Table.from_pandas(test_data, preserve_index=False)
# Save each split as ".parquet" file
pq.write_table(train_table, "train.parquet")
pq.write_table(val_table, "val.parquet")
pq.write_table(test_table, "test.parquet")
# 2. Upload your Converted Data
init(api_key="api_key", api_url="api_url")
set_project(Project(id="AdultIncomeId", name="Adult Income Tutorial"))
directory = upload(
directory_name="adult_income_uploaded",
train_set_path="train.parquet",
test_set_path="test.parquet",
val_set_path="val.parquet",
)
# 3. Instantiate a Dataset
train_dataset = ParquetDataset(
split_name="train",
identifier_name="my_local_dataset",
path=directory["train_set_path"],
)
# 4. Find a schema
analyzed_train_dataset = train_dataset.analyze(target_names=["income"])
print(analyzed_train_dataset.analyzed_schema)
preprocessor = SklearnPreprocessor(preprocess_function=StandardScaler())
analyzed_train_dataset.analyzed_schema["educational-num"] = NumericalFeature(
name="educational-num", is_target=False, preprocessor=preprocessor
)
print(analyzed_train_dataset.analyzed_schema)
# 5. Fit the schema
fit_train_dataset = analyzed_train_dataset.fit()
fit_test_dataset = FittedParquetDataset(
split_name="test",
identifier_name="my_local_dataset",
path=directory["test_set_path"],
fitted_schema=fit_train_dataset.fitted_schema,
)
fit_val_dataset = FittedParquetDataset(
split_name="val",
identifier_name="my_local_dataset",
path=directory["val_set_path"],
fitted_schema=fit_train_dataset.fitted_schema,
)
# ##### Prepare the Model #######
# 1. Create the required torch models
input_size = fit_train_dataset.fitted_schema.input_size[1]
target_size = fit_train_dataset.fitted_schema.target_size[1]
print(f"input_size: {input_size} - target_size: {target_size}")
feature_extraction = MLP(input_size=input_size, hidden_channels=[128, 50])
task_learner = MLP(input_size=50, hidden_channels=[target_size], last_activation=partial(torch.nn.Softmax, dim=1))
# 2. Explainable Model Specifications
model_specifications = ModelDecisionGraphParameters(
graph_depth=3,
target_homogeneity_pruning_threshold=0.8,
population_pruning_threshold=0.15,
prune_step=5,
target_homogeneity_weight=1.0,
discrimination_weight=0.1,
balancing_weight=0.05,
)
# 3. Create the Explainable Model
xpdeep_model = XpdeepModel.from_torch(
fitted_schema=fit_train_dataset.fitted_schema,
feature_extraction=feature_extraction,
task_learner=task_learner,
backbone=None,
decision_graph_parameters=model_specifications,
)
# ##### Train #######
# Metrics to monitor the training.
metrics = DictMetrics(
multi_class_accuracy=partial(MulticlassAccuracy, num_classes=2),
multi_class_F1_score=partial(MulticlassF1Score, num_classes=2),
confusion_matrix=partial(MulticlassConfusionMatrix, normalize="all", num_classes=2),
)
callbacks = [
EarlyStopping(monitoring_metric="Total loss", mode="minimize", patience=5),
Scheduler(pre_scheduler=partial(ReduceLROnPlateau), step_method="epoch", monitoring_metric="Total loss"),
ModelCheckpoint(monitoring_metric="global_multi_class_F1_score", mode="minimize"),
]
# Optimizer is a partial object as pytorch needs to give the model as optimizer parameter.
optimizer = partial(torch.optim.AdamW, lr=0.001)
trainer = Trainer(
loss=CrossEntropyLossFromProbabilities(reduction="none"),
optimizer=optimizer,
callbacks=callbacks,
start_epoch=0,
max_epochs=20,
metrics=metrics,
)
trained_model = trainer.train(
model=xpdeep_model,
train_set=fit_train_dataset,
validation_set=fit_val_dataset,
batch_size=128,
)
# ##### Explain #######
# 1. Build the Explainer
statistics = DictStats(
distribution_target=DistributionStat(on="target"), distribution_prediction=DistributionStat(on="prediction")
)
quality_metrics = [Sensitivity(), Infidelity()]
explainer = Explainer(
description_representativeness=1000, quality_metrics=quality_metrics, metrics=metrics, statistics=statistics
)
# 2. Model Functioning Explanations
model_explanations = explainer.global_explain(
trained_model,
train_set=fit_train_dataset,
test_set=fit_test_dataset,
validation_set=fit_val_dataset,
)
visualisation_link = model_explanations.visualisation_link
# 3. Inference and their Causal Explanations
my_filter = Filter("testing_filter", fit_test_dataset)
my_filter.add_criteria(
NumericalCriterion(fit_test_dataset.fitted_schema["age"], max_=30),
CategoricalCriterion(fit_test_dataset.fitted_schema["workclass"], categories=["Private"]),
)
causal_explanations = explainer.local_explain(trained_model, fit_test_dataset, my_filter)
visualisation_link = causal_explanations.visualisation_link
2. Explainable Model Specifications#
Here comes the crucial part: we need to specify model specifications under ModelDecisionGraphParameters
to get the best explanations (Model Decision Graph and Inference Graph).
from xpdeep.model.model_builder import ModelDecisionGraphParameters
model_specifications = ModelDecisionGraphParameters(
graph_depth=3,
target_homogeneity_pruning_threshold=0.8,
population_pruning_threshold=0.15,
prune_step=5,
target_homogeneity_weight=1.0,
discrimination_weight=0.1,
balancing_weight=0.05,
)
👀 Full file preview
"""Adult Income workflow, classification, tabular data."""
from functools import partial
import pandas as pd
import pyarrow as pa
import pyarrow.parquet as pq
import torch
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from torch.optim.lr_scheduler import ReduceLROnPlateau
from xpdeep import init, set_project
from xpdeep.dataset.parquet_dataset import FittedParquetDataset, ParquetDataset
from xpdeep.dataset.schema.feature.feature import NumericalFeature
from xpdeep.dataset.schema.preprocessor import SklearnPreprocessor
from xpdeep.dataset.upload import upload
from xpdeep.explain.explainer import Explainer
from xpdeep.explain.quality_metrics import Infidelity, Sensitivity
from xpdeep.explain.statistic import DictStats, DistributionStat
from xpdeep.filtering.criteria import CategoricalCriterion, NumericalCriterion
from xpdeep.filtering.filter import Filter
from xpdeep.metrics.metric import DictMetrics
from xpdeep.metrics.zoo.multiclass_metrics import MulticlassConfusionMatrix, MulticlassAccuracy, MulticlassF1Score
from xpdeep.model.model_builder import ModelDecisionGraphParameters
from xpdeep.model.xpdeep_model import XpdeepModel
from xpdeep.model.zoo.cross_entropy_loss_from_proba import CrossEntropyLossFromProbabilities
from xpdeep.model.zoo.mlp import MLP
from xpdeep.project import Project
from xpdeep.trainer.callbacks import EarlyStopping, ModelCheckpoint, Scheduler
from xpdeep.trainer.trainer import Trainer
torch.random.manual_seed(5)
# ##### Prepare the Dataset #######
# 1. Split and Convert your Raw Data
# Load the CSV file
file_path = "adult_income.csv"
data = pd.read_csv(file_path)
data = data.drop(columns=["fnlwgt", "education"])
# Split the data into training and testing sets
train_data, test_data = train_test_split(data, test_size=0.2, random_state=42)
# Further split the training set into training and validation sets
train_data, val_data = train_test_split(train_data, test_size=0.25, random_state=42)
train_data["index_xp_deep"] = range(len(train_data))
test_data["index_xp_deep"] = range(len(test_data))
val_data["index_xp_deep"] = range(len(val_data))
# Convert to pyarrow Table format
train_table = pa.Table.from_pandas(train_data, preserve_index=False)
val_table = pa.Table.from_pandas(val_data, preserve_index=False)
test_table = pa.Table.from_pandas(test_data, preserve_index=False)
# Save each split as ".parquet" file
pq.write_table(train_table, "train.parquet")
pq.write_table(val_table, "val.parquet")
pq.write_table(test_table, "test.parquet")
# 2. Upload your Converted Data
init(api_key="api_key", api_url="api_url")
set_project(Project(id="AdultIncomeId", name="Adult Income Tutorial"))
directory = upload(
directory_name="adult_income_uploaded",
train_set_path="train.parquet",
test_set_path="test.parquet",
val_set_path="val.parquet",
)
# 3. Instantiate a Dataset
train_dataset = ParquetDataset(
split_name="train",
identifier_name="my_local_dataset",
path=directory["train_set_path"],
)
# 4. Find a schema
analyzed_train_dataset = train_dataset.analyze(target_names=["income"])
print(analyzed_train_dataset.analyzed_schema)
preprocessor = SklearnPreprocessor(preprocess_function=StandardScaler())
analyzed_train_dataset.analyzed_schema["educational-num"] = NumericalFeature(
name="educational-num", is_target=False, preprocessor=preprocessor
)
print(analyzed_train_dataset.analyzed_schema)
# 5. Fit the schema
fit_train_dataset = analyzed_train_dataset.fit()
fit_test_dataset = FittedParquetDataset(
split_name="test",
identifier_name="my_local_dataset",
path=directory["test_set_path"],
fitted_schema=fit_train_dataset.fitted_schema,
)
fit_val_dataset = FittedParquetDataset(
split_name="val",
identifier_name="my_local_dataset",
path=directory["val_set_path"],
fitted_schema=fit_train_dataset.fitted_schema,
)
# ##### Prepare the Model #######
# 1. Create the required torch models
input_size = fit_train_dataset.fitted_schema.input_size[1]
target_size = fit_train_dataset.fitted_schema.target_size[1]
print(f"input_size: {input_size} - target_size: {target_size}")
feature_extraction = MLP(input_size=input_size, hidden_channels=[128, 50])
task_learner = MLP(input_size=50, hidden_channels=[target_size], last_activation=partial(torch.nn.Softmax, dim=1))
# 2. Explainable Model Specifications
model_specifications = ModelDecisionGraphParameters(
graph_depth=3,
target_homogeneity_pruning_threshold=0.8,
population_pruning_threshold=0.15,
prune_step=5,
target_homogeneity_weight=1.0,
discrimination_weight=0.1,
balancing_weight=0.05,
)
# 3. Create the Explainable Model
xpdeep_model = XpdeepModel.from_torch(
fitted_schema=fit_train_dataset.fitted_schema,
feature_extraction=feature_extraction,
task_learner=task_learner,
backbone=None,
decision_graph_parameters=model_specifications,
)
# ##### Train #######
# Metrics to monitor the training.
metrics = DictMetrics(
multi_class_accuracy=partial(MulticlassAccuracy, num_classes=2),
multi_class_F1_score=partial(MulticlassF1Score, num_classes=2),
confusion_matrix=partial(MulticlassConfusionMatrix, normalize="all", num_classes=2),
)
callbacks = [
EarlyStopping(monitoring_metric="Total loss", mode="minimize", patience=5),
Scheduler(pre_scheduler=partial(ReduceLROnPlateau), step_method="epoch", monitoring_metric="Total loss"),
ModelCheckpoint(monitoring_metric="global_multi_class_F1_score", mode="minimize"),
]
# Optimizer is a partial object as pytorch needs to give the model as optimizer parameter.
optimizer = partial(torch.optim.AdamW, lr=0.001)
trainer = Trainer(
loss=CrossEntropyLossFromProbabilities(reduction="none"),
optimizer=optimizer,
callbacks=callbacks,
start_epoch=0,
max_epochs=20,
metrics=metrics,
)
trained_model = trainer.train(
model=xpdeep_model,
train_set=fit_train_dataset,
validation_set=fit_val_dataset,
batch_size=128,
)
# ##### Explain #######
# 1. Build the Explainer
statistics = DictStats(
distribution_target=DistributionStat(on="target"), distribution_prediction=DistributionStat(on="prediction")
)
quality_metrics = [Sensitivity(), Infidelity()]
explainer = Explainer(
description_representativeness=1000, quality_metrics=quality_metrics, metrics=metrics, statistics=statistics
)
# 2. Model Functioning Explanations
model_explanations = explainer.global_explain(
trained_model,
train_set=fit_train_dataset,
test_set=fit_test_dataset,
validation_set=fit_val_dataset,
)
visualisation_link = model_explanations.visualisation_link
# 3. Inference and their Causal Explanations
my_filter = Filter("testing_filter", fit_test_dataset)
my_filter.add_criteria(
NumericalCriterion(fit_test_dataset.fitted_schema["age"], max_=30),
CategoricalCriterion(fit_test_dataset.fitted_schema["workclass"], categories=["Private"]),
)
causal_explanations = explainer.local_explain(trained_model, fit_test_dataset, my_filter)
visualisation_link = causal_explanations.visualisation_link
Let's break the configuration:
graph_depth
: 3 is usually a suitable value for classification tasks as it can gather up to 8=2**3 different groups of samples.target_homogeneity_pruning_threshold
: 0.8population_pruning_threshold
: 0.15prune_step
: 5, meaning a pruning step will occur every 5 epochs in the training stage.target_homogeneity_weight
: 1.0discrimination_weight
: 0.1balancing_weight
: 0.05
For further details, see docs
Note
All parameters have a default value, you can start by using those default value, then iterate and update the configuration to find suitable explanations.
3. Create the Explainable Model#
Given the model architecture and configuration, we can finally instantiate the explainable model XpdeepModel
.
from xpdeep.model.xpdeep_model import XpdeepModel
xpdeep_model = XpdeepModel.from_torch(
fitted_schema=fit_train_dataset.fitted_schema,
feature_extraction=feature_extraction,
task_learner=task_learner,
backbone=None,
decision_graph_parameters=model_specifications,
)
👀 Full file preview
"""Adult Income workflow, classification, tabular data."""
from functools import partial
import pandas as pd
import pyarrow as pa
import pyarrow.parquet as pq
import torch
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from torch.optim.lr_scheduler import ReduceLROnPlateau
from xpdeep import init, set_project
from xpdeep.dataset.parquet_dataset import FittedParquetDataset, ParquetDataset
from xpdeep.dataset.schema.feature.feature import NumericalFeature
from xpdeep.dataset.schema.preprocessor import SklearnPreprocessor
from xpdeep.dataset.upload import upload
from xpdeep.explain.explainer import Explainer
from xpdeep.explain.quality_metrics import Infidelity, Sensitivity
from xpdeep.explain.statistic import DictStats, DistributionStat
from xpdeep.filtering.criteria import CategoricalCriterion, NumericalCriterion
from xpdeep.filtering.filter import Filter
from xpdeep.metrics.metric import DictMetrics
from xpdeep.metrics.zoo.multiclass_metrics import MulticlassConfusionMatrix, MulticlassAccuracy, MulticlassF1Score
from xpdeep.model.model_builder import ModelDecisionGraphParameters
from xpdeep.model.xpdeep_model import XpdeepModel
from xpdeep.model.zoo.cross_entropy_loss_from_proba import CrossEntropyLossFromProbabilities
from xpdeep.model.zoo.mlp import MLP
from xpdeep.project import Project
from xpdeep.trainer.callbacks import EarlyStopping, ModelCheckpoint, Scheduler
from xpdeep.trainer.trainer import Trainer
torch.random.manual_seed(5)
# ##### Prepare the Dataset #######
# 1. Split and Convert your Raw Data
# Load the CSV file
file_path = "adult_income.csv"
data = pd.read_csv(file_path)
data = data.drop(columns=["fnlwgt", "education"])
# Split the data into training and testing sets
train_data, test_data = train_test_split(data, test_size=0.2, random_state=42)
# Further split the training set into training and validation sets
train_data, val_data = train_test_split(train_data, test_size=0.25, random_state=42)
train_data["index_xp_deep"] = range(len(train_data))
test_data["index_xp_deep"] = range(len(test_data))
val_data["index_xp_deep"] = range(len(val_data))
# Convert to pyarrow Table format
train_table = pa.Table.from_pandas(train_data, preserve_index=False)
val_table = pa.Table.from_pandas(val_data, preserve_index=False)
test_table = pa.Table.from_pandas(test_data, preserve_index=False)
# Save each split as ".parquet" file
pq.write_table(train_table, "train.parquet")
pq.write_table(val_table, "val.parquet")
pq.write_table(test_table, "test.parquet")
# 2. Upload your Converted Data
init(api_key="api_key", api_url="api_url")
set_project(Project(id="AdultIncomeId", name="Adult Income Tutorial"))
directory = upload(
directory_name="adult_income_uploaded",
train_set_path="train.parquet",
test_set_path="test.parquet",
val_set_path="val.parquet",
)
# 3. Instantiate a Dataset
train_dataset = ParquetDataset(
split_name="train",
identifier_name="my_local_dataset",
path=directory["train_set_path"],
)
# 4. Find a schema
analyzed_train_dataset = train_dataset.analyze(target_names=["income"])
print(analyzed_train_dataset.analyzed_schema)
preprocessor = SklearnPreprocessor(preprocess_function=StandardScaler())
analyzed_train_dataset.analyzed_schema["educational-num"] = NumericalFeature(
name="educational-num", is_target=False, preprocessor=preprocessor
)
print(analyzed_train_dataset.analyzed_schema)
# 5. Fit the schema
fit_train_dataset = analyzed_train_dataset.fit()
fit_test_dataset = FittedParquetDataset(
split_name="test",
identifier_name="my_local_dataset",
path=directory["test_set_path"],
fitted_schema=fit_train_dataset.fitted_schema,
)
fit_val_dataset = FittedParquetDataset(
split_name="val",
identifier_name="my_local_dataset",
path=directory["val_set_path"],
fitted_schema=fit_train_dataset.fitted_schema,
)
# ##### Prepare the Model #######
# 1. Create the required torch models
input_size = fit_train_dataset.fitted_schema.input_size[1]
target_size = fit_train_dataset.fitted_schema.target_size[1]
print(f"input_size: {input_size} - target_size: {target_size}")
feature_extraction = MLP(input_size=input_size, hidden_channels=[128, 50])
task_learner = MLP(input_size=50, hidden_channels=[target_size], last_activation=partial(torch.nn.Softmax, dim=1))
# 2. Explainable Model Specifications
model_specifications = ModelDecisionGraphParameters(
graph_depth=3,
target_homogeneity_pruning_threshold=0.8,
population_pruning_threshold=0.15,
prune_step=5,
target_homogeneity_weight=1.0,
discrimination_weight=0.1,
balancing_weight=0.05,
)
# 3. Create the Explainable Model
xpdeep_model = XpdeepModel.from_torch(
fitted_schema=fit_train_dataset.fitted_schema,
feature_extraction=feature_extraction,
task_learner=task_learner,
backbone=None,
decision_graph_parameters=model_specifications,
)
# ##### Train #######
# Metrics to monitor the training.
metrics = DictMetrics(
multi_class_accuracy=partial(MulticlassAccuracy, num_classes=2),
multi_class_F1_score=partial(MulticlassF1Score, num_classes=2),
confusion_matrix=partial(MulticlassConfusionMatrix, normalize="all", num_classes=2),
)
callbacks = [
EarlyStopping(monitoring_metric="Total loss", mode="minimize", patience=5),
Scheduler(pre_scheduler=partial(ReduceLROnPlateau), step_method="epoch", monitoring_metric="Total loss"),
ModelCheckpoint(monitoring_metric="global_multi_class_F1_score", mode="minimize"),
]
# Optimizer is a partial object as pytorch needs to give the model as optimizer parameter.
optimizer = partial(torch.optim.AdamW, lr=0.001)
trainer = Trainer(
loss=CrossEntropyLossFromProbabilities(reduction="none"),
optimizer=optimizer,
callbacks=callbacks,
start_epoch=0,
max_epochs=20,
metrics=metrics,
)
trained_model = trainer.train(
model=xpdeep_model,
train_set=fit_train_dataset,
validation_set=fit_val_dataset,
batch_size=128,
)
# ##### Explain #######
# 1. Build the Explainer
statistics = DictStats(
distribution_target=DistributionStat(on="target"), distribution_prediction=DistributionStat(on="prediction")
)
quality_metrics = [Sensitivity(), Infidelity()]
explainer = Explainer(
description_representativeness=1000, quality_metrics=quality_metrics, metrics=metrics, statistics=statistics
)
# 2. Model Functioning Explanations
model_explanations = explainer.global_explain(
trained_model,
train_set=fit_train_dataset,
test_set=fit_test_dataset,
validation_set=fit_val_dataset,
)
visualisation_link = model_explanations.visualisation_link
# 3. Inference and their Causal Explanations
my_filter = Filter("testing_filter", fit_test_dataset)
my_filter.add_criteria(
NumericalCriterion(fit_test_dataset.fitted_schema["age"], max_=30),
CategoricalCriterion(fit_test_dataset.fitted_schema["workclass"], categories=["Private"]),
)
causal_explanations = explainer.local_explain(trained_model, fit_test_dataset, my_filter)
visualisation_link = causal_explanations.visualisation_link
Train#
The train step is straightforward: we need to specify the Trainer
parameters.
from xpdeep.metrics.metric import DictMetrics
from xpdeep.metrics.zoo.multiclass_metrics import MulticlassConfusionMatrix, MulticlassAccuracy, MulticlassF1Score
from xpdeep.trainer.callbacks import EarlyStopping, Scheduler, ModelCheckpoint
from xpdeep.model.zoo.cross_entropy_loss_from_proba import CrossEntropyLossFromProbabilities
from xpdeep.trainer.trainer import Trainer
from torch.optim.lr_scheduler import ReduceLROnPlateau
# Metrics to monitor the training.
metrics = DictMetrics(
multi_class_accuracy=partial(MulticlassAccuracy, num_classes=2),
multi_class_F1_score=partial(MulticlassF1Score, num_classes=2),
confusion_matrix=partial(MulticlassConfusionMatrix, normalize="all", num_classes=2),
)
callbacks = [
EarlyStopping(monitoring_metric="Total loss", mode="minimize", patience=5),
Scheduler(pre_scheduler=partial(ReduceLROnPlateau), step_method="epoch", monitoring_metric="Total loss"),
ModelCheckpoint(monitoring_metric="global_multi_class_F1_score", mode="minimize"),
]
# Optimizer is a partial object as pytorch needs to give the model as optimizer parameter.
optimizer = partial(torch.optim.AdamW, lr=0.001)
trainer = Trainer(
loss=CrossEntropyLossFromProbabilities(reduction="none"),
optimizer=optimizer,
callbacks=callbacks,
start_epoch=0,
max_epochs=20,
metrics=metrics,
)
👀 Full file preview
"""Adult Income workflow, classification, tabular data."""
from functools import partial
import pandas as pd
import pyarrow as pa
import pyarrow.parquet as pq
import torch
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from torch.optim.lr_scheduler import ReduceLROnPlateau
from xpdeep import init, set_project
from xpdeep.dataset.parquet_dataset import FittedParquetDataset, ParquetDataset
from xpdeep.dataset.schema.feature.feature import NumericalFeature
from xpdeep.dataset.schema.preprocessor import SklearnPreprocessor
from xpdeep.dataset.upload import upload
from xpdeep.explain.explainer import Explainer
from xpdeep.explain.quality_metrics import Infidelity, Sensitivity
from xpdeep.explain.statistic import DictStats, DistributionStat
from xpdeep.filtering.criteria import CategoricalCriterion, NumericalCriterion
from xpdeep.filtering.filter import Filter
from xpdeep.metrics.metric import DictMetrics
from xpdeep.metrics.zoo.multiclass_metrics import MulticlassConfusionMatrix, MulticlassAccuracy, MulticlassF1Score
from xpdeep.model.model_builder import ModelDecisionGraphParameters
from xpdeep.model.xpdeep_model import XpdeepModel
from xpdeep.model.zoo.cross_entropy_loss_from_proba import CrossEntropyLossFromProbabilities
from xpdeep.model.zoo.mlp import MLP
from xpdeep.project import Project
from xpdeep.trainer.callbacks import EarlyStopping, ModelCheckpoint, Scheduler
from xpdeep.trainer.trainer import Trainer
torch.random.manual_seed(5)
# ##### Prepare the Dataset #######
# 1. Split and Convert your Raw Data
# Load the CSV file
file_path = "adult_income.csv"
data = pd.read_csv(file_path)
data = data.drop(columns=["fnlwgt", "education"])
# Split the data into training and testing sets
train_data, test_data = train_test_split(data, test_size=0.2, random_state=42)
# Further split the training set into training and validation sets
train_data, val_data = train_test_split(train_data, test_size=0.25, random_state=42)
train_data["index_xp_deep"] = range(len(train_data))
test_data["index_xp_deep"] = range(len(test_data))
val_data["index_xp_deep"] = range(len(val_data))
# Convert to pyarrow Table format
train_table = pa.Table.from_pandas(train_data, preserve_index=False)
val_table = pa.Table.from_pandas(val_data, preserve_index=False)
test_table = pa.Table.from_pandas(test_data, preserve_index=False)
# Save each split as ".parquet" file
pq.write_table(train_table, "train.parquet")
pq.write_table(val_table, "val.parquet")
pq.write_table(test_table, "test.parquet")
# 2. Upload your Converted Data
init(api_key="api_key", api_url="api_url")
set_project(Project(id="AdultIncomeId", name="Adult Income Tutorial"))
directory = upload(
directory_name="adult_income_uploaded",
train_set_path="train.parquet",
test_set_path="test.parquet",
val_set_path="val.parquet",
)
# 3. Instantiate a Dataset
train_dataset = ParquetDataset(
split_name="train",
identifier_name="my_local_dataset",
path=directory["train_set_path"],
)
# 4. Find a schema
analyzed_train_dataset = train_dataset.analyze(target_names=["income"])
print(analyzed_train_dataset.analyzed_schema)
preprocessor = SklearnPreprocessor(preprocess_function=StandardScaler())
analyzed_train_dataset.analyzed_schema["educational-num"] = NumericalFeature(
name="educational-num", is_target=False, preprocessor=preprocessor
)
print(analyzed_train_dataset.analyzed_schema)
# 5. Fit the schema
fit_train_dataset = analyzed_train_dataset.fit()
fit_test_dataset = FittedParquetDataset(
split_name="test",
identifier_name="my_local_dataset",
path=directory["test_set_path"],
fitted_schema=fit_train_dataset.fitted_schema,
)
fit_val_dataset = FittedParquetDataset(
split_name="val",
identifier_name="my_local_dataset",
path=directory["val_set_path"],
fitted_schema=fit_train_dataset.fitted_schema,
)
# ##### Prepare the Model #######
# 1. Create the required torch models
input_size = fit_train_dataset.fitted_schema.input_size[1]
target_size = fit_train_dataset.fitted_schema.target_size[1]
print(f"input_size: {input_size} - target_size: {target_size}")
feature_extraction = MLP(input_size=input_size, hidden_channels=[128, 50])
task_learner = MLP(input_size=50, hidden_channels=[target_size], last_activation=partial(torch.nn.Softmax, dim=1))
# 2. Explainable Model Specifications
model_specifications = ModelDecisionGraphParameters(
graph_depth=3,
target_homogeneity_pruning_threshold=0.8,
population_pruning_threshold=0.15,
prune_step=5,
target_homogeneity_weight=1.0,
discrimination_weight=0.1,
balancing_weight=0.05,
)
# 3. Create the Explainable Model
xpdeep_model = XpdeepModel.from_torch(
fitted_schema=fit_train_dataset.fitted_schema,
feature_extraction=feature_extraction,
task_learner=task_learner,
backbone=None,
decision_graph_parameters=model_specifications,
)
# ##### Train #######
# Metrics to monitor the training.
metrics = DictMetrics(
multi_class_accuracy=partial(MulticlassAccuracy, num_classes=2),
multi_class_F1_score=partial(MulticlassF1Score, num_classes=2),
confusion_matrix=partial(MulticlassConfusionMatrix, normalize="all", num_classes=2),
)
callbacks = [
EarlyStopping(monitoring_metric="Total loss", mode="minimize", patience=5),
Scheduler(pre_scheduler=partial(ReduceLROnPlateau), step_method="epoch", monitoring_metric="Total loss"),
ModelCheckpoint(monitoring_metric="global_multi_class_F1_score", mode="minimize"),
]
# Optimizer is a partial object as pytorch needs to give the model as optimizer parameter.
optimizer = partial(torch.optim.AdamW, lr=0.001)
trainer = Trainer(
loss=CrossEntropyLossFromProbabilities(reduction="none"),
optimizer=optimizer,
callbacks=callbacks,
start_epoch=0,
max_epochs=20,
metrics=metrics,
)
trained_model = trainer.train(
model=xpdeep_model,
train_set=fit_train_dataset,
validation_set=fit_val_dataset,
batch_size=128,
)
# ##### Explain #######
# 1. Build the Explainer
statistics = DictStats(
distribution_target=DistributionStat(on="target"), distribution_prediction=DistributionStat(on="prediction")
)
quality_metrics = [Sensitivity(), Infidelity()]
explainer = Explainer(
description_representativeness=1000, quality_metrics=quality_metrics, metrics=metrics, statistics=statistics
)
# 2. Model Functioning Explanations
model_explanations = explainer.global_explain(
trained_model,
train_set=fit_train_dataset,
test_set=fit_test_dataset,
validation_set=fit_val_dataset,
)
visualisation_link = model_explanations.visualisation_link
# 3. Inference and their Causal Explanations
my_filter = Filter("testing_filter", fit_test_dataset)
my_filter.add_criteria(
NumericalCriterion(fit_test_dataset.fitted_schema["age"], max_=30),
CategoricalCriterion(fit_test_dataset.fitted_schema["workclass"], categories=["Private"]),
)
causal_explanations = explainer.local_explain(trained_model, fit_test_dataset, my_filter)
visualisation_link = causal_explanations.visualisation_link
Note
Here, the loss is a custom loss compatible with our output format, based on the default torch
loss.
Note
Torch metrics expect targets to be an index vector and not a set of onehot vector for multiclass metrics. As our targets are onehot vectors and not indexes, we use custom classes to convert the targets as indexes.
We can now train the model:
trained_model = trainer.train(
model=xpdeep_model,
train_set=fit_train_dataset,
validation_set=fit_val_dataset,
batch_size=128,
)
👀 Full file preview
"""Adult Income workflow, classification, tabular data."""
from functools import partial
import pandas as pd
import pyarrow as pa
import pyarrow.parquet as pq
import torch
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from torch.optim.lr_scheduler import ReduceLROnPlateau
from xpdeep import init, set_project
from xpdeep.dataset.parquet_dataset import FittedParquetDataset, ParquetDataset
from xpdeep.dataset.schema.feature.feature import NumericalFeature
from xpdeep.dataset.schema.preprocessor import SklearnPreprocessor
from xpdeep.dataset.upload import upload
from xpdeep.explain.explainer import Explainer
from xpdeep.explain.quality_metrics import Infidelity, Sensitivity
from xpdeep.explain.statistic import DictStats, DistributionStat
from xpdeep.filtering.criteria import CategoricalCriterion, NumericalCriterion
from xpdeep.filtering.filter import Filter
from xpdeep.metrics.metric import DictMetrics
from xpdeep.metrics.zoo.multiclass_metrics import MulticlassConfusionMatrix, MulticlassAccuracy, MulticlassF1Score
from xpdeep.model.model_builder import ModelDecisionGraphParameters
from xpdeep.model.xpdeep_model import XpdeepModel
from xpdeep.model.zoo.cross_entropy_loss_from_proba import CrossEntropyLossFromProbabilities
from xpdeep.model.zoo.mlp import MLP
from xpdeep.project import Project
from xpdeep.trainer.callbacks import EarlyStopping, ModelCheckpoint, Scheduler
from xpdeep.trainer.trainer import Trainer
torch.random.manual_seed(5)
# ##### Prepare the Dataset #######
# 1. Split and Convert your Raw Data
# Load the CSV file
file_path = "adult_income.csv"
data = pd.read_csv(file_path)
data = data.drop(columns=["fnlwgt", "education"])
# Split the data into training and testing sets
train_data, test_data = train_test_split(data, test_size=0.2, random_state=42)
# Further split the training set into training and validation sets
train_data, val_data = train_test_split(train_data, test_size=0.25, random_state=42)
train_data["index_xp_deep"] = range(len(train_data))
test_data["index_xp_deep"] = range(len(test_data))
val_data["index_xp_deep"] = range(len(val_data))
# Convert to pyarrow Table format
train_table = pa.Table.from_pandas(train_data, preserve_index=False)
val_table = pa.Table.from_pandas(val_data, preserve_index=False)
test_table = pa.Table.from_pandas(test_data, preserve_index=False)
# Save each split as ".parquet" file
pq.write_table(train_table, "train.parquet")
pq.write_table(val_table, "val.parquet")
pq.write_table(test_table, "test.parquet")
# 2. Upload your Converted Data
init(api_key="api_key", api_url="api_url")
set_project(Project(id="AdultIncomeId", name="Adult Income Tutorial"))
directory = upload(
directory_name="adult_income_uploaded",
train_set_path="train.parquet",
test_set_path="test.parquet",
val_set_path="val.parquet",
)
# 3. Instantiate a Dataset
train_dataset = ParquetDataset(
split_name="train",
identifier_name="my_local_dataset",
path=directory["train_set_path"],
)
# 4. Find a schema
analyzed_train_dataset = train_dataset.analyze(target_names=["income"])
print(analyzed_train_dataset.analyzed_schema)
preprocessor = SklearnPreprocessor(preprocess_function=StandardScaler())
analyzed_train_dataset.analyzed_schema["educational-num"] = NumericalFeature(
name="educational-num", is_target=False, preprocessor=preprocessor
)
print(analyzed_train_dataset.analyzed_schema)
# 5. Fit the schema
fit_train_dataset = analyzed_train_dataset.fit()
fit_test_dataset = FittedParquetDataset(
split_name="test",
identifier_name="my_local_dataset",
path=directory["test_set_path"],
fitted_schema=fit_train_dataset.fitted_schema,
)
fit_val_dataset = FittedParquetDataset(
split_name="val",
identifier_name="my_local_dataset",
path=directory["val_set_path"],
fitted_schema=fit_train_dataset.fitted_schema,
)
# ##### Prepare the Model #######
# 1. Create the required torch models
input_size = fit_train_dataset.fitted_schema.input_size[1]
target_size = fit_train_dataset.fitted_schema.target_size[1]
print(f"input_size: {input_size} - target_size: {target_size}")
feature_extraction = MLP(input_size=input_size, hidden_channels=[128, 50])
task_learner = MLP(input_size=50, hidden_channels=[target_size], last_activation=partial(torch.nn.Softmax, dim=1))
# 2. Explainable Model Specifications
model_specifications = ModelDecisionGraphParameters(
graph_depth=3,
target_homogeneity_pruning_threshold=0.8,
population_pruning_threshold=0.15,
prune_step=5,
target_homogeneity_weight=1.0,
discrimination_weight=0.1,
balancing_weight=0.05,
)
# 3. Create the Explainable Model
xpdeep_model = XpdeepModel.from_torch(
fitted_schema=fit_train_dataset.fitted_schema,
feature_extraction=feature_extraction,
task_learner=task_learner,
backbone=None,
decision_graph_parameters=model_specifications,
)
# ##### Train #######
# Metrics to monitor the training.
metrics = DictMetrics(
multi_class_accuracy=partial(MulticlassAccuracy, num_classes=2),
multi_class_F1_score=partial(MulticlassF1Score, num_classes=2),
confusion_matrix=partial(MulticlassConfusionMatrix, normalize="all", num_classes=2),
)
callbacks = [
EarlyStopping(monitoring_metric="Total loss", mode="minimize", patience=5),
Scheduler(pre_scheduler=partial(ReduceLROnPlateau), step_method="epoch", monitoring_metric="Total loss"),
ModelCheckpoint(monitoring_metric="global_multi_class_F1_score", mode="minimize"),
]
# Optimizer is a partial object as pytorch needs to give the model as optimizer parameter.
optimizer = partial(torch.optim.AdamW, lr=0.001)
trainer = Trainer(
loss=CrossEntropyLossFromProbabilities(reduction="none"),
optimizer=optimizer,
callbacks=callbacks,
start_epoch=0,
max_epochs=20,
metrics=metrics,
)
trained_model = trainer.train(
model=xpdeep_model,
train_set=fit_train_dataset,
validation_set=fit_val_dataset,
batch_size=128,
)
# ##### Explain #######
# 1. Build the Explainer
statistics = DictStats(
distribution_target=DistributionStat(on="target"), distribution_prediction=DistributionStat(on="prediction")
)
quality_metrics = [Sensitivity(), Infidelity()]
explainer = Explainer(
description_representativeness=1000, quality_metrics=quality_metrics, metrics=metrics, statistics=statistics
)
# 2. Model Functioning Explanations
model_explanations = explainer.global_explain(
trained_model,
train_set=fit_train_dataset,
test_set=fit_test_dataset,
validation_set=fit_val_dataset,
)
visualisation_link = model_explanations.visualisation_link
# 3. Inference and their Causal Explanations
my_filter = Filter("testing_filter", fit_test_dataset)
my_filter.add_criteria(
NumericalCriterion(fit_test_dataset.fitted_schema["age"], max_=30),
CategoricalCriterion(fit_test_dataset.fitted_schema["workclass"], categories=["Private"]),
)
causal_explanations = explainer.local_explain(trained_model, fit_test_dataset, my_filter)
visualisation_link = causal_explanations.visualisation_link
The training logs are displayed in the console:
Epoch 1/20 - Loss: 0.657: 1%|▏ | 3/229 [00:00<00:30, 7.44it/s]
Epoch 1/20 - Loss: 0.614: 3%|▎ | 7/229 [00:00<00:25, 8.56it/s]
Epoch 1/20 - Loss: 0.539: 4%|▍ | 10/229 [00:01<00:25, 8.58it/s]
Epoch 1/20 - Loss: 0.491: 6%|▌ | 14/229 [00:01<00:26, 8.22it/s]
Once the model trained, it can be used to get explanations.
Explain#
Similarly to the Trainer
, explanations are computed with an Explainer
interface.
1. Build the Explainer#
We provide the Explainer
quality metrics to get insights on the explanation quality. In addition, we compute
along with the explanations the distribution on targets and predictions. Finally, we set description_representativeness
to 1000.
from xpdeep.explain.explainer import Explainer
from xpdeep.explain.quality_metrics import Infidelity, Sensitivity
from xpdeep.explain.statistic import DictStats, DistributionStat
statistics = DictStats(
distribution_target=DistributionStat(on="target"), distribution_prediction=DistributionStat(on="prediction")
)
quality_metrics = [Sensitivity(), Infidelity()]
explainer = Explainer(
description_representativeness=1000, quality_metrics=quality_metrics, metrics=metrics, statistics=statistics
)
👀 Full file preview
"""Adult Income workflow, classification, tabular data."""
from functools import partial
import pandas as pd
import pyarrow as pa
import pyarrow.parquet as pq
import torch
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from torch.optim.lr_scheduler import ReduceLROnPlateau
from xpdeep import init, set_project
from xpdeep.dataset.parquet_dataset import FittedParquetDataset, ParquetDataset
from xpdeep.dataset.schema.feature.feature import NumericalFeature
from xpdeep.dataset.schema.preprocessor import SklearnPreprocessor
from xpdeep.dataset.upload import upload
from xpdeep.explain.explainer import Explainer
from xpdeep.explain.quality_metrics import Infidelity, Sensitivity
from xpdeep.explain.statistic import DictStats, DistributionStat
from xpdeep.filtering.criteria import CategoricalCriterion, NumericalCriterion
from xpdeep.filtering.filter import Filter
from xpdeep.metrics.metric import DictMetrics
from xpdeep.metrics.zoo.multiclass_metrics import MulticlassConfusionMatrix, MulticlassAccuracy, MulticlassF1Score
from xpdeep.model.model_builder import ModelDecisionGraphParameters
from xpdeep.model.xpdeep_model import XpdeepModel
from xpdeep.model.zoo.cross_entropy_loss_from_proba import CrossEntropyLossFromProbabilities
from xpdeep.model.zoo.mlp import MLP
from xpdeep.project import Project
from xpdeep.trainer.callbacks import EarlyStopping, ModelCheckpoint, Scheduler
from xpdeep.trainer.trainer import Trainer
torch.random.manual_seed(5)
# ##### Prepare the Dataset #######
# 1. Split and Convert your Raw Data
# Load the CSV file
file_path = "adult_income.csv"
data = pd.read_csv(file_path)
data = data.drop(columns=["fnlwgt", "education"])
# Split the data into training and testing sets
train_data, test_data = train_test_split(data, test_size=0.2, random_state=42)
# Further split the training set into training and validation sets
train_data, val_data = train_test_split(train_data, test_size=0.25, random_state=42)
train_data["index_xp_deep"] = range(len(train_data))
test_data["index_xp_deep"] = range(len(test_data))
val_data["index_xp_deep"] = range(len(val_data))
# Convert to pyarrow Table format
train_table = pa.Table.from_pandas(train_data, preserve_index=False)
val_table = pa.Table.from_pandas(val_data, preserve_index=False)
test_table = pa.Table.from_pandas(test_data, preserve_index=False)
# Save each split as ".parquet" file
pq.write_table(train_table, "train.parquet")
pq.write_table(val_table, "val.parquet")
pq.write_table(test_table, "test.parquet")
# 2. Upload your Converted Data
init(api_key="api_key", api_url="api_url")
set_project(Project(id="AdultIncomeId", name="Adult Income Tutorial"))
directory = upload(
directory_name="adult_income_uploaded",
train_set_path="train.parquet",
test_set_path="test.parquet",
val_set_path="val.parquet",
)
# 3. Instantiate a Dataset
train_dataset = ParquetDataset(
split_name="train",
identifier_name="my_local_dataset",
path=directory["train_set_path"],
)
# 4. Find a schema
analyzed_train_dataset = train_dataset.analyze(target_names=["income"])
print(analyzed_train_dataset.analyzed_schema)
preprocessor = SklearnPreprocessor(preprocess_function=StandardScaler())
analyzed_train_dataset.analyzed_schema["educational-num"] = NumericalFeature(
name="educational-num", is_target=False, preprocessor=preprocessor
)
print(analyzed_train_dataset.analyzed_schema)
# 5. Fit the schema
fit_train_dataset = analyzed_train_dataset.fit()
fit_test_dataset = FittedParquetDataset(
split_name="test",
identifier_name="my_local_dataset",
path=directory["test_set_path"],
fitted_schema=fit_train_dataset.fitted_schema,
)
fit_val_dataset = FittedParquetDataset(
split_name="val",
identifier_name="my_local_dataset",
path=directory["val_set_path"],
fitted_schema=fit_train_dataset.fitted_schema,
)
# ##### Prepare the Model #######
# 1. Create the required torch models
input_size = fit_train_dataset.fitted_schema.input_size[1]
target_size = fit_train_dataset.fitted_schema.target_size[1]
print(f"input_size: {input_size} - target_size: {target_size}")
feature_extraction = MLP(input_size=input_size, hidden_channels=[128, 50])
task_learner = MLP(input_size=50, hidden_channels=[target_size], last_activation=partial(torch.nn.Softmax, dim=1))
# 2. Explainable Model Specifications
model_specifications = ModelDecisionGraphParameters(
graph_depth=3,
target_homogeneity_pruning_threshold=0.8,
population_pruning_threshold=0.15,
prune_step=5,
target_homogeneity_weight=1.0,
discrimination_weight=0.1,
balancing_weight=0.05,
)
# 3. Create the Explainable Model
xpdeep_model = XpdeepModel.from_torch(
fitted_schema=fit_train_dataset.fitted_schema,
feature_extraction=feature_extraction,
task_learner=task_learner,
backbone=None,
decision_graph_parameters=model_specifications,
)
# ##### Train #######
# Metrics to monitor the training.
metrics = DictMetrics(
multi_class_accuracy=partial(MulticlassAccuracy, num_classes=2),
multi_class_F1_score=partial(MulticlassF1Score, num_classes=2),
confusion_matrix=partial(MulticlassConfusionMatrix, normalize="all", num_classes=2),
)
callbacks = [
EarlyStopping(monitoring_metric="Total loss", mode="minimize", patience=5),
Scheduler(pre_scheduler=partial(ReduceLROnPlateau), step_method="epoch", monitoring_metric="Total loss"),
ModelCheckpoint(monitoring_metric="global_multi_class_F1_score", mode="minimize"),
]
# Optimizer is a partial object as pytorch needs to give the model as optimizer parameter.
optimizer = partial(torch.optim.AdamW, lr=0.001)
trainer = Trainer(
loss=CrossEntropyLossFromProbabilities(reduction="none"),
optimizer=optimizer,
callbacks=callbacks,
start_epoch=0,
max_epochs=20,
metrics=metrics,
)
trained_model = trainer.train(
model=xpdeep_model,
train_set=fit_train_dataset,
validation_set=fit_val_dataset,
batch_size=128,
)
# ##### Explain #######
# 1. Build the Explainer
statistics = DictStats(
distribution_target=DistributionStat(on="target"), distribution_prediction=DistributionStat(on="prediction")
)
quality_metrics = [Sensitivity(), Infidelity()]
explainer = Explainer(
description_representativeness=1000, quality_metrics=quality_metrics, metrics=metrics, statistics=statistics
)
# 2. Model Functioning Explanations
model_explanations = explainer.global_explain(
trained_model,
train_set=fit_train_dataset,
test_set=fit_test_dataset,
validation_set=fit_val_dataset,
)
visualisation_link = model_explanations.visualisation_link
# 3. Inference and their Causal Explanations
my_filter = Filter("testing_filter", fit_test_dataset)
my_filter.add_criteria(
NumericalCriterion(fit_test_dataset.fitted_schema["age"], max_=30),
CategoricalCriterion(fit_test_dataset.fitted_schema["workclass"], categories=["Private"]),
)
causal_explanations = explainer.local_explain(trained_model, fit_test_dataset, my_filter)
visualisation_link = causal_explanations.visualisation_link
Tip
Here we reuse metrics
from the train stage for convenience, but they can be adapted to your needs !
2. Model Functioning Explanations#
Model Functioning Explanations are computed with the global_explain
method.
model_explanations = explainer.global_explain(trained_model,
train_set=fit_train_dataset,
test_set=fit_test_dataset,
validation_set=fit_val_dataset,
)
visualisation_link = model_explanations.visualisation_link
👀 Full file preview
"""Adult Income workflow, classification, tabular data."""
from functools import partial
import pandas as pd
import pyarrow as pa
import pyarrow.parquet as pq
import torch
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from torch.optim.lr_scheduler import ReduceLROnPlateau
from xpdeep import init, set_project
from xpdeep.dataset.parquet_dataset import FittedParquetDataset, ParquetDataset
from xpdeep.dataset.schema.feature.feature import NumericalFeature
from xpdeep.dataset.schema.preprocessor import SklearnPreprocessor
from xpdeep.dataset.upload import upload
from xpdeep.explain.explainer import Explainer
from xpdeep.explain.quality_metrics import Infidelity, Sensitivity
from xpdeep.explain.statistic import DictStats, DistributionStat
from xpdeep.filtering.criteria import CategoricalCriterion, NumericalCriterion
from xpdeep.filtering.filter import Filter
from xpdeep.metrics.metric import DictMetrics
from xpdeep.metrics.zoo.multiclass_metrics import MulticlassConfusionMatrix, MulticlassAccuracy, MulticlassF1Score
from xpdeep.model.model_builder import ModelDecisionGraphParameters
from xpdeep.model.xpdeep_model import XpdeepModel
from xpdeep.model.zoo.cross_entropy_loss_from_proba import CrossEntropyLossFromProbabilities
from xpdeep.model.zoo.mlp import MLP
from xpdeep.project import Project
from xpdeep.trainer.callbacks import EarlyStopping, ModelCheckpoint, Scheduler
from xpdeep.trainer.trainer import Trainer
torch.random.manual_seed(5)
# ##### Prepare the Dataset #######
# 1. Split and Convert your Raw Data
# Load the CSV file
file_path = "adult_income.csv"
data = pd.read_csv(file_path)
data = data.drop(columns=["fnlwgt", "education"])
# Split the data into training and testing sets
train_data, test_data = train_test_split(data, test_size=0.2, random_state=42)
# Further split the training set into training and validation sets
train_data, val_data = train_test_split(train_data, test_size=0.25, random_state=42)
train_data["index_xp_deep"] = range(len(train_data))
test_data["index_xp_deep"] = range(len(test_data))
val_data["index_xp_deep"] = range(len(val_data))
# Convert to pyarrow Table format
train_table = pa.Table.from_pandas(train_data, preserve_index=False)
val_table = pa.Table.from_pandas(val_data, preserve_index=False)
test_table = pa.Table.from_pandas(test_data, preserve_index=False)
# Save each split as ".parquet" file
pq.write_table(train_table, "train.parquet")
pq.write_table(val_table, "val.parquet")
pq.write_table(test_table, "test.parquet")
# 2. Upload your Converted Data
init(api_key="api_key", api_url="api_url")
set_project(Project(id="AdultIncomeId", name="Adult Income Tutorial"))
directory = upload(
directory_name="adult_income_uploaded",
train_set_path="train.parquet",
test_set_path="test.parquet",
val_set_path="val.parquet",
)
# 3. Instantiate a Dataset
train_dataset = ParquetDataset(
split_name="train",
identifier_name="my_local_dataset",
path=directory["train_set_path"],
)
# 4. Find a schema
analyzed_train_dataset = train_dataset.analyze(target_names=["income"])
print(analyzed_train_dataset.analyzed_schema)
preprocessor = SklearnPreprocessor(preprocess_function=StandardScaler())
analyzed_train_dataset.analyzed_schema["educational-num"] = NumericalFeature(
name="educational-num", is_target=False, preprocessor=preprocessor
)
print(analyzed_train_dataset.analyzed_schema)
# 5. Fit the schema
fit_train_dataset = analyzed_train_dataset.fit()
fit_test_dataset = FittedParquetDataset(
split_name="test",
identifier_name="my_local_dataset",
path=directory["test_set_path"],
fitted_schema=fit_train_dataset.fitted_schema,
)
fit_val_dataset = FittedParquetDataset(
split_name="val",
identifier_name="my_local_dataset",
path=directory["val_set_path"],
fitted_schema=fit_train_dataset.fitted_schema,
)
# ##### Prepare the Model #######
# 1. Create the required torch models
input_size = fit_train_dataset.fitted_schema.input_size[1]
target_size = fit_train_dataset.fitted_schema.target_size[1]
print(f"input_size: {input_size} - target_size: {target_size}")
feature_extraction = MLP(input_size=input_size, hidden_channels=[128, 50])
task_learner = MLP(input_size=50, hidden_channels=[target_size], last_activation=partial(torch.nn.Softmax, dim=1))
# 2. Explainable Model Specifications
model_specifications = ModelDecisionGraphParameters(
graph_depth=3,
target_homogeneity_pruning_threshold=0.8,
population_pruning_threshold=0.15,
prune_step=5,
target_homogeneity_weight=1.0,
discrimination_weight=0.1,
balancing_weight=0.05,
)
# 3. Create the Explainable Model
xpdeep_model = XpdeepModel.from_torch(
fitted_schema=fit_train_dataset.fitted_schema,
feature_extraction=feature_extraction,
task_learner=task_learner,
backbone=None,
decision_graph_parameters=model_specifications,
)
# ##### Train #######
# Metrics to monitor the training.
metrics = DictMetrics(
multi_class_accuracy=partial(MulticlassAccuracy, num_classes=2),
multi_class_F1_score=partial(MulticlassF1Score, num_classes=2),
confusion_matrix=partial(MulticlassConfusionMatrix, normalize="all", num_classes=2),
)
callbacks = [
EarlyStopping(monitoring_metric="Total loss", mode="minimize", patience=5),
Scheduler(pre_scheduler=partial(ReduceLROnPlateau), step_method="epoch", monitoring_metric="Total loss"),
ModelCheckpoint(monitoring_metric="global_multi_class_F1_score", mode="minimize"),
]
# Optimizer is a partial object as pytorch needs to give the model as optimizer parameter.
optimizer = partial(torch.optim.AdamW, lr=0.001)
trainer = Trainer(
loss=CrossEntropyLossFromProbabilities(reduction="none"),
optimizer=optimizer,
callbacks=callbacks,
start_epoch=0,
max_epochs=20,
metrics=metrics,
)
trained_model = trainer.train(
model=xpdeep_model,
train_set=fit_train_dataset,
validation_set=fit_val_dataset,
batch_size=128,
)
# ##### Explain #######
# 1. Build the Explainer
statistics = DictStats(
distribution_target=DistributionStat(on="target"), distribution_prediction=DistributionStat(on="prediction")
)
quality_metrics = [Sensitivity(), Infidelity()]
explainer = Explainer(
description_representativeness=1000, quality_metrics=quality_metrics, metrics=metrics, statistics=statistics
)
# 2. Model Functioning Explanations
model_explanations = explainer.global_explain(
trained_model,
train_set=fit_train_dataset,
test_set=fit_test_dataset,
validation_set=fit_val_dataset,
)
visualisation_link = model_explanations.visualisation_link
# 3. Inference and their Causal Explanations
my_filter = Filter("testing_filter", fit_test_dataset)
my_filter.add_criteria(
NumericalCriterion(fit_test_dataset.fitted_schema["age"], max_=30),
CategoricalCriterion(fit_test_dataset.fitted_schema["workclass"], categories=["Private"]),
)
causal_explanations = explainer.local_explain(trained_model, fit_test_dataset, my_filter)
visualisation_link = causal_explanations.visualisation_link
We can visualize explanations with XpViz
, using the link in model_explanations.visualisation_link
, if you already
have requested the correct credentials.
3. Inference and their Causal Explanations#
We need a subset of samples to compute Causal Explanations on. Here we filter the test set on two features, selecting samples with "age" under 30 and in the "Private" workclass. It represents 2461 samples.
from xpdeep.filtering.filter import Filter
from xpdeep.filtering.criteria import NumericalCriterion, CategoricalCriterion
my_filter = Filter("testing_filter", fit_test_dataset)
my_filter.add_criteria(
NumericalCriterion(fit_test_dataset.fitted_schema["age"], max_=30),
CategoricalCriterion(fit_test_dataset.fitted_schema["workclass"], categories=["Private"]),
)
Explanation can then be computed using the local_explain
method from the Explainer
.
causal_explanations = explainer.local_explain(trained_model, fit_test_dataset, my_filter)
visualisation_link = causal_explanations.visualisation_link
👀 Full file preview
"""Adult Income workflow, classification, tabular data."""
from functools import partial
import pandas as pd
import pyarrow as pa
import pyarrow.parquet as pq
import torch
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from torch.optim.lr_scheduler import ReduceLROnPlateau
from xpdeep import init, set_project
from xpdeep.dataset.parquet_dataset import FittedParquetDataset, ParquetDataset
from xpdeep.dataset.schema.feature.feature import NumericalFeature
from xpdeep.dataset.schema.preprocessor import SklearnPreprocessor
from xpdeep.dataset.upload import upload
from xpdeep.explain.explainer import Explainer
from xpdeep.explain.quality_metrics import Infidelity, Sensitivity
from xpdeep.explain.statistic import DictStats, DistributionStat
from xpdeep.filtering.criteria import CategoricalCriterion, NumericalCriterion
from xpdeep.filtering.filter import Filter
from xpdeep.metrics.metric import DictMetrics
from xpdeep.metrics.zoo.multiclass_metrics import MulticlassConfusionMatrix, MulticlassAccuracy, MulticlassF1Score
from xpdeep.model.model_builder import ModelDecisionGraphParameters
from xpdeep.model.xpdeep_model import XpdeepModel
from xpdeep.model.zoo.cross_entropy_loss_from_proba import CrossEntropyLossFromProbabilities
from xpdeep.model.zoo.mlp import MLP
from xpdeep.project import Project
from xpdeep.trainer.callbacks import EarlyStopping, ModelCheckpoint, Scheduler
from xpdeep.trainer.trainer import Trainer
torch.random.manual_seed(5)
# ##### Prepare the Dataset #######
# 1. Split and Convert your Raw Data
# Load the CSV file
file_path = "adult_income.csv"
data = pd.read_csv(file_path)
data = data.drop(columns=["fnlwgt", "education"])
# Split the data into training and testing sets
train_data, test_data = train_test_split(data, test_size=0.2, random_state=42)
# Further split the training set into training and validation sets
train_data, val_data = train_test_split(train_data, test_size=0.25, random_state=42)
train_data["index_xp_deep"] = range(len(train_data))
test_data["index_xp_deep"] = range(len(test_data))
val_data["index_xp_deep"] = range(len(val_data))
# Convert to pyarrow Table format
train_table = pa.Table.from_pandas(train_data, preserve_index=False)
val_table = pa.Table.from_pandas(val_data, preserve_index=False)
test_table = pa.Table.from_pandas(test_data, preserve_index=False)
# Save each split as ".parquet" file
pq.write_table(train_table, "train.parquet")
pq.write_table(val_table, "val.parquet")
pq.write_table(test_table, "test.parquet")
# 2. Upload your Converted Data
init(api_key="api_key", api_url="api_url")
set_project(Project(id="AdultIncomeId", name="Adult Income Tutorial"))
directory = upload(
directory_name="adult_income_uploaded",
train_set_path="train.parquet",
test_set_path="test.parquet",
val_set_path="val.parquet",
)
# 3. Instantiate a Dataset
train_dataset = ParquetDataset(
split_name="train",
identifier_name="my_local_dataset",
path=directory["train_set_path"],
)
# 4. Find a schema
analyzed_train_dataset = train_dataset.analyze(target_names=["income"])
print(analyzed_train_dataset.analyzed_schema)
preprocessor = SklearnPreprocessor(preprocess_function=StandardScaler())
analyzed_train_dataset.analyzed_schema["educational-num"] = NumericalFeature(
name="educational-num", is_target=False, preprocessor=preprocessor
)
print(analyzed_train_dataset.analyzed_schema)
# 5. Fit the schema
fit_train_dataset = analyzed_train_dataset.fit()
fit_test_dataset = FittedParquetDataset(
split_name="test",
identifier_name="my_local_dataset",
path=directory["test_set_path"],
fitted_schema=fit_train_dataset.fitted_schema,
)
fit_val_dataset = FittedParquetDataset(
split_name="val",
identifier_name="my_local_dataset",
path=directory["val_set_path"],
fitted_schema=fit_train_dataset.fitted_schema,
)
# ##### Prepare the Model #######
# 1. Create the required torch models
input_size = fit_train_dataset.fitted_schema.input_size[1]
target_size = fit_train_dataset.fitted_schema.target_size[1]
print(f"input_size: {input_size} - target_size: {target_size}")
feature_extraction = MLP(input_size=input_size, hidden_channels=[128, 50])
task_learner = MLP(input_size=50, hidden_channels=[target_size], last_activation=partial(torch.nn.Softmax, dim=1))
# 2. Explainable Model Specifications
model_specifications = ModelDecisionGraphParameters(
graph_depth=3,
target_homogeneity_pruning_threshold=0.8,
population_pruning_threshold=0.15,
prune_step=5,
target_homogeneity_weight=1.0,
discrimination_weight=0.1,
balancing_weight=0.05,
)
# 3. Create the Explainable Model
xpdeep_model = XpdeepModel.from_torch(
fitted_schema=fit_train_dataset.fitted_schema,
feature_extraction=feature_extraction,
task_learner=task_learner,
backbone=None,
decision_graph_parameters=model_specifications,
)
# ##### Train #######
# Metrics to monitor the training.
metrics = DictMetrics(
multi_class_accuracy=partial(MulticlassAccuracy, num_classes=2),
multi_class_F1_score=partial(MulticlassF1Score, num_classes=2),
confusion_matrix=partial(MulticlassConfusionMatrix, normalize="all", num_classes=2),
)
callbacks = [
EarlyStopping(monitoring_metric="Total loss", mode="minimize", patience=5),
Scheduler(pre_scheduler=partial(ReduceLROnPlateau), step_method="epoch", monitoring_metric="Total loss"),
ModelCheckpoint(monitoring_metric="global_multi_class_F1_score", mode="minimize"),
]
# Optimizer is a partial object as pytorch needs to give the model as optimizer parameter.
optimizer = partial(torch.optim.AdamW, lr=0.001)
trainer = Trainer(
loss=CrossEntropyLossFromProbabilities(reduction="none"),
optimizer=optimizer,
callbacks=callbacks,
start_epoch=0,
max_epochs=20,
metrics=metrics,
)
trained_model = trainer.train(
model=xpdeep_model,
train_set=fit_train_dataset,
validation_set=fit_val_dataset,
batch_size=128,
)
# ##### Explain #######
# 1. Build the Explainer
statistics = DictStats(
distribution_target=DistributionStat(on="target"), distribution_prediction=DistributionStat(on="prediction")
)
quality_metrics = [Sensitivity(), Infidelity()]
explainer = Explainer(
description_representativeness=1000, quality_metrics=quality_metrics, metrics=metrics, statistics=statistics
)
# 2. Model Functioning Explanations
model_explanations = explainer.global_explain(
trained_model,
train_set=fit_train_dataset,
test_set=fit_test_dataset,
validation_set=fit_val_dataset,
)
visualisation_link = model_explanations.visualisation_link
# 3. Inference and their Causal Explanations
my_filter = Filter("testing_filter", fit_test_dataset)
my_filter.add_criteria(
NumericalCriterion(fit_test_dataset.fitted_schema["age"], max_=30),
CategoricalCriterion(fit_test_dataset.fitted_schema["workclass"], categories=["Private"]),
)
causal_explanations = explainer.local_explain(trained_model, fit_test_dataset, my_filter)
visualisation_link = causal_explanations.visualisation_link
We can again visualize causal explanations using the visualisation_link
.