Machine Learning with Python

A Working Example

Author

JMG

import pandas as pd
from sklearn import tree
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score, roc_auc_score
import matplotlib.pyplot as plt
import seaborn as sns
import graphviz
penguins = pd.read_csv("penguins.csv")
penguins = penguins[['sex', 'bill_length_mm', 'bill_depth_mm', 'flipper_length_mm', 'body_mass_g', 'species']]
# Handle categorical variables (one-hot encoding)
penguins = pd.get_dummies(penguins, columns=['species'], drop_first=True)
penguins.head()
sex bill_length_mm bill_depth_mm flipper_length_mm body_mass_g species_Chinstrap species_Gentoo
0 male 39.1 18.7 181 3750 False False
1 female 39.5 17.4 186 3800 False False
2 female 40.3 18.0 195 3250 False False
3 female 36.7 19.3 193 3450 False False
4 male 39.3 20.6 190 3650 False False
# Split the data
X = penguins.drop('sex', axis=1)
y = penguins['sex']
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=123, stratify=y)
# Define the decision tree classifier
tree_classifier = DecisionTreeClassifier()
# Define the parameter grid for tuning
param_grid = {
    'ccp_alpha': [0, 0.01, 0.1, 1],
    'max_depth': [None, 5, 10, 15],
    'min_samples_split': [2, 5, 10, 15]
}
# Perform grid search using cross-validation
grid_search = GridSearchCV(tree_classifier, param_grid, cv=5, scoring=['accuracy', 'roc_auc'], refit='roc_auc')
grid_search.fit(X_train, y_train)
GridSearchCV(cv=5, estimator=DecisionTreeClassifier(),
             param_grid={'ccp_alpha': [0, 0.01, 0.1, 1],
                         'max_depth': [None, 5, 10, 15],
                         'min_samples_split': [2, 5, 10, 15]},
             refit='roc_auc', scoring=['accuracy', 'roc_auc'])
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
# Display the best parameters and corresponding metrics
print("Best parameters:", grid_search.best_params_)
print("Best ROC AUC:", grid_search.best_score_)
Best parameters: {'ccp_alpha': 0, 'max_depth': 5, 'min_samples_split': 15}
Best ROC AUC: 0.932576153846154
# Train the final model with the best parameters on the entire training set
final_model = grid_search.best_estimator_
final_model = final_model.fit(X_train, y_train)
dot_data = tree.export_graphviz(final_model, out_file=None, 
                      filled=True, rounded=True,  
                      special_characters=True)  

graph = graphviz.Source(dot_data) 
graph

# Make predictions on the test set
y_pred = final_model.predict(X_test)
# Evaluate the final model
final_accuracy = accuracy_score(y_test, y_pred)
print("Final Accuracy:", final_accuracy)
Final Accuracy: 0.8571428571428571
y_pred_proba = final_model.predict_proba(X_test)[:, 1]

# Compute ROC AUC score
roc_auc = roc_auc_score(y_test, y_pred_proba)
print("Final ROC AUC:", roc_auc)
Final ROC AUC: 0.9430272108843537