import pandas as pd
from sklearn import tree
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score, roc_auc_score
import matplotlib.pyplot as plt
import seaborn as sns
import graphvizMachine Learning with Python
A Working Example
penguins = pd.read_csv("penguins.csv")penguins = penguins[['sex', 'bill_length_mm', 'bill_depth_mm', 'flipper_length_mm', 'body_mass_g', 'species']]# Handle categorical variables (one-hot encoding)
penguins = pd.get_dummies(penguins, columns=['species'], drop_first=True)penguins.head()| sex | bill_length_mm | bill_depth_mm | flipper_length_mm | body_mass_g | species_Chinstrap | species_Gentoo | |
|---|---|---|---|---|---|---|---|
| 0 | male | 39.1 | 18.7 | 181 | 3750 | False | False |
| 1 | female | 39.5 | 17.4 | 186 | 3800 | False | False |
| 2 | female | 40.3 | 18.0 | 195 | 3250 | False | False |
| 3 | female | 36.7 | 19.3 | 193 | 3450 | False | False |
| 4 | male | 39.3 | 20.6 | 190 | 3650 | False | False |
# Split the data
X = penguins.drop('sex', axis=1)
y = penguins['sex']
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=123, stratify=y)# Define the decision tree classifier
tree_classifier = DecisionTreeClassifier()# Define the parameter grid for tuning
param_grid = {
'ccp_alpha': [0, 0.01, 0.1, 1],
'max_depth': [None, 5, 10, 15],
'min_samples_split': [2, 5, 10, 15]
}# Perform grid search using cross-validation
grid_search = GridSearchCV(tree_classifier, param_grid, cv=5, scoring=['accuracy', 'roc_auc'], refit='roc_auc')
grid_search.fit(X_train, y_train)GridSearchCV(cv=5, estimator=DecisionTreeClassifier(),
param_grid={'ccp_alpha': [0, 0.01, 0.1, 1],
'max_depth': [None, 5, 10, 15],
'min_samples_split': [2, 5, 10, 15]},
refit='roc_auc', scoring=['accuracy', 'roc_auc'])In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook. On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
GridSearchCV(cv=5, estimator=DecisionTreeClassifier(),
param_grid={'ccp_alpha': [0, 0.01, 0.1, 1],
'max_depth': [None, 5, 10, 15],
'min_samples_split': [2, 5, 10, 15]},
refit='roc_auc', scoring=['accuracy', 'roc_auc'])DecisionTreeClassifier()
DecisionTreeClassifier()
# Display the best parameters and corresponding metrics
print("Best parameters:", grid_search.best_params_)
print("Best ROC AUC:", grid_search.best_score_)Best parameters: {'ccp_alpha': 0, 'max_depth': 5, 'min_samples_split': 15}
Best ROC AUC: 0.932576153846154
# Train the final model with the best parameters on the entire training set
final_model = grid_search.best_estimator_
final_model = final_model.fit(X_train, y_train)dot_data = tree.export_graphviz(final_model, out_file=None,
filled=True, rounded=True,
special_characters=True)
graph = graphviz.Source(dot_data)
graph# Make predictions on the test set
y_pred = final_model.predict(X_test)# Evaluate the final model
final_accuracy = accuracy_score(y_test, y_pred)print("Final Accuracy:", final_accuracy)Final Accuracy: 0.8571428571428571
y_pred_proba = final_model.predict_proba(X_test)[:, 1]
# Compute ROC AUC score
roc_auc = roc_auc_score(y_test, y_pred_proba)print("Final ROC AUC:", roc_auc)Final ROC AUC: 0.9430272108843537