Machine Learning with Python

import pandas as pd
from sklearn import tree
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score, roc_auc_score
import matplotlib.pyplot as plt
import seaborn as sns
import graphviz

penguins = pd.read_csv("penguins.csv")

penguins = penguins[['sex', 'bill_length_mm', 'bill_depth_mm', 'flipper_length_mm', 'body_mass_g', 'species']]

# Handle categorical variables (one-hot encoding)
penguins = pd.get_dummies(penguins, columns=['species'], drop_first=True)

penguins.head()

	sex	bill_length_mm	bill_depth_mm	flipper_length_mm	body_mass_g	species_Chinstrap	species_Gentoo
0	male	39.1	18.7	181	3750	False	False
1	female	39.5	17.4	186	3800	False	False
2	female	40.3	18.0	195	3250	False	False
3	female	36.7	19.3	193	3450	False	False
4	male	39.3	20.6	190	3650	False	False

# Split the data
X = penguins.drop('sex', axis=1)
y = penguins['sex']
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=123, stratify=y)

# Define the decision tree classifier
tree_classifier = DecisionTreeClassifier()

# Define the parameter grid for tuning
param_grid = {
    'ccp_alpha': [0, 0.01, 0.1, 1],
    'max_depth': [None, 5, 10, 15],
    'min_samples_split': [2, 5, 10, 15]
}

# Perform grid search using cross-validation
grid_search = GridSearchCV(tree_classifier, param_grid, cv=5, scoring=['accuracy', 'roc_auc'], refit='roc_auc')
grid_search.fit(X_train, y_train)

GridSearchCV(cv=5, estimator=DecisionTreeClassifier(),
             param_grid={'ccp_alpha': [0, 0.01, 0.1, 1],
                         'max_depth': [None, 5, 10, 15],
                         'min_samples_split': [2, 5, 10, 15]},
             refit='roc_auc', scoring=['accuracy', 'roc_auc'])

In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.

# Display the best parameters and corresponding metrics
print("Best parameters:", grid_search.best_params_)
print("Best ROC AUC:", grid_search.best_score_)

Best parameters: {'ccp_alpha': 0, 'max_depth': 5, 'min_samples_split': 15}
Best ROC AUC: 0.932576153846154

# Train the final model with the best parameters on the entire training set
final_model = grid_search.best_estimator_
final_model = final_model.fit(X_train, y_train)

dot_data = tree.export_graphviz(final_model, out_file=None, 
                      filled=True, rounded=True,  
                      special_characters=True)  

graph = graphviz.Source(dot_data) 
graph

# Make predictions on the test set
y_pred = final_model.predict(X_test)

# Evaluate the final model
final_accuracy = accuracy_score(y_test, y_pred)

print("Final Accuracy:", final_accuracy)

Final Accuracy: 0.8571428571428571

y_pred_proba = final_model.predict_proba(X_test)[:, 1]

# Compute ROC AUC score
roc_auc = roc_auc_score(y_test, y_pred_proba)

print("Final ROC AUC:", roc_auc)

Final ROC AUC: 0.9430272108843537