import pandas as pd
df = pd.read_csv("data/ml-ready/cdi-customer-churn.csv")
X = df.drop(columns="churn")
y = df["churn"]Classification Models
Load Data
Train/Test Split
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(
X,
y,
test_size=0.2,
random_state=42,
stratify=y
)Define Preprocessing
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.pipeline import Pipeline
numeric_features = X.select_dtypes(include=["int64", "float64"]).columns
categorical_features = X.select_dtypes(include=["object"]).columns
numeric_transformer = Pipeline(
steps=[("scaler", StandardScaler())]
)
categorical_transformer = Pipeline(
steps=[("encoder", OneHotEncoder(handle_unknown="ignore"))]
)
preprocessor = ColumnTransformer(
transformers=[
("num", numeric_transformer, numeric_features),
("cat", categorical_transformer, categorical_features)
]
)Build Logistic Regression Pipeline
from sklearn.linear_model import LogisticRegression
clf = Pipeline(
steps=[
("preprocessor", preprocessor),
("classifier", LogisticRegression(max_iter=1000))
]
)Train Classifier
clf.fit(X_train, y_train)Pipeline(steps=[('preprocessor',
ColumnTransformer(transformers=[('num',
Pipeline(steps=[('scaler',
StandardScaler())]),
Index(['tenure_months', 'monthly_spend', 'support_calls'], dtype='str')),
('cat',
Pipeline(steps=[('encoder',
OneHotEncoder(handle_unknown='ignore'))]),
Index(['customer_id', 'contract_type', 'autopay'], dtype='str'))])),
('classifier', LogisticRegression(max_iter=1000))])In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook. On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
Parameters
Parameters
Index(['tenure_months', 'monthly_spend', 'support_calls'], dtype='str')
Parameters
Index(['customer_id', 'contract_type', 'autopay'], dtype='str')
Parameters
Parameters
Predict Classes
y_pred = clf.predict(X_test)
y_pred[:10]array([0, 1, 1, 1, 1, 1, 1, 1, 0, 1])
Evaluate Classification Performance
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
accuracy, precision, recall, f1(0.6375, 0.680327868852459, 0.8137254901960784, 0.7410714285714286)
Predict Probabilities
y_prob = clf.predict_proba(X_test)[:, 1]
y_prob[:10]array([0.34507573, 0.66552028, 0.64911401, 0.82731511, 0.8113595 ,
0.82512733, 0.76763305, 0.57034489, 0.41141683, 0.6657059 ])
Note on Thresholds
By default, class predictions use a threshold of 0.5.
In the evaluation lesson, we will examine how changing thresholds affects precision and recall.