# Importing libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import LabelEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, confusion_matrix
set() sns.
8 Logistic regression: Others
8.1 Decision Boundary in Classification
# Importing the dataset
= pd.read_csv('datasets/apples_and_oranges.csv')
dataset dataset.head()
Weight | Size | Class | |
---|---|---|---|
0 | 69 | 4.39 | orange |
1 | 69 | 4.21 | orange |
2 | 65 | 4.09 | orange |
3 | 72 | 5.85 | apple |
4 | 67 | 4.70 | orange |
# No. of apples and oranges
'Class'].value_counts() dataset[
Class
orange 20
apple 20
Name: count, dtype: int64
8.1.1 Encoding Target
= LabelEncoder()
le 'Class'] = le.fit_transform(dataset['Class'])
dataset[ le.classes_
array(['apple', 'orange'], dtype=object)
This implies that, * 0 represents Apple * 1 represents Orange
dataset.head()
Weight | Size | Class | |
---|---|---|---|
0 | 69 | 4.39 | 1 |
1 | 69 | 4.21 | 1 |
2 | 65 | 4.09 | 1 |
3 | 72 | 5.85 | 0 |
4 | 67 | 4.70 | 1 |
8.1.2 Plotting the dataset
=(9,6))
plt.figure(figsize'Apples and Oranges', fontweight='bold', fontsize=16)
plt.title('Weight')
plt.xlabel('Size')
plt.ylabel(= plt.scatter(dataset['Weight'], dataset['Size'], c=dataset['Class'], cmap='viridis')
scatter *scatter.legend_elements(),
plt.legend(= 'upper left',
loc = 'Class'); title
We can observe that oranges have lower weight and size compared to apples. Further by drawing a straight line between these two groups of data points, we can clearly distinguish between apples and oranges.
8.1.3 Building a Logistic Regression model to distinguish apples and oranges
As we can clearly distinguish between apples and oranges using a straight line decision boundary, we can choose the hypothesis y = a0 + a1 x1 + a2 * x2* for Logistic Regression
where,
a0, a1, a2 are the fitting parameters
x1 is Weight
x2 is Size
# Defining target and features
= dataset['Class']
y = dataset.drop(columns=['Class']) x
# Creating object of LogisticRegression class
= LogisticRegression() log_reg
# Fitting parameters
log_reg.fit(x,y)
LogisticRegression()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
LogisticRegression()
# Intercept - a0
log_reg.intercept_
array([106.60287324])
# Coefficients - a1, a2 respectively
log_reg.coef_
array([[-1.42833694, -1.31285258]])
# Predicting labels for the given dataset
= log_reg.predict(x)
label_predictions label_predictions
array([1, 1, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1, 1, 0, 1, 0, 1, 1,
1, 1, 1, 0, 1, 1, 0, 0, 0, 1, 0, 0, 1, 1, 1, 0, 0, 0])
8.1.4 Linear Decision Boundary with naive features
# Parameter values
= log_reg.intercept_[0]
a0 = log_reg.coef_[0][0]
a1 = log_reg.coef_[0][1] a2
# Defining x1 and x2 values for decision boundary
= np.array([69, 71])
x1 = - (a0 / a2) - (a1 / a2) * x1 x2
# Plotting the decision boundary
=(9,6))
plt.figure(figsize'Apples and Oranges', fontweight='bold', fontsize=16)
plt.title('Weight')
plt.xlabel('Size')
plt.ylabel(= plt.scatter(dataset['Weight'], dataset['Size'], c=dataset['Class'], cmap='viridis')
scatter *scatter.legend_elements(),
plt.legend(= 'upper left',
loc = 'Class')
title ='red', label='Decision Boundary')
plt.plot(x1, x2, color plt.show()
In this problem, we have just two features x1 and x2, if we use just those as they are we will end up with a straight line which divides our 2D plane into two half-planes.
8.2 Non-linear Decision Boundary
In some occasions, we want to have a more complex boundary, and we can achieve this by transforming our features. For instance, when confronted with a training data distribution as illustrated below, it becomes imperative to generate polynomial features \[(x_1^2, x_2^2)\] in order to enhance the delineation between the two classes.
Let’s delve into a concrete example below to illustrate this concept.For the purpose of illustrating the decision boundary, I chose not to split the data into training and test sets.
#Load our Dataset for Logistic Regression
= pd.read_csv('datasets/ex2data2.txt', header=None, names = ['feature 1', 'feature 2', 'faulty'])
components components.head()
feature 1 | feature 2 | faulty | |
---|---|---|---|
0 | 0.051267 | 0.69956 | 1 |
1 | -0.092742 | 0.68494 | 1 |
2 | -0.213710 | 0.69225 | 1 |
3 | -0.375000 | 0.50219 | 1 |
4 | -0.513250 | 0.46564 | 1 |
# check the balance of the dataset
'faulty'].value_counts() components[
faulty
0 60
1 58
Name: count, dtype: int64
# get positive and negative samples for plotting
= components['faulty'] == 1
pos = components['faulty'] == 0 neg
# Visualize Data
= plt.subplots();
fig, axes 'Feature 1')
axes.set_xlabel('Feature 2')
axes.set_ylabel('feature 1'], components.loc[pos, 'feature 2'], color = 'r', marker='x', label='Faulty')
axes.scatter(components.loc[pos, 'feature 1'], components.loc[neg, 'feature 2'], color = 'g', marker='o', label='Non Faculty')
axes.scatter(components.loc[neg, ='Legend', loc = 'best' )
axes.legend(title-1,1.5)
axes.set_xlim(-1,1.5) axes.set_xlim(
As we can see that the positive and negative examples are not linearly seperable. So we have to add additional higher order polynomial features.
# define function to map higher order polynomial features
def mapFeature(X1, X2, degree):
= np.ones(X1.shape[0])
res for i in range(1,degree + 1):
for j in range(0,i + 1):
= np.column_stack((res, (X1 ** (i-j)) * (X2 ** j)))
res
return res
# Get the features
= components.iloc[:, :2] X
= 2 degree
= mapFeature(X.iloc[:, 0], X.iloc[:, 1], degree) X_poly
# Get the target variable
= components.iloc[:, 2] y
def sigmoid(z):
return 1 / (1 + np.exp(-z))
def costFunc(theta, X, y):
= y.shape[0]
m = X.dot(theta)
z = sigmoid(z)
h = y * np.log(h)
term1 = (1- y) * np.log(1 - h)
term2 = -np.sum(term1 + term2, axis = 0) / m
J return J
# Set initial values for our parameters
= np.zeros(X_poly.shape[1]).reshape(X_poly.shape[1], 1) initial_theta
# Now call the optimization routine
#NOTE: This automatically picks the learning rate
from scipy.optimize import minimize
= minimize(costFunc, initial_theta.flatten(), args=(X_poly, y)) res
# our optimizated coefficients
= res.x theta
# define a function to plot the decision boundary
def plotDecisionBoundary(theta,degree, axes):
= np.linspace(-1, 1.5, 50)
u = np.linspace(-1, 1.5, 50)
v = np.meshgrid(u,v)
U,V # convert U, V to vectors for calculating additional features
# using vectorized implementation
= np.ravel(U)
U = np.ravel(V)
V = np.zeros((len(u) * len(v)))
Z
= mapFeature(U, V, degree)
X_poly = X_poly.dot(theta)
Z
# reshape U, V, Z back to matrix
= U.reshape((len(u), len(v)))
U = V.reshape((len(u), len(v)))
V = Z.reshape((len(u), len(v)))
Z
= axes.contour(U,V,Z,levels=[0],cmap= "Greys_r")
cs =['Non Faculty', 'faulty', 'Decision Boundary'])
axes.legend(labelsreturn cs
# Plot Decision boundary
= plt.subplots();
fig, axes 'Feature 1')
axes.set_xlabel('Feature 2')
axes.set_ylabel('feature 1'], components.loc[pos, 'feature 2'], color = 'r', marker='x', label='Faulty')
axes.scatter(components.loc[pos, 'feature 1'], components.loc[neg, 'feature 2'], color = 'g', marker='o', label='Good')
axes.scatter(components.loc[neg, #axes.legend(title='Legend', loc = 'best' )
; plotDecisionBoundary(theta, degree, axes)
of course, you can increase the degree of the polynomial you want to fit, but the overfitting could become a problem
# set degree = 6
= 6
degree # map features to the degree
= mapFeature(X.iloc[:, 0], X.iloc[:, 1], degree)
X_poly # set initial parameters
= np.zeros(X_poly.shape[1]).reshape(X_poly.shape[1], 1) initial_theta
# Run the optimization function
= minimize(costFunc, initial_theta.flatten(), args=(X_poly, y))
res = res.x.reshape(res.x.shape[0], 1)
theta
# Plot Decision boundary
= plt.subplots()
fig, axes 'Feature 1')
axes.set_xlabel('Feature 2')
axes.set_ylabel('feature 1'], components.loc[pos, 'feature 2'], color='r', marker='x', label='Faulty')
axes.scatter(components.loc[pos, 'feature 1'], components.loc[neg, 'feature 2'], color='g', marker='o', label='Good')
axes.scatter(components.loc[neg, #axes.legend(title='Legend', loc='best')
plotDecisionBoundary(theta, degree, axes)
As we can see the model tries pretty hard to capture every single example perfectly and overfits the data. This kind of model has overfitting issue. i.e The model has not pre-conceived notion about the seperation of the positive and negative examples and pretty much can fit any kind of data. Such model will fail in predicting the correct classification when it sees new examples.
One of techiques is to use regularization, which we will cover later. The idea is to penalize the algorithm when it tries to overfit by adding a regularization term to the cost function.
The New Cost function with the regularization is specified as
\(J(\theta ) = \frac{1}{m} \sum_{i=1}^m[-y_i log(h_\theta (z_i) – (1 – y_i) log(1-h_\theta (z_i))] + \frac{\lambda}{2m} \sum_{j=1}^n[\theta_j^2]\)
where \(\lambda\) = regularization factor
n = number of features.
(NOTE: The regularization term does include the intercept term \(\theta_0\)
8.2.1 By adding Polynomial Features
components
feature 1 | feature 2 | faulty | |
---|---|---|---|
0 | 0.051267 | 0.699560 | 1 |
1 | -0.092742 | 0.684940 | 1 |
2 | -0.213710 | 0.692250 | 1 |
3 | -0.375000 | 0.502190 | 1 |
4 | -0.513250 | 0.465640 | 1 |
... | ... | ... | ... |
113 | -0.720620 | 0.538740 | 0 |
114 | -0.593890 | 0.494880 | 0 |
115 | -0.484450 | 0.999270 | 0 |
116 | -0.006336 | 0.999270 | 0 |
117 | 0.632650 | -0.030612 | 0 |
118 rows × 3 columns
= components[['feature 1', 'feature 2']]
X = components['faulty'] y
from sklearn.preprocessing import PolynomialFeatures
= PolynomialFeatures(degree=2)
poly = poly.fit_transform(X) X_poly
= poly.get_feature_names_out()
feature_names print(feature_names)
['1' 'feature 1' 'feature 2' 'feature 1^2' 'feature 1 feature 2'
'feature 2^2']
from sklearn.linear_model import LogisticRegression
= LogisticRegression()
model model.fit(X_poly, y)
LogisticRegression()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
LogisticRegression()
= model.predict(X_poly) label_predictions
8.2.1.1 Accuracy Score
accuracy_score(y, label_predictions)
0.8135593220338984
8.2.1.2 Confusion Matrix
= confusion_matrix(y, label_predictions)
cm cm
array([[47, 13],
[ 9, 49]], dtype=int64)
8.2.2 By Transforming Continuous Variable
Variable transformation is an important technique to create robust models using logistic regression. Because the predictors are linear in the log of the odds, it is often helpful to transform the continuous variables to create a more linear relationship. To determine the best transformation of a continuous variable, a univariate plot is very helpful. Remember the nice univariate plot of Y variable against X variable in linear regression? This is not easily attained, because Y is dichotomous in logistic regression.
There are different recommended solutions. Among them, * One is to create several variations (in forms of squared, cubed, or logged transformations etc.). * Another solution is to break some continuous variables into segments and treat them as categorical variables. This may work well to pick up nonlinear trends. The biggest drawback is that it loses the benefit of the linear trend relationship in the curve1. It also may lead to over fitting.
= pd.read_csv('./Datasets/Social_Network_Ads_train.csv') #Develop the model on train data
train = pd.read_csv('./Datasets/Social_Network_Ads_test.csv') #Test the model on test data test
train.head()
User ID | Gender | Age | EstimatedSalary | Purchased | |
---|---|---|---|---|---|
0 | 15755018 | Male | 36 | 33000 | 0 |
1 | 15697020 | Female | 39 | 61000 | 0 |
2 | 15796351 | Male | 36 | 118000 | 1 |
3 | 15665760 | Male | 39 | 122000 | 1 |
4 | 15794661 | Female | 26 | 118000 | 0 |
train.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 300 entries, 0 to 299
Data columns (total 5 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 User ID 300 non-null int64
1 Gender 300 non-null object
2 Age 300 non-null int64
3 EstimatedSalary 300 non-null int64
4 Purchased 300 non-null int64
dtypes: int64(4), object(1)
memory usage: 11.8+ KB
train.Gender.value_counts()
Gender
Female 151
Male 149
Name: count, dtype: int64
= pd.get_dummies(train['Gender'], drop_first=True)
tmp_1 = pd.concat([train, tmp_1], axis=1)
train train.head()
User ID | Gender | Age | EstimatedSalary | Purchased | Male | |
---|---|---|---|---|---|---|
0 | 15755018 | Male | 36 | 33000 | 0 | True |
1 | 15697020 | Female | 39 | 61000 | 0 | False |
2 | 15796351 | Male | 36 | 118000 | 1 | True |
3 | 15665760 | Male | 39 | 122000 | 1 | True |
4 | 15794661 | Female | 26 | 118000 | 0 | False |
= pd.get_dummies(test['Gender'], drop_first=True)
tmp_1 = pd.concat([test, tmp_1], axis=1)
test test.head()
User ID | Gender | Age | EstimatedSalary | Purchased | Male | |
---|---|---|---|---|---|---|
0 | 15810944 | Male | 35 | 20000 | 0 | True |
1 | 15668575 | Female | 26 | 43000 | 0 | False |
2 | 15603246 | Female | 27 | 57000 | 0 | False |
3 | 15694829 | Female | 32 | 150000 | 1 | False |
4 | 15697686 | Male | 29 | 80000 | 0 | True |
# Separating features and target on training set
= train.Purchased
y_train = train.drop(["Purchased", "Gender", "User ID"], axis = 1) X_train
X_train
Age | EstimatedSalary | Male | |
---|---|---|---|
0 | 36 | 33000 | True |
1 | 39 | 61000 | False |
2 | 36 | 118000 | True |
3 | 39 | 122000 | True |
4 | 26 | 118000 | False |
... | ... | ... | ... |
295 | 48 | 96000 | False |
296 | 42 | 149000 | True |
297 | 28 | 79000 | True |
298 | 51 | 134000 | False |
299 | 33 | 28000 | False |
300 rows × 3 columns
# Separating features and target on test set
= test.Purchased
y_test = test.drop(["Purchased", "Gender", "User ID"], axis = 1)
X_test X_test
Age | EstimatedSalary | Male | |
---|---|---|---|
0 | 35 | 20000 | True |
1 | 26 | 43000 | False |
2 | 27 | 57000 | False |
3 | 32 | 150000 | False |
4 | 29 | 80000 | True |
... | ... | ... | ... |
95 | 49 | 39000 | False |
96 | 47 | 34000 | True |
97 | 60 | 42000 | True |
98 | 39 | 59000 | False |
99 | 51 | 23000 | True |
100 rows × 3 columns
from sklearn.linear_model import LogisticRegression
= LogisticRegression()
sklearn_model sklearn_model.fit(X_train, y_train)
LogisticRegression()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
LogisticRegression()
= sklearn_model.predict(X_test)
y_pred_test print('Accuracy of logistic regression on test set : {:.4f}'.format(accuracy_score(y_test, y_pred_test )))
Accuracy of logistic regression on test set : 0.8800
8.2.2.1 Log transformation of salary
sns.histplot(train.EstimatedSalary)
"log_salary"] = np.log(train["EstimatedSalary"])
train[ sns.histplot(train.log_salary)
The reason for such transformations have nothing to do with their distribution. Instead, the reason has to do with the functional form of the effect. Say we want to know the effect of the number of publications on the probability of getting tenure. It is reasonable to believe that getting an extra publication when one has only 1 publication has more impact compared with getting an extra publication when one has already published 50 articles. The log transformation is one way to capture such a (testable) assumption of diminishing returns.
"log_salary"] = np.log(test["EstimatedSalary"]) test[
# Separating features and target
= train.Purchased
y_train = train.drop(["Purchased", "Gender", "User ID", "EstimatedSalary"], axis = 1) X_train
X_train
Age | Male | log_salary | |
---|---|---|---|
0 | 36 | True | 10.404263 |
1 | 39 | False | 11.018629 |
2 | 36 | True | 11.678440 |
3 | 39 | True | 11.711776 |
4 | 26 | False | 11.678440 |
... | ... | ... | ... |
295 | 48 | False | 11.472103 |
296 | 42 | True | 11.911702 |
297 | 28 | True | 11.277203 |
298 | 51 | False | 11.805595 |
299 | 33 | False | 10.239960 |
300 rows × 3 columns
from sklearn.linear_model import LogisticRegression
= LogisticRegression()
sklearn_model_log sklearn_model_log.fit(X_train, y_train)
LogisticRegression()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
LogisticRegression()
# Separating features and target for the test dataset
= test.Purchased
y_test_log = test.drop(["Purchased", "Gender", "User ID", "EstimatedSalary"], axis = 1) X_test_log
= sklearn_model_log.predict(X_test_log)
y_log_pred_test print('Accuracy of logistic regression after log transformation of salary on test set : {:.4f}'.format(accuracy_score( y_test_log, y_log_pred_test)))
Accuracy of logistic regression after log transformation of salary on test set : 0.8300
8.2.3 By Binning Continous Variables
=train.Age) sns.histplot(data
= [train.Age.min()-1, 25, 35, 48, train.Age.max()]
bins = ['Big Kid', 'Young Adult', 'Adult', 'Senior']
labels 'AgeGroup'] = pd.cut(train["Age"], bins, labels = labels)
train[
#draw a bar plot of Age vs. survival
="AgeGroup", y="Purchased", data=train)
sns.barplot(x plt.show()
train
User ID | Gender | Age | EstimatedSalary | Purchased | Male | log_salary | AgeGroup | |
---|---|---|---|---|---|---|---|---|
0 | 15755018 | Male | 36 | 33000 | 0 | True | 10.404263 | Adult |
1 | 15697020 | Female | 39 | 61000 | 0 | False | 11.018629 | Adult |
2 | 15796351 | Male | 36 | 118000 | 1 | True | 11.678440 | Adult |
3 | 15665760 | Male | 39 | 122000 | 1 | True | 11.711776 | Adult |
4 | 15794661 | Female | 26 | 118000 | 0 | False | 11.678440 | Young Adult |
... | ... | ... | ... | ... | ... | ... | ... | ... |
295 | 15724536 | Female | 48 | 96000 | 1 | False | 11.472103 | Adult |
296 | 15701537 | Male | 42 | 149000 | 1 | True | 11.911702 | Adult |
297 | 15807481 | Male | 28 | 79000 | 0 | True | 11.277203 | Young Adult |
298 | 15603942 | Female | 51 | 134000 | 0 | False | 11.805595 | Senior |
299 | 15690188 | Female | 33 | 28000 | 0 | False | 10.239960 | Young Adult |
300 rows × 8 columns
from sklearn import preprocessing
= preprocessing.LabelEncoder()
label_encoder 'AgeGroup']= label_encoder.fit_transform(train['AgeGroup'])
train['AgeGroup'].unique() train[
array([0, 3, 1, 2])
'AgeGroup'] = pd.cut(test["Age"], bins, labels = labels)
test['AgeGroup']= label_encoder.fit_transform(test['AgeGroup']) test[
train
User ID | Gender | Age | EstimatedSalary | Purchased | Male | log_salary | AgeGroup | |
---|---|---|---|---|---|---|---|---|
0 | 15755018 | Male | 36 | 33000 | 0 | True | 10.404263 | 0 |
1 | 15697020 | Female | 39 | 61000 | 0 | False | 11.018629 | 0 |
2 | 15796351 | Male | 36 | 118000 | 1 | True | 11.678440 | 0 |
3 | 15665760 | Male | 39 | 122000 | 1 | True | 11.711776 | 0 |
4 | 15794661 | Female | 26 | 118000 | 0 | False | 11.678440 | 3 |
... | ... | ... | ... | ... | ... | ... | ... | ... |
295 | 15724536 | Female | 48 | 96000 | 1 | False | 11.472103 | 0 |
296 | 15701537 | Male | 42 | 149000 | 1 | True | 11.911702 | 0 |
297 | 15807481 | Male | 28 | 79000 | 0 | True | 11.277203 | 3 |
298 | 15603942 | Female | 51 | 134000 | 0 | False | 11.805595 | 2 |
299 | 15690188 | Female | 33 | 28000 | 0 | False | 10.239960 | 3 |
300 rows × 8 columns
# Separating features and target on train set
= train.Purchased
y_train = train.drop(["Purchased", "Gender", "User ID", "EstimatedSalary", "Age"], axis = 1)
X_train X_train
Male | log_salary | AgeGroup | |
---|---|---|---|
0 | True | 10.404263 | 0 |
1 | False | 11.018629 | 0 |
2 | True | 11.678440 | 0 |
3 | True | 11.711776 | 0 |
4 | False | 11.678440 | 3 |
... | ... | ... | ... |
295 | False | 11.472103 | 0 |
296 | True | 11.911702 | 0 |
297 | True | 11.277203 | 3 |
298 | False | 11.805595 | 2 |
299 | False | 10.239960 | 3 |
300 rows × 3 columns
from sklearn.linear_model import LogisticRegression
= LogisticRegression()
sklearn_model_bin sklearn_model_bin.fit(X_train, y_train)
LogisticRegression()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
LogisticRegression()
# Separating features and target on test set
= test.Purchased
y_test = test.drop(["Purchased", "Gender", "User ID", "EstimatedSalary", "Age"], axis = 1)
X_test_bin X_test_bin
Male | log_salary | AgeGroup | |
---|---|---|---|
0 | True | 9.903488 | 3 |
1 | False | 10.668955 | 3 |
2 | False | 10.950807 | 3 |
3 | False | 11.918391 | 3 |
4 | True | 11.289782 | 3 |
... | ... | ... | ... |
95 | False | 10.571317 | 2 |
96 | True | 10.434116 | 0 |
97 | True | 10.645425 | 2 |
98 | False | 10.985293 | 0 |
99 | True | 10.043249 | 2 |
100 rows × 3 columns
= sklearn_model_bin.predict(X_test_bin)
y_bin_pred_test print('Accuracy of logistic regression after age binning on test set : {:.4f}'.format(accuracy_score( y_test, y_bin_pred_test)))
Accuracy of logistic regression after age binning on test set : 0.7100
8.3 Reference
- https://www.linkedin.com/pulse/generating-non-linear-decision-boundaries-using-logistic-d-urso/
- https://jermwatt.github.io/machine_learning_refined/notes/10_Nonlinear_intro/10_4_Twoclass.html
- https://www.kaggle.com/code/lzs0047/logistic-regression-non-linear-decision-boundary/edit
- https://www.kaggle.com/code/ashishrane7/logistic-regression-non-linear-decision-boundary/notebook