Logistic Regression.py

# To work with dataframes
import pandas as pd

# To perform numerical operations
import numpy as np

# To visualize data
import seaborn as sns

# To partition the data
from sklearn.model_selection import train_test_split

# Importing library for logistic regression
from sklearn.linear_model import LogisticRegression

# Importing performance metrics - accuracy score & confusion matrix
from sklearn.metrics import accuracy_score, confusion_matrix

# Importing data

data_income = pd.read_csv('income.csv', na_values=[" ?"])

# Creating a copy of original data                                                                              # Additional strings (" ?") to recognize as NAN
data = data_income.copy()

"""
#Exploratory data analysis:

#1.Getting to know the data
#2.Data preprocessing (Missing values)
#3.Cross tables and data visualization
"""

# To check variables' data type
print(data.info())

# Check for missing values
data.isnull()

print('Data columns with null values:\n', data.isnull().sum())

# Summary of numerical variables
summary_num = data.describe()
print(summary_num)

# Summary of categorical variables
summary_cate = data.describe(include="O")
print(summary_cate)

# **** Frequency of each categories
data['JobType'].value_counts()
data['occupation'].value_counts()

# Checking for unique classes
print(np.unique(data['JobType']))
print(np.unique(data['occupation']))

data = pd.read_csv('income.csv', na_values=[" ?"])

data.isnull().sum()

missing = data[data.isnull().any(axis=1)]   # axis=1 => to consider at least one column value is missing in a row

data2 = data.dropna(axis=0)
data3 = data2.copy()
data4 = data3.copy()

# Realtionship between independent variables
correlation = data2.corr()

# Cross tables & Data Visualization

data2.columns

# Gender proportion table:

gender = pd.crosstab(index=data2["gender"], columns='count', normalize=True)
print(gender)

#  Gender vs Salary Status:

gender_salstat = pd.crosstab(index=data2["gender"], columns=data2['SalStat'], margins=True, normalize='index')

# Include row and column totals
print(gender_salstat)

SalStat = sns.countplot(data2['SalStat'])

sns.distplot(data2['age'], bins=10, kde=False)

sns.boxplot('SalStat', 'age', data=data2)
data2.groupby('SalStat')['age'].median()

JobType = sns.countplot(y=data2['JobType'], hue='SalStat', data=data2)
job_salstat = pd.crosstab(index=data2["JobType"], columns=data2['SalStat'], margins=True, normalize='index')
round(job_salstat * 100, 1)

Education = sns.countplot(y=data2['EdType'], hue='SalStat', data=data2)
EdType_salstat = pd.crosstab(index=data2["EdType"], columns=data2['SalStat'], margins=True, normalize='index')
round(EdType_salstat * 100, 1)

Occupation = sns.countplot(y=data2['occupation'], hue='SalStat', data=data2)
occ_salstat = pd.crosstab(index=data2["occupation"], columns=data2['SalStat'], margins=True, normalize='index')
round(occ_salstat * 100, 1)

sns.distplot(data2['capitalgain'], bins=10, kde=False)

sns.distplot(data2['capitalloss'], bins=10, kde=False)

# =============================================================================
# LOGISTIC REGRESSION
# =============================================================================

# Reindexing the salary status names to 0,1
data2['SalStat'] = data2['SalStat'].map({' less than or equal to 50,000': 0, ' greater than 50,000': 1})
print(data2['SalStat'])
new_data = pd.get_dummies(data2, drop_first=True)

# Storing the column names
columns_list = list(new_data.columns)
print(columns_list)

# Separating the input names from data
features = list(set(columns_list) - set(['SalStat']))
print(features)

# Storing the output values in y
y = new_data['SalStat'].values
print(y)

# Storing the values from input features
x = new_data[features].values
print(x)

# Splitting the data into train and test
train_x, test_x, train_y, test_y = train_test_split(x, y, test_size=0.3, random_state=0)

# Make an instance of the Model
logistic = LogisticRegression()

# Fitting the values for x and y
logistic.fit(train_x, train_y)
logistic.coef_
logistic.intercept_

# Prediction from test data
prediction = logistic.predict(test_x)
print(prediction)

# Confusion matrix
confusion_matrix = confusion_matrix(test_y, prediction)
print(confusion_matrix)

# Calculating the accuracy
accuracy_score = accuracy_score(test_y, prediction)
print(accuracy_score)

# Printing the misclassified values from prediction

print('Misclassified samples: %d' % (test_y != prediction).sum())

# REMOVING INSIGNIFICANT VARIABLES:

# Reindexing the salary status names to 0,1
data3['SalStat'] = data3['SalStat'].map({' less than or equal to 50,000': 0, ' greater than 50,000': 1})
print(data3['SalStat'])
cols = ['gender', 'nativecountry', 'race', 'JobType']
new_data = data3.drop(cols, axis=1)
new_data = pd.get_dummies(new_data, drop_first=True)

# Storing the column names
columns_list2 = list(new_data.columns)
print(columns_list2)

# Separating the input names from data
features2 = list(set(columns_list2) - set(['SalStat']))
print(features2)

# Storing the output values in y
y2 = new_data['SalStat'].values
print(y2)

# Storing the values from input features
x2 = new_data[features2].values
print(x2)

# Splitting the data into train and test
train_x2, test_x2, train_y2, test_y2 = train_test_split(x2, y2, test_size=0.3, random_state=0)

# Make an instance of the Model
logistic2 = LogisticRegression()

# Fitting the values for x and y
logistic2.fit(train_x2, train_y2)

# Prediction from test data
prediction2 = logistic2.predict(test_x2)

# Printing the misclassified values from prediction
print('Misclassified samples: %d' % (test_y2 != prediction2).sum())