-
Notifications
You must be signed in to change notification settings - Fork 0
/
Logistic Regression.py
200 lines (138 loc) · 5.73 KB
/
Logistic Regression.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
# To work with dataframes
import pandas as pd
# To perform numerical operations
import numpy as np
# To visualize data
import seaborn as sns
# To partition the data
from sklearn.model_selection import train_test_split
# Importing library for logistic regression
from sklearn.linear_model import LogisticRegression
# Importing performance metrics - accuracy score & confusion matrix
from sklearn.metrics import accuracy_score, confusion_matrix
# Importing data
data_income = pd.read_csv('income.csv', na_values=[" ?"])
# Creating a copy of original data # Additional strings (" ?") to recognize as NAN
data = data_income.copy()
"""
#Exploratory data analysis:
#1.Getting to know the data
#2.Data preprocessing (Missing values)
#3.Cross tables and data visualization
"""
# To check variables' data type
print(data.info())
# Check for missing values
data.isnull()
print('Data columns with null values:\n', data.isnull().sum())
# Summary of numerical variables
summary_num = data.describe()
print(summary_num)
# Summary of categorical variables
summary_cate = data.describe(include="O")
print(summary_cate)
# **** Frequency of each categories
data['JobType'].value_counts()
data['occupation'].value_counts()
# Checking for unique classes
print(np.unique(data['JobType']))
print(np.unique(data['occupation']))
data = pd.read_csv('income.csv', na_values=[" ?"])
data.isnull().sum()
missing = data[data.isnull().any(axis=1)] # axis=1 => to consider at least one column value is missing in a row
data2 = data.dropna(axis=0)
data3 = data2.copy()
data4 = data3.copy()
# Realtionship between independent variables
correlation = data2.corr()
# Cross tables & Data Visualization
data2.columns
# Gender proportion table:
gender = pd.crosstab(index=data2["gender"], columns='count', normalize=True)
print(gender)
# Gender vs Salary Status:
gender_salstat = pd.crosstab(index=data2["gender"], columns=data2['SalStat'], margins=True, normalize='index')
# Include row and column totals
print(gender_salstat)
SalStat = sns.countplot(data2['SalStat'])
sns.distplot(data2['age'], bins=10, kde=False)
sns.boxplot('SalStat', 'age', data=data2)
data2.groupby('SalStat')['age'].median()
JobType = sns.countplot(y=data2['JobType'], hue='SalStat', data=data2)
job_salstat = pd.crosstab(index=data2["JobType"], columns=data2['SalStat'], margins=True, normalize='index')
round(job_salstat * 100, 1)
Education = sns.countplot(y=data2['EdType'], hue='SalStat', data=data2)
EdType_salstat = pd.crosstab(index=data2["EdType"], columns=data2['SalStat'], margins=True, normalize='index')
round(EdType_salstat * 100, 1)
Occupation = sns.countplot(y=data2['occupation'], hue='SalStat', data=data2)
occ_salstat = pd.crosstab(index=data2["occupation"], columns=data2['SalStat'], margins=True, normalize='index')
round(occ_salstat * 100, 1)
sns.distplot(data2['capitalgain'], bins=10, kde=False)
sns.distplot(data2['capitalloss'], bins=10, kde=False)
# =============================================================================
# LOGISTIC REGRESSION
# =============================================================================
# Reindexing the salary status names to 0,1
data2['SalStat'] = data2['SalStat'].map({' less than or equal to 50,000': 0, ' greater than 50,000': 1})
print(data2['SalStat'])
new_data = pd.get_dummies(data2, drop_first=True)
# Storing the column names
columns_list = list(new_data.columns)
print(columns_list)
# Separating the input names from data
features = list(set(columns_list) - set(['SalStat']))
print(features)
# Storing the output values in y
y = new_data['SalStat'].values
print(y)
# Storing the values from input features
x = new_data[features].values
print(x)
# Splitting the data into train and test
train_x, test_x, train_y, test_y = train_test_split(x, y, test_size=0.3, random_state=0)
# Make an instance of the Model
logistic = LogisticRegression()
# Fitting the values for x and y
logistic.fit(train_x, train_y)
logistic.coef_
logistic.intercept_
# Prediction from test data
prediction = logistic.predict(test_x)
print(prediction)
# Confusion matrix
confusion_matrix = confusion_matrix(test_y, prediction)
print(confusion_matrix)
# Calculating the accuracy
accuracy_score = accuracy_score(test_y, prediction)
print(accuracy_score)
# Printing the misclassified values from prediction
print('Misclassified samples: %d' % (test_y != prediction).sum())
# REMOVING INSIGNIFICANT VARIABLES:
# Reindexing the salary status names to 0,1
data3['SalStat'] = data3['SalStat'].map({' less than or equal to 50,000': 0, ' greater than 50,000': 1})
print(data3['SalStat'])
cols = ['gender', 'nativecountry', 'race', 'JobType']
new_data = data3.drop(cols, axis=1)
new_data = pd.get_dummies(new_data, drop_first=True)
# Storing the column names
columns_list2 = list(new_data.columns)
print(columns_list2)
# Separating the input names from data
features2 = list(set(columns_list2) - set(['SalStat']))
print(features2)
# Storing the output values in y
y2 = new_data['SalStat'].values
print(y2)
# Storing the values from input features
x2 = new_data[features2].values
print(x2)
# Splitting the data into train and test
train_x2, test_x2, train_y2, test_y2 = train_test_split(x2, y2, test_size=0.3, random_state=0)
# Make an instance of the Model
logistic2 = LogisticRegression()
# Fitting the values for x and y
logistic2.fit(train_x2, train_y2)
# Prediction from test data
prediction2 = logistic2.predict(test_x2)
# Printing the misclassified values from prediction
print('Misclassified samples: %d' % (test_y2 != prediction2).sum())