-
Notifications
You must be signed in to change notification settings - Fork 0
/
optimal_score.py
174 lines (148 loc) · 6.51 KB
/
optimal_score.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
from sklearn import preprocessing
from scipy.stats.stats import pearsonr
import pandas as pd
import prov.model
import datetime
import uuid
import dml
import sys
sys.path.append("/Users/lubovmckone/course-2018-spr-proj/agoncharova_lmckone/z3/build/python/")
from z3 import *
class optimal_score(dml.Algorithm):
contributor = 'agoncharova_lmckone'
reads = ['agoncharova_lmckone.stability_score']
writes = ['agoncharova_lmckone.optimal_score']
# HELPER
@staticmethod
def get_stability_scores_from_repo():
'''
Gets and returns the data from the stability_score collection.
'''
client = dml.pymongo.MongoClient()
repo = client.repo
repo.authenticate('agoncharova_lmckone', 'agoncharova_lmckone')
stability_score_collection = repo['agoncharova_lmckone.stability_score']
stability_scores_cursor = stability_score_collection.find()
stability_scores_arr = []
for stability_score_object in stability_scores_cursor:
stability_scores_arr.append(stability_score_object)
repo.logout()
return stability_scores_arr
@staticmethod
def explore_business_data(all_data):
'''
Normalize business data and compute correlation coefficient
between the number
'''
# normalize the business scores for each entry and look whether they are correlated
# inspiration: http://sebastianraschka.com/Articles/2014_about_feature_scaling.html#about-standardization
df = pd.DataFrame(all_data)
std_scale = preprocessing.StandardScaler().fit(df[['businesses']])
df_std = std_scale.transform(df[['businesses']])
minmax_scale = preprocessing.MinMaxScaler().fit(df[['businesses']])
df_minmax = minmax_scale.transform(df[['businesses']])
stability_list = list(df['stability'].as_matrix())
print(len(stability_list))
df_minmax_formatted = [x[0] for x in df_minmax]
print(len(df_minmax_formatted))
# find correlation between the normalized business scores and the optimal score
corr = pearsonr(stability_list, df_minmax_formatted) # corr = 0.10446045881042658, 2-tailed p-val = 0.13704210138510017
# non zero correlation of 0.1 obviously means it is CORRELATED
return corr[0]
@staticmethod
def compute_optimal_num_businesses(all_data, trial):
'''
High number of evictions and crimes lead to a higher score, so
we want to lower the score?
z3 solver allows us to solve proprietary equations with
various constraints.
'''
num = 204
if(trial):
num = 100
# setup
df = pd.DataFrame(all_data)
# isolate and format the vars
tracts = list(df['Tract'].as_matrix())
businesses = list(df['businesses'].as_matrix())
stability = list(df['stability'].as_matrix())
additional_businesses = [9999]*len(businesses) # 204 entries
results = []
# let's optimize
for i in range(num):
S = Optimize()
new = Real('new'+str(i))
num_businesses = Int('num_businesses'+str(i))
S.add(new > 0) # new stability score has to be positive
S.minimize(new)
if(stability[i] > 0.5):
S.add(num_businesses <= 5) # we can't add more than 5 businesses
S.add(num_businesses == (stability[i] - new)/ 0.2)
if(stability[i] < 0.5 and stability[i] > 0.3):
S.add(num_businesses <= 3) # we can't add more than 3 businesses
S.add(num_businesses == (stability[i] - new)/ 0.1)
print(S.check())
try:
model = S.model()
new_score = model[new].as_decimal(5)[:7]
addl_businesses = model[num_businesses].as_long()
result = {"Tract": tracts[i],
"new_optimal_score": new_score,
"stability_score": str(stability[i]),
"addl_businesses": str(addl_businesses),
"businesses": str(businesses[i]) }
results.append(result)
except:
print("something bad happened :(")
pass
print("successfully solved " + str(len(results)) + " SMT problems")
return results
@staticmethod
def execute(trial=False):
'''
Incorporate the number of businesses into the tract stability score
by first normalizing it in construct_business_score.
Run an SMT solver using z3 library to see how many businesses could be added
to a tract in order to improve stability score.
'''
startTime = datetime.datetime.now()
this = optimal_score
all_data = this.get_stability_scores_from_repo()
corr = this.explore_business_data(all_data)
print("correlation between businesses and stability score is: " + str(corr))
results = this.compute_optimal_num_businesses(all_data, trial)
client = dml.pymongo.MongoClient()
repo = client.repo
repo.authenticate('agoncharova_lmckone', 'agoncharova_lmckone')
optimal_score_collection = repo['agoncharova_lmckone.optimal_score']
repo.dropCollection('optimal_score')
repo.createCollection('optimal_score')
repo['agoncharova_lmckone.optimal_score'].insert_many(results)
repo.logout()
print("ran SMT solver and saved data to optimal_score collection")
return {"start": startTime, "end": datetime.datetime.now()}
@staticmethod
def provenance(doc=prov.model.ProvDocument(), startTime=None, endTime=None):
client = dml.pymongo.MongoClient()
repo = client.repo
repo.authenticate('agoncharova_lmckone', 'agoncharova_lmckone')
doc.add_namespace('alg', 'http://datamechanics.io/algorithm/') # The scripts are in <folder>#<filename> format.
doc.add_namespace('dat', 'http://datamechanics.io/data/') # The data sets are in <user>#<collection> format.
doc.add_namespace('ont', 'http://datamechanics.io/ontology#') # 'Extension', 'DataResource', 'DataSet', 'Retrieval', 'Query', or 'Computation'.
doc.add_namespace('log', 'http://datamechanics.io/log/') # The event log.
this_agent = doc.agent('alg:agoncharova_lmckone#optimal_score',
{prov.model.PROV_TYPE: prov.model.PROV['SoftwareAgent'], 'ont:Extension': 'py'})
this_entity = doc.entity('dat:agoncharova_lmckone#optimal_score',
{prov.model.PROV_LABEL: 'Optimal Score', prov.model.PROV_TYPE: 'ont:DataSet'})
optimal_score_resource = doc.entity('dat:agoncharova_lmckone#optimal_score',
{prov.model.PROV_LABEL: 'Optimal Score', prov.model.PROV_TYPE: 'ont:DataSet'})
get_optimal_score = doc.activity('log:uuid' + str(uuid.uuid4()), startTime, endTime)
doc.usage(get_optimal_score, optimal_score_resource, startTime, None, {prov.model.PROV_TYPE: 'ont:Computation'})
doc.wasAssociatedWith(get_optimal_score, this_agent)
doc.wasAttributedTo(this_entity, this_agent)
doc.wasGeneratedBy(this_entity, get_optimal_score, endTime)
doc.wasDerivedFrom(this_entity, optimal_score_resource, get_optimal_score, get_optimal_score, get_optimal_score)
repo.logout()
return doc
# optimal_score.execute()
# optimal_score.provenance()