-
Notifications
You must be signed in to change notification settings - Fork 0
/
listing_cluster.py
117 lines (92 loc) · 3.27 KB
/
listing_cluster.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
import dbscan
from library import database
from sklearn.cluster import DBSCAN
from sklearn.mixture import DPGMM
import sys
'''
input the normal data, cleans out the really empty entries
returns a list of 2 lists, (data, identification)
'''
def _check_data(data, to_check):
final = []
identification = []
for entry in data:
row = []
okay = True
for x in to_check:
if entry[x] == 0:
okay = False
if okay is True:
final.append(entry[1:])
identification.append(entry[0])
return [final,identification]
'''
This is an internal function to test with DBSCAN for clustering. Automatically
updates values
takes in the full data, and the labels and database connection
'''
def _DBSCAN(cluster_data, identification, db):
#try the dbscn
dbscan = DBSCAN().fit(cluster_data)
core_samples_mask = np.zeros_like(dbscan.labels_, dtype=bool)
core_samples_mask[dbscan.core_sample_indices_] = True
labels = dbscan.labels_
n_clusters_ = len(set(labels)) - (1 if -1 in labels else 0)
print('Estimated number of clusters with DBSCAN: %d' % n_clusters_)
final = [] #listing_id, label, city, country
for n in range(0,len(identification)):
final.append([identification[n],labels[n]])
print len(final)
print final[:3]
return final
'''
Internal function to run dirichlet
'''
def _Dirichlet(cluster_data, identification):
print "In Dirichlet"
for i in range(0,3):
print "i is ", i
dirichlet = DPGMM(n_components = len(cluster_data)).fit(cluster_data)
#paremeters= dirichlet.get_params #returns parameters of the algorithm as a whole from the fit
predict = dirichlet.predict(cluster_data)
n_clusters_ = len(set(predict)) - (1 if -1 in predict else 0)
print('Estimated number of clusters with Dirichlet: %d' % n_clusters_)
final = []
for x in range(0,len(identification)):
final.append([identification[x],predict[x]])
print "this is what final sort of looked like"
print final[:3]
return final
'''
For the test we are going to do the big just for barcelona
'''
def main():
from library import database #don't know why all of the sudden this needs to be in the function
data_thesis = database.database("Thesis")
query_plain = "SELECT * FROM `features_plain_gobig`;"
query_fancy = "SELECT * FROM `features_fancy`;"
data_plain = data_thesis.get_data(query_plain)
data_fancy = data_thesis.get_data(query_fancy)
#sorting data
go_plain = [[float(item) for item in entry[1:]] for entry in data_plain]
go_plain_identification = [entry[0] for entry in data_plain]
go_fancy = [[float(item) for item in entry[1:]] for entry in data_fancy]
go_fancy_identification = [entry[0] for entry in data_fancy]
print "Let's try to isolate clusters by location!"
#_DBSCAN(cluster_data, identification, db)
plain_final = _Dirichlet(go_plain, go_plain_identification)
fancy_final = _Dirichlet(go_fancy, go_fancy_identification)
'''
go big: 11 clusters (full data)
go small = 13 clusters (full data)
data: listing_id, cluster
'''
#save data
data_thesis.clear_table("listing_clusters_fancy")
data_thesis.clear_table("listing_clusters_plain")
for x in range(0, len(plain_final)):
data_thesis.add_entry_to_table("listing_clusters_plain", plain_final[x])
data_thesis.add_entry_to_table("listing_clusters_fancy", fancy_final[x])
data_thesis.destroy_connection()
if __name__ == '__main__':
main()