-
Notifications
You must be signed in to change notification settings - Fork 0
/
art_matcher_algorithm.py
178 lines (134 loc) · 5.22 KB
/
art_matcher_algorithm.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
#-----------------------------------------------------------#
# Art Matcher #
# Author: Rocio Ng #
# Purpose: Implements K-means to cluster #
# arwork based on feature vectors #
# and returns images closest to #
# the query image in the feature #
# space #
#-----------------------------------------------------------#
from secret import SQL_password
import pymysql as mdb
import csv
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.cluster import KMeans
from img_feature_extraction import img_from_url
import scipy.spatial as sp
import pickle
from sklearn.externals import joblib
import time
# for testing:
# link = "http://www.cianellistudios.com/images/abstract-art/abstract-art-mother-earth.jpg"
# for future --- allow user to decide how many works to return
# def img_cluster():
# pass
#----------Convert SQL Image Feature Database to Pandas Dataframe-------------#
def artwork_df():
print "Now connecting to tumblr_db"
con = mdb.connect('localhost',
'root',
SQL_password,
'tumblr_db')
with con:
cur = con.cursor()
print "Now extracting the feature space for all the images in the Database"
# cur.execute("SELECT * FROM Artwork WHERE Avg_Gray is NOT NULL")
sql = "SELECT * FROM Artwork WHERE Avg_Gray is NOT NULL"
# query = cur.fetchall()
art_df = pd.read_sql(sql, con)
# get rid of unwanted columns
art_df = art_df.drop(["Id", "Tags", "Notes"], axis =1)
art_df = art_df.set_index(["Blog_Name", "Img_url"])
return art_df
#print art_df.head()
#-----Have csv pre-loaded to save processing time on the web-application-------#
# artwork_df = artwork_df()
# artwork_df.to_csv("art_df.csv")
def k_means():
# load the feature space of art images in the database
# art = pd.DataFrame.from_csv('data/artwork_features_MVP.csv', index_col= [0,1])
art_df = artwork_df()
#---------------Clustering on CHANNELS ONLY--------------------------#
# include only channel columns
art_df = art_df.loc[:,"Avg_Blue":"Low_Gray"]
# drop avg gray column since it is mostly lumped along a single value
art_df = art_df.drop("Avg_Gray", axis=1)
# print channels_df.head()
# run clustering algorithm and fit the model
print "Now running the model"
k_means = KMeans(n_clusters=9)
k_means.fit(art_df)
print "Now pickling the results of the model"
clf = joblib.dump(k_means, "RGB_kmeans_model.pkl")
def art_match(url):
'''downloads image at query url, extracts feature vector
and returns the urls of art images in the database with vectors
closest to the query image
'''
# load model results and database
print "Now loading the Pickled K-means Fit"
k_means = joblib.load('RGB_kmeans_model.pkl')
# art_df = artwork_df()
print "Now loading art features dataframe"
art_df = pd.DataFrame.from_csv("art_df.csv", index_col= [0,1])
art_df = art_df.loc[:,"Avg_Blue":"Low_Gray"]
art_df = art_df.drop("Avg_Gray", axis=1)
print art_df.shape
# load QUERY url and extracts the feature vector
print "Loading the query image"
query_img = img_from_url(url) # this has changed
# include only channel features
query_img = query_img[0:16] # last four have contour info
print len(query_img)
query_img.pop(12) # remove avg gray value
print query_img
# assign cluster number for query image
print "Now assigning query image to a cluster"
cluster = k_means.predict(query_img)[0]
#cluster = predict[0]
print "Now extracting artwork from the same cluster"
# extract the cluster that each art image belongs to
labels = k_means.labels_
# subset dataframe to only include images in the same
# cluster as the query image
indexes = np.where(labels == cluster)[0]
art_subset = art_df.iloc[indexes]
print "There are %i images in this cluster" % len(art_subset)
#print art_subset.head()
#cluster_center = k_means.cluster_centers_[cluster]
start_time = time.time() # for checking time of matching
print "Now finding matches to the query image"
# calculate Eucldiean distances of art images to the query image
nrows = int(art_subset.shape[0])
distances = []
for i in range(0,nrows):
row = list(art_subset.iloc[i])
distances.append(sp.distance.euclidean(row,query_img))
print("--- %s seconds ---" % (time.time() - start_time))
# adds noew column containing the distances
art_subset["Distance"] = distances
# sort and extract the 6 closest images
df = art_subset.sort('Distance').iloc[0:6]
df.reset_index(inplace = True)
# get urls they refer to
artists = list(df["Blog_Name"])
art_urls = list(df['Img_url'])
return art_urls
# # # # for testing:
# link = "http://www.cianellistudios.com/images/abstract-art/abstract-art-mother-earth.jpg"
# # # # for testing:
# art_match(link)
# if __name__ == '__main__':
# k_means()
# link = "http://www.cianellistudios.com/images/abstract-art/abstract-art-mother-earth.jpg"
# # # # for testing:
# art_match(link)
def k_means_contour():
pass
# extract contour
#art_df = artwork_df()
print "Now loading art contour features"
art_df = pd.DataFrame.from_csv("art_df.csv", index_col= [0,1])
art_df = art_df.loc[:,"Avg_Blue":"Low_b"]