-
Notifications
You must be signed in to change notification settings - Fork 0
/
TF-IDF-COSINE.py
92 lines (80 loc) · 2.72 KB
/
TF-IDF-COSINE.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
#THE ISLANDERS
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
import numpy as np
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
import csv
file = open('Lyrics.csv', 'r')
df = pd.read_csv ('Lyrics.csv', names=['song','artist','lyrics'],sep=',', skiprows=1, encoding='latin-1')
porter = PorterStemmer()
docs = []
documents = []
for i in df['lyrics']:
documents += [i]
line = i.strip()
tokens = word_tokenize(line)
words = [word.lower() for word in tokens if word.isalpha()]
tokens_without_sw = [word for word in words if not word in stopwords.words('english')]
stemmed = [porter.stem(word) for word in tokens_without_sw]
lyric = (" ").join(tokens_without_sw)
docs += [lyric]
songs = []
for i in df['song']:
songs += [i]
artists = []
for i in df['artist']:
artists += [i]
# Create a TfidfVectorizer object
vectorizer = TfidfVectorizer()
# Fits the data and transform it to a vector
X = vectorizer.fit_transform(docs)
# Convert X to transposed matrix
X = X.T.toarray()
# Create a DataFrame and set the vocabulary as the index
df2 = pd.DataFrame(X, index=vectorizer.get_feature_names_out())
def get_similar_lyrics(q, df2):
print("query:", q)
print("The following are lyrics with the highest cosine similarity values: ")
# Convert the query become a vector
q = [q]
q_vec = vectorizer.transform(q).toarray().reshape(df2.shape[0],)
sim = {}
# Calculate the similarity
for i in range(df2.shape[1]):
sim[i] = np.dot(df2.loc[:, i].values, q_vec) / np.linalg.norm(df2.loc[:, i]) * np.linalg.norm(q_vec)
# Sort the values
sim_sorted = sorted(sim.items(), key=lambda x: x[1], reverse=True)
print(sim_sorted)
# Print the lyrics and their similarity values
x = 0
result = open('Lyrics_after_tfidf.csv', "a", encoding="utf-8")
fieldnames = ['song', 'artist', 'lyrics']
writer = csv.DictWriter(result, fieldnames=fieldnames)
writer.writeheader()
last_v = 0.0
for k, v in sim_sorted:
if v != 0.0:
if v!= last_v:
last_v = v
print("Similarity Value:", v)
writer.writerow({'song': songs[k], 'artist': artists[k], 'lyrics': documents[k]})
#sbert += [documents[k]]
x += 1
if x == 50:
break;
print(documents[k])
print()
else:
continue
# Add The Query
q1 = 'love and peace'
line = q1.strip()
tokens = word_tokenize(line)
words = [word.lower() for word in tokens if word.isalpha()]
tokens_without_sw = [word for word in words if not word in stopwords.words('english')]
stemmed = [porter.stem(word) for word in tokens_without_sw]
q1 = (" ").join(stemmed)
# Call the function
get_similar_lyrics(q1, df2)