-
Notifications
You must be signed in to change notification settings - Fork 1
/
serve.py
632 lines (520 loc) · 21.7 KB
/
serve.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
"""
Flask server backend
ideas:
- allow delete of tags
- unify all different pages into single search filter sort interface
- special single-image search just for paper similarity
"""
import os
import re
import time
from random import shuffle
import math
from functools import cache
import numpy as np
from sklearn import svm
from flask import Flask, request, redirect, url_for
from flask import render_template
from flask import g # global session-level object
from flask import session
from aslite.db import get_papers_db, get_metas_db, get_tags_db, get_last_active_db, get_email_db, get_tweets_db
from aslite.db import load_features
from eprint_daemon import parse_date_string
from datetime import datetime, timezone, timedelta
# -----------------------------------------------------------------------------
# inits and globals
RET_NUM = 25 # number of papers to return per page
max_tweet_records = 15
app = Flask(__name__)
# set the secret key so we can cryptographically sign cookies and maintain sessions
if os.path.isfile('secret_key.txt'):
# example of generating a good key on your system is:
# import secrets; secrets.token_urlsafe(16)
sk = open('secret_key.txt').read().strip()
else:
print("WARNING: no secret key found, using default devkey")
sk = 'devkey'
app.secret_key = sk
# -----------------------------------------------------------------------------
# globals that manage the (lazy) loading of various state for a request
def get_tags():
if g.user is None:
return {}
if not hasattr(g, '_tags'):
with get_tags_db() as tags_db:
tags_dict = tags_db[g.user] if g.user in tags_db else {}
g._tags = tags_dict
return g._tags
def get_papers():
if not hasattr(g, '_pdb'):
g._pdb = get_papers_db()
return g._pdb
def get_metas():
if not hasattr(g, '_mdb'):
g._mdb = get_metas_db()
return g._mdb
def get_tweets():
if not hasattr(g, '_tweets'):
g._tweets = get_tweets_db()
return g._tweets
@app.before_request
def before_request():
g.user = session.get('user', None)
# record activity on this user so we can reserve periodic
# recommendations heavy compute only for active users
if g.user:
with get_last_active_db(flag='c') as last_active_db:
last_active_db[g.user] = int(time.time())
@app.teardown_request
def close_connection(error=None):
# close any opened database connections
if hasattr(g, '_pdb'):
g._pdb.close()
if hasattr(g, '_mdb'):
g._mdb.close()
# -----------------------------------------------------------------------------
# ranking utilities for completing the search/rank/filter requests
def render_pid(pid):
# render a single paper with just the information we need for the UI
pdb = get_papers()
tags = get_tags()
d = pdb[pid]
nice_date = "{:%d %b %Y}".format(parse_date_string(d['date'][1]))
nice_pid = d['identifier'][0].split('eprint.iacr.org/')[1]
thumb_path = 'static/thumb/' + nice_pid + '.jpg'
thumb_url = thumb_path if os.path.isfile(thumb_path) else ''
return dict(
weight = 0.0,
id = nice_pid,
title = d['title'][0],
time = nice_date,
authors = ', '.join(d['creator']),
tags = ', '.join(d['subject']),
utags = [t for t, pids in tags.items() if nice_pid in pids],
summary = d['description'],
thumb_url = thumb_url,
)
def random_rank():
mdb = get_metas()
pids = list(mdb.keys())
shuffle(pids)
scores = [0 for _ in pids]
return pids, scores
def time_rank():
mdb = get_metas()
ms = sorted(mdb.items(), key=lambda kv: kv[1]['date'], reverse=True)
tnow = datetime.now(timezone.utc)
pids = [k for k, v in ms]
scores = [(tnow - parse_date_string(v['date'])).total_seconds() / (60 * 60 * 24) for k, v in ms] # time delta in days
return pids, scores
FIX_PID = lambda pid: 'https://eprint.iacr.org/' + pid if len(pid) > 0 and not pid.startswith('https://eprint.iacr.org') else ''
def svm_rank(tags: str = '', pid: str = '', C: float = 0.01):
pid = FIX_PID(pid)
# tag can be one tag or a few comma-separated tags or 'all' for all tags we have in db
# pid can be a specific paper id to set as positive for a kind of nearest neighbor search
if not (tags or pid):
return [], [], []
# load all of the features
features = load_features()
x, pids = features['x'], features['pids']
n, d = x.shape
ptoi, itop = {}, {}
for i, p in enumerate(pids):
ptoi[p] = i
itop[i] = p
# construct the positive set
y = np.zeros(n, dtype=np.float32)
if pid:
y[ptoi[pid]] = 1.0
elif tags:
tags_db = get_tags()
tags_filter_to = tags_db.keys() if tags == 'all' else set(tags.split(','))
for tag, pids in tags_db.items():
if tag in tags_filter_to:
for pid in pids:
pid = FIX_PID(pid)
y[ptoi[pid]] = 1.0
if y.sum() == 0:
return [], [], [] # there are no positives?
# classify
clf = svm.LinearSVC(class_weight='balanced', verbose=False, max_iter=10000, tol=1e-6, C=C)
clf.fit(x, y)
s = clf.decision_function(x)
sortix = np.argsort(-s)
pids = [itop[ix] for ix in sortix]
scores = [100*float(s[ix]) for ix in sortix]
# get the words that score most positively and most negatively for the svm
ivocab = {v:k for k,v in features['vocab'].items()} # index to word mapping
weights = clf.coef_[0] # (n_features,) weights of the trained svm
sortix = np.argsort(-weights)
words = []
for ix in list(sortix[:40]) + list(sortix[-20:]):
words.append({
'word': ivocab[ix],
'weight': weights[ix],
})
return pids, scores, words
@cache
def tprepro(tweet_text):
# take tweet, return set of words
t = tweet_text.lower()
t = re.sub(r'[^\w\s]','',t) # remove punctuation
ws = set([w for w in t.split() if not w.startswith('#')])
return ws
def score_tweet(tweet):
# give people with more followers more vote, as it's seen by more people and contributes to more hype
float_vote = min(math.log10(tweet['user_followers_count'] + 1), 4.0)/2.0
# uprank tweets that have more likes, retweets, replies, and quotes
float_vote += math.log10(tweet['like_count'] + tweet['retweet_count'] + 1)
float_vote += math.log10(tweet['reply_count'] + tweet['quote_count'] + 1)
return float_vote
def weight_tweet(tweet):
papers = get_papers()
weight = 10.0
# some tweets are really boring, like an rt
if "IACR" in tweet['user_screen_name'].lower():
weight -= 1
if (tweet['text'].lower().startswith('rt') or
tweet['lang'] != 'en' or
len(tweet['text']) < 40):
weight -= 1
# good tweets make a comment, not just a boring RT, or exactly the post title. Detect these.
tweet_words = len(tprepro(tweet['text']))
title_words = 0
for pid in tweet['pids']:
if pid not in papers:
continue
title_words += len(tprepro(papers[pid]['title'][0]))
comment_words = tweet_words - title_words # how much does the tweet have other than just the actual title of the article?
if comment_words < 3:
weight -= 1
return weight
@cache
def tweets_rank(days=7):
try:
days = int(days)
except:
days = 7
tweets = get_tweets()
papers = get_papers()
tnow = time.time()
t0 = tnow - int(days)*24*60*60
tweets_filter = [t for p,t in tweets.items() if t['created_at_time'] > t0]
raw_votes, votes, records_dict = {}, {}, {}
for tweet in tweets_filter:
# filter out bots
if "arxiv" in tweet['user_screen_name'].lower():
continue
for pid in set(tweet['pids']):
if pid not in papers:
continue
if not pid in records_dict:
records_dict[pid] = {'pid':pid, 'tweets':[], 'vote': 0.0, 'raw_vote': 0} # create a new entry for this pid
float_vote = score_tweet(tweet)
weight = float_vote + weight_tweet(tweet)
# add up the votes for papers
records_dict[pid]['tweets'].append({'screen_name':tweet['user_screen_name'], 'text':tweet['text'], 'weight':weight, 'id':tweet['id'] })
votes[pid] = votes.get(pid, 0.0) + float_vote
raw_votes[pid] = raw_votes.get(pid, 0) + 1
# record the total amount of vote/raw_vote for each pid
for pid in votes:
records_dict[pid]['vote'] = votes[pid] # record the total amount of vote across relevant tweets
records_dict[pid]['raw_vote'] = raw_votes[pid]
pids = sorted(records_dict, key=lambda x: records_dict[x]['vote'], reverse=True)
scores = [records_dict[pid]['vote'] for pid in pids]
tweets = [records_dict[pid]['tweets'] for pid in pids]
return pids, scores, tweets
def search_rank(q: str = ''):
if not q:
return [], [] # no query? no results
qs = q.lower().strip().split() # split query by spaces and lowercase
pdb = get_papers()
match = lambda s: sum(min(3, s.lower().count(qp)) for qp in qs)
matchu = lambda s: sum(int(s.lower().count(qp) > 0) for qp in qs)
pairs = []
for pid, p in pdb.items():
score = 0.0
author_str = ' '.join(p['creator'])
title_str = p['title'][0] if len(p['title']) > 0 else ''
desc_str = p['description'][0] if len(p['description']) > 0 else ''
score += 10.0 * matchu(author_str)
score += 20.0 * matchu(title_str)
score += 1.0 * match(desc_str)
if score > 0:
pairs.append((score, pid))
pairs.sort(reverse=True)
pids = [p[1] for p in pairs]
scores = [p[0] for p in pairs]
return pids, scores
# -----------------------------------------------------------------------------
# primary application endpoints
def default_context():
# any global context across all pages, e.g. related to the current user
context = {}
context['user'] = g.user if g.user is not None else ''
return context
@app.route('/', methods=['GET'])
def main():
# default settings
default_rank = 'time'
default_tags = ''
default_time_filter = ''
default_skip_have = 'no'
# override variables with any provided options via the interface
opt_rank = request.args.get('rank', default_rank) # rank type. search|tags|pid|time|tweets|random
opt_q = request.args.get('q', '') # search request in the text box
opt_tags = request.args.get('tags', default_tags) # tags to rank by if opt_rank == 'tag'
opt_pid = request.args.get('pid', '') # pid to find nearest neighbors to
opt_time_filter = request.args.get('time_filter', default_time_filter) # number of days to filter by
opt_skip_have = request.args.get('skip_have', default_skip_have) # hide papers we already have?
opt_svm_c = request.args.get('svm_c', '') # svm C parameter
opt_tweet_filter = request.args.get('tweet_filter', '') # days of tweets to filter
opt_page_number = request.args.get('page_number', '1') # page number for pagination
# if a query is given, override rank to be of type "search"
# this allows the user to simply hit ENTER in the search field and have the correct thing happen
if opt_q:
opt_rank = 'search'
# try to parse opt_svm_c into something sensible (a float)
try:
C = float(opt_svm_c)
except ValueError:
C = 0.01 # sensible default, i think
# rank papers: by tags, by time, by random
words = [] # only populated in the case of svm rank
tweets = [] # only populated in the case of tweet rank
if opt_rank == 'search':
pids, scores = search_rank(q=opt_q)
elif opt_rank == 'tags':
pids, scores, words = svm_rank(tags=opt_tags, C=C)
elif opt_rank == 'pid':
pids, scores, words = svm_rank(pid=opt_pid, C=C)
elif opt_rank == 'time':
pids, scores = time_rank()
elif opt_rank == 'tweets':
pids, scores, tweets = tweets_rank(days=opt_tweet_filter)
elif opt_rank == 'random':
pids, scores = random_rank()
else:
raise ValueError("opt_rank %s is not a thing" % (opt_rank, ))
# filter by time
if opt_time_filter:
mdb = get_metas()
kv = {k:v for k,v in mdb.items()} # read all of metas to memory at once, for efficiency
tnow = datetime.now(timezone.utc)
deltat = timedelta(days=int(opt_time_filter)) # allowed time delta in seconds
keep = [i for i,pid in enumerate(pids) if (tnow - parse_date_string(kv[pid]['date'])) < deltat]
pids, scores = [pids[i] for i in keep], [scores[i] for i in keep]
# optionally hide papers we already have
if opt_skip_have == 'yes':
tags = get_tags()
have = set().union(*tags.values())
keep = [i for i,pid in enumerate(pids) if pid not in have]
pids, scores = [pids[i] for i in keep], [scores[i] for i in keep]
# crop the number of results to RET_NUM, and paginate
try:
page_number = max(1, int(opt_page_number))
except ValueError:
page_number = 1
start_index = (page_number - 1) * RET_NUM # desired starting index
end_index = min(start_index + RET_NUM, len(pids)) # desired ending index
pids = pids[start_index:end_index]
scores = scores[start_index:end_index]
# render all papers to just the information we need for the UI
papers = [render_pid(pid) for pid in pids]
for i, p in enumerate(papers):
p['weight'] = float(scores[i])
# build the current tags for the user, and append the special 'all' tag
tags = get_tags()
rtags = [{'name':t, 'n':len(pids)} for t, pids in tags.items()]
if rtags:
rtags.append({'name': 'all'})
# build the page context information and render
context = default_context()
context['papers'] = papers
context['tags'] = rtags
context['words'] = words
context['tweets'] = tweets
context['words_desc'] = "Here are the top 40 most positive and bottom 20 most negative weights of the SVM. If they don't look great then try tuning the regularization strength hyperparameter of the SVM, svm_c, above. Lower C is higher regularization."
context['words_desc'] = "Here are the top 15 most influential tweets about this paper."
context['gvars'] = {}
context['gvars']['rank'] = opt_rank
context['gvars']['tags'] = opt_tags
context['gvars']['pid'] = opt_pid
context['gvars']['time_filter'] = opt_time_filter
context['gvars']['tweet_filter'] = opt_tweet_filter
context['gvars']['skip_have'] = opt_skip_have
context['gvars']['search_query'] = opt_q
context['gvars']['svm_c'] = str(C)
context['gvars']['page_number'] = str(page_number)
return render_template('index.html', **context)
@app.route('/inspect', methods=['GET'])
def inspect():
# fetch the paper of interest based on the pid
pid = request.args.get('pid', '')
pid = FIX_PID(pid)
pdb = get_papers()
if pid not in pdb:
return "error, malformed pid" # todo: better error handling
# load the tfidf vectors, the vocab, and the idf table
features = load_features()
x = features['x']
idf = features['idf']
ivocab = {v:k for k,v in features['vocab'].items()}
pix = features['pids'].index(pid)
wixs = np.flatnonzero(np.asarray(x[pix].todense()))
words = []
for ix in wixs:
words.append({
'word': ivocab[ix],
'weight': float(x[pix, ix]),
'idf': float(idf[ix]),
})
words.sort(key=lambda w: w['weight'], reverse=True)
# get the tweets for this paper
tdb = get_tweets()
tweets = [t for _, t in tdb.items() if pid in t['pids']]
for i, t in enumerate(tweets):
tweets[i]['votes'] = score_tweet(t)
tweets[i]['weight'] = weight_tweet(t)
# crop the tweets to only some number of highest weight ones (for efficiency)
tweets.sort(reverse=True, key=lambda x: x['weight'])
if len(tweets) > max_tweet_records:
tweets = tweets[:max_tweet_records]
# package everything up and render
paper = render_pid(pid)
context = default_context()
context['paper'] = paper
context['words'] = words
context['tweets'] = tweets
context['words_desc'] = "The following are the tokens and their (tfidf) weight in the paper vector. This is the actual summary that feeds into the SVM to power recommendations, so hopefully it is good and representative!"
context['tweets_desc'] = "The following are the most influential tweets and their scores."
return render_template('inspect.html', **context)
@app.route('/profile')
def profile():
context = default_context()
with get_email_db() as edb:
email = edb.get(g.user, '')
context['email'] = email
return render_template('profile.html', **context)
@app.route('/stats')
def stats():
context = default_context()
mdb = get_metas()
kv = {k:v for k,v in mdb.items()} # read all of metas to memory at once, for efficiency
times = [v['date'] for v in kv.values()]
context['num_papers'] = len(kv)
if len(kv) > 0:
context['earliest_paper'] = parse_date_string(min(times))
context['latest_paper'] = parse_date_string(max(times))
else:
context['earliest_paper'] = 'N/A'
context['latest_paper'] = 'N/A'
# count number of papers from various time deltas to now
tnow = datetime.now(timezone.utc)
for thr in [1, 6, 12, 24, 48, 72, 96]:
deltat = timedelta(hours=thr)
# tnow = datetime.now(timezone.utc)
# deltat = timedelta(days=int(opt_time_filter)) # allowed time delta in seconds
# keep = [i for i,pid in enumerate(pids) if (tnow - parse_date_string(kv[pid]['date'])) < deltat]
context['thr_%d' % thr] = len([t for t in times if (tnow - parse_date_string(t)) < deltat])
return render_template('stats.html', **context)
@app.route('/about')
def about():
context = default_context()
return render_template('about.html', **context)
# -----------------------------------------------------------------------------
# tag related endpoints: add, delete tags for any paper
@app.route('/add/<pid1>/<pid2>/<tag>')
def add(pid1=None, pid2=None, tag=None):
pid = pid1 + '/' + pid2
if g.user is None:
return "error, not logged in"
if tag == 'all':
return "error, cannot add the protected tag 'all'"
elif tag == 'null':
return "error, cannot add the protected tag 'null'"
with get_tags_db(flag='c') as tags_db:
# create the user if we don't know about them yet with an empty library
if not g.user in tags_db:
tags_db[g.user] = {}
# fetch the user library object
d = tags_db[g.user]
# add the paper to the tag
if tag not in d:
d[tag] = set()
d[tag].add(pid)
# write back to database
tags_db[g.user] = d
print("added paper %s to tag %s for user %s" % (pid, tag, g.user))
return "ok: " + str(d) # return back the user library for debugging atm
@app.route('/sub/<pid1>/<pid2>/<tag>')
def sub(pid1=None, pid2=None, tag=None):
pid = pid1 + '/' + pid2
if g.user is None:
return "error, not logged in"
with get_tags_db(flag='c') as tags_db:
# if the user doesn't have any tags, there is nothing to do
if not g.user in tags_db:
return "user has no library of tags ¯\_(ツ)_/¯"
# fetch the user library object
d = tags_db[g.user]
# add the paper to the tag
if tag not in d:
return "user doesn't have the tag %s" % (tag, )
else:
if pid in d[tag]:
# remove this pid from the tag
d[tag].remove(pid)
# if this was the last paper in this tag, also delete the tag
if len(d[tag]) == 0:
del d[tag]
# write back the resulting dict to database
tags_db[g.user] = d
return "ok removed pid %s from tag %s" % (pid, tag)
else:
return "user doesn't have paper %s in tag %s" % (pid, tag)
@app.route('/del/<tag>')
def delete_tag(tag=None):
if g.user is None:
return "error, not logged in"
with get_tags_db(flag='c') as tags_db:
if g.user not in tags_db:
return "user does not have a library"
d = tags_db[g.user]
if tag not in d:
return "user does not have this tag"
# delete the tag
del d[tag]
# write back to database
tags_db[g.user] = d
print("deleted tag %s for user %s" % (tag, g.user))
return "ok: " + str(d) # return back the user library for debugging atm
# -----------------------------------------------------------------------------
# endpoints to log in and out
@app.route('/login', methods=['POST'])
def login():
# the user is logged out but wants to log in, ok
if g.user is None and request.form['username']:
username = request.form['username']
if len(username) > 0: # one more paranoid check
session['user'] = username
return redirect(url_for('profile'))
@app.route('/logout')
def logout():
session.pop('user', None)
return redirect(url_for('profile'))
# -----------------------------------------------------------------------------
# user settings and configurations
@app.route('/register_email', methods=['POST'])
def register_email():
email = request.form['email']
if g.user:
# do some basic input validation
proper_email = re.match(r'^[A-Z0-9._%+-]+@[A-Z0-9.-]+\.[A-Z]{2,4}$', email, re.IGNORECASE)
if email == '' or proper_email: # allow empty email, meaning no email
# everything checks out, write to the database
with get_email_db(flag='c') as edb:
edb[g.user] = email
return redirect(url_for('profile'))
if __name__ == '__main__':
app.run(debug=True)