Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Backend performance and stability improvements #515

Merged
merged 14 commits into from
Jan 26, 2021
2 changes: 1 addition & 1 deletion server/preprocessing/other-scripts/base.R
Original file line number Diff line number Diff line change
Expand Up @@ -38,7 +38,7 @@ blog <- getLogger('api.base')

get_papers <- function(query, params, limit=100,
filter=NULL,
retry_opts=rbace::bs_retry_options()) {
retry_opts=rbace::bs_retry_options(3,60,3,4)) {

blog$info(paste("vis_id:", .GlobalEnv$VIS_ID, "Search:", query))
start.time <- Sys.time()
Expand Down
27 changes: 11 additions & 16 deletions server/preprocessing/other-scripts/features.R
Original file line number Diff line number Diff line change
@@ -1,17 +1,9 @@
vflog <- getLogger('vis.features')

create_corpus <- function(metadata, text, lang=NULL) {
create_corpus <- function(metadata, text, languages=c("en")) {
valid <- getStemLanguages()
# if lang not given use lang detection
if (is.null(lang)) {
text["language"] <- unlist(lapply(metadata$lang_detected,
function(x) { if (x %in% valid) x else "english"
}
))
} else {
text["language"] <- if (lang %in% valid) lang else NA
}
mapping <- list(content = "content", id = "id", language = "language")
text["languages"] <- languages
mapping <- list(content = "content", id = "id", languages = "languages")
myReader <- readTabular(mapping = mapping)

corpus <- Corpus(DataframeSource(text),
Expand Down Expand Up @@ -54,14 +46,17 @@ concatenate_features <- function(...) {
return(cbind(...))
}

remove_stop_words <- function(x, language = "english") UseMethod("remove_stop_words", x)
remove_stop_words.character <- function(x, language = "english") {
remove_stop_words <- function(x, languages) UseMethod("remove_stop_words", x)
remove_stop_words.character <- function(x, languages) {
y <- unlist(strsplit(x, " "))
stops <- get_stopwords(language, TESTING)
stops = list()
for (lang in languages) {
stops <- c(stops, get_stopwords(lang, TESTING))
}
stopword <- unlist(lapply(y, function(z) z %in% stops))
doc <- y[which(!stopword)]
doc <- paste(doc, collapse = " ")
}
remove_stop_words.PlainTextDocument <- function(x, language = meta(x, "language")) {
content_transformer(remove_stop_words.character)(x, language)
remove_stop_words.PlainTextDocument <- function(x, languages = meta(x, "languages")) {
content_transformer(remove_stop_words.character)(x, languages)
}
4 changes: 2 additions & 2 deletions server/preprocessing/other-scripts/preprocess.R
Original file line number Diff line number Diff line change
Expand Up @@ -94,7 +94,7 @@ replace_keywords_if_empty <- function(metadata, stops, service) {
metadata$subject[is.na(metadata$subject)] <- ""
} else {
candidates = mapply(paste, metadata$title)
candidates = lapply(candidates, function(x)paste(removeWords(x, stops), collapse=""))
candidates = mclapply(candidates, function(x)paste(removeWords(x, stops), collapse=""))
candidates = lapply(candidates, function(x) {gsub("[^[:alpha:]]", " ", x)})
candidates = lapply(candidates, function(x) {gsub(" +", " ", x)})
candidates_bigrams = lapply(lapply(candidates, function(x)unlist(lapply(ngrams(unlist(strsplit(x, split=" ")), 2), paste, collapse="_"))), paste, collapse=" ")
Expand All @@ -106,7 +106,7 @@ replace_keywords_if_empty <- function(metadata, stops, service) {
nn_tfidf = TermDocumentMatrix(nn_corpus, control = list(tokenize = SplitTokenizer, weighting = function(x) weightSMART(x, spec="ntn")))
tfidf_top = apply(nn_tfidf, 2, function(x) {x2 <- sort(x, TRUE);x2[x2>=x2[3]]})
tfidf_top_names = lapply(tfidf_top, names)
replacement_keywords <- lapply(tfidf_top_names, function(x) filter_out_nested_ngrams(x, 3))
replacement_keywords <- mclapply(tfidf_top_names, function(x) filter_out_nested_ngrams(x, 3))
replacement_keywords = lapply(replacement_keywords, FUN = function(x) {paste(unlist(x), collapse="; ")})
replacement_keywords = gsub("_", " ", replacement_keywords)

Expand Down
2 changes: 1 addition & 1 deletion server/preprocessing/other-scripts/pubmed.R
Original file line number Diff line number Diff line change
Expand Up @@ -81,7 +81,7 @@ get_papers <- function(query, params = NULL, limit = 100, retry_opts = rentrez::
stop(paste("No results retrieved."))
}

out <- lapply(xml, function(z) {
out <- mclapply(xml, function(z) {
flds <- switch(
xml2::xml_name(z),
PubmedArticle = fields,
Expand Down
3 changes: 1 addition & 2 deletions server/preprocessing/other-scripts/summarize.R
Original file line number Diff line number Diff line change
Expand Up @@ -85,7 +85,7 @@ create_cluster_labels <- function(clusters, metadata,


fix_cluster_labels <- function(clusterlabels, type_counts){
unlist(lapply(clusterlabels, function(x) {
unlist(mclapply(clusterlabels, function(x) {
fix_keyword_casing(x, type_counts)
}))
}
Expand Down Expand Up @@ -113,7 +113,6 @@ get_cluster_corpus <- function(clusters, metadata, service, stops, taxonomy_sepa
matches = which(metadata$id %in% group)
titles = metadata$title[matches]
subjects = metadata$subject[matches]
langs = metadata$lang_detected[matches]
titles = lapply(titles, function(x) {gsub("[^[:alnum:]-]", " ", x)})
titles = lapply(titles, gsub, pattern="\\s+", replacement=" ")
title_ngrams <- get_title_ngrams(titles, stops)
Expand Down
8 changes: 6 additions & 2 deletions server/preprocessing/other-scripts/utils.R
Original file line number Diff line number Diff line change
Expand Up @@ -94,15 +94,19 @@ detect_error <- function(failed, service) {
if (!is.null(failed$query_reason)) {
# map response to individual error codes/messages
# then return them as json list
if (service == 'base' && startsWith(failed$query_reason, "Error in curl::curl_fetch_memory(x$url$url, handle = x$url$handle): Timeout was reached")){
reason <- c(reason, 'API error: timeout')
}
if (service == 'pubmed' && startsWith(failed$query_reason, "HTTP failure: 502, bad gateway")){
reason <- c(reason, 'API error: requested metadata size')
}
if (service == 'pubmed' && startsWith(failed$query_reason, "HTTP failure: 500")){
reason <- c(reason, 'API error: PubMed not reachable')
}
if (service == 'pubmed' && startsWith(failed$query_reason, "HTTP failure")){
if (length(reason) == 0 && service == 'pubmed' && startsWith(failed$query_reason, "HTTP failure")){
reason <- c(reason, 'unexpected PubMed API error')
} else {
}
if (length(reason) == 0) {
result <- regmatches(failed$query, regexec(phrasepattern, failed$query))
# if not one of the known data source API errors:
# apply query error detection heuristics
Expand Down
12 changes: 2 additions & 10 deletions server/preprocessing/other-scripts/vis_layout.R
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@ library(stringi)
library(stringdist)
library(plyr)
library(onehot)
registerDoParallel(3)
registerDoParallel(6)


vlog <- getLogger('vis')
Expand Down Expand Up @@ -54,20 +54,12 @@ vis_layout <- function(text, metadata, service,
text <- filtered$text

if(vis_type=='overview'){
metadata["lang_detected"] <- detect_language(text$content)
stops <- get_stopwords(lang, testing)
corpus <- create_corpus(metadata, text, lang)
corpus <- create_corpus(metadata, text, c(lang))

vlog$debug("get features")
tdm_matrix <- create_tdm_matrix(corpus$stemmed)
distance_matrix <- get_distance_matrix(tdm_matrix)
lang_detected <- get_OHE_feature(metadata, "lang_detected")
vlog$info(paste("Languages:",
paste(paste0(names(lang_detected),
":",
apply(lang_detected, 2, sum)),
collapse = " "),
sep=" "))
features <- concatenate_features(distance_matrix)
vlog$debug("get clusters")
clusters <- create_clusters(as.dist(features), max_clusters=max_clusters)
Expand Down
10 changes: 6 additions & 4 deletions server/workers/services/src/apis/persistence.py
Original file line number Diff line number Diff line change
Expand Up @@ -57,6 +57,7 @@ def write_revision(database, vis_id, data, rev_id=None):
session.add(new_rev)
vis.vis_latest = rev_id
session.commit()
session.close()


def create_visualization(database,
Expand All @@ -74,13 +75,14 @@ def create_visualization(database,
session.add(new_vis)
session.commit()
write_revision(database, vis_id, data, 1)
session.close()


def exists_visualization(database, vis_id):
session = select_session(sessions.get(database))
vis = session.query(Visualizations).filter_by(vis_id=vis_id).first()
session.commit()
exists = True if vis else False
session.close()
return exists


Expand All @@ -89,8 +91,8 @@ def get_last_version(database, vis_id, details=False, context=False):


def get_revision(database, vis_id, rev_id, details=False, context=False):
session = select_session(sessions.get(database))
try:
session = select_session(sessions.get(database))
if rev_id is None:
vis, rev = (session
.query(Visualizations, Revisions)
Expand All @@ -107,7 +109,7 @@ def get_revision(database, vis_id, rev_id, details=False, context=False):
.filter(Revisions.rev_vis == vis_id)
.filter(Revisions.rev_id == rev_id)
).first()
session.commit()
session.close()
if context is True:
res = {
"rev_vis": rev.rev_vis,
Expand Down Expand Up @@ -137,7 +139,6 @@ def get_context(database, vis_id, revision_context=False):
.filter(Revisions.rev_vis == vis_id)
.filter(Revisions.rev_id == Visualizations.vis_latest)
).first()
session.commit()
res = {
"rev_vis": rev.rev_vis,
"vis_query": rev.vis_query,
Expand All @@ -148,6 +149,7 @@ def get_context(database, vis_id, revision_context=False):
if revision_context == 'true':
data = json.loads(rev.rev_data)
res["additional_context"] = data.get("additional_context", {})
session.close()
return res


Expand Down
4 changes: 3 additions & 1 deletion server/workers/tests/Backend regression test cases.csv
Original file line number Diff line number Diff line change
Expand Up @@ -34,4 +34,6 @@ case id,data integration,affected component,search query,from,to,article types,s
17,base,clustering,stuff,1665-01-01,2020-07-01,['121'],most-relevant,max n cluster,17 <= 15,,,,
18,base,clustering,stuff,2018-06-28,2020-08-06,['121'],most-relevant,max n cluster,17 <= 15,,,,
19,base,summarization,stuff,2018-10-28,2020-08-06,['121'],most-relevant,stopwords not start end keywords areatitles,"just' not in {'0', '1', '2', 'a', ""a's"", 'able', ...}",,,,
20,pubmed,retrieval,"""Our objective was to explore the effect of depression and anxiety on adherence to antiretroviral therapy (ART) among MSM with newly diagnosed HIV infections.""",1809-01-01,2020-09-25,all,most-relevant,successful map,,,,,
20,pubmed,retrieval,"""Our objective was to explore the effect of depression and anxiety on adherence to antiretroviral therapy (ART) among MSM with newly diagnosed HIV infections.""",1809-01-01,2020-09-25,all,most-relevant,successful map,,,,,
21,pubmed,preprocessing,canine covid19,1809-01-01,2020-12-07,all,most-relevant,successful map,"Error in if (nchar(x) > 17000) {: Missing value, where TRUE/FALSE is needed",,,,
22,base,summarization,explosion,2018-10-28,2020-12-08,['121'],most-relevant,no ??? in bubble titles,,,,,
2 changes: 1 addition & 1 deletion server/workers/tests/knowncases/testcase0.json

Large diffs are not rendered by default.

4 changes: 2 additions & 2 deletions server/workers/tests/knowncases/testcase11.json

Large diffs are not rendered by default.

4 changes: 2 additions & 2 deletions server/workers/tests/knowncases/testcase12.json

Large diffs are not rendered by default.

4 changes: 2 additions & 2 deletions server/workers/tests/knowncases/testcase13.json

Large diffs are not rendered by default.

2 changes: 1 addition & 1 deletion server/workers/tests/knowncases/testcase14.json

Large diffs are not rendered by default.

4 changes: 2 additions & 2 deletions server/workers/tests/knowncases/testcase15.json

Large diffs are not rendered by default.

4 changes: 2 additions & 2 deletions server/workers/tests/knowncases/testcase16.json

Large diffs are not rendered by default.

4 changes: 2 additions & 2 deletions server/workers/tests/knowncases/testcase17.json

Large diffs are not rendered by default.

4 changes: 2 additions & 2 deletions server/workers/tests/knowncases/testcase18.json

Large diffs are not rendered by default.

4 changes: 2 additions & 2 deletions server/workers/tests/knowncases/testcase19.json

Large diffs are not rendered by default.

Loading