Skip to content

Commit

Permalink
Merge pull request OpenKnowledgeMaps#515 from OpenKnowledgeMaps/backe…
Browse files Browse the repository at this point in the history
…nd-maintenance

backend maintenance Jan 21

Former-commit-id: d8b6b6c
  • Loading branch information
chreman committed Jan 26, 2021
2 parents 743ad00 + 94cda74 commit 361f8f7
Show file tree
Hide file tree
Showing 33 changed files with 126 additions and 82 deletions.
2 changes: 1 addition & 1 deletion server/preprocessing/other-scripts/base.R
Original file line number Diff line number Diff line change
Expand Up @@ -38,7 +38,7 @@ blog <- getLogger('api.base')

get_papers <- function(query, params, limit=100,
filter=NULL,
retry_opts=rbace::bs_retry_options()) {
retry_opts=rbace::bs_retry_options(3,60,3,4)) {

blog$info(paste("vis_id:", .GlobalEnv$VIS_ID, "Search:", query))
start.time <- Sys.time()
Expand Down
27 changes: 11 additions & 16 deletions server/preprocessing/other-scripts/features.R
Original file line number Diff line number Diff line change
@@ -1,17 +1,9 @@
vflog <- getLogger('vis.features')

create_corpus <- function(metadata, text, lang=NULL) {
create_corpus <- function(metadata, text, languages=c("en")) {
valid <- getStemLanguages()
# if lang not given use lang detection
if (is.null(lang)) {
text["language"] <- unlist(lapply(metadata$lang_detected,
function(x) { if (x %in% valid) x else "english"
}
))
} else {
text["language"] <- if (lang %in% valid) lang else NA
}
mapping <- list(content = "content", id = "id", language = "language")
text["languages"] <- languages
mapping <- list(content = "content", id = "id", languages = "languages")
myReader <- readTabular(mapping = mapping)

corpus <- Corpus(DataframeSource(text),
Expand Down Expand Up @@ -54,14 +46,17 @@ concatenate_features <- function(...) {
return(cbind(...))
}

remove_stop_words <- function(x, language = "english") UseMethod("remove_stop_words", x)
remove_stop_words.character <- function(x, language = "english") {
remove_stop_words <- function(x, languages) UseMethod("remove_stop_words", x)
remove_stop_words.character <- function(x, languages) {
y <- unlist(strsplit(x, " "))
stops <- get_stopwords(language, TESTING)
stops = list()
for (lang in languages) {
stops <- c(stops, get_stopwords(lang, TESTING))
}
stopword <- unlist(lapply(y, function(z) z %in% stops))
doc <- y[which(!stopword)]
doc <- paste(doc, collapse = " ")
}
remove_stop_words.PlainTextDocument <- function(x, language = meta(x, "language")) {
content_transformer(remove_stop_words.character)(x, language)
remove_stop_words.PlainTextDocument <- function(x, languages = meta(x, "languages")) {
content_transformer(remove_stop_words.character)(x, languages)
}
4 changes: 2 additions & 2 deletions server/preprocessing/other-scripts/preprocess.R
Original file line number Diff line number Diff line change
Expand Up @@ -94,7 +94,7 @@ replace_keywords_if_empty <- function(metadata, stops, service) {
metadata$subject[is.na(metadata$subject)] <- ""
} else {
candidates = mapply(paste, metadata$title)
candidates = lapply(candidates, function(x)paste(removeWords(x, stops), collapse=""))
candidates = mclapply(candidates, function(x)paste(removeWords(x, stops), collapse=""))
candidates = lapply(candidates, function(x) {gsub("[^[:alpha:]]", " ", x)})
candidates = lapply(candidates, function(x) {gsub(" +", " ", x)})
candidates_bigrams = lapply(lapply(candidates, function(x)unlist(lapply(ngrams(unlist(strsplit(x, split=" ")), 2), paste, collapse="_"))), paste, collapse=" ")
Expand All @@ -106,7 +106,7 @@ replace_keywords_if_empty <- function(metadata, stops, service) {
nn_tfidf = TermDocumentMatrix(nn_corpus, control = list(tokenize = SplitTokenizer, weighting = function(x) weightSMART(x, spec="ntn")))
tfidf_top = apply(nn_tfidf, 2, function(x) {x2 <- sort(x, TRUE);x2[x2>=x2[3]]})
tfidf_top_names = lapply(tfidf_top, names)
replacement_keywords <- lapply(tfidf_top_names, function(x) filter_out_nested_ngrams(x, 3))
replacement_keywords <- mclapply(tfidf_top_names, function(x) filter_out_nested_ngrams(x, 3))
replacement_keywords = lapply(replacement_keywords, FUN = function(x) {paste(unlist(x), collapse="; ")})
replacement_keywords = gsub("_", " ", replacement_keywords)

Expand Down
2 changes: 1 addition & 1 deletion server/preprocessing/other-scripts/pubmed.R
Original file line number Diff line number Diff line change
Expand Up @@ -81,7 +81,7 @@ get_papers <- function(query, params = NULL, limit = 100, retry_opts = rentrez::
stop(paste("No results retrieved."))
}

out <- lapply(xml, function(z) {
out <- mclapply(xml, function(z) {
flds <- switch(
xml2::xml_name(z),
PubmedArticle = fields,
Expand Down
3 changes: 1 addition & 2 deletions server/preprocessing/other-scripts/summarize.R
Original file line number Diff line number Diff line change
Expand Up @@ -85,7 +85,7 @@ create_cluster_labels <- function(clusters, metadata,


fix_cluster_labels <- function(clusterlabels, type_counts){
unlist(lapply(clusterlabels, function(x) {
unlist(mclapply(clusterlabels, function(x) {
fix_keyword_casing(x, type_counts)
}))
}
Expand Down Expand Up @@ -113,7 +113,6 @@ get_cluster_corpus <- function(clusters, metadata, service, stops, taxonomy_sepa
matches = which(metadata$id %in% group)
titles = metadata$title[matches]
subjects = metadata$subject[matches]
langs = metadata$lang_detected[matches]
titles = lapply(titles, function(x) {gsub("[^[:alnum:]-]", " ", x)})
titles = lapply(titles, gsub, pattern="\\s+", replacement=" ")
title_ngrams <- get_title_ngrams(titles, stops)
Expand Down
8 changes: 6 additions & 2 deletions server/preprocessing/other-scripts/utils.R
Original file line number Diff line number Diff line change
Expand Up @@ -94,15 +94,19 @@ detect_error <- function(failed, service) {
if (!is.null(failed$query_reason)) {
# map response to individual error codes/messages
# then return them as json list
if (service == 'base' && startsWith(failed$query_reason, "Error in curl::curl_fetch_memory(x$url$url, handle = x$url$handle): Timeout was reached")){
reason <- c(reason, 'API error: timeout')
}
if (service == 'pubmed' && startsWith(failed$query_reason, "HTTP failure: 502, bad gateway")){
reason <- c(reason, 'API error: requested metadata size')
}
if (service == 'pubmed' && startsWith(failed$query_reason, "HTTP failure: 500")){
reason <- c(reason, 'API error: PubMed not reachable')
}
if (service == 'pubmed' && startsWith(failed$query_reason, "HTTP failure")){
if (length(reason) == 0 && service == 'pubmed' && startsWith(failed$query_reason, "HTTP failure")){
reason <- c(reason, 'unexpected PubMed API error')
} else {
}
if (length(reason) == 0) {
result <- regmatches(failed$query, regexec(phrasepattern, failed$query))
# if not one of the known data source API errors:
# apply query error detection heuristics
Expand Down
12 changes: 2 additions & 10 deletions server/preprocessing/other-scripts/vis_layout.R
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@ library(stringi)
library(stringdist)
library(plyr)
library(onehot)
registerDoParallel(3)
registerDoParallel(6)


vlog <- getLogger('vis')
Expand Down Expand Up @@ -54,20 +54,12 @@ vis_layout <- function(text, metadata, service,
text <- filtered$text

if(vis_type=='overview'){
metadata["lang_detected"] <- detect_language(text$content)
stops <- get_stopwords(lang, testing)
corpus <- create_corpus(metadata, text, lang)
corpus <- create_corpus(metadata, text, c(lang))

vlog$debug("get features")
tdm_matrix <- create_tdm_matrix(corpus$stemmed)
distance_matrix <- get_distance_matrix(tdm_matrix)
lang_detected <- get_OHE_feature(metadata, "lang_detected")
vlog$info(paste("Languages:",
paste(paste0(names(lang_detected),
":",
apply(lang_detected, 2, sum)),
collapse = " "),
sep=" "))
features <- concatenate_features(distance_matrix)
vlog$debug("get clusters")
clusters <- create_clusters(as.dist(features), max_clusters=max_clusters)
Expand Down
10 changes: 6 additions & 4 deletions server/workers/services/src/apis/persistence.py
Original file line number Diff line number Diff line change
Expand Up @@ -57,6 +57,7 @@ def write_revision(database, vis_id, data, rev_id=None):
session.add(new_rev)
vis.vis_latest = rev_id
session.commit()
session.close()


def create_visualization(database,
Expand All @@ -74,13 +75,14 @@ def create_visualization(database,
session.add(new_vis)
session.commit()
write_revision(database, vis_id, data, 1)
session.close()


def exists_visualization(database, vis_id):
session = select_session(sessions.get(database))
vis = session.query(Visualizations).filter_by(vis_id=vis_id).first()
session.commit()
exists = True if vis else False
session.close()
return exists


Expand All @@ -89,8 +91,8 @@ def get_last_version(database, vis_id, details=False, context=False):


def get_revision(database, vis_id, rev_id, details=False, context=False):
session = select_session(sessions.get(database))
try:
session = select_session(sessions.get(database))
if rev_id is None:
vis, rev = (session
.query(Visualizations, Revisions)
Expand All @@ -107,7 +109,7 @@ def get_revision(database, vis_id, rev_id, details=False, context=False):
.filter(Revisions.rev_vis == vis_id)
.filter(Revisions.rev_id == rev_id)
).first()
session.commit()
session.close()
if context is True:
res = {
"rev_vis": rev.rev_vis,
Expand Down Expand Up @@ -137,7 +139,6 @@ def get_context(database, vis_id, revision_context=False):
.filter(Revisions.rev_vis == vis_id)
.filter(Revisions.rev_id == Visualizations.vis_latest)
).first()
session.commit()
res = {
"rev_vis": rev.rev_vis,
"vis_query": rev.vis_query,
Expand All @@ -148,6 +149,7 @@ def get_context(database, vis_id, revision_context=False):
if revision_context == 'true':
data = json.loads(rev.rev_data)
res["additional_context"] = data.get("additional_context", {})
session.close()
return res


Expand Down
4 changes: 3 additions & 1 deletion server/workers/tests/Backend regression test cases.csv
Original file line number Diff line number Diff line change
Expand Up @@ -34,4 +34,6 @@ case id,data integration,affected component,search query,from,to,article types,s
17,base,clustering,stuff,1665-01-01,2020-07-01,['121'],most-relevant,max n cluster,17 <= 15,,,,
18,base,clustering,stuff,2018-06-28,2020-08-06,['121'],most-relevant,max n cluster,17 <= 15,,,,
19,base,summarization,stuff,2018-10-28,2020-08-06,['121'],most-relevant,stopwords not start end keywords areatitles,"just' not in {'0', '1', '2', 'a', ""a's"", 'able', ...}",,,,
20,pubmed,retrieval,"""Our objective was to explore the effect of depression and anxiety on adherence to antiretroviral therapy (ART) among MSM with newly diagnosed HIV infections.""",1809-01-01,2020-09-25,all,most-relevant,successful map,,,,,
20,pubmed,retrieval,"""Our objective was to explore the effect of depression and anxiety on adherence to antiretroviral therapy (ART) among MSM with newly diagnosed HIV infections.""",1809-01-01,2020-09-25,all,most-relevant,successful map,,,,,
21,pubmed,preprocessing,canine covid19,1809-01-01,2020-12-07,all,most-relevant,successful map,"Error in if (nchar(x) > 17000) {: Missing value, where TRUE/FALSE is needed",,,,
22,base,summarization,explosion,2018-10-28,2020-12-08,['121'],most-relevant,no ??? in bubble titles,,,,,
2 changes: 1 addition & 1 deletion server/workers/tests/knowncases/testcase0.json

Large diffs are not rendered by default.

4 changes: 2 additions & 2 deletions server/workers/tests/knowncases/testcase11.json

Large diffs are not rendered by default.

4 changes: 2 additions & 2 deletions server/workers/tests/knowncases/testcase12.json

Large diffs are not rendered by default.

4 changes: 2 additions & 2 deletions server/workers/tests/knowncases/testcase13.json

Large diffs are not rendered by default.

2 changes: 1 addition & 1 deletion server/workers/tests/knowncases/testcase14.json

Large diffs are not rendered by default.

4 changes: 2 additions & 2 deletions server/workers/tests/knowncases/testcase15.json

Large diffs are not rendered by default.

4 changes: 2 additions & 2 deletions server/workers/tests/knowncases/testcase16.json

Large diffs are not rendered by default.

4 changes: 2 additions & 2 deletions server/workers/tests/knowncases/testcase17.json

Large diffs are not rendered by default.

4 changes: 2 additions & 2 deletions server/workers/tests/knowncases/testcase18.json

Large diffs are not rendered by default.

4 changes: 2 additions & 2 deletions server/workers/tests/knowncases/testcase19.json

Large diffs are not rendered by default.

Loading

0 comments on commit 361f8f7

Please sign in to comment.