Merge pull request OpenKnowledgeMaps#515 from OpenKnowledgeMaps/backe…

…nd-maintenance backend maintenance Jan 21 Former-commit-id: d8b6b6c
chreman · Jan 26, 2021 · 361f8f7 · 361f8f7
2 parents 743ad00 + 94cda74
commit 361f8f7
Show file tree

Hide file tree

Showing 33 changed files with 126 additions and 82 deletions.
diff --git a/server/preprocessing/other-scripts/base.R b/server/preprocessing/other-scripts/base.R
@@ -38,7 +38,7 @@ blog <- getLogger('api.base')
 
 get_papers <- function(query, params, limit=100,
                        filter=NULL,
-                       retry_opts=rbace::bs_retry_options()) {
+                       retry_opts=rbace::bs_retry_options(3,60,3,4)) {
 
   blog$info(paste("vis_id:", .GlobalEnv$VIS_ID, "Search:", query))
   start.time <- Sys.time()

diff --git a/server/preprocessing/other-scripts/features.R b/server/preprocessing/other-scripts/features.R
@@ -1,17 +1,9 @@
 vflog <- getLogger('vis.features')
 
-create_corpus <- function(metadata, text, lang=NULL) {
+create_corpus <- function(metadata, text, languages=c("en")) {
   valid <- getStemLanguages()
-  # if lang not given use lang detection
-  if (is.null(lang)) {
-    text["language"] <- unlist(lapply(metadata$lang_detected,
-                      function(x) { if (x %in% valid) x else "english"
-                                  }
-                              ))
-    } else {
-      text["language"] <- if (lang %in% valid) lang else NA
-    }
-  mapping <- list(content = "content", id = "id", language = "language")
+  text["languages"] <- languages
+  mapping <- list(content = "content", id = "id", languages = "languages")
   myReader <- readTabular(mapping = mapping)
 
   corpus <- Corpus(DataframeSource(text),
@@ -54,14 +46,17 @@ concatenate_features <- function(...) {
   return(cbind(...))
 }
 
-remove_stop_words <- function(x, language = "english") UseMethod("remove_stop_words", x)
-remove_stop_words.character <- function(x, language = "english") {
+remove_stop_words <- function(x, languages) UseMethod("remove_stop_words", x)
+remove_stop_words.character <- function(x, languages) {
   y <- unlist(strsplit(x, " "))
-  stops <- get_stopwords(language, TESTING)
+  stops = list()
+  for (lang in languages) {
+    stops <- c(stops, get_stopwords(lang, TESTING))
+  }
   stopword <- unlist(lapply(y, function(z) z %in% stops))
   doc <- y[which(!stopword)]
   doc <- paste(doc, collapse = " ")
 }
-remove_stop_words.PlainTextDocument <- function(x, language = meta(x, "language")) {
-  content_transformer(remove_stop_words.character)(x, language)
+remove_stop_words.PlainTextDocument <- function(x, languages = meta(x, "languages")) {
+  content_transformer(remove_stop_words.character)(x, languages)
 }
diff --git a/server/preprocessing/other-scripts/preprocess.R b/server/preprocessing/other-scripts/preprocess.R
@@ -94,7 +94,7 @@ replace_keywords_if_empty <- function(metadata, stops, service) {
     metadata$subject[is.na(metadata$subject)] <- ""
   } else {
     candidates = mapply(paste, metadata$title)
-    candidates = lapply(candidates, function(x)paste(removeWords(x, stops), collapse=""))
+    candidates = mclapply(candidates, function(x)paste(removeWords(x, stops), collapse=""))
     candidates = lapply(candidates, function(x) {gsub("[^[:alpha:]]", " ", x)})
     candidates = lapply(candidates, function(x) {gsub(" +", " ", x)})
     candidates_bigrams = lapply(lapply(candidates, function(x)unlist(lapply(ngrams(unlist(strsplit(x, split=" ")), 2), paste, collapse="_"))), paste, collapse=" ")
@@ -106,7 +106,7 @@ replace_keywords_if_empty <- function(metadata, stops, service) {
     nn_tfidf = TermDocumentMatrix(nn_corpus, control = list(tokenize = SplitTokenizer, weighting = function(x) weightSMART(x, spec="ntn")))
     tfidf_top = apply(nn_tfidf, 2, function(x) {x2 <- sort(x, TRUE);x2[x2>=x2[3]]})
     tfidf_top_names = lapply(tfidf_top, names)
-    replacement_keywords <- lapply(tfidf_top_names, function(x) filter_out_nested_ngrams(x, 3))
+    replacement_keywords <- mclapply(tfidf_top_names, function(x) filter_out_nested_ngrams(x, 3))
     replacement_keywords = lapply(replacement_keywords, FUN = function(x) {paste(unlist(x), collapse="; ")})
     replacement_keywords = gsub("_", " ", replacement_keywords)
 

diff --git a/server/preprocessing/other-scripts/pubmed.R b/server/preprocessing/other-scripts/pubmed.R
@@ -81,7 +81,7 @@ get_papers <- function(query, params = NULL, limit = 100, retry_opts = rentrez::
     stop(paste("No results retrieved."))
   }
 
-  out <- lapply(xml, function(z) {
+  out <- mclapply(xml, function(z) {
     flds <- switch(
       xml2::xml_name(z),
       PubmedArticle = fields,

diff --git a/server/preprocessing/other-scripts/summarize.R b/server/preprocessing/other-scripts/summarize.R
@@ -85,7 +85,7 @@ create_cluster_labels <- function(clusters, metadata,
 
 
 fix_cluster_labels <- function(clusterlabels, type_counts){
-  unlist(lapply(clusterlabels, function(x) {
+  unlist(mclapply(clusterlabels, function(x) {
     fix_keyword_casing(x, type_counts)
     }))
 }
@@ -113,7 +113,6 @@ get_cluster_corpus <- function(clusters, metadata, service, stops, taxonomy_sepa
     matches = which(metadata$id %in% group)
     titles =  metadata$title[matches]
     subjects = metadata$subject[matches]
-    langs = metadata$lang_detected[matches]
     titles = lapply(titles, function(x) {gsub("[^[:alnum:]-]", " ", x)})
     titles = lapply(titles, gsub, pattern="\\s+", replacement=" ")
     title_ngrams <- get_title_ngrams(titles, stops)

diff --git a/server/preprocessing/other-scripts/utils.R b/server/preprocessing/other-scripts/utils.R
@@ -94,15 +94,19 @@ detect_error <- function(failed, service) {
   if (!is.null(failed$query_reason)) {
     # map response to individual error codes/messages
     # then return them as json list
+    if (service == 'base' && startsWith(failed$query_reason, "Error in curl::curl_fetch_memory(x$url$url, handle = x$url$handle): Timeout was reached")){
+        reason <- c(reason, 'API error: timeout')
+    }
     if (service == 'pubmed' && startsWith(failed$query_reason, "HTTP failure: 502, bad gateway")){
         reason <- c(reason, 'API error: requested metadata size')
     }
     if (service == 'pubmed' && startsWith(failed$query_reason, "HTTP failure: 500")){
         reason <- c(reason, 'API error: PubMed not reachable')
     }
-    if (service == 'pubmed' && startsWith(failed$query_reason, "HTTP failure")){
+    if (length(reason) == 0 && service == 'pubmed' && startsWith(failed$query_reason, "HTTP failure")){
         reason <- c(reason, 'unexpected PubMed API error')
-      } else {
+    }
+    if (length(reason) == 0) {
         result <- regmatches(failed$query, regexec(phrasepattern, failed$query))
         # if not one of the known data source API errors:
         # apply query error detection heuristics

diff --git a/server/preprocessing/other-scripts/vis_layout.R b/server/preprocessing/other-scripts/vis_layout.R
@@ -13,7 +13,7 @@ library(stringi)
 library(stringdist)
 library(plyr)
 library(onehot)
-registerDoParallel(3)
+registerDoParallel(6)
 
 
 vlog <- getLogger('vis')
@@ -54,20 +54,12 @@ vis_layout <- function(text, metadata, service,
   text <- filtered$text
 
   if(vis_type=='overview'){
-    metadata["lang_detected"] <- detect_language(text$content)
     stops <- get_stopwords(lang, testing)
-    corpus <- create_corpus(metadata, text, lang)
+    corpus <- create_corpus(metadata, text, c(lang))
 
     vlog$debug("get features")
     tdm_matrix <- create_tdm_matrix(corpus$stemmed)
     distance_matrix <- get_distance_matrix(tdm_matrix)
-    lang_detected <- get_OHE_feature(metadata, "lang_detected")
-    vlog$info(paste("Languages:",
-                    paste(paste0(names(lang_detected),
-                                 ":",
-                                 apply(lang_detected, 2, sum)),
-                          collapse = " "),
-                     sep=" "))
     features <- concatenate_features(distance_matrix)
     vlog$debug("get clusters")
     clusters <- create_clusters(as.dist(features), max_clusters=max_clusters)

diff --git a/server/workers/services/src/apis/persistence.py b/server/workers/services/src/apis/persistence.py
@@ -57,6 +57,7 @@ def write_revision(database, vis_id, data, rev_id=None):
     session.add(new_rev)
     vis.vis_latest = rev_id
     session.commit()
+    session.close()
 
 
 def create_visualization(database,
@@ -74,13 +75,14 @@ def create_visualization(database,
         session.add(new_vis)
         session.commit()
         write_revision(database, vis_id, data, 1)
+        session.close()
 
 
 def exists_visualization(database, vis_id):
     session = select_session(sessions.get(database))
     vis = session.query(Visualizations).filter_by(vis_id=vis_id).first()
-    session.commit()
     exists = True if vis else False
+    session.close()
     return exists
 
 
@@ -89,8 +91,8 @@ def get_last_version(database, vis_id, details=False, context=False):
 
 
 def get_revision(database, vis_id, rev_id, details=False, context=False):
-    session = select_session(sessions.get(database))
     try:
+        session = select_session(sessions.get(database))
         if rev_id is None:
             vis, rev = (session
                         .query(Visualizations, Revisions)
@@ -107,7 +109,7 @@ def get_revision(database, vis_id, rev_id, details=False, context=False):
                         .filter(Revisions.rev_vis == vis_id)
                         .filter(Revisions.rev_id == rev_id)
                         ).first()
-        session.commit()
+        session.close()
         if context is True:
             res = {
                 "rev_vis": rev.rev_vis,
@@ -137,7 +139,6 @@ def get_context(database, vis_id, revision_context=False):
                 .filter(Revisions.rev_vis == vis_id)
                 .filter(Revisions.rev_id == Visualizations.vis_latest)
                 ).first()
-    session.commit()
     res = {
         "rev_vis": rev.rev_vis,
         "vis_query": rev.vis_query,
@@ -148,6 +149,7 @@ def get_context(database, vis_id, revision_context=False):
     if revision_context == 'true':
         data = json.loads(rev.rev_data)
         res["additional_context"] = data.get("additional_context", {})
+    session.close()
     return res
 
 

diff --git a/server/workers/tests/Backend regression test cases.csv b/server/workers/tests/Backend regression test cases.csv
@@ -34,4 +34,6 @@ case id,data integration,affected component,search query,from,to,article types,s
 17,base,clustering,stuff,1665-01-01,2020-07-01,['121'],most-relevant,max n cluster,17 <= 15,,,,
 18,base,clustering,stuff,2018-06-28,2020-08-06,['121'],most-relevant,max n cluster,17 <= 15,,,,
 19,base,summarization,stuff,2018-10-28,2020-08-06,['121'],most-relevant,stopwords not start end keywords areatitles,"just' not in {'0', '1', '2', 'a', ""a's"", 'able', ...}",,,,
-20,pubmed,retrieval,"""Our objective was to explore the effect of depression and anxiety on adherence to antiretroviral therapy (ART) among MSM with newly diagnosed HIV infections.""",1809-01-01,2020-09-25,all,most-relevant,successful map,,,,,
+20,pubmed,retrieval,"""Our objective was to explore the effect of depression and anxiety on adherence to antiretroviral therapy (ART) among MSM with newly diagnosed HIV infections.""",1809-01-01,2020-09-25,all,most-relevant,successful map,,,,,
+21,pubmed,preprocessing,canine covid19,1809-01-01,2020-12-07,all,most-relevant,successful map,"Error in if (nchar(x) > 17000) {: Missing value, where TRUE/FALSE is needed",,,,
+22,base,summarization,explosion,2018-10-28,2020-12-08,['121'],most-relevant,no ??? in bubble titles,,,,,
diff --git a/server/workers/tests/knowncases/testcase0.json b/server/workers/tests/knowncases/testcase0.json
diff --git a/server/workers/tests/knowncases/testcase11.json b/server/workers/tests/knowncases/testcase11.json
diff --git a/server/workers/tests/knowncases/testcase12.json b/server/workers/tests/knowncases/testcase12.json
diff --git a/server/workers/tests/knowncases/testcase13.json b/server/workers/tests/knowncases/testcase13.json
diff --git a/server/workers/tests/knowncases/testcase14.json b/server/workers/tests/knowncases/testcase14.json
diff --git a/server/workers/tests/knowncases/testcase15.json b/server/workers/tests/knowncases/testcase15.json
diff --git a/server/workers/tests/knowncases/testcase16.json b/server/workers/tests/knowncases/testcase16.json
diff --git a/server/workers/tests/knowncases/testcase17.json b/server/workers/tests/knowncases/testcase17.json
diff --git a/server/workers/tests/knowncases/testcase18.json b/server/workers/tests/knowncases/testcase18.json
diff --git a/server/workers/tests/knowncases/testcase19.json b/server/workers/tests/knowncases/testcase19.json