From 6625ca9921e5fcb7d31c6829b0b8a053d900c191 Mon Sep 17 00:00:00 2001 From: Syphax Bouazzouni Date: Mon, 4 Apr 2022 12:39:30 +0200 Subject: [PATCH 01/14] add an abstraction to the diff tool --- .../diff/bubastis_diff.rb | 41 ++++++++++--------- lib/ontologies_linked_data/diff/diff.rb | 14 ++++++- 2 files changed, 35 insertions(+), 20 deletions(-) diff --git a/lib/ontologies_linked_data/diff/bubastis_diff.rb b/lib/ontologies_linked_data/diff/bubastis_diff.rb index d5de6f65..5e7eab62 100644 --- a/lib/ontologies_linked_data/diff/bubastis_diff.rb +++ b/lib/ontologies_linked_data/diff/bubastis_diff.rb @@ -10,7 +10,7 @@ class InputFileNotFoundError < Diff::DiffException class DiffFileNotGeneratedException < Diff::DiffException end - class BubastisDiffCommand + class BubastisDiffCommand < DiffTool # Bubastis version 1.2 # 18th December 2014 @@ -37,14 +37,32 @@ class BubastisDiffCommand # Loading one file locally and one from the web and outputting results to plain text: # java -jar bubastis_1_2.jar -ontology1 "H://disease_ontology_version_1.owl" -ontology2 "http://www.disease.org/diseaseontology_latest.owl" -output "C://my_diff.txt" - def initialize(input_fileOld, input_fileNew) + def initialize(old_file_path, new_file_path) @bubastis_jar_path = LinkedData.bindir + "/bubastis.jar" - @input_fileOld = input_fileOld - @input_fileNew = input_fileNew + @input_fileOld = old_file_path + @input_fileNew = new_file_path @output_repo = File.expand_path(@input_fileNew).gsub(File.basename(@input_fileNew),'') @file_diff_path = nil end + + def file_diff_path + @file_diff_path + end + + def diff + setup_environment + call_bubastis_java_cmd + if @file_diff_path.nil? + raise DiffFileNotGeneratedException, "Diff file nil" + elsif not File.exist?(@file_diff_path) + raise DiffFileNotGeneratedException, "Diff file not found in #{@file_diff_path}" + end + return @file_diff_path + end + + private + def setup_environment if @input_fileOld.nil? or (not File.exist?(@input_fileOld)) raise InputFileNotFoundError, "#{@input_fileOld} not found." @@ -104,21 +122,6 @@ def call_bubastis_java_cmd end return @file_diff_path end - - def file_diff_path - @file_diff_path - end - - def diff - setup_environment - call_bubastis_java_cmd - if @file_diff_path.nil? - raise DiffFileNotGeneratedException, "Diff file nil" - elsif not File.exist?(@file_diff_path) - raise DiffFileNotGeneratedException, "Diff file not found in #{@file_diff_path}" - end - return @file_diff_path - end end end end diff --git a/lib/ontologies_linked_data/diff/diff.rb b/lib/ontologies_linked_data/diff/diff.rb index 32b054f8..3c89326e 100644 --- a/lib/ontologies_linked_data/diff/diff.rb +++ b/lib/ontologies_linked_data/diff/diff.rb @@ -1,8 +1,20 @@ module LinkedData module Diff - class < Date: Mon, 4 Apr 2022 12:42:24 +0200 Subject: [PATCH 02/14] add submission process operation abstraction --- lib/ontologies_linked_data.rb | 1 + .../submission_process/submission_process.rb | 14 ++++++++++++++ 2 files changed, 15 insertions(+) create mode 100644 lib/ontologies_linked_data/services/submission_process/submission_process.rb diff --git a/lib/ontologies_linked_data.rb b/lib/ontologies_linked_data.rb index 678e0e89..54cb382e 100644 --- a/lib/ontologies_linked_data.rb +++ b/lib/ontologies_linked_data.rb @@ -34,6 +34,7 @@ # Require all models project_root = File.dirname(File.absolute_path(__FILE__)) +require 'ontologies_linked_data/services/submission_process/submission_process' # We need to require deterministic - that is why we have the sort. models = Dir.glob(project_root + '/ontologies_linked_data/models/**/*.rb').sort diff --git a/lib/ontologies_linked_data/services/submission_process/submission_process.rb b/lib/ontologies_linked_data/services/submission_process/submission_process.rb new file mode 100644 index 00000000..4e7433e8 --- /dev/null +++ b/lib/ontologies_linked_data/services/submission_process/submission_process.rb @@ -0,0 +1,14 @@ +module LinkedData + module Services + class OntologySubmissionProcess + + def initialize(submission) + @submission = submission + end + + def process(logger, options = {}) + raise NotImplementedError + end + end + end +end From f67e6dbd8020f362398eec3c9614981f281d8965 Mon Sep 17 00:00:00 2001 From: Syphax Bouazzouni Date: Mon, 4 Apr 2022 12:45:33 +0200 Subject: [PATCH 03/14] auto lint refactor of ontologies_linked_data.rb --- lib/ontologies_linked_data.rb | 64 +++++++++++++++++++++-------------- 1 file changed, 38 insertions(+), 26 deletions(-) diff --git a/lib/ontologies_linked_data.rb b/lib/ontologies_linked_data.rb index 54cb382e..996831cd 100644 --- a/lib/ontologies_linked_data.rb +++ b/lib/ontologies_linked_data.rb @@ -1,39 +1,43 @@ -require "goo" +require 'goo' # Make sure we're in the load path -lib_dir = File.dirname(__FILE__)+"/../lib" +lib_dir = "#{File.dirname(__FILE__)}/../lib" $LOAD_PATH.unshift lib_dir unless $LOAD_PATH.include?(lib_dir) # Setup Goo (repo connection and namespaces) -require "ontologies_linked_data/config/config" +require 'ontologies_linked_data/config/config' # Include other dependent code -require "ontologies_linked_data/security/authorization" -require "ontologies_linked_data/security/access_control" -require "ontologies_linked_data/security/access_denied_middleware" -require "ontologies_linked_data/hypermedia/hypermedia" -require "ontologies_linked_data/serializer" -require "ontologies_linked_data/serializers/serializers" -require "ontologies_linked_data/utils/file" -require "ontologies_linked_data/utils/triples" -require "ontologies_linked_data/utils/notifications" -require "ontologies_linked_data/utils/ontology_csv_writer" -require "ontologies_linked_data/utils/multi_logger" -require "ontologies_linked_data/parser/parser" -require "ontologies_linked_data/diff/diff" -require "ontologies_linked_data/monkeypatches/class" # load before object -require "ontologies_linked_data/monkeypatches/object" -require "ontologies_linked_data/monkeypatches/logging" -require "ontologies_linked_data/sample_data/sample_data" -require "ontologies_linked_data/mappings/mappings" -require "ontologies_linked_data/http_cache/cacheable_resource" -require "ontologies_linked_data/metrics/metrics" +require 'ontologies_linked_data/security/authorization' +require 'ontologies_linked_data/security/access_control' +require 'ontologies_linked_data/security/access_denied_middleware' +require 'ontologies_linked_data/hypermedia/hypermedia' +require 'ontologies_linked_data/serializer' +require 'ontologies_linked_data/serializers/serializers' +require 'ontologies_linked_data/utils/file' +require 'ontologies_linked_data/utils/triples' +require 'ontologies_linked_data/utils/notifications' +require 'ontologies_linked_data/utils/ontology_csv_writer' +require 'ontologies_linked_data/utils/multi_logger' +require 'ontologies_linked_data/parser/parser' +require 'ontologies_linked_data/diff/diff' +require 'ontologies_linked_data/monkeypatches/class' # load before object +require 'ontologies_linked_data/monkeypatches/object' +require 'ontologies_linked_data/monkeypatches/logging' +require 'ontologies_linked_data/sample_data/sample_data' +require 'ontologies_linked_data/mappings/mappings' +require 'ontologies_linked_data/http_cache/cacheable_resource' +require 'ontologies_linked_data/metrics/metrics' # Require base model -require "ontologies_linked_data/models/base" +require 'ontologies_linked_data/models/base' -# Require all models + + + +# Require all models and services project_root = File.dirname(File.absolute_path(__FILE__)) +# Require base services require 'ontologies_linked_data/services/submission_process/submission_process' # We need to require deterministic - that is why we have the sort. @@ -42,12 +46,20 @@ require m end +# We need to require deterministic - that is why we have the sort. +models = Dir.glob("#{project_root}/ontologies_linked_data/models/**/*.rb").sort +models.each do |m| + require m +end + + + module LinkedData def rootdir File.dirname(File.absolute_path(__FILE__)) end def bindir - File.expand_path(rootdir + '/../bin') + File.expand_path("#{rootdir}/../bin") end end From aa0d4c8d032a32b8d258c7973fb173db87b7559b Mon Sep 17 00:00:00 2001 From: Syphax Bouazzouni Date: Mon, 4 Apr 2022 12:45:57 +0200 Subject: [PATCH 04/14] add the requires of the services modules --- lib/ontologies_linked_data.rb | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/lib/ontologies_linked_data.rb b/lib/ontologies_linked_data.rb index 996831cd..92d51fcd 100644 --- a/lib/ontologies_linked_data.rb +++ b/lib/ontologies_linked_data.rb @@ -41,7 +41,8 @@ require 'ontologies_linked_data/services/submission_process/submission_process' # We need to require deterministic - that is why we have the sort. -models = Dir.glob(project_root + '/ontologies_linked_data/models/**/*.rb').sort + +models = Dir.glob("#{project_root}/ontologies_linked_data/services/**/*.rb").sort models.each do |m| require m end From 8883e13f11b132c9ac7e1240f211679402c069e5 Mon Sep 17 00:00:00 2001 From: Syphax Bouazzouni Date: Mon, 4 Apr 2022 12:50:14 +0200 Subject: [PATCH 05/14] add the SubmissionArchiver operation --- .../operations/submission_archiver.rb | 42 +++++++++++++++++++ 1 file changed, 42 insertions(+) create mode 100644 lib/ontologies_linked_data/services/submission_process/operations/submission_archiver.rb diff --git a/lib/ontologies_linked_data/services/submission_process/operations/submission_archiver.rb b/lib/ontologies_linked_data/services/submission_process/operations/submission_archiver.rb new file mode 100644 index 00000000..1e69df0f --- /dev/null +++ b/lib/ontologies_linked_data/services/submission_process/operations/submission_archiver.rb @@ -0,0 +1,42 @@ +module LinkedData + module Services + class OntologySubmissionArchiver < OntologySubmissionProcess + + FILES_TO_DELETE = %w[labels.ttl mappings.ttl obsolete.ttl owlapi.xrdf errors.log] + + + def process + submission_archive + end + + private + def submission_archive + @submission.submissionStatus = nil + status = LinkedData::Models::SubmissionStatus.find("ARCHIVED").first + @submission.add_submission_status(status) + + + # Delete everything except for original ontology file. + @submission.ontology.bring(:submissions) + submissions = @submission.ontology.submissions + unless submissions.nil? + submissions.each { |s| s.bring(:submissionId) } + submission = submissions.sort { |a, b| b.submissionId <=> a.submissionId }.first + # Don't perform deletion if this is the most recent submission. + delete_old_submission_files if @submission.submissionId < submission.submissionId + end + end + + def delete_old_submission_files + path_to_repo = @submission.data_folder + submission_files = FILES_TO_DELETE.map { |f| File.join(path_to_repo, f) } + submission_files.push(@submission.csv_path) + submission_files.push(@submission.parsing_log_path) unless @submission.parsing_log_path.nil? + FileUtils.rm(submission_files, force: true) + end + + end + + + end +end From d31637d253653d3bd12333179bc505643c56a429 Mon Sep 17 00:00:00 2001 From: Syphax Bouazzouni Date: Mon, 4 Apr 2022 12:50:52 +0200 Subject: [PATCH 06/14] adding the RDF generator operation --- .../operations/submission_rdf_generator.rb | 452 ++++++++++++++++++ 1 file changed, 452 insertions(+) create mode 100644 lib/ontologies_linked_data/services/submission_process/operations/submission_rdf_generator.rb diff --git a/lib/ontologies_linked_data/services/submission_process/operations/submission_rdf_generator.rb b/lib/ontologies_linked_data/services/submission_process/operations/submission_rdf_generator.rb new file mode 100644 index 00000000..702001ad --- /dev/null +++ b/lib/ontologies_linked_data/services/submission_process/operations/submission_rdf_generator.rb @@ -0,0 +1,452 @@ +module LinkedData + module Services + + class MissingLabelsHandler < OntologySubmissionProcess + + def process(logger, options = {}) + handle_missing_labels(options[:file_path], logger) + end + + private + def handle_missing_labels(file_path, logger) + callbacks = { + missing_labels: { + op_name: 'Missing Labels Generation', + required: true, + status: LinkedData::Models::SubmissionStatus.find('RDF_LABELS').first, + artifacts: { + file_path: file_path + }, + caller_on_pre: :generate_missing_labels_pre, + caller_on_pre_page: :generate_missing_labels_pre_page, + caller_on_each: :generate_missing_labels_each, + caller_on_post_page: :generate_missing_labels_post_page, + caller_on_post: :generate_missing_labels_post + } + } + + raw_paging = LinkedData::Models::Class.in(@submission).include(:prefLabel, :synonym, :label) + loop_classes(logger, raw_paging, @submission, callbacks) + end + + def process_callbacks(logger, callbacks, action_name) + callbacks.delete_if do |_, callback| + begin + if callback[action_name] + callable = self.method(callback[action_name]) + yield(callable, callback) + end + false + rescue Exception => e + logger.error("#{e.class}: #{e.message}\n#{e.backtrace.join("\n\t")}") + logger.flush + + if callback[:status] + @submission.add_submission_status(callback[:status].get_error_status) + @submission.save + end + + # halt the entire processing if :required is set to true + raise e if callback[:required] + # continue processing of other callbacks, but not this one + true + end + end + end + + def loop_classes(logger, raw_paging, submission, callbacks) + page = 1 + size = 2500 + count_classes = 0 + acr = submission.id.to_s.split("/")[-1] + operations = callbacks.values.map { |v| v[:op_name] }.join(", ") + + time = Benchmark.realtime do + paging = raw_paging.page(page, size) + cls_count_set = false + cls_count = submission.class_count(logger) + + if cls_count > -1 + # prevent a COUNT SPARQL query if possible + paging.page_count_set(cls_count) + cls_count_set = true + else + cls_count = 0 + end + + iterate_classes = false + # 1. init artifacts hash if not explicitly passed in the callback + # 2. determine if class-level iteration is required + callbacks.each { |_, callback| callback[:artifacts] ||= {}; if callback[:caller_on_each] + iterate_classes = true + end } + + process_callbacks(logger, callbacks, :caller_on_pre) { + |callable, callback| callable.call(callback[:artifacts], logger, paging) } + + page_len = -1 + prev_page_len = -1 + + begin + t0 = Time.now + page_classes = paging.page(page, size).all + total_pages = page_classes.total_pages + page_len = page_classes.length + + # nothing retrieved even though we're expecting more records + if total_pages > 0 && page_classes.empty? && (prev_page_len == -1 || prev_page_len == size) + j = 0 + num_calls = LinkedData.settings.num_retries_4store + + while page_classes.empty? && j < num_calls do + j += 1 + logger.error("Empty page encountered. Retrying #{j} times...") + sleep(2) + page_classes = paging.page(page, size).all + unless page_classes.empty? + logger.info("Success retrieving a page of #{page_classes.length} classes after retrying #{j} times...") + end + end + + if page_classes.empty? + msg = "Empty page #{page} of #{total_pages} persisted after retrying #{j} times. #{operations} of #{acr} aborted..." + logger.error(msg) + raise msg + end + end + + if page_classes.empty? + if total_pages > 0 + logger.info("The number of pages reported for #{acr} - #{total_pages} is higher than expected #{page - 1}. Completing #{operations}...") + else + logger.info("Ontology #{acr} contains #{total_pages} pages...") + end + break + end + + prev_page_len = page_len + logger.info("#{acr}: page #{page} of #{total_pages} - #{page_len} ontology terms retrieved in #{Time.now - t0} sec.") + logger.flush + count_classes += page_classes.length + + process_callbacks(logger, callbacks, :caller_on_pre_page) { + |callable, callback| callable.call(callback[:artifacts], logger, paging, page_classes, page) } + + page_classes.each { |c| + process_callbacks(logger, callbacks, :caller_on_each) { + |callable, callback| callable.call(callback[:artifacts], logger, paging, page_classes, page, c) } + } if iterate_classes + + process_callbacks(logger, callbacks, :caller_on_post_page) { + |callable, callback| callable.call(callback[:artifacts], logger, paging, page_classes, page) } + cls_count += page_classes.length unless cls_count_set + + page = page_classes.next? ? page + 1 : nil + end while !page.nil? + + callbacks.each { |_, callback| callback[:artifacts][:count_classes] = cls_count } + process_callbacks(logger, callbacks, :caller_on_post) { + |callable, callback| callable.call(callback[:artifacts], logger, paging) } + end + + logger.info("Completed #{operations}: #{acr} in #{time} sec. #{count_classes} classes.") + logger.flush + + # set the status on actions that have completed successfully + callbacks.each do |_, callback| + if callback[:status] + @submission.add_submission_status(callback[:status]) + @submission.save + end + end + end + + def generate_missing_labels_pre(artifacts = {}, logger, paging) + file_path = artifacts[:file_path] + artifacts[:save_in_file] = File.join(File.dirname(file_path), "labels.ttl") + artifacts[:save_in_file_mappings] = File.join(File.dirname(file_path), "mappings.ttl") + property_triples = LinkedData::Utils::Triples.rdf_for_custom_properties(@submission) + Goo.sparql_data_client.append_triples(@submission.id, property_triples, mime_type = "application/x-turtle") + fsave = File.open(artifacts[:save_in_file], "w") + fsave.write(property_triples) + fsave_mappings = File.open(artifacts[:save_in_file_mappings], "w") + artifacts[:fsave] = fsave + artifacts[:fsave_mappings] = fsave_mappings + end + + def generate_missing_labels_pre_page(artifacts={}, logger, paging, page_classes, page) + artifacts[:label_triples] = [] + artifacts[:mapping_triples] = [] + end + + def generate_missing_labels_each(artifacts = {}, logger, paging, page_classes, page, c) + prefLabel = nil + + if c.prefLabel.nil? + rdfs_labels = c.label + + if rdfs_labels && rdfs_labels.length > 1 && c.synonym.length > 0 + rdfs_labels = (Set.new(c.label) - Set.new(c.synonym)).to_a.first + + rdfs_labels = c.label if rdfs_labels.nil? || rdfs_labels.length == 0 + end + + rdfs_labels = [rdfs_labels] if rdfs_labels and not (rdfs_labels.instance_of? Array) + label = nil + + if rdfs_labels && rdfs_labels.length > 0 + label = rdfs_labels[0] + else + label = LinkedData::Utils::Triples.last_iri_fragment c.id.to_s + end + artifacts[:label_triples] << LinkedData::Utils::Triples.label_for_class_triple( + c.id, Goo.vocabulary(:metadata_def)[:prefLabel], label) + prefLabel = label + else + prefLabel = c.prefLabel + end + + if @submission.ontology.viewOf.nil? + loomLabel = LinkedData::Models::OntologySubmission.loom_transform_literal(prefLabel.to_s) + + if loomLabel.length > 2 + artifacts[:mapping_triples] << LinkedData::Utils::Triples.loom_mapping_triple( + c.id, Goo.vocabulary(:metadata_def)[:mappingLoom], loomLabel) + end + artifacts[:mapping_triples] << LinkedData::Utils::Triples.uri_mapping_triple( + c.id, Goo.vocabulary(:metadata_def)[:mappingSameURI], c.id) + end + end + + def generate_missing_labels_post_page(artifacts = {}, logger, paging, page_classes, page) + rest_mappings = LinkedData::Mappings.migrate_rest_mappings(@submission.ontology.acronym) + artifacts[:mapping_triples].concat(rest_mappings) + + if artifacts[:label_triples].length > 0 + logger.info("Asserting #{artifacts[:label_triples].length} labels in " + + "#{@submission.id.to_ntriples}") + logger.flush + artifacts[:label_triples] = artifacts[:label_triples].join("\n") + artifacts[:fsave].write(artifacts[:label_triples]) + t0 = Time.now + Goo.sparql_data_client.append_triples(@submission.id, artifacts[:label_triples], mime_type = "application/x-turtle") + t1 = Time.now + logger.info("Labels asserted in #{t1 - t0} sec.") + logger.flush + else + logger.info("No labels generated in page #{page}.") + logger.flush + end + + if artifacts[:mapping_triples].length > 0 + logger.info("Asserting #{artifacts[:mapping_triples].length} mappings in " + + "#{@submission.id.to_ntriples}") + logger.flush + artifacts[:mapping_triples] = artifacts[:mapping_triples].join("\n") + artifacts[:fsave_mappings].write(artifacts[:mapping_triples]) + + t0 = Time.now + Goo.sparql_data_client.append_triples(@submission.id, artifacts[:mapping_triples], mime_type = "application/x-turtle") + t1 = Time.now + logger.info("Mapping labels asserted in #{t1 - t0} sec.") + logger.flush + end + end + + def generate_missing_labels_post(artifacts = {}, logger, pagging) + logger.info("end generate_missing_labels traversed #{artifacts[:count_classes]} classes") + logger.info("Saved generated labels in #{artifacts[:save_in_file]}") + artifacts[:fsave].close() + artifacts[:fsave_mappings].close() + logger.flush + end + + end + + class SubmissionRDFGenerator < OntologySubmissionProcess + + def process(logger, options) + process_rdf(logger, options[:reasoning]) + end + + private + + def process_rdf(logger, reasoning) + # Remove processing status types before starting RDF parsing etc. + @submission.submissionStatus = nil + status = LinkedData::Models::SubmissionStatus.find('UPLOADED').first + @submission.add_submission_status(status) + @submission.save + + # Parse RDF + begin + unless @submission.valid? + error = 'Submission is not valid, it cannot be processed. Check errors.' + raise ArgumentError, error + end + unless @submission.uploadFilePath + error = 'Submission is missing an ontology file, cannot parse.' + raise ArgumentError, error + end + status = LinkedData::Models::SubmissionStatus.find('RDF').first + @submission.remove_submission_status(status) #remove RDF status before starting + zip_dst = @submission.unzip_submission(logger) + file_path = zip_dst ? zip_dst.to_s : @submission.uploadFilePath.to_s + generate_rdf(logger, file_path, reasoning: reasoning) + @submission.add_submission_status(status) + @submission.save + rescue Exception => e + logger.error("#{e.class}: #{e.message}\n#{e.backtrace.join("\n\t")}") + logger.flush + @submission.add_submission_status(status.get_error_status) + @submission.save + # If RDF generation fails, no point of continuing + raise e + end + + MissingLabelsHandler.new(@submission).process(logger, file_path: file_path) + + status = LinkedData::Models::SubmissionStatus.find('OBSOLETE').first + begin + generate_obsolete_classes(logger, file_path) + @submission.add_submission_status(status) + @submission.save + rescue Exception => e + logger.error("#{e.class}: #{e.message}\n#{e.backtrace.join("\n\t")}") + logger.flush + @submission.add_submission_status(status.get_error_status) + @submission.save + # if obsolete fails the parsing fails + raise e + end + end + + def generate_obsolete_classes(logger, file_path) + @submission.bring(:obsoleteProperty) if @submission.bring?(:obsoleteProperty) + @submission.bring(:obsoleteParent) if @submission.bring?(:obsoleteParent) + classes_deprecated = [] + if @submission.obsoleteProperty && + @submission.obsoleteProperty.to_s != "http://www.w3.org/2002/07/owl#deprecated" + + predicate_obsolete = RDF::URI.new(@submission.obsoleteProperty.to_s) + query_obsolete_predicate = < 0 + classes_deprecated.uniq! + logger.info("Asserting owl:deprecated statement for #{classes_deprecated} classes") + save_in_file = File.join(File.dirname(file_path), "obsolete.ttl") + fsave = File.open(save_in_file, "w") + classes_deprecated.each do |class_id| + fsave.write(LinkedData::Utils::Triples.obselete_class_triple(class_id) + "\n") + end + fsave.close() + result = Goo.sparql_data_client.append_triples_from_file( + @submission.id, + save_in_file, + mime_type = "application/x-turtle") + end + end + + def generate_rdf(logger, file_path, reasoning: true) + mime_type = nil + + if @submission.hasOntologyLanguage.umls? + triples_file_path = @submission.triples_file_path + logger.info("Using UMLS turtle file found, skipping OWLAPI parse") + logger.flush + mime_type = LinkedData::MediaTypes.media_type_from_base(LinkedData::MediaTypes::TURTLE) + SubmissionMetricsCalculator.new(@submission).generate_umls_metrics_file(triples_file_path) + else + output_rdf = @submission.rdf_path + + if File.exist?(output_rdf) + logger.info("deleting old owlapi.xrdf ..") + deleted = FileUtils.rm(output_rdf) + + if deleted.length > 0 + logger.info("deleted") + else + logger.info("error deleting owlapi.rdf") + end + end + owlapi = LinkedData::Parser::OWLAPICommand.new( + File.expand_path(file_path), + File.expand_path(@submission.data_folder.to_s), + master_file: @submission.masterFileName) + + owlapi.disable_reasoner unless reasoning + triples_file_path, missing_imports = owlapi.parse + + if missing_imports && missing_imports.length > 0 + @submission.missingImports = missing_imports + + missing_imports.each do |imp| + logger.info("OWL_IMPORT_MISSING: #{imp}") + end + else + @submission.missingImports = nil + end + logger.flush + end + delete_and_append(triples_file_path, logger, mime_type) + version_info = extract_version() + + @submission.version = version_info if version_info + end + + def delete_and_append(triples_file_path, logger, mime_type = nil) + Goo.sparql_data_client.delete_graph(@submission.id) + Goo.sparql_data_client.put_triples(@submission.id, triples_file_path, mime_type) + logger.info("Triples #{triples_file_path} appended in #{@submission.id.to_ntriples}") + logger.flush + end + + def extract_version + + query_version_info = < + ?versionInfo . +} +eos + Goo.sparql_query_client.query(query_version_info).each_solution do |sol| + return sol[:versionInfo].to_s + end + nil + end + + end + end +end + From 1dbca522db263612b35830bf4b73f325d93b7267 Mon Sep 17 00:00:00 2001 From: Syphax Bouazzouni Date: Mon, 4 Apr 2022 12:51:49 +0200 Subject: [PATCH 07/14] adding the OntologySubmissionIndexer operation --- .../operations/submission_indexer.rb | 199 ++++++++++++++++++ 1 file changed, 199 insertions(+) create mode 100644 lib/ontologies_linked_data/services/submission_process/operations/submission_indexer.rb diff --git a/lib/ontologies_linked_data/services/submission_process/operations/submission_indexer.rb b/lib/ontologies_linked_data/services/submission_process/operations/submission_indexer.rb new file mode 100644 index 00000000..8751c306 --- /dev/null +++ b/lib/ontologies_linked_data/services/submission_process/operations/submission_indexer.rb @@ -0,0 +1,199 @@ +module LinkedData + module Services + class OntologySubmissionIndexer < OntologySubmissionProcess + + def process(logger, options = nil) + process_indexation(logger, options) + end + + private + + def process_indexation(logger, options) + + status = LinkedData::Models::SubmissionStatus.find('INDEXED').first + begin + index(logger, options[:index_commit], false) + @submission.add_submission_status(status) + rescue Exception => e + logger.error("#{e.class}: #{e.message}\n#{e.backtrace.join("\n\t")}") + logger.flush + @submission.add_submission_status(status.get_error_status) + if File.file?(@submission.csv_path) + FileUtils.rm(@submission.csv_path) + end + ensure + @submission.save + end + end + + def index(logger, commit = true, optimize = true) + page = 0 + size = 1000 + count_classes = 0 + + time = Benchmark.realtime do + @submission.bring(:ontology) if @submission.bring?(:ontology) + @submission.ontology.bring(:acronym) if @submission.ontology.bring?(:acronym) + @submission.ontology.bring(:provisionalClasses) if @submission.ontology.bring?(:provisionalClasses) + csv_writer = LinkedData::Utils::OntologyCSVWriter.new + csv_writer.open(@submission.ontology, @submission.csv_path) + + begin + logger.info("Indexing ontology terms: #{@submission.ontology.acronym}...") + t0 = Time.now + @submission.ontology.unindex(false) + logger.info("Removed ontology terms index (#{Time.now - t0}s)"); logger.flush + + paging = LinkedData::Models::Class.in(@submission).include(:unmapped).aggregate(:count, :children).page(page, size) + cls_count = @submission.class_count(logger) + paging.page_count_set(cls_count) unless cls_count < 0 + total_pages = paging.page(1, size).all.total_pages + num_threads = [total_pages, LinkedData.settings.indexing_num_threads].min + threads = [] + page_classes = nil + + num_threads.times do |num| + threads[num] = Thread.new { + Thread.current['done'] = false + Thread.current['page_len'] = -1 + Thread.current['prev_page_len'] = -1 + + while !Thread.current['done'] + @submission.synchronize do + page = (page == 0 || page_classes.next?) ? page + 1 : nil + + if page.nil? + Thread.current['done'] = true + else + Thread.current['page'] = page || 'nil' + page_classes = paging.page(page, size).all + count_classes += page_classes.length + Thread.current['page_classes'] = page_classes + Thread.current['page_len'] = page_classes.length + Thread.current['t0'] = Time.now + + # nothing retrieved even though we're expecting more records + if total_pages > 0 && page_classes.empty? && (Thread.current['prev_page_len'] == -1 || Thread.current['prev_page_len'] == size) + j = 0 + num_calls = LinkedData.settings.num_retries_4store + + while page_classes.empty? && j < num_calls do + j += 1 + logger.error("Thread #{num + 1}: Empty page encountered. Retrying #{j} times...") + sleep(2) + page_classes = paging.page(page, size).all + logger.info("Thread #{num + 1}: Success retrieving a page of #{page_classes.length} classes after retrying #{j} times...") unless page_classes.empty? + end + + if page_classes.empty? + msg = "Thread #{num + 1}: Empty page #{Thread.current["page"]} of #{total_pages} persisted after retrying #{j} times. Indexing of #{@submission.id.to_s} aborted..." + logger.error(msg) + raise msg + else + Thread.current['page_classes'] = page_classes + end + end + + if page_classes.empty? + if total_pages > 0 + logger.info("Thread #{num + 1}: The number of pages reported for #{@submission.id.to_s} - #{total_pages} is higher than expected #{page - 1}. Completing indexing...") + else + logger.info("Thread #{num + 1}: Ontology #{@submission.id.to_s} contains #{total_pages} pages...") + end + + break + end + + Thread.current['prev_page_len'] = Thread.current['page_len'] + end + end + + break if Thread.current['done'] + + logger.info("Thread #{num + 1}: Page #{Thread.current["page"]} of #{total_pages} - #{Thread.current["page_len"]} ontology terms retrieved in #{Time.now - Thread.current["t0"]} sec.") + Thread.current['t0'] = Time.now + + Thread.current['page_classes'].each do |c| + begin + # this cal is needed for indexing of properties + LinkedData::Models::Class.map_attributes(c, paging.equivalent_predicates) + rescue Exception => e + i = 0 + num_calls = LinkedData.settings.num_retries_4store + success = nil + + while success.nil? && i < num_calls do + i += 1 + logger.error("Thread #{num + 1}: Exception while mapping attributes for #{c.id.to_s}. Retrying #{i} times...") + sleep(2) + + begin + LinkedData::Models::Class.map_attributes(c, paging.equivalent_predicates) + logger.info("Thread #{num + 1}: Success mapping attributes for #{c.id.to_s} after retrying #{i} times...") + success = true + rescue Exception => e1 + success = nil + + if i == num_calls + logger.error("Thread #{num + 1}: Error mapping attributes for #{c.id.to_s}:") + logger.error("Thread #{num + 1}: #{e1.class}: #{e1.message} after retrying #{i} times...\n#{e1.backtrace.join("\n\t")}") + logger.flush + end + end + end + end + + @submission.synchronize do + csv_writer.write_class(c) + end + end + logger.info("Thread #{num + 1}: Page #{Thread.current["page"]} of #{total_pages} attributes mapped in #{Time.now - Thread.current["t0"]} sec.") + + Thread.current['t0'] = Time.now + LinkedData::Models::Class.indexBatch(Thread.current['page_classes']) + logger.info("Thread #{num + 1}: Page #{Thread.current["page"]} of #{total_pages} - #{Thread.current["page_len"]} ontology terms indexed in #{Time.now - Thread.current["t0"]} sec.") + logger.flush + end + } + end + + threads.map { |t| t.join } + csv_writer.close + + begin + # index provisional classes + @submission.ontology.provisionalClasses.each { |pc| pc.index } + rescue Exception => e + logger.error("Error while indexing provisional classes for ontology #{@submission.ontology.acronym}:") + logger.error("#{e.class}: #{e.message}\n#{e.backtrace.join("\n\t")}") + logger.flush + end + + if commit + t0 = Time.now + LinkedData::Models::Class.indexCommit() + logger.info("Ontology terms index commit in #{Time.now - t0} sec.") + end + rescue StandardError => e + csv_writer.close + logger.error("\n\n#{e.class}: #{e.message}\n") + logger.error(e.backtrace) + raise e + end + end + logger.info("Completed indexing ontology terms: #{@submission.ontology.acronym} in #{time} sec. #{count_classes} classes.") + logger.flush + + if optimize + logger.info('Optimizing ontology terms index...') + time = Benchmark.realtime do + LinkedData::Models::Class.indexOptimize() + end + logger.info("Completed optimizing ontology terms index in #{time} sec.") + end + end + + end + end +end + From 7bea7e4447dafeaa963a436f62b75f0b2ae91eb6 Mon Sep 17 00:00:00 2001 From: Syphax Bouazzouni Date: Mon, 4 Apr 2022 12:52:15 +0200 Subject: [PATCH 08/14] adding the SubmissionPropertiesIndexer --- .../submission_properties_indexer.rb | 71 +++++++++++++++++++ 1 file changed, 71 insertions(+) create mode 100644 lib/ontologies_linked_data/services/submission_process/operations/submission_properties_indexer.rb diff --git a/lib/ontologies_linked_data/services/submission_process/operations/submission_properties_indexer.rb b/lib/ontologies_linked_data/services/submission_process/operations/submission_properties_indexer.rb new file mode 100644 index 00000000..7c615d9a --- /dev/null +++ b/lib/ontologies_linked_data/services/submission_process/operations/submission_properties_indexer.rb @@ -0,0 +1,71 @@ +module LinkedData + module Services + class SubmissionPropertiesIndexer < OntologySubmissionProcess + + def process(logger, options = nil) + process_indexation(logger, options) + end + + private + + def process_indexation(logger, options) + status = LinkedData::Models::SubmissionStatus.find('INDEXED_PROPERTIES').first + begin + index_properties(logger, options[:index_commit], false) + @submission.add_submission_status(status) + rescue StandardError => e + logger.error("#{e.class}: #{e.message}\n#{e.backtrace.join("\n\t")}") + logger.flush + @submission.add_submission_status(status.get_error_status) + ensure + @submission.save + end + end + + + def index_properties(logger, commit = true, optimize = true) + page = 1 + size = 2500 + count_props = 0 + + time = Benchmark.realtime do + @submission.bring(:ontology) if @submission.bring?(:ontology) + @submission.ontology.bring(:acronym) if @submission.ontology.bring?(:acronym) + logger.info("Indexing ontology properties: #{@submission.ontology.acronym}...") + t0 = Time.now + @submission.ontology.unindex_properties(commit) + logger.info("Removed ontology properties index in #{Time.now - t0} seconds."); logger.flush + + props = @submission.ontology.properties + count_props = props.length + total_pages = (count_props/size.to_f).ceil + logger.info("Indexing a total of #{total_pages} pages of #{size} properties each.") + + props.each_slice(size) do |prop_batch| + t = Time.now + LinkedData::Models::Class.indexBatch(prop_batch, :property) + logger.info("Page #{page} of ontology properties indexed in #{Time.now - t} seconds."); logger.flush + page += 1 + end + + if commit + t0 = Time.now + LinkedData::Models::Class.indexCommit(nil, :property) + logger.info("Ontology properties index commit in #{Time.now - t0} seconds.") + end + end + logger.info("Completed indexing ontology properties of #{@submission.ontology.acronym} in #{time} sec. Total of #{count_props} properties indexed.") + logger.flush + + if optimize + logger.info('Optimizing ontology properties index...') + time = Benchmark.realtime do + LinkedData::Models::Class.indexOptimize(nil, :property) + end + logger.info("Completed optimizing ontology properties index in #{time} seconds.") + end + end + end + end +end + From e658b66b8ca9ef18dc9490b93b9546dd77b969ff Mon Sep 17 00:00:00 2001 From: Syphax Bouazzouni Date: Mon, 4 Apr 2022 12:53:16 +0200 Subject: [PATCH 09/14] adding the SubmissionMetricsCalculator operation --- lib/ontologies_linked_data/metrics/metrics.rb | 64 ++--------- .../submission_mertrics_calculator.rb | 102 ++++++++++++++++++ 2 files changed, 112 insertions(+), 54 deletions(-) create mode 100644 lib/ontologies_linked_data/services/submission_process/operations/submission_mertrics_calculator.rb diff --git a/lib/ontologies_linked_data/metrics/metrics.rb b/lib/ontologies_linked_data/metrics/metrics.rb index c74f9e2b..c042c7ca 100644 --- a/lib/ontologies_linked_data/metrics/metrics.rb +++ b/lib/ontologies_linked_data/metrics/metrics.rb @@ -2,50 +2,6 @@ module LinkedData module Metrics - def self.metrics_for_submission(submission, logger) - metrics = nil - logger.info("metrics_for_submission start") - logger.flush - begin - submission.bring(:submissionStatus) if submission.bring?(:submissionStatus) - cls_metrics = class_metrics(submission, logger) - logger.info("class_metrics finished") - logger.flush - metrics = LinkedData::Models::Metric.new - - cls_metrics.each do |k,v| - unless v.instance_of?(Integer) - begin - v = Integer(v) - rescue ArgumentError - v = 0 - rescue TypeError - v = 0 - end - end - metrics.send("#{k}=",v) - end - indiv_count = number_individuals(logger, submission) - metrics.individuals = indiv_count - logger.info("individuals finished") - logger.flush - prop_count = number_properties(logger, submission) - metrics.properties = prop_count - logger.info("properties finished") - logger.flush - # re-generate metrics file - submission.generate_metrics_file(cls_metrics[:classes], indiv_count, prop_count) - logger.info("generation of metrics file finished") - logger.flush - rescue Exception => e - logger.error(e.message) - logger.error(e) - logger.flush - metrics = nil - end - metrics - end - def self.class_metrics(submission, logger) t00 = Time.now submission.ontology.bring(:flat) if submission.ontology.bring?(:flat) @@ -97,7 +53,7 @@ def self.class_metrics(submission, logger) logger.flush children_counts = [] groupby_children.each do |cls,count| - unless cls.start_with?("http") + unless cls.start_with?('http') next end unless is_flat @@ -178,7 +134,7 @@ def self.number_individuals(logger, submission) else logger.info("Unable to find metrics in file for submission #{submission.id.to_s}. Performing a COUNT of type query to get the total individual count...") logger.flush - indiv_count = count_owl_type(submission.id, "NamedIndividual") + indiv_count = count_owl_type(submission.id, 'NamedIndividual') end indiv_count end @@ -192,8 +148,8 @@ def self.number_properties(logger, submission) else logger.info("Unable to find metrics in file for submission #{submission.id.to_s}. Performing a COUNT of type query to get the total property count...") logger.flush - prop_count = count_owl_type(submission.id, "DatatypeProperty") - prop_count += count_owl_type(submission.id, "ObjectProperty") + prop_count = count_owl_type(submission.id, 'DatatypeProperty') + prop_count += count_owl_type(submission.id, 'ObjectProperty') end prop_count end @@ -203,17 +159,17 @@ def self.hierarchy_depth?(graph,root,n,treeProp) hops = [] vars = [] n.times do |i| - hop = sTemplate.sub("children","?x#{i}") + hop = sTemplate.sub('children',"?x#{i}") if i == 0 - hop = hop.sub("parent", "<#{root.to_s}>") + hop = hop.sub('parent', "<#{root.to_s}>") else - hop = hop.sub("parent", "?x#{i-1}") + hop = hop.sub('parent', "?x#{i-1}") end hops << hop vars << "?x#{i}" end joins = hops.join(".\n") - vars = vars.join(" ") + vars = vars.join(' ') query = < { @@ -238,7 +194,7 @@ def self.hierarchy_depth?(graph,root,n,treeProp) def self.query_count_definitions(subId,defProps) propFilter = defProps.map { |x| "?p = <#{x.to_s}>" } - propFilter = propFilter.join " || " + propFilter = propFilter.join ' || ' query = <<-eos SELECT (count(DISTINCT ?s) as ?c) WHERE { GRAPH <#{subId.to_s}> { @@ -249,7 +205,7 @@ def self.query_count_definitions(subId,defProps) FILTER (?s != <#{Goo.namespaces[:owl][:Thing]}>) }} eos - query = query.sub("properties", propFilter) + query = query.sub('properties', propFilter) rs = Goo.sparql_query_client.query(query) rs.each do |sol| return sol[:c].object diff --git a/lib/ontologies_linked_data/services/submission_process/operations/submission_mertrics_calculator.rb b/lib/ontologies_linked_data/services/submission_process/operations/submission_mertrics_calculator.rb new file mode 100644 index 00000000..411c8194 --- /dev/null +++ b/lib/ontologies_linked_data/services/submission_process/operations/submission_mertrics_calculator.rb @@ -0,0 +1,102 @@ +module LinkedData + module Services + class SubmissionMetricsCalculator < OntologySubmissionProcess + def process(logger, options = nil) + process_metrics(logger) + end + + def generate_umls_metrics_file(tr_file_path=nil) + tr_file_path ||= @submission.triples_file_path + class_count = 0 + indiv_count = 0 + prop_count = 0 + + File.foreach(tr_file_path) do |line| + class_count += 1 if line =~ /owl:Class/ + indiv_count += 1 if line =~ /owl:NamedIndividual/ + prop_count += 1 if line =~ /owl:ObjectProperty/ + prop_count += 1 if line =~ /owl:DatatypeProperty/ + end + generate_metrics_file(class_count, indiv_count, prop_count) + end + + private + + def process_metrics(logger) + status = LinkedData::Models::SubmissionStatus.find('METRICS').first + begin + compute_metrics(logger) + @submission.add_submission_status(status) + rescue StandardError => e + logger.error("#{e.class}: #{e.message}\n#{e.backtrace.join("\n\t")}") + logger.flush + @submission.metrics = nil + @submission.add_submission_status(status.get_error_status) + ensure + @submission.save + end + end + + def compute_metrics(logger) + metrics = metrics_for_submission(logger) + metrics.id = RDF::URI.new(@submission.id.to_s + '/metrics') + exist_metrics = LinkedData::Models::Metric.find(metrics.id).first + exist_metrics.delete if exist_metrics + metrics.save + @submission.metrics = metrics + @submission + end + + def metrics_for_submission(logger) + logger.info('metrics_for_submission start') + logger.flush + begin + @submission.bring(:submissionStatus) if @submission.bring?(:submissionStatus) + cls_metrics = LinkedData::Metrics.class_metrics(@submission, logger) + logger.info('class_metrics finished') + logger.flush + metrics = LinkedData::Models::Metric.new + + cls_metrics.each do |k,v| + unless v.instance_of?(Integer) + begin + v = Integer(v) + rescue ArgumentError + v = 0 + rescue TypeError + v = 0 + end + end + metrics.send("#{k}=",v) + end + indiv_count = LinkedData::Metrics.number_individuals(logger, @submission) + metrics.individuals = indiv_count + logger.info('individuals finished') + logger.flush + prop_count = LinkedData::Metrics.number_properties(logger, @submission) + metrics.properties = prop_count + logger.info('properties finished') + logger.flush + # re-generate metrics file + generate_metrics_file(cls_metrics[:classes], indiv_count, prop_count) + logger.info('generation of metrics file finished') + logger.flush + rescue StandardError => e + logger.error(e.message) + logger.error(e) + logger.flush + metrics = nil + end + metrics + end + + def generate_metrics_file(class_count, indiv_count, prop_count) + CSV.open(@submission.metrics_path, 'wb') do |csv| + csv << ['Class Count', 'Individual Count', 'Property Count'] + csv << [class_count, indiv_count, prop_count] + end + end + + end + end +end From 4e1aff5d6dfed93451b857a76d7d62007e8492ed Mon Sep 17 00:00:00 2001 From: Syphax Bouazzouni Date: Mon, 4 Apr 2022 12:53:49 +0200 Subject: [PATCH 10/14] adding the SubmissionDiffGenerator operation --- .../operations/submission_diff_generator.rb | 86 +++++++++++++++++++ 1 file changed, 86 insertions(+) create mode 100644 lib/ontologies_linked_data/services/submission_process/operations/submission_diff_generator.rb diff --git a/lib/ontologies_linked_data/services/submission_process/operations/submission_diff_generator.rb b/lib/ontologies_linked_data/services/submission_process/operations/submission_diff_generator.rb new file mode 100644 index 00000000..b6dda351 --- /dev/null +++ b/lib/ontologies_linked_data/services/submission_process/operations/submission_diff_generator.rb @@ -0,0 +1,86 @@ +module LinkedData + module Services + class SubmissionDiffGenerator < OntologySubmissionProcess + + def process(logger, options = nil) + process_diff(logger) + end + + def diff(logger, older) + generate_diff(logger, init_diff_tool(older)) + end + + private + + # accepts another submission in 'older' (it should be an 'older' ontology version) + def init_diff_tool(older) + @submission.bring(:uploadFilePath) + older.bring(:uploadFilePath) + + LinkedData::Diff::BubastisDiffCommand.new( + File.expand_path(older.uploadFilePath), + File.expand_path(@submission.uploadFilePath)) + end + + def process_diff(logger) + status = LinkedData::Models::SubmissionStatus.find('DIFF').first + # Get previous submission from ontology.submissions + @submission.ontology.bring(:submissions) + submissions = @submission.ontology.submissions + + if submissions.nil? + logger.info("Diff process: no submissions available for #{@submission.id}.") + else + submissions.each { |s| s.bring(:submissionId, :diffFilePath) } + # Sort submissions in descending order of submissionId, extract last two submissions + recent_submissions = submissions.sort { |a, b| b.submissionId <=> a.submissionId }[0..1] + + if recent_submissions.length > 1 + # validate that the most recent submission is the current submission + if @submission.submissionId == recent_submissions.first.submissionId + prev = recent_submissions.last + + # Ensure that prev is older than the current submission + if @submission.submissionId > prev.submissionId + # generate a diff + begin + diff(logger,prev) + @submission.add_submission_status(status) + rescue Exception => e + logger.error("#{e.class}: #{e.message}\n#{e.backtrace.join("\n\t")}") + logger.flush + @submission.add_submission_status(status.get_error_status) + ensure + @submission.save + end + end + end + else + logger.info("Diff process: no older submissions available for #{@submission.id}.") + end + end + end + + + def generate_diff(logger, diff_tool) + begin + @submission.bring_remaining + @submission.bring(:diffFilePath) + + LinkedData::Diff.logger = logger + @submission.diffFilePath = diff_tool.diff + @submission.save + logger.info("Diff generated successfully for #{@submission.id}") + logger.flush + rescue StoreError => e + logger.error("Diff process for #{@submission.id} failed - #{e.class}: #{e.message}") + logger.flush + raise e + end + end + + end + end +end + + From 4b1e4ec2abfce6b87ccb24242a15db9cec97e361 Mon Sep 17 00:00:00 2001 From: Syphax Bouazzouni Date: Mon, 4 Apr 2022 13:56:11 +0200 Subject: [PATCH 11/14] add the submission processor operation --- .../submission_processor.rb | 126 ++++++++++++++++++ 1 file changed, 126 insertions(+) create mode 100644 lib/ontologies_linked_data/services/submission_process/submission_processor.rb diff --git a/lib/ontologies_linked_data/services/submission_process/submission_processor.rb b/lib/ontologies_linked_data/services/submission_process/submission_processor.rb new file mode 100644 index 00000000..107f70cd --- /dev/null +++ b/lib/ontologies_linked_data/services/submission_process/submission_processor.rb @@ -0,0 +1,126 @@ +module LinkedData + module Services + class OntologyProcessor < OntologySubmissionProcess + + ################################################################ + # Possible options with their defaults: + # process_rdf = false + # index_search = false + # index_properties = false + # index_commit = false + # run_metrics = false + # reasoning = false + # diff = false + # archive = false + # if no options passed, ALL actions, except for archive = true + ################################################################ + def process(logger, options = nil) + process_submission(logger, options) + end + + private + + def process_submission(logger, options = {}) + # Wrap the whole process so we can email results + begin + archive, diff, index_commit, index_properties, + index_search, process_rdf, reasoning, run_metrics = get_options(options) + + @submission.bring_remaining + @submission.ontology.bring_remaining + + logger.info("Starting to process #{@submission.ontology.acronym}/submissions/#{@submission.submissionId}") + logger.flush + LinkedData::Parser.logger = logger + + if archive + @submission.archive + else + + @submission.generate_rdf(logger, reasoning: reasoning) if process_rdf + + parsed = @submission.ready?(status: [:rdf, :rdf_labels]) + + if index_search + unless parsed + raise StandardError, "The submission #{@submission.ontology.acronym}/submissions/#{@submission.submissionId} + cannot be indexed because it has not been successfully parsed" + end + @submission.index(logger, commit: index_commit) + end + + if index_properties + unless parsed + raise Exception, "The properties for the submission #{@submission.ontology.acronym}/submissions/#{@submission.submissionId} + cannot be indexed because it has not been successfully parsed" + + end + @submission.index_properties(logger, commit: index_commit) + end + + if run_metrics + unless parsed + raise StandardError, "Metrics cannot be generated on the submission + #{@submission.ontology.acronym}/submissions/#{@submission.submissionId} + because it has not been successfully parsed" + end + @submission.generate_metrics(logger) + end + @submission.generate_diff(logger) if diff + end + + @submission.save + logger.info("Submission processing of #{@submission.id} completed successfully") + logger.flush + ensure + # make sure results get emailed + notify_submission_processed(logger) + end + @submission + end + + def notify_submission_processed(logger) + begin + LinkedData::Utils::Notifications.submission_processed(@submission) + rescue StandardError => e + logger.error("Email sending failed: #{e.message}\n#{e.backtrace.join("\n\t")}"); logger.flush + end + end + + def get_options(options) + + if options.empty? + process_rdf = true + index_search = true + index_properties = true + index_commit = true + run_metrics = true + reasoning = true + diff = true + archive = false + else + process_rdf = options[:process_rdf] == true + index_search = options[:index_search] == true + index_properties = options[:index_properties] == true + run_metrics = options[:run_metrics] == true + + reasoning = if !process_rdf || options[:reasoning] == false + false + else + true + end + + index_commit = if (!index_search && !index_properties) || options[:index_commit] == false + false + else + true + end + + diff = options[:diff] == true + archive = options[:archive] == true + end + [archive, diff, index_commit, index_properties, index_search, process_rdf, reasoning, run_metrics] + end + end + end +end From 0bce026171915bf251c0fd53b453e853956af156 Mon Sep 17 00:00:00 2001 From: Syphax Bouazzouni Date: Mon, 4 Apr 2022 13:57:06 +0200 Subject: [PATCH 12/14] move the submission operations to an external module --- lib/ontologies_linked_data.rb | 6 + .../models/concerns/submission_process.rb | 40 + .../models/ontology_submission.rb | 895 +----------------- 3 files changed, 49 insertions(+), 892 deletions(-) create mode 100644 lib/ontologies_linked_data/models/concerns/submission_process.rb diff --git a/lib/ontologies_linked_data.rb b/lib/ontologies_linked_data.rb index 92d51fcd..c935b54d 100644 --- a/lib/ontologies_linked_data.rb +++ b/lib/ontologies_linked_data.rb @@ -47,6 +47,12 @@ require m end +# We need to require deterministic - that is why we have the sort. +models = Dir.glob("#{project_root}/ontologies_linked_data/models/concerns//**/*.rb").sort +models.each do |m| + require m +end + # We need to require deterministic - that is why we have the sort. models = Dir.glob("#{project_root}/ontologies_linked_data/models/**/*.rb").sort models.each do |m| diff --git a/lib/ontologies_linked_data/models/concerns/submission_process.rb b/lib/ontologies_linked_data/models/concerns/submission_process.rb new file mode 100644 index 00000000..fa19df40 --- /dev/null +++ b/lib/ontologies_linked_data/models/concerns/submission_process.rb @@ -0,0 +1,40 @@ +module LinkedData + module Concerns + module SubmissionProcessable + + def process_submission(logger, options={}) + LinkedData::Services::OntologyProcessor.new(self).process(logger, options) + end + + def diff(logger, older) + LinkedData::Services::SubmissionDiffGenerator.new(self).diff(logger, older) + end + + def generate_diff(logger) + LinkedData::Services::SubmissionDiffGenerator.new(self).process(logger) + end + + def index(logger, commit: true, optimize: true) + LinkedData::Services::OntologySubmissionIndexer.new(self).process(logger, commit: commit, optimize: optimize) + end + + def index_properties(logger, commit: true, optimize: true) + LinkedData::Services::SubmissionPropertiesIndexer.new(self).process(logger, commit: commit, optimize: optimize) + end + + def archive + LinkedData::Services::OntologySubmissionArchiver.new(self ).process + end + + def generate_rdf(logger, reasoning: true) + LinkedData::Services::SubmissionRDFGenerator.new(self).process(logger, reasoning: reasoning) + end + + def generate_metrics(logger) + LinkedData::Services::SubmissionMetricsCalculator.new(self).process(logger) + end + + end + end +end + diff --git a/lib/ontologies_linked_data/models/ontology_submission.rb b/lib/ontologies_linked_data/models/ontology_submission.rb index 050958dd..76f957ee 100644 --- a/lib/ontologies_linked_data/models/ontology_submission.rb +++ b/lib/ontologies_linked_data/models/ontology_submission.rb @@ -12,7 +12,8 @@ module Models class OntologySubmission < LinkedData::Models::Base - FILES_TO_DELETE = ['labels.ttl', 'mappings.ttl', 'obsolete.ttl', 'owlapi.xrdf', 'errors.log'] + include LinkedData::Concerns::SubmissionProcessable + FLAT_ROOTS_LIMIT = 1000 model :ontology_submission, name_with: lambda { |s| submission_id_generator(s) } @@ -317,36 +318,6 @@ def unzip_submission(logger) return zip_dst end - def delete_old_submission_files - path_to_repo = data_folder - submission_files = FILES_TO_DELETE.map { |f| File.join(path_to_repo, f) } - submission_files.push(csv_path) - submission_files.push(parsing_log_path) unless parsing_log_path.nil? - FileUtils.rm(submission_files, force: true) - end - - # accepts another submission in 'older' (it should be an 'older' ontology version) - def diff(logger, older) - begin - self.bring_remaining - self.bring(:diffFilePath) - self.bring(:uploadFilePath) - older.bring(:uploadFilePath) - LinkedData::Diff.logger = logger - bubastis = LinkedData::Diff::BubastisDiffCommand.new( - File.expand_path(older.uploadFilePath), - File.expand_path(self.uploadFilePath) - ) - self.diffFilePath = bubastis.diff - self.save - logger.info("Bubastis diff generated successfully for #{self.id}") - logger.flush - rescue Exception => e - logger.error("Bubastis diff for #{self.id} failed - #{e.class}: #{e.message}") - logger.flush - raise e - end - end def class_count(logger=nil) logger ||= LinkedData::Parser.logger || Logger.new($stderr) @@ -407,385 +378,6 @@ def metrics_from_file(logger=nil) metrics end - def generate_metrics_file(class_count, indiv_count, prop_count) - CSV.open(self.metrics_path, "wb") do |csv| - csv << ["Class Count", "Individual Count", "Property Count"] - csv << [class_count, indiv_count, prop_count] - end - end - - def generate_umls_metrics_file(tr_file_path=nil) - tr_file_path ||= self.triples_file_path - class_count = 0 - indiv_count = 0 - prop_count = 0 - - File.foreach(tr_file_path) do |line| - class_count += 1 if line =~ /owl:Class/ - indiv_count += 1 if line =~ /owl:NamedIndividual/ - prop_count += 1 if line =~ /owl:ObjectProperty/ - prop_count += 1 if line =~ /owl:DatatypeProperty/ - end - self.generate_metrics_file(class_count, indiv_count, prop_count) - end - - def generate_rdf(logger, file_path, reasoning=true) - mime_type = nil - - if self.hasOntologyLanguage.umls? - triples_file_path = self.triples_file_path - logger.info("Using UMLS turtle file found, skipping OWLAPI parse") - logger.flush - mime_type = LinkedData::MediaTypes.media_type_from_base(LinkedData::MediaTypes::TURTLE) - generate_umls_metrics_file(triples_file_path) - else - output_rdf = self.rdf_path - - if File.exist?(output_rdf) - logger.info("deleting old owlapi.xrdf ..") - deleted = FileUtils.rm(output_rdf) - - if deleted.length > 0 - logger.info("deleted") - else - logger.info("error deleting owlapi.rdf") - end - end - owlapi = LinkedData::Parser::OWLAPICommand.new( - File.expand_path(file_path), - File.expand_path(self.data_folder.to_s), - master_file: self.masterFileName) - - if !reasoning - owlapi.disable_reasoner - end - triples_file_path, missing_imports = owlapi.parse - - if missing_imports && missing_imports.length > 0 - self.missingImports = missing_imports - - missing_imports.each do |imp| - logger.info("OWL_IMPORT_MISSING: #{imp}") - end - else - self.missingImports = nil - end - logger.flush - end - delete_and_append(triples_file_path, logger, mime_type) - version_info = extract_version() - - if version_info - self.version = version_info - end - end - - def extract_version - - query_version_info = < - ?versionInfo . -} -eos - Goo.sparql_query_client.query(query_version_info).each_solution do |sol| - return sol[:versionInfo].to_s - end - return nil - end - - def process_callbacks(logger, callbacks, action_name, &block) - callbacks.delete_if do |_, callback| - begin - if callback[action_name] - callable = self.method(callback[action_name]) - yield(callable, callback) - end - false - rescue Exception => e - logger.error("#{e.class}: #{e.message}\n#{e.backtrace.join("\n\t")}") - logger.flush - - if callback[:status] - add_submission_status(callback[:status].get_error_status) - self.save - end - - # halt the entire processing if :required is set to true - raise e if callback[:required] - # continue processing of other callbacks, but not this one - true - end - end - end - - def loop_classes(logger, raw_paging, callbacks) - page = 1 - size = 2500 - count_classes = 0 - acr = self.id.to_s.split("/")[-1] - operations = callbacks.values.map { |v| v[:op_name] }.join(", ") - - time = Benchmark.realtime do - paging = raw_paging.page(page, size) - cls_count_set = false - cls_count = class_count(logger) - - if cls_count > -1 - # prevent a COUNT SPARQL query if possible - paging.page_count_set(cls_count) - cls_count_set = true - else - cls_count = 0 - end - - iterate_classes = false - # 1. init artifacts hash if not explicitly passed in the callback - # 2. determine if class-level iteration is required - callbacks.each { |_, callback| callback[:artifacts] ||= {}; iterate_classes = true if callback[:caller_on_each] } - - process_callbacks(logger, callbacks, :caller_on_pre) { - |callable, callback| callable.call(callback[:artifacts], logger, paging) } - - page_len = -1 - prev_page_len = -1 - - begin - t0 = Time.now - page_classes = paging.page(page, size).all - total_pages = page_classes.total_pages - page_len = page_classes.length - - # nothing retrieved even though we're expecting more records - if total_pages > 0 && page_classes.empty? && (prev_page_len == -1 || prev_page_len == size) - j = 0 - num_calls = LinkedData.settings.num_retries_4store - - while page_classes.empty? && j < num_calls do - j += 1 - logger.error("Empty page encountered. Retrying #{j} times...") - sleep(2) - page_classes = paging.page(page, size).all - logger.info("Success retrieving a page of #{page_classes.length} classes after retrying #{j} times...") unless page_classes.empty? - end - - if page_classes.empty? - msg = "Empty page #{page} of #{total_pages} persisted after retrying #{j} times. #{operations} of #{acr} aborted..." - logger.error(msg) - raise msg - end - end - - if page_classes.empty? - if total_pages > 0 - logger.info("The number of pages reported for #{acr} - #{total_pages} is higher than expected #{page - 1}. Completing #{operations}...") - else - logger.info("Ontology #{acr} contains #{total_pages} pages...") - end - break - end - - prev_page_len = page_len - logger.info("#{acr}: page #{page} of #{total_pages} - #{page_len} ontology terms retrieved in #{Time.now - t0} sec.") - logger.flush - count_classes += page_classes.length - - process_callbacks(logger, callbacks, :caller_on_pre_page) { - |callable, callback| callable.call(callback[:artifacts], logger, paging, page_classes, page) } - - page_classes.each { |c| - process_callbacks(logger, callbacks, :caller_on_each) { - |callable, callback| callable.call(callback[:artifacts], logger, paging, page_classes, page, c) } - } if iterate_classes - - process_callbacks(logger, callbacks, :caller_on_post_page) { - |callable, callback| callable.call(callback[:artifacts], logger, paging, page_classes, page) } - cls_count += page_classes.length unless cls_count_set - - page = page_classes.next? ? page + 1 : nil - end while !page.nil? - - callbacks.each { |_, callback| callback[:artifacts][:count_classes] = cls_count } - process_callbacks(logger, callbacks, :caller_on_post) { - |callable, callback| callable.call(callback[:artifacts], logger, paging) } - end - - logger.info("Completed #{operations}: #{acr} in #{time} sec. #{count_classes} classes.") - logger.flush - - # set the status on actions that have completed successfully - callbacks.each do |_, callback| - if callback[:status] - add_submission_status(callback[:status]) - self.save - end - end - end - - def generate_missing_labels_pre(artifacts={}, logger, paging) - file_path = artifacts[:file_path] - artifacts[:save_in_file] = File.join(File.dirname(file_path), "labels.ttl") - artifacts[:save_in_file_mappings] = File.join(File.dirname(file_path), "mappings.ttl") - property_triples = LinkedData::Utils::Triples.rdf_for_custom_properties(self) - Goo.sparql_data_client.append_triples(self.id, property_triples, mime_type="application/x-turtle") - fsave = File.open(artifacts[:save_in_file], "w") - fsave.write(property_triples) - fsave_mappings = File.open(artifacts[:save_in_file_mappings], "w") - artifacts[:fsave] = fsave - artifacts[:fsave_mappings] = fsave_mappings - end - - def generate_missing_labels_pre_page(artifacts={}, logger, paging, page_classes, page) - artifacts[:label_triples] = [] - artifacts[:mapping_triples] = [] - end - - def generate_missing_labels_each(artifacts={}, logger, paging, page_classes, page, c) - prefLabel = nil - - if c.prefLabel.nil? - rdfs_labels = c.label - - if rdfs_labels && rdfs_labels.length > 1 && c.synonym.length > 0 - rdfs_labels = (Set.new(c.label) - Set.new(c.synonym)).to_a.first - - if rdfs_labels.nil? || rdfs_labels.length == 0 - rdfs_labels = c.label - end - end - - if rdfs_labels and not (rdfs_labels.instance_of? Array) - rdfs_labels = [rdfs_labels] - end - label = nil - - if rdfs_labels && rdfs_labels.length > 0 - label = rdfs_labels[0] - else - label = LinkedData::Utils::Triples.last_iri_fragment c.id.to_s - end - artifacts[:label_triples] << LinkedData::Utils::Triples.label_for_class_triple( - c.id, Goo.vocabulary(:metadata_def)[:prefLabel], label) - prefLabel = label - else - prefLabel = c.prefLabel - end - - if self.ontology.viewOf.nil? - loomLabel = OntologySubmission.loom_transform_literal(prefLabel.to_s) - - if loomLabel.length > 2 - artifacts[:mapping_triples] << LinkedData::Utils::Triples.loom_mapping_triple( - c.id, Goo.vocabulary(:metadata_def)[:mappingLoom], loomLabel) - end - artifacts[:mapping_triples] << LinkedData::Utils::Triples.uri_mapping_triple( - c.id, Goo.vocabulary(:metadata_def)[:mappingSameURI], c.id) - end - end - - def generate_missing_labels_post_page(artifacts={}, logger, paging, page_classes, page) - rest_mappings = LinkedData::Mappings.migrate_rest_mappings(self.ontology.acronym) - artifacts[:mapping_triples].concat(rest_mappings) - - if artifacts[:label_triples].length > 0 - logger.info("Asserting #{artifacts[:label_triples].length} labels in " + - "#{self.id.to_ntriples}") - logger.flush - artifacts[:label_triples] = artifacts[:label_triples].join("\n") - artifacts[:fsave].write(artifacts[:label_triples]) - t0 = Time.now - Goo.sparql_data_client.append_triples(self.id, artifacts[:label_triples], mime_type="application/x-turtle") - t1 = Time.now - logger.info("Labels asserted in #{t1 - t0} sec.") - logger.flush - else - logger.info("No labels generated in page #{page}.") - logger.flush - end - - if artifacts[:mapping_triples].length > 0 - logger.info("Asserting #{artifacts[:mapping_triples].length} mappings in " + - "#{self.id.to_ntriples}") - logger.flush - artifacts[:mapping_triples] = artifacts[:mapping_triples].join("\n") - artifacts[:fsave_mappings].write(artifacts[:mapping_triples]) - - t0 = Time.now - Goo.sparql_data_client.append_triples(self.id, artifacts[:mapping_triples], mime_type="application/x-turtle") - t1 = Time.now - logger.info("Mapping labels asserted in #{t1 - t0} sec.") - logger.flush - end - end - - def generate_missing_labels_post(artifacts={}, logger, paging) - logger.info("end generate_missing_labels traversed #{artifacts[:count_classes]} classes") - logger.info("Saved generated labels in #{artifacts[:save_in_file]}") - artifacts[:fsave].close() - artifacts[:fsave_mappings].close() - logger.flush - end - - def generate_obsolete_classes(logger, file_path) - self.bring(:obsoleteProperty) if self.bring?(:obsoleteProperty) - self.bring(:obsoleteParent) if self.bring?(:obsoleteParent) - classes_deprecated = [] - if self.obsoleteProperty && - self.obsoleteProperty.to_s != "http://www.w3.org/2002/07/owl#deprecated" - - predicate_obsolete = RDF::URI.new(self.obsoleteProperty.to_s) - query_obsolete_predicate = < 0 - classes_deprecated.uniq! - logger.info("Asserting owl:deprecated statement for #{classes_deprecated} classes") - save_in_file = File.join(File.dirname(file_path), "obsolete.ttl") - fsave = File.open(save_in_file,"w") - classes_deprecated.each do |class_id| - fsave.write(LinkedData::Utils::Triples.obselete_class_triple(class_id) + "\n") - end - fsave.close() - result = Goo.sparql_data_client.append_triples_from_file( - self.id, - save_in_file, - mime_type="application/x-turtle") - end - end def add_submission_status(status) valid = status.is_a?(LinkedData::Models::SubmissionStatus) @@ -867,480 +459,6 @@ def archived? return ready?(status: [:archived]) end - ################################################################ - # Possible options with their defaults: - # process_rdf = false - # index_search = false - # index_properties = false - # index_commit = false - # run_metrics = false - # reasoning = false - # diff = false - # archive = false - # if no options passed, ALL actions, except for archive = true - ################################################################ - def process_submission(logger, options={}) - # Wrap the whole process so we can email results - begin - process_rdf = false - index_search = false - index_properties = false - index_commit = false - run_metrics = false - reasoning = false - diff = false - archive = false - - if options.empty? - process_rdf = true - index_search = true - index_properties = true - index_commit = true - run_metrics = true - reasoning = true - diff = true - archive = false - else - process_rdf = options[:process_rdf] == true ? true : false - index_search = options[:index_search] == true ? true : false - index_properties = options[:index_properties] == true ? true : false - index_commit = options[:index_commit] == true ? true : false - run_metrics = options[:run_metrics] == true ? true : false - - if !process_rdf || options[:reasoning] == false - reasoning = false - else - reasoning = true - end - - if (!index_search && !index_properties) || options[:index_commit] == false - index_commit = false - else - index_commit = true - end - - diff = options[:diff] == true ? true : false - archive = options[:archive] == true ? true : false - end - - self.bring_remaining - self.ontology.bring_remaining - - logger.info("Starting to process #{self.ontology.acronym}/submissions/#{self.submissionId}") - logger.flush - LinkedData::Parser.logger = logger - status = nil - - if archive - self.submissionStatus = nil - status = LinkedData::Models::SubmissionStatus.find("ARCHIVED").first - add_submission_status(status) - - # Delete everything except for original ontology file. - ontology.bring(:submissions) - submissions = ontology.submissions - unless submissions.nil? - submissions.each { |s| s.bring(:submissionId) } - submission = submissions.sort { |a,b| b.submissionId <=> a.submissionId }[0] - # Don't perform deletion if this is the most recent submission. - if (self.submissionId < submission.submissionId) - delete_old_submission_files - end - end - else - if process_rdf - # Remove processing status types before starting RDF parsing etc. - self.submissionStatus = nil - status = LinkedData::Models::SubmissionStatus.find("UPLOADED").first - add_submission_status(status) - self.save - - # Parse RDF - file_path = nil - begin - if not self.valid? - error = "Submission is not valid, it cannot be processed. Check errors." - raise ArgumentError, error - end - if not self.uploadFilePath - error = "Submission is missing an ontology file, cannot parse." - raise ArgumentError, error - end - status = LinkedData::Models::SubmissionStatus.find("RDF").first - remove_submission_status(status) #remove RDF status before starting - zip_dst = unzip_submission(logger) - file_path = zip_dst ? zip_dst.to_s : self.uploadFilePath.to_s - generate_rdf(logger, file_path, reasoning=reasoning) - add_submission_status(status) - self.save - rescue Exception => e - logger.error("#{e.class}: #{e.message}\n#{e.backtrace.join("\n\t")}") - logger.flush - add_submission_status(status.get_error_status) - self.save - # If RDF generation fails, no point of continuing - raise e - end - - callbacks = { - missing_labels: { - op_name: "Missing Labels Generation", - required: true, - status: LinkedData::Models::SubmissionStatus.find("RDF_LABELS").first, - artifacts: { - file_path: file_path - }, - caller_on_pre: :generate_missing_labels_pre, - caller_on_pre_page: :generate_missing_labels_pre_page, - caller_on_each: :generate_missing_labels_each, - caller_on_post_page: :generate_missing_labels_post_page, - caller_on_post: :generate_missing_labels_post - } - } - - raw_paging = LinkedData::Models::Class.in(self).include(:prefLabel, :synonym, :label) - loop_classes(logger, raw_paging, callbacks) - - status = LinkedData::Models::SubmissionStatus.find("OBSOLETE").first - begin - generate_obsolete_classes(logger, file_path) - add_submission_status(status) - self.save - rescue Exception => e - logger.error("#{e.class}: #{e.message}\n#{e.backtrace.join("\n\t")}") - logger.flush - add_submission_status(status.get_error_status) - self.save - # if obsolete fails the parsing fails - raise e - end - end - - parsed = ready?(status: [:rdf, :rdf_labels]) - - if index_search - raise Exception, "The submission #{self.ontology.acronym}/submissions/#{self.submissionId} cannot be indexed because it has not been successfully parsed" unless parsed - status = LinkedData::Models::SubmissionStatus.find("INDEXED").first - begin - index(logger, index_commit, false) - add_submission_status(status) - rescue Exception => e - logger.error("#{e.class}: #{e.message}\n#{e.backtrace.join("\n\t")}") - logger.flush - add_submission_status(status.get_error_status) - if File.file?(self.csv_path) - FileUtils.rm(self.csv_path) - end - ensure - self.save - end - end - - if index_properties - raise Exception, "The properties for the submission #{self.ontology.acronym}/submissions/#{self.submissionId} cannot be indexed because it has not been successfully parsed" unless parsed - status = LinkedData::Models::SubmissionStatus.find("INDEXED_PROPERTIES").first - begin - index_properties(logger, index_commit, false) - add_submission_status(status) - rescue Exception => e - logger.error("#{e.class}: #{e.message}\n#{e.backtrace.join("\n\t")}") - logger.flush - add_submission_status(status.get_error_status) - ensure - self.save - end - end - - if run_metrics - raise Exception, "Metrics cannot be generated on the submission #{self.ontology.acronym}/submissions/#{self.submissionId} because it has not been successfully parsed" unless parsed - status = LinkedData::Models::SubmissionStatus.find("METRICS").first - begin - process_metrics(logger) - add_submission_status(status) - rescue Exception => e - logger.error("#{e.class}: #{e.message}\n#{e.backtrace.join("\n\t")}") - logger.flush - self.metrics = nil - add_submission_status(status.get_error_status) - ensure - self.save - end - end - - if diff - status = LinkedData::Models::SubmissionStatus.find("DIFF").first - # Get previous submission from ontology.submissions - self.ontology.bring(:submissions) - submissions = self.ontology.submissions - - unless submissions.nil? - submissions.each {|s| s.bring(:submissionId, :diffFilePath)} - # Sort submissions in descending order of submissionId, extract last two submissions - recent_submissions = submissions.sort {|a, b| b.submissionId <=> a.submissionId}[0..1] - - if recent_submissions.length > 1 - # validate that the most recent submission is the current submission - if self.submissionId == recent_submissions.first.submissionId - prev = recent_submissions.last - - # Ensure that prev is older than the current submission - if self.submissionId > prev.submissionId - # generate a diff - begin - self.diff(logger, prev) - add_submission_status(status) - rescue Exception => e - logger.error("#{e.class}: #{e.message}\n#{e.backtrace.join("\n\t")}") - logger.flush - add_submission_status(status.get_error_status) - ensure - self.save - end - end - end - else - logger.info("Bubastis diff: no older submissions available for #{self.id}.") - end - else - logger.info("Bubastis diff: no submissions available for #{self.id}.") - end - end - end - - self.save - logger.info("Submission processing of #{self.id} completed successfully") - logger.flush - ensure - # make sure results get emailed - begin - LinkedData::Utils::Notifications.submission_processed(self) - rescue Exception => e - logger.error("Email sending failed: #{e.message}\n#{e.backtrace.join("\n\t")}"); logger.flush - end - end - self - end - - def process_metrics(logger) - metrics = LinkedData::Metrics.metrics_for_submission(self, logger) - metrics.id = RDF::URI.new(self.id.to_s + "/metrics") - exist_metrics = LinkedData::Models::Metric.find(metrics.id).first - exist_metrics.delete if exist_metrics - metrics.save - self.metrics = metrics - self - end - - def index(logger, commit = true, optimize = true) - page = 0 - size = 1000 - count_classes = 0 - - time = Benchmark.realtime do - self.bring(:ontology) if self.bring?(:ontology) - self.ontology.bring(:acronym) if self.ontology.bring?(:acronym) - self.ontology.bring(:provisionalClasses) if self.ontology.bring?(:provisionalClasses) - csv_writer = LinkedData::Utils::OntologyCSVWriter.new - csv_writer.open(self.ontology, self.csv_path) - - begin - logger.info("Indexing ontology terms: #{self.ontology.acronym}...") - t0 = Time.now - self.ontology.unindex(false) - logger.info("Removed ontology terms index (#{Time.now - t0}s)"); logger.flush - - paging = LinkedData::Models::Class.in(self).include(:unmapped).aggregate(:count, :children).page(page, size) - cls_count = class_count(logger) - paging.page_count_set(cls_count) unless cls_count < 0 - total_pages = paging.page(1, size).all.total_pages - num_threads = [total_pages, LinkedData.settings.indexing_num_threads].min - threads = [] - page_classes = nil - - num_threads.times do |num| - threads[num] = Thread.new { - Thread.current["done"] = false - Thread.current["page_len"] = -1 - Thread.current["prev_page_len"] = -1 - - while !Thread.current["done"] - synchronize do - page = (page == 0 || page_classes.next?) ? page + 1 : nil - - if page.nil? - Thread.current["done"] = true - else - Thread.current["page"] = page || "nil" - page_classes = paging.page(page, size).all - count_classes += page_classes.length - Thread.current["page_classes"] = page_classes - Thread.current["page_len"] = page_classes.length - Thread.current["t0"] = Time.now - - # nothing retrieved even though we're expecting more records - if total_pages > 0 && page_classes.empty? && (Thread.current["prev_page_len"] == -1 || Thread.current["prev_page_len"] == size) - j = 0 - num_calls = LinkedData.settings.num_retries_4store - - while page_classes.empty? && j < num_calls do - j += 1 - logger.error("Thread #{num + 1}: Empty page encountered. Retrying #{j} times...") - sleep(2) - page_classes = paging.page(page, size).all - logger.info("Thread #{num + 1}: Success retrieving a page of #{page_classes.length} classes after retrying #{j} times...") unless page_classes.empty? - end - - if page_classes.empty? - msg = "Thread #{num + 1}: Empty page #{Thread.current["page"]} of #{total_pages} persisted after retrying #{j} times. Indexing of #{self.id.to_s} aborted..." - logger.error(msg) - raise msg - else - Thread.current["page_classes"] = page_classes - end - end - - if page_classes.empty? - if total_pages > 0 - logger.info("Thread #{num + 1}: The number of pages reported for #{self.id.to_s} - #{total_pages} is higher than expected #{page - 1}. Completing indexing...") - else - logger.info("Thread #{num + 1}: Ontology #{self.id.to_s} contains #{total_pages} pages...") - end - - break - end - - Thread.current["prev_page_len"] = Thread.current["page_len"] - end - end - - break if Thread.current["done"] - - logger.info("Thread #{num + 1}: Page #{Thread.current["page"]} of #{total_pages} - #{Thread.current["page_len"]} ontology terms retrieved in #{Time.now - Thread.current["t0"]} sec.") - Thread.current["t0"] = Time.now - - Thread.current["page_classes"].each do |c| - begin - # this cal is needed for indexing of properties - LinkedData::Models::Class.map_attributes(c, paging.equivalent_predicates) - rescue Exception => e - i = 0 - num_calls = LinkedData.settings.num_retries_4store - success = nil - - while success.nil? && i < num_calls do - i += 1 - logger.error("Thread #{num + 1}: Exception while mapping attributes for #{c.id.to_s}. Retrying #{i} times...") - sleep(2) - - begin - LinkedData::Models::Class.map_attributes(c, paging.equivalent_predicates) - logger.info("Thread #{num + 1}: Success mapping attributes for #{c.id.to_s} after retrying #{i} times...") - success = true - rescue Exception => e1 - success = nil - - if i == num_calls - logger.error("Thread #{num + 1}: Error mapping attributes for #{c.id.to_s}:") - logger.error("Thread #{num + 1}: #{e1.class}: #{e1.message} after retrying #{i} times...\n#{e1.backtrace.join("\n\t")}") - logger.flush - end - end - end - end - - synchronize do - csv_writer.write_class(c) - end - end - logger.info("Thread #{num + 1}: Page #{Thread.current["page"]} of #{total_pages} attributes mapped in #{Time.now - Thread.current["t0"]} sec.") - - Thread.current["t0"] = Time.now - LinkedData::Models::Class.indexBatch(Thread.current["page_classes"]) - logger.info("Thread #{num + 1}: Page #{Thread.current["page"]} of #{total_pages} - #{Thread.current["page_len"]} ontology terms indexed in #{Time.now - Thread.current["t0"]} sec.") - logger.flush - end - } - end - - threads.map { |t| t.join } - csv_writer.close - - begin - # index provisional classes - self.ontology.provisionalClasses.each { |pc| pc.index } - rescue Exception => e - logger.error("Error while indexing provisional classes for ontology #{self.ontology.acronym}:") - logger.error("#{e.class}: #{e.message}\n#{e.backtrace.join("\n\t")}") - logger.flush - end - - if commit - t0 = Time.now - LinkedData::Models::Class.indexCommit() - logger.info("Ontology terms index commit in #{Time.now - t0} sec.") - end - rescue StandardError => e - csv_writer.close - logger.error("\n\n#{e.class}: #{e.message}\n") - logger.error(e.backtrace) - raise e - end - end - logger.info("Completed indexing ontology terms: #{self.ontology.acronym} in #{time} sec. #{count_classes} classes.") - logger.flush - - if optimize - logger.info("Optimizing ontology terms index...") - time = Benchmark.realtime do - LinkedData::Models::Class.indexOptimize() - end - logger.info("Completed optimizing ontology terms index in #{time} sec.") - end - end - - def index_properties(logger, commit = true, optimize = true) - page = 1 - size = 2500 - count_props = 0 - - time = Benchmark.realtime do - self.bring(:ontology) if self.bring?(:ontology) - self.ontology.bring(:acronym) if self.ontology.bring?(:acronym) - logger.info("Indexing ontology properties: #{self.ontology.acronym}...") - t0 = Time.now - self.ontology.unindex_properties(commit) - logger.info("Removed ontology properties index in #{Time.now - t0} seconds."); logger.flush - - props = self.ontology.properties - count_props = props.length - total_pages = (count_props/size.to_f).ceil - logger.info("Indexing a total of #{total_pages} pages of #{size} properties each.") - - props.each_slice(size) do |prop_batch| - t = Time.now - LinkedData::Models::Class.indexBatch(prop_batch, :property) - logger.info("Page #{page} of ontology properties indexed in #{Time.now - t} seconds."); logger.flush - page += 1 - end - - if commit - t0 = Time.now - LinkedData::Models::Class.indexCommit(nil, :property) - logger.info("Ontology properties index commit in #{Time.now - t0} seconds.") - end - end - logger.info("Completed indexing ontology properties of #{self.ontology.acronym} in #{time} sec. Total of #{count_props} properties indexed.") - logger.flush - - if optimize - logger.info("Optimizing ontology properties index...") - time = Benchmark.realtime do - LinkedData::Models::Class.indexOptimize(nil, :property) - end - logger.info("Completed optimizing ontology properties index in #{time} seconds.") - end - end - # Override delete to add removal from the search index #TODO: revise this with a better process def delete(*args) @@ -1361,7 +479,7 @@ def delete(*args) self.ontology.bring(:submissions) if self.ontology.submissions.length > 0 - prev_sub = self.ontology.latest_submission() + prev_sub = self.ontology.latest_submission if prev_sub prev_sub.index(LinkedData::Parser.logger || Logger.new($stderr)) @@ -1531,13 +649,6 @@ def delete_classes_graph private - def delete_and_append(triples_file_path, logger, mime_type = nil) - Goo.sparql_data_client.delete_graph(self.id) - Goo.sparql_data_client.put_triples(self.id, triples_file_path, mime_type) - logger.info("Triples #{triples_file_path} appended in #{self.id.to_ntriples}") - logger.flush - end - def check_http_file(url) session = Net::HTTP.new(url.host, url.port) session.use_ssl = true if url.port == 443 From 85a25e4f90e3fd55baf42536c2595b4aa5d7394f Mon Sep 17 00:00:00 2001 From: Syphax Bouazzouni Date: Mon, 4 Apr 2022 14:02:14 +0200 Subject: [PATCH 13/14] change the index argument name from 'index_commit' to just 'commit' --- .../submission_process/operations/submission_indexer.rb | 2 +- .../operations/submission_properties_indexer.rb | 5 ++--- 2 files changed, 3 insertions(+), 4 deletions(-) diff --git a/lib/ontologies_linked_data/services/submission_process/operations/submission_indexer.rb b/lib/ontologies_linked_data/services/submission_process/operations/submission_indexer.rb index 8751c306..137a3219 100644 --- a/lib/ontologies_linked_data/services/submission_process/operations/submission_indexer.rb +++ b/lib/ontologies_linked_data/services/submission_process/operations/submission_indexer.rb @@ -12,7 +12,7 @@ def process_indexation(logger, options) status = LinkedData::Models::SubmissionStatus.find('INDEXED').first begin - index(logger, options[:index_commit], false) + index(logger, options[:commit], false) @submission.add_submission_status(status) rescue Exception => e logger.error("#{e.class}: #{e.message}\n#{e.backtrace.join("\n\t")}") diff --git a/lib/ontologies_linked_data/services/submission_process/operations/submission_properties_indexer.rb b/lib/ontologies_linked_data/services/submission_process/operations/submission_properties_indexer.rb index 7c615d9a..beefd048 100644 --- a/lib/ontologies_linked_data/services/submission_process/operations/submission_properties_indexer.rb +++ b/lib/ontologies_linked_data/services/submission_process/operations/submission_properties_indexer.rb @@ -11,7 +11,7 @@ def process(logger, options = nil) def process_indexation(logger, options) status = LinkedData::Models::SubmissionStatus.find('INDEXED_PROPERTIES').first begin - index_properties(logger, options[:index_commit], false) + index_properties(logger, commit: options[:commit], optimize: false) @submission.add_submission_status(status) rescue StandardError => e logger.error("#{e.class}: #{e.message}\n#{e.backtrace.join("\n\t")}") @@ -22,8 +22,7 @@ def process_indexation(logger, options) end end - - def index_properties(logger, commit = true, optimize = true) + def index_properties(logger, commit: true, optimize: true) page = 1 size = 2500 count_props = 0 From b6df920d03af15ee35a3a7c31078c6ea263ba41d Mon Sep 17 00:00:00 2001 From: Alex Skrenchuk Date: Wed, 13 Mar 2024 16:04:58 -0700 Subject: [PATCH 14/14] reset branch specifier to master --- Gemfile | 4 ++-- Gemfile.lock | 8 ++++---- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/Gemfile b/Gemfile index ba7af023..84eff580 100644 --- a/Gemfile +++ b/Gemfile @@ -33,5 +33,5 @@ group :development do end # NCBO gems (can be from a local dev path or from rubygems/git) -gem 'goo', github: 'ncbo/goo', branch: 'develop' -gem 'sparql-client', github: 'ncbo/sparql-client', branch: 'develop' +gem 'goo', github: 'ncbo/goo', branch: 'master' +gem 'sparql-client', github: 'ncbo/sparql-client', branch: 'master' diff --git a/Gemfile.lock b/Gemfile.lock index a0bfd5f4..31e562c0 100644 --- a/Gemfile.lock +++ b/Gemfile.lock @@ -1,7 +1,7 @@ GIT remote: https://github.com/ncbo/goo.git - revision: 4ea0e70a4361fc694700e11f1012129452278c7d - branch: develop + revision: 75436fe8e387febc53e34ee31ff0e6dd837a9d3f + branch: master specs: goo (0.0.2) addressable (~> 2.8) @@ -15,8 +15,8 @@ GIT GIT remote: https://github.com/ncbo/sparql-client.git - revision: 55e7dbf858eb571c767bc67868f9af61663859cb - branch: develop + revision: d418d56a6c9ff5692f925b45739a2a1c66bca851 + branch: master specs: sparql-client (1.0.1) json_pure (>= 1.4)