Merge branch 'master' into cthoyt-patch-1

mapping-commons · Jul 25, 2023 · 12222bf · 12222bf
2 parents 6fe59ee + a553df4
commit 12222bf
Show file tree

Hide file tree

Showing 19 changed files with 1,231 additions and 455 deletions.
diff --git a/mkdocs.yml b/mkdocs.yml
@@ -36,15 +36,21 @@ nav:
     - How to contribute?: contributing.md
     - Code of Conduct: code_of_conduct.md
   - Resources for users:
-    - Use cases: usecases.md
+    - Use cases: 
+      - Overview: usecases.md
+      - How to gradually enrich OMOP mappings with SSSOM: tutorials/omop-mappings.md
     - Workshops: workshops.md
     - Presentations: presentations.md
     - Basic Tutorial: tutorial.md
+    - Mapping Justifications: mapping-justifications.md
     - How to use mapping predicates: mapping-predicates.md
+    - Set up a mapping registry/commons: mapping-commons.md
     - Overview of Chaining Rules: chaining_rules.md
     - A basic guide for the SSSOM toolkit: toolkit.md
     - 5-Star Entity Mappings - Cheatsheet: 5star-mappings.md
     - Update schema/context and release: update.md
+    - Matching tool implementation guide: matching-tool-implementation-guide.md
+    - Glossary: glossary.md
 
 site_url: https://mapping-commons.github.io/sssom/
 repo_url: https://github.com/mapping-commons/sssom/

diff --git a/project/excel/sssom_schema.xlsx b/project/excel/sssom_schema.xlsx
diff --git a/project/graphql/sssom_schema.graphql b/project/graphql/sssom_schema.graphql
@@ -74,6 +74,7 @@ type MappingSet
     objectSourceVersion: String
     mappingProvider: Uri
     mappingTool: String
+    mappingToolVersion: String
     mappingDate: Date
     subjectMatchField: [EntityReference]
     objectMatchField: [EntityReference]

diff --git a/project/jsonld/sssom_schema.context.jsonld b/project/jsonld/sssom_schema.context.jsonld
@@ -1,5 +1,9 @@
 {
-   "_comments": "Auto generated from sssom_schema.yaml by jsonldcontextgen.py version: 0.1.1\n    Generation date: 2023-06-19T15:06:30\n    Schema: sssom\n    metamodel version: 1.7.0\n    model version: None\n    \n    id: https://w3id.org/sssom/schema/\n    description: Datamodel for Simple Standard for Sharing Ontological Mappings (SSSOM)\n    license: https://creativecommons.org/publicdomain/zero/1.0/\n    ",
+   "comments": {
+      "description": "Auto generated by LinkML jsonld context generator",
+      "generation_date": "2023-07-25T09:55:30",
+      "source": "sssom_schema.yaml"
+   },
    "@context": {
       "dc": "http://purl.org/dc/terms/",
       "dcterms": "http://purl.org/dc/terms/",

diff --git a/project/jsonld/sssom_schema.jsonld b/project/jsonld/sssom_schema.jsonld
@@ -1399,6 +1399,7 @@
       "slot_uri": "https://w3id.org/sssom/mapping_tool_version",
       "owner": "Mapping",
       "domain_of": [
+        "MappingSet",
         "Mapping"
       ],
       "range": "string",
@@ -1707,6 +1708,7 @@
         "object_source_version",
         "mapping_provider",
         "mapping_tool",
+        "mapping_tool_version",
         "mapping_date",
         "subject_match_field",
         "object_match_field",
@@ -1814,9 +1816,9 @@
   ],
   "metamodel_version": "1.7.0",
   "source_file": "sssom_schema.yaml",
-  "source_file_date": "2023-06-19T14:51:37",
-  "source_file_size": 25627,
-  "generation_date": "2023-06-19T15:06:31",
+  "source_file_date": "2023-07-25T09:52:38",
+  "source_file_size": 25654,
+  "generation_date": "2023-07-25T09:55:31",
   "@type": "SchemaDefinition",
   "@context": [
     "project/jsonld/sssom_schema.context.jsonld",

diff --git a/project/jsonschema/sssom_schema.schema.json b/project/jsonschema/sssom_schema.schema.json
diff --git a/project/owl/sssom_schema.owl.ttl b/project/owl/sssom_schema.owl.ttl
diff --git a/project/protobuf/sssom_schema.proto b/project/protobuf/sssom_schema.proto
@@ -75,6 +75,7 @@ message MappingSet
   string objectSourceVersion = 0
   uri mappingProvider = 0
   string mappingTool = 0
+  string mappingToolVersion = 0
   date mappingDate = 0
  repeated  entityReference subjectMatchField = 0
  repeated  entityReference objectMatchField = 0

diff --git a/project/shacl/sssom_schema.shacl.ttl b/project/shacl/sssom_schema.shacl.ttl
diff --git a/project/shex/sssom_schema.shex b/project/shex/sssom_schema.shex
@@ -1,11 +1,11 @@
 BASE <https://w3id.org/sssom/>
+PREFIX prov: <http://www.w3.org/ns/prov#>
 PREFIX owl: <http://www.w3.org/2002/07/owl#>
 PREFIX rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#>
 PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#>
 PREFIX xsd: <http://www.w3.org/2001/XMLSchema#>
 PREFIX linkml: <https://w3id.org/linkml/>
 PREFIX pav: <http://purl.org/pav/>
-PREFIX prov: <http://www.w3.org/ns/prov#>
 PREFIX dcterms: <http://purl.org/dc/terms/>
 
 
@@ -128,6 +128,7 @@ linkml:Sparqlpath xsd:string
           <object_source_version> @linkml:String ? ;
           <mapping_provider> @linkml:Uri ? ;
           <mapping_tool> @linkml:String ? ;
+          <mapping_tool_version> @linkml:String ? ;
           pav:authoredOn @linkml:Date ? ;
           <subject_match_field> @<EntityReference> * ;
           <object_match_field> @<EntityReference> * ;

diff --git a/project/sqlschema/sssom_schema.sql b/project/sqlschema/sssom_schema.sql
@@ -75,6 +75,7 @@ CREATE TABLE mapping_set (
 	object_source_version TEXT, 
 	mapping_provider TEXT, 
 	mapping_tool TEXT, 
+	mapping_tool_version TEXT, 
 	mapping_date DATE, 
 	subject_match_field TEXT, 
 	object_match_field TEXT, 
@@ -83,7 +84,7 @@ CREATE TABLE mapping_set (
 	see_also TEXT, 
 	other TEXT, 
 	comment TEXT, 
-	PRIMARY KEY (mappings, mapping_set_id, mapping_set_version, mapping_set_source, mapping_set_title, mapping_set_description, creator_id, creator_label, license, subject_type, subject_source, subject_source_version, object_type, object_source, object_source_version, mapping_provider, mapping_tool, mapping_date, subject_match_field, object_match_field, subject_preprocessing, object_preprocessing, see_also, other, comment)
+	PRIMARY KEY (mappings, mapping_set_id, mapping_set_version, mapping_set_source, mapping_set_title, mapping_set_description, creator_id, creator_label, license, subject_type, subject_source, subject_source_version, object_type, object_source, object_source_version, mapping_provider, mapping_tool, mapping_tool_version, mapping_date, subject_match_field, object_match_field, subject_preprocessing, object_preprocessing, see_also, other, comment)
 );
 
 CREATE TABLE mapping_set_reference (

diff --git a/src/docs/glossary.md b/src/docs/glossary.md
@@ -0,0 +1,3 @@
+# Glossary
+
+The glossary is currently being developed [here](https://docs.google.com/document/d/1QqR8j7szjaq6wzE9YLBnZ2kOD9eN14d3SYd312X8JjQ/edit?usp=sharing).
diff --git a/src/docs/mapping-commons.md b/src/docs/mapping-commons.md
@@ -0,0 +1,21 @@
+# How to set up a Mapping Commons
+
+A mapping commons is an open, collaborative space for managing and reconciling mappings. The goal is to collect mappings from a variety of sources into a _mapping set registry_, standardise them into a common representation, curate some basic metrics such as "confidence" (how much does the community managing the commons trust a specific mapping source?) and provenance (where exactly did this mapping come from before it was integrated).
+
+There is no agreed upon standard for mapping registries yet. SSSOM itself provides a [lighweight metadata model for mapping registries](https://mapping-commons.github.io/sssom/) which is, as of August 2023, under active development.
+
+## Typical setup of a mapping commons
+
+We recommend to base your mapping commons on a combination of GitHub (or GitLab) collaborative workflows (issues and discussions for the community, access management etc) and a git repository based on the [Mapping Commons Cookiecutter Template](https://github.com/mapping-commons/mapping-commons-cookiecutter) for version control of the mappings. 
+
+Using the template system above allows you to: 
+
+1. make use of basic CI and quality control for your mappings, 
+2. provides a standard way to document metadata about your mapping sets
+3. provides a basic ETL system based on `gnu make` (which you dont have to use, its just convenient)
+4. Provides a standardised registry format that can be re-used/imported by others.
+
+Examples of Mapping Commons are:
+
+1. https://github.com/mapping-commons/mh_mapping_initiative
+1. https://gitlab.c-path.org/c-pathontology/mapping-commons
diff --git a/src/docs/mapping-justifications.md b/src/docs/mapping-justifications.md
@@ -0,0 +1,89 @@
+# Guide to using Mapping Justifications
+
+The goal of this document is to provide the user with a few pointers into the art of mapping justification construction. As of Summer 2023, the SSSOM justification system is still evolving, and will likely benefit from yoru input. Where informative metadata properties or values are missing from the [SSSOM datamodel](https://mapping-commons.github.io/sssom/) or [SEMAPV](https://mapping-commons.github.io/semantic-mapping-vocabulary/), request them on the [SSSOM](https://github.com/mapping-commons/sssom/issues) or [SEMAPV issue tracker](https://github.com/mapping-commons/semantic-mapping-vocabulary/issues) respectively.
+
+## Table of contents
+
+1. [lexical matching](#lexical-matching)
+1. [semantic similarity threshold-based matching](#semantic-matching)
+1. [mapping review](#mapping-review)
+1. Other justifications
+    1. background knowledge-based matching
+    1. composite matching
+    1. instance-based matching
+    1. lexical similarity threshold-based matching
+    1. logical reasoning
+    1. manual mapping curation
+    1. mapping chaining-based matching
+    1. mapping inversion-based matching
+    1. semantic similarity threshold-based matching
+    1. structural matching
+    1. unspecified matching
+
+
+<a id="lexical-matching"></a>
+
+## Lexical matching
+
+There are two kinds of lexical matching justifications we try to distinguish:
+
+- [semapv:LexicalMatching](https://w3id.org/semapv/vocab/LexicalMatching): The match is exact (potentially after pre-processing)
+- [semapv:LexicalSimilarityThresholdMatching](https://w3id.org/semapv/vocab/LexicalSimilarityThresholdMatching): The match is fuzzy (for example, Levenshtein distance). Note: embedding similarity, even if constructed purely of a word embedding, is considered a form of _semantic_ similarity.
+
+#### Level 1: Track the fact that the match was based on a lexical process
+
+Whenever a mapping was established by a lexical matching process, track at least that fact:
+
+- [mapping_justification](https://mapping-commons.github.io/sssom/mapping_justification/)`: `[semapv:LexicalMatching](https://w3id.org/semapv/vocab/CompositeMatching). This indicates that the mapping was determined through some form of exact lexical matching.
+
+#### Level 2: Track the specific datamodel fields involved in the matching process
+
+Regardless of which specific lexical matching justification you are working on, it is often useful to document the source field of the values used to aquire the match. For example:
+
+- [subject_match_field](https://mapping-commons.github.io/sssom/subject_match_field/)`: rdfs:label` indicates that the value of the `rdfs:label` property on the subject entity was used to establish the match.
+- [object_match_field](https://mapping-commons.github.io/sssom/object_match_field/)`: skos:prefLabel` indicates that the value of the `skos:prefLabel` property on the object entity was used to establish the match.
+- [match_string](https://mapping-commons.github.io/sssom/match_string/)`: somestring` the exact string that was used to establish the match. This is especially useful if preprocessing methods are applied, see below (Level 3).
+
+#### Level 3: Pre-processing
+
+There are many pre-processing techniques for text in the NLP literature, such as lower-casing or lemmatisation. To judge the fidelity of a match, it is often useful to document the exact techniques used.
+
+- [subject_preprocessing](https://mapping-commons.github.io/sssom/match_string/)`: semapv:BlankNormalisation` indicates that before determining the match, blank characters (spaces etc) where standardised in some way. There are plenty of preprocessing techniques already recorded in [SEMAPV](https://mapping-commons.github.io/semantic-mapping-vocabulary/), including semapv:BlankNormalisation, semapv:CaseNormalization, semapv:DiacriticsSuppression, semapv:DigitSuppression, semapv:Lemmatization, semapv:LinkStripping, semapv:PunctuationElemination, semapv:RegexRemoval, semapv:RegexReplacement, semapv:Stemming, semapv:StopWordRemoval, semapv:TermExtraction, semapv:Tokenization, but feel free to add more.
+
+However, there is one aspect that makes this process quite difficult to implement: Most matchers will blindly apply a set of normalisation techniques prior to processing, but not document which exact technique **had an effect**. It is obviously less useful to say: we applied all these 20 techniques, if only one of them was actually effectual (i.e. caused the string to change).
+
+If there is no (easy) way to keep track of which technique was effectual for any given match, we believe that it is still better to document all techniques, but doing so on `mapping set` level rather than for each individual mappings (to keep the mapping sets smaller).
+
+<a id="semantic-matching"></a>
+
+## Semantic similarity threshold-based matching
+
+The basic idea behind "Semantic similarity threshold-based matching" is that a process that is "semantics aware" (in the loose sense, either by being cognisant about the graph structure, the logical structure, or a contextual textual knowledge such as an embedded Wikipedia article) enabled computing a score between the subject and object entity that to some degree reflects the "similarity" between the two entities. There are many examples of this:
+
+1. The (graph-)structure around the subject and object entities are projected into a common embedding space, and the similarity between the subject and object entities are expressed as cosine similarity between the two embeddings.
+1. The jaccard similarity between a set of properties of the subject and object entities is calculated.
+1. The Resnik score is calculated between the subject and object entities.
+
+**Important note on applicability of SSSOM for semantic similarity profiles**: SSSOM is not used for documenting semantic similarity profiles, i.e. cross-tables where some set of terms are compared with another set of terms and the semantic similarity is recorded as a score. SSSOM is used to document mappings, and only if a mapping decision is influenced by a semantic similarity based approach, especially in conjunction with as specific thresshold, SSSOM is applicable. For pure semantic similarity tables use [OAK Semantic Similarity](https://incatools.github.io/ontology-access-kit/datamodels/similarity/index.html).
+
+**Semantic vs lexical similarity?**: Semantic similarity is different from lexical similarity intuitively because the context (the graph structure, the background information) is taken into account and provides an (often crude) model of the actual entity, rather than of the word describing it. However, the distinctions can become a bit hazy. Imagine learning a graph embedding on a graph without edges, or a word embedding purely on a single label - there is definitely a grey zone where lexical similarity finishes and semantic similarity begins. In practice though, it should be mostly clear.
+
+## Level 1: Documenting semantic similarity matches
+
+The suggested metadata for semantic similarity threshold based matching approach is:
+
+- [semantic_similarity_measure](https://mapping-commons.github.io/sssom/semantic_similarity_measure/)
+- [semantic_similarity_score](https://mapping-commons.github.io/sssom/semantic_similarity_score/)
+- ((authors note: Maybe we need a [value for similarity threshold](https://github.com/mapping-commons/sssom/issues/296)?))
+
+<a id="mapping-review"></a>
+
+## Mapping review
+
+[semapv:MappingReview](https://w3id.org/semapv/vocab/MappingReview) is a process conducted by a (usually human) agent to determine the validity of a specific given mapping. It differs from [semapv:ManualMappingCuration](https://w3id.org/semapv/vocab/ManualMappingCuration) in that it does not involve looking for alternative mappings or indeed, necessarily determining if a mapping is the best possible mapping. It should be considered cheaper, less trustworthy evidence compared to [semapv:ManualMappingCuration](https://w3id.org/semapv/vocab/ManualMappingCuration).
+
+There are two kinds of mapping reviews in SSSOM:
+
+- Review as an independent justification: [semapv:MappingReview](https://w3id.org/semapv/vocab/MappingReview) is an independent process that determines the validity of a mapping.
+- Review _of_ an existing justification: Instead of evaluating an entire mapping, you can record the fact that someone has looked at a specific justification and deemed it acceptable. In this case, simply record the reviewers identify using the [reviewer_id](https://mapping-commons.github.io/sssom/reviewer_id/) or [reviewer_label](https://mapping-commons.github.io/sssom/reviewer_label/) fields.
+