Skip to content

Commit

Permalink
Merge pull request #15 from simleo/more_metadata
Browse files Browse the repository at this point in the history
convert: add more metadata
  • Loading branch information
simleo authored Feb 21, 2023
2 parents 928bb9d + 0e0fe2c commit 4078746
Show file tree
Hide file tree
Showing 28 changed files with 2,398 additions and 2,194 deletions.
124 changes: 114 additions & 10 deletions src/runcrate/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -54,12 +54,21 @@
"ParameterConnection": "https://w3id.org/ro/terms/workflow-run#ParameterConnection",
"connection": "https://w3id.org/ro/terms/workflow-run#connection",
"sourceParameter": "https://w3id.org/ro/terms/workflow-run#sourceParameter",
"targetParameter": "https://w3id.org/ro/terms/workflow-run#targetParameter"
"targetParameter": "https://w3id.org/ro/terms/workflow-run#targetParameter",
"sha1": "https://w3id.org/ro/terms/workflow-run#sha1"
}


CWLPROV_NONE = "https://w3id.org/cwl/prov#None"

PROFILES_VERSION = "0.1"
WROC_PROFILE_VERSION = "1.0"


def as_list(value):
if isinstance(value, list):
return value
return [value]


def convert_cwl_type(cwl_type):
if isinstance(cwl_type, list):
Expand Down Expand Up @@ -171,10 +180,11 @@ def get_workflow(wf_path):

class ProvCrateBuilder:

def __init__(self, root, workflow_name=None, license=None):
def __init__(self, root, workflow_name=None, license=None, readme=None):
self.root = Path(root)
self.workflow_name = workflow_name
self.license = license
self.readme = Path(readme) if readme else readme
self.wf_path = self.root / "workflow" / WORKFLOW_BASENAME
self.cwl_defs = get_workflow(self.wf_path)
self.step_maps = self._get_step_maps(self.cwl_defs)
Expand Down Expand Up @@ -252,11 +262,42 @@ def get_dict(self, entity):
def build(self):
crate = ROCrate(gen_preview=False)
crate.metadata.extra_terms.update(EXTRA_TERMS)
self.add_root_metadata(crate)
self.add_profiles(crate)
self.add_workflow(crate)
self.add_engine_run(crate)
self.add_action(crate, self.workflow_run)
self.patch_workflow_input_collection(crate)
return crate

def add_root_metadata(self, crate):
if self.license:
crate.root_dataset["license"] = self.license
if self.readme:
readme = crate.add_file(self.readme)
readme["about"] = crate.root_dataset
if self.readme.suffix.lower() == ".md":
readme["encodingFormat"] = "text/markdown"

def add_profiles(self, crate):
profiles = []
for p in "process", "workflow", "provenance":
id_ = f"https://w3id.org/ro/wfrun/{p}/{PROFILES_VERSION}"
profiles.append(crate.add(ContextEntity(crate, id_, properties={
"@type": "CreativeWork",
"name": f"{p.title()} Run Crate",
"version": PROFILES_VERSION,
})))
# FIXME: in the future, this could go out of sync with the wroc
# profile added by ro-crate-py to the metadata descriptor
wroc_profile_id = f"https://w3id.org/workflowhub/workflow-ro-crate/{WROC_PROFILE_VERSION}"
profiles.append(crate.add(ContextEntity(crate, wroc_profile_id, properties={
"@type": "CreativeWork",
"name": "Workflow RO-Crate",
"version": WROC_PROFILE_VERSION,
})))
crate.root_dataset["conformsTo"] = profiles

def add_workflow(self, crate):
lang_version = self.cwl_defs[WORKFLOW_BASENAME].cwlVersion
properties = {
Expand All @@ -267,8 +308,6 @@ def add_workflow(self, crate):
self.wf_path, self.wf_path.name, main=True, lang="cwl",
lang_version=lang_version, gen_cwl=False, properties=properties
)
if self.license:
crate.root_dataset["license"] = self.license
cwl_workflow = self.cwl_defs[workflow.id]
workflow["input"] = self.add_params(crate, cwl_workflow.inputs)
workflow["output"] = self.add_params(crate, cwl_workflow.outputs)
Expand Down Expand Up @@ -459,7 +498,18 @@ def add_action_params(self, crate, activity, to_wf_p, ptype="usage"):
action_params.append(action_p)
return action_params

def convert_param(self, prov_param, crate, convert_secondary=True, parent=""):
@staticmethod
def _set_alternate_name(prov_param, action_p, parent=None):
basename = getattr(prov_param, "basename", None)
if not basename:
return
if not parent:
action_p["alternateName"] = basename
return
if "alternateName" in parent:
action_p["alternateName"] = (Path(parent["alternateName"]) / basename).as_posix()

def convert_param(self, prov_param, crate, convert_secondary=True, parent=None):
type_names = frozenset(str(_) for _ in prov_param.types())
secondary_files = [_.generated_entity() for _ in prov_param.derivations()
if str(_.type) == "cwlprov:SecondaryFile"]
Expand All @@ -479,20 +529,24 @@ def convert_param(self, prov_param, crate, convert_secondary=True, parent=""):
return action_p
if "wf4ever:File" in type_names:
hash_ = self.hashes[prov_param.id.localpart]
dest = Path(parent) / hash_
dest = Path(parent.id if parent else "") / hash_
action_p = crate.dereference(dest.as_posix())
if not action_p:
source = self.root / Path("data") / hash_[:2] / hash_
action_p = crate.add_file(source, dest)
action_p = crate.add_file(source, dest, properties={
"sha1": hash_,
})
self._set_alternate_name(prov_param, action_p, parent=parent)
return action_p
if "ro:Folder" in type_names:
hash_ = self.hashes[prov_param.id.localpart]
dest = Path(parent) / hash_
dest = Path(parent.id if parent else "") / hash_
action_p = crate.dereference(dest.as_posix())
if not action_p:
action_p = crate.add_directory(dest_path=dest)
self._set_alternate_name(prov_param, action_p, parent=parent)
for child in self.get_dict(prov_param).values():
part = self.convert_param(child, crate, parent=dest)
part = self.convert_param(child, crate, parent=action_p)
action_p.append_to("hasPart", part)
return action_p
if prov_param.value is not None:
Expand Down Expand Up @@ -548,3 +602,53 @@ def connect(source, target, entity):
pass
to_param = get_fragment(out.id)
connect(from_param, to_param, workflow)

def patch_workflow_input_collection(self, crate, wf=None):
"""\
CWLProv records secondary files only in step runs, not in the workflow
run. Thus, when the conversion of parameter values is completed,
workflow-level parameters with secondary files get mapped to the main
entity of the collection alone (a File). This method fixes the mapping
by retrieving the correct Collection entity from the relevant tool
execution.
"""
if wf is None:
wf = crate.mainEntity
sel = [_ for _ in crate.contextual_entities
if "CreateAction" in as_list(_.type) and _.get("instrument") is wf]
if not sel:
raise RuntimeError(f"{wf.id} has no corresponding action")
wf_action = sel[0]
connections = [_ for _ in crate.contextual_entities
if "ParameterConnection" in as_list(_.type)]
for param in wf.get("input", []):
if param.get("additionalType") == "Collection":
src_sel = [_ for _ in wf_action.get("object", [])
if param in as_list(_.get("exampleOfWork"))]
if not src_sel:
raise RuntimeError(f"object for param {param.id} not found")
obj = src_sel[0]
if obj.type != "Collection":
param_connections = [_ for _ in connections if _["sourceParameter"] is param]
if not param_connections:
continue
pc = param_connections[0]
tgt_param = pc["targetParameter"]
tgt_sel = [_ for _ in crate.get_entities()
if tgt_param in as_list(_.get("exampleOfWork"))]
if not tgt_sel:
raise RuntimeError(f"object for param {tgt_param.id} not found")
tgt_obj = tgt_sel[0]
wf_action["object"] = [
_ for _ in as_list(wf_action["object"]) if _ is not obj
] + [tgt_obj]
tgt_obj.append_to("exampleOfWork", param)
obj["exampleOfWork"] = [_ for _ in as_list(obj["exampleOfWork"])
if _ is not param]
if len(obj["exampleOfWork"]) == 1:
obj["exampleOfWork"] = obj["exampleOfWork"][0]
if len(obj["exampleOfWork"]) == 0:
del obj["exampleOfWork"]
for tool in wf.get("hasPart", []):
if "ComputationalWorkflow" in as_list(tool.type):
self.patch_workflow_input_collection(crate, wf=tool)
9 changes: 7 additions & 2 deletions src/runcrate/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -47,13 +47,18 @@ def cli():
metavar="STRING",
help="original workflow name",
)
def convert(root, output, license, workflow_name):
@click.option(
"--readme",
type=click.Path(exists=True, dir_okay=False, readable=True, path_type=Path),
help="path to a README file (should be README.md in Markdown format)",
)
def convert(root, output, license, workflow_name, readme):
"""\
RO_DIR: top-level directory of the CWLProv RO
"""
if not output:
output = Path(f"{root.name}.crate.zip")
builder = ProvCrateBuilder(root, workflow_name, license)
builder = ProvCrateBuilder(root, workflow_name, license, readme)
crate = builder.build()
if output.suffix == ".zip":
crate.write_zip(output)
Expand Down
6 changes: 3 additions & 3 deletions tests/data/grepucase-run-1/bag-info.txt
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
Bag-Software-Agent: cwltool 3.1.20221201130942
Bag-Software-Agent: cwltool 3.1.20230213100550
BagIt-Profile-Identifier: https://w3id.org/ro/bagit/profile
Bagging-Date: 2022-12-16
Bagging-Date: 2023-02-17
External-Description: Research Object of CWL workflow run
External-Identifier: arcp://uuid,86e05e63-d4dd-4a06-8235-9d26df405724/
External-Identifier: arcp://uuid,422fecc5-1e57-45bc-a653-1513a7a6fe70/
Payload-Oxum: 151.7

This file was deleted.

Original file line number Diff line number Diff line change
@@ -0,0 +1,26 @@
@prefix id: <urn:uuid:> .
@prefix ore: <http://www.openarchives.org/ore/terms/> .
@prefix prov: <http://www.w3.org/ns/prov#> .
@prefix ro: <http://purl.org/wf4ever/ro#> .
@prefix xsd: <http://www.w3.org/2001/XMLSchema#> .

id:3c9e35a6-42d6-4905-8a27-0e4159882180 a ro:FolderEntry,
ore:Proxy,
prov:Entity ;
ro:entryName "foo.out"^^xsd:string ;
ore:proxyFor id:bea1dc23-0537-4874-82ce-454f48c08d0a ;
ore:proxyIn id:5b25d2eb-2e89-4d62-8145-2ea3656ba173 .

id:8fc06ba2-c479-498a-b95e-61eb975eb53b a ro:FolderEntry,
ore:Proxy,
prov:Entity ;
ro:entryName "bar.out"^^xsd:string ;
ore:proxyFor id:8cb9abec-9ecf-4e54-86b5-7f08a073ce60 ;
ore:proxyIn id:5b25d2eb-2e89-4d62-8145-2ea3656ba173 .

id:5b25d2eb-2e89-4d62-8145-2ea3656ba173 a ro:Folder,
ore:Aggregation,
prov:Entity ;
ore:aggregates id:3c9e35a6-42d6-4905-8a27-0e4159882180,
id:8fc06ba2-c479-498a-b95e-61eb975eb53b .

This file was deleted.

Original file line number Diff line number Diff line change
Expand Up @@ -4,15 +4,15 @@
@prefix ro: <http://purl.org/wf4ever/ro#> .
@prefix xsd: <http://www.w3.org/2001/XMLSchema#> .

id:3504fafc-d1b7-4165-a4d7-7a0c4bfbbe98 a ro:Folder,
ore:Aggregation,
prov:Entity ;
ore:aggregates id:83160fd7-e052-40ef-9f0c-8af9b40c952b .

id:83160fd7-e052-40ef-9f0c-8af9b40c952b a ro:FolderEntry,
id:899d951e-0ef0-42a7-a460-3cf4c3b3aff5 a ro:FolderEntry,
ore:Proxy,
prov:Entity ;
ro:entryName "bar.out.out"^^xsd:string ;
ore:proxyFor id:54ed715b-cb81-46d5-bbd0-41c8c4131a56 ;
ore:proxyIn id:3504fafc-d1b7-4165-a4d7-7a0c4bfbbe98 .
ore:proxyFor id:2c6ae84c-c98c-4173-aa24-44b9bb21d372 ;
ore:proxyIn id:8cb9abec-9ecf-4e54-86b5-7f08a073ce60 .

id:8cb9abec-9ecf-4e54-86b5-7f08a073ce60 a ro:Folder,
ore:Aggregation,
prov:Entity ;
ore:aggregates id:899d951e-0ef0-42a7-a460-3cf4c3b3aff5 .

Original file line number Diff line number Diff line change
@@ -0,0 +1,26 @@
@prefix id: <urn:uuid:> .
@prefix ore: <http://www.openarchives.org/ore/terms/> .
@prefix prov: <http://www.w3.org/ns/prov#> .
@prefix ro: <http://purl.org/wf4ever/ro#> .
@prefix xsd: <http://www.w3.org/2001/XMLSchema#> .

id:93dc7383-4670-4192-9954-659613284713 a ro:FolderEntry,
ore:Proxy,
prov:Entity ;
ro:entryName "bar"^^xsd:string ;
ore:proxyFor id:3890a779-35f5-4753-8cce-9c4645221ea2 ;
ore:proxyIn id:a632ff01-ca44-453b-982f-96879f1b85ba .

id:dfa94007-0aba-4189-a4a2-f4be72d084ef a ro:FolderEntry,
ore:Proxy,
prov:Entity ;
ro:entryName "foo"^^xsd:string ;
ore:proxyFor id:c7636f08-692c-4e1e-927f-823e9e76aac7 ;
ore:proxyIn id:a632ff01-ca44-453b-982f-96879f1b85ba .

id:a632ff01-ca44-453b-982f-96879f1b85ba a ro:Folder,
ore:Aggregation,
prov:Entity ;
ore:aggregates id:93dc7383-4670-4192-9954-659613284713,
id:dfa94007-0aba-4189-a4a2-f4be72d084ef .

Original file line number Diff line number Diff line change
Expand Up @@ -4,15 +4,15 @@
@prefix ro: <http://purl.org/wf4ever/ro#> .
@prefix xsd: <http://www.w3.org/2001/XMLSchema#> .

id:378648cc-14bb-42ba-90e5-fbf9e4099a1c a ro:Folder,
ore:Aggregation,
prov:Entity ;
ore:aggregates id:eb32dcb0-f27a-42ec-a3ed-6314546844a0 .

id:eb32dcb0-f27a-42ec-a3ed-6314546844a0 a ro:FolderEntry,
id:7329bf41-05c0-4881-8bd3-1203a530f736 a ro:FolderEntry,
ore:Proxy,
prov:Entity ;
ro:entryName "foo.out.out"^^xsd:string ;
ore:proxyFor id:0867ef40-bf1d-4b05-87ab-03a90f476d5e ;
ore:proxyIn id:378648cc-14bb-42ba-90e5-fbf9e4099a1c .
ore:proxyFor id:cb12bf67-5541-4f8c-b897-47a87e11a242 ;
ore:proxyIn id:bea1dc23-0537-4874-82ce-454f48c08d0a .

id:bea1dc23-0537-4874-82ce-454f48c08d0a a ro:Folder,
ore:Aggregation,
prov:Entity ;
ore:aggregates id:7329bf41-05c0-4881-8bd3-1203a530f736 .

Loading

0 comments on commit 4078746

Please sign in to comment.