tools/code/bids_to_dlco.py

377 lines
No EOL
13 KiB
Python

from argparse import ArgumentParser
import json
from pathlib import Path
import sys
from ruamel.yaml import YAML
yaml=YAML()
yaml.default_flow_style = False
yaml.indent(sequence=4, offset=2)
package_dir = Path(__file__).parent.parent.resolve()
def read_json_file(file_path):
"""
Load dictionary from json file
"""
try:
with open(file_path) as f:
return json.load(f)
except OSError as err:
print("OS error: {0}".format(err))
except:
print("Unexpected error:", sys.exc_info()[0])
raise
def add_obj_to_list(obj, arr, keys_to_match):
""""""
existing_obj = find_duplicate_object_in_list(
list_to_search=arr,
new_obj=obj,
keys_to_match=keys_to_match,
)
if not existing_obj:
arr.append(obj)
return obj
else:
return existing_obj
def find_duplicate_object_in_list(
list_to_search: list, new_obj: object, keys_to_match
):
""""""
existing_objects = list_to_search
for key in keys_to_match:
existing_objects = [
obj
for obj in existing_objects
if (key in new_obj) and (key in obj) and (obj[key] == new_obj[key])
]
if not bool(existing_objects):
return None
else:
return existing_objects[0]
def get_parts(dv_files, schema_files):
has_part = []
qualified_part = []
# for each file:
for f in dv_files:
fname = f["label"]
fpath = f["label"] if "directoryLabel" not in f else f["directoryLabel"] + "/" + f["label"]
fdata = f["dataFile"]
# Let's start with the file:
file_obj = {
"id": "exthisdsver:./" + fpath,
# "id": fdata["md5"],
"byte_size": fdata["filesize"],
"checksum": [{
"algorithm": "md5",
"digest": fdata["md5"]
}],
"download_url": ["https://dataverse.nl/api/access/datafile/" + str(fdata["id"])],
"media_type": fdata["contentType"],
}
# all fileobjects have to be added to the main has_part list
file_obj = add_obj_to_list(
obj=file_obj,
arr=has_part,
keys_to_match=[
"id",
]
)
# If the file data DOES NOT have "directoryLabel" field, it is a root-level file
# which has to be added to the root-level qualified_part list
if "directoryLabel" not in f:
qualified_file_obj = add_obj_to_list(
obj={
"entity": file_obj["id"],
"name": f["label"],
},
arr=qualified_part,
keys_to_match=[
"entity",
"name"
]
)
# nothing left to do for this file
continue
# If the file data DOES have a "directoryLabel" field, it is a file inside a directory
# (or a tree of directories). We need to loop through directory tree (i.e. parts in path up
# to and the file's parent) and create distribution objects and their relations for each directory,
# and distribution objects and their relations for each file, and add all to the relevant lists.
fdir = Path(f["directoryLabel"])
fdir_parts = list(fdir.parts)
for idx, dirname in enumerate(fdir_parts):
current_path = f"{'/'.join(fdir_parts[:idx+1])}/"
# All sub-distributions (i.e. directories) have to be added to the top-level distribution["has_part"] (if it doesn't exist)
# All sub-distribution entities have to be added to their parent's filecontainerobject["qualified_part"] (if it doesn't exist)
container_obj = add_obj_to_list(
obj={
"id": "exthisdsver:./" + current_path,
"qualified_part": []
},
arr=has_part,
keys_to_match=[
"id",
]
)
# Then:
# 1. if it's the root directory of the current file (and not parent):
# - add qualified part to distribution["qualified_part"] (if it doesn't exist)
# 2. if it's NOT the root NOR parent directory of the current file:
# - add qualified part to distribution["has_part"][current_dir] (if it doesn't exist)
# 3. if it's the file's parent directory:
# - add qualified part to distribution["has_part"][current_dir] (if it doesn't exist)
# - add file relation as qualified part to distribution["has_part"][current_dir] (if it doesn't exist)
# 1:
if idx == 0:
# IS rootdir
qualified_obj = add_obj_to_list(
obj={
"entity": "exthisdsver:./" + current_path,
"name": dirname,
},
arr=qualified_part,
keys_to_match=[
"entity",
"name"
]
)
# 3:
else:
# any other dir
qualified_obj = add_obj_to_list(
obj={
"entity": "exthisdsver:./" + current_path,
"name": dirname,
},
arr=previous_container_obj["qualified_part"],
keys_to_match=[
"entity",
"name"
]
)
# with file's parent dir, add qualified file relation to container object
if idx == len(fdir_parts) - 1:
qualified_file_obj = add_obj_to_list(
obj={
"entity": "exthisdsver:./" + fpath,
"name": f["label"],
},
arr=container_obj["qualified_part"],
keys_to_match=[
"entity",
"name"
]
)
previous_container_obj = container_obj
return has_part, qualified_part
if __name__ == "__main__":
# load arguments
parser = ArgumentParser()
parser.add_argument(
"jsonfile", type=str, help="Path to the dataverse-json export",
)
parser.add_argument(
"schema_file", type=str, help="Path to the schemaorg-jsonld export",
)
args = parser.parse_args()
# load files into dicts
json_obj = read_json_file(Path(args.jsonfile))
schema_obj = read_json_file(Path(args.schema_file))
# ---------------------------
# Relations to contextual entities
# i.e. describe the dataset contextually
# ---------------------------
# First list agents:
agents = [
# Stephan Heunis
dict(
id="exthisds:#" + schema_obj["creator"][0]["givenName"] + schema_obj["creator"][0]["familyName"],
meta_type="dldist:Person",
name=schema_obj["creator"][0]["givenName"] + ' ' + schema_obj["creator"][0]["familyName"],
identifier=[
{
"schema_agency": "https://orcid.org",
"notation": schema_obj["creator"][0]["identifier"].replace("https://orcid.org/", ""),
},
],
affiliation=["exthisds:#" + schema_obj["creator"][0]["affiliation"]["name"].replace(" ", "")],
same_as=[
schema_obj["creator"][0]["identifier"]
],
),
# Eindhoven University
dict(
id="exthisds:#" + schema_obj["creator"][0]["affiliation"]["name"].replace(" ", ""),
meta_type="dldist:Organization",
name=schema_obj["creator"][0]["affiliation"]["name"],
identifier=[
{
"schema_agency": "https://ror.org",
"notation": "02c2kyt77",
},
{
"schema_agency": "https://www.wikidata.org/wiki",
"notation": "Q280824",
},
],
same_as=[
"https://ror.org/02c2kyt77"
"https://www.wikidata.org/wiki/Q280824"
],
),
]
# Then publications
# note: these publications aren't contained in the dataverse metadata
publications = [
dict(
id="exthisds:#data_paper",
meta_type="dlsdd:Publication",
notation="Heunis S, Breeuwer M, Caballero-Gaudes C et al. rt-me-fMRI: a task and resting state dataset for real-time, multi-echo fMRI methods development and validation [version 1; peer review: 1 approved, 1 approved with reservations]. F1000Research 2021, 10:70 (https://doi.org/10.12688/f1000research.29988.1)",
date_published="2021-02-04",
identifier=[
{
"schema_agency": "https://doi.org",
"notation": "10.12688/f1000research.29988.1",
},
],
qualified_attribution=[
{
"agent":agents[0]["id"],
"had_role": [
"marcrel:aut",
"marcrel:col",
"marcrel:cre"
],
}
]
),
dict(
id="exthisds:#methods_paper",
meta_type="dlsdd:Publication",
notation="S. Heunis, M. Breeuwer, C. Caballero-Gaudes, L. Hellrung, W. Huijbers, J.F. Jansen, R. Lamerichs, S. Zinger, A.P. Aldenkamp. The effects of multi-echo fMRI combination and rapid T*-mapping on offline and real-time BOLD sensitivity. NeuroImage, 238 (2021), Article 118244, 10.1016/j.neuroimage.2021.118244",
date_published="2021-06-11",
identifier=[
{
"schema_agency": "https://doi.org",
"notation": "10.1016/j.neuroimage.2021.118244",
},
],
qualified_attribution=[
{
"agent":agents[0]["id"],
"had_role": [
"marcrel:aut",
],
}
],
)
]
# Then dataset version relation
dataset_version = dict(
id="exthisdsver:#",
meta_type="dldist:Resource",
date_modified=schema_obj["dateModified"],
description=schema_obj["description"],
is_version_of="exthisds:#",
keyword=schema_obj["keywords"],
landing_page=schema_obj["identifier"], # doi of the dataset
name="dataverse-rtmefmri",
title=schema_obj["name"],
version=str(json_obj["datasetVersion"]["versionNumber"]) + "." + str(json_obj["datasetVersion"]["versionMinorNumber"]), # jsonld only gives major version
same_as=[
schema_obj["identifier"],
],
identifier=[
{
"schema_agency": "https://doi.org",
"notation": schema_obj["identifier"].replace("https://doi.org/", ""),
},
],
qualified_attribution=[
dict(
agent=agents[0]["id"],
had_role=[
"marcrel:aut",
"marcrel:col",
"marcrel:cre"
],
),
dict(
agent=agents[1]["id"],
had_role=[
"marcrel:sht", # Supporting host
],
)
],
qualified_relation=[
# data paper
dict(
had_role=[
"CiTO:isDocumentedBy",
"CiTO:citesAsAuthority",
],
entity=[publications[0]["id"]]
),
# methods paper
dict(
had_role=[
"CiTO:isCitedAsDataSourceBy",
],
entity=[publications[1]["id"]]
),
# TODO: include grants with "had_role: schema:funding"
],
)
# Then license
license = dict(
id="exthisds:#humanhealthdatalicense",
meta_type="dldist:LicenseDocument",
same_as=[
schema_obj["license"],
],
license_text=json_obj["datasetVersion"]["termsOfUse"]
)
# Then put the relations together in a list
relation = [dataset_version] + agents + publications + [license]
# TODO: include study activity and study participants
# TODO: include access
# ---------------------------
# Then build the distribution
# ---------------------------
# Distribution and some base properties, relations, and (qualified) parts
has_part, qualified_part = get_parts(json_obj["datasetVersion"]["files"], schema_obj["distribution"])
distribution = dict(
id="exthisdsver:.",
conforms_to=[
"https://bids-specification.readthedocs.io/en/v1.6.0"
],
license=license["id"],
is_distribution_of="exthisdsver:#",
was_attributed_to=[
agents[0]["id"],
agents[1]["id"],
],
relation=relation,
qualified_part=qualified_part,
has_part=has_part,
)
# 5. Write to yaml file
outfile = 'Distribution-dataverse-rtmefmri.yaml'
with open(outfile, 'w') as outf:
yaml.dump(distribution, outf)