377 lines
No EOL
13 KiB
Python
377 lines
No EOL
13 KiB
Python
from argparse import ArgumentParser
|
|
import json
|
|
from pathlib import Path
|
|
import sys
|
|
from ruamel.yaml import YAML
|
|
|
|
yaml=YAML()
|
|
yaml.default_flow_style = False
|
|
yaml.indent(sequence=4, offset=2)
|
|
|
|
package_dir = Path(__file__).parent.parent.resolve()
|
|
|
|
|
|
def read_json_file(file_path):
|
|
"""
|
|
Load dictionary from json file
|
|
"""
|
|
try:
|
|
with open(file_path) as f:
|
|
return json.load(f)
|
|
except OSError as err:
|
|
print("OS error: {0}".format(err))
|
|
except:
|
|
print("Unexpected error:", sys.exc_info()[0])
|
|
raise
|
|
|
|
|
|
def add_obj_to_list(obj, arr, keys_to_match):
|
|
""""""
|
|
existing_obj = find_duplicate_object_in_list(
|
|
list_to_search=arr,
|
|
new_obj=obj,
|
|
keys_to_match=keys_to_match,
|
|
)
|
|
if not existing_obj:
|
|
arr.append(obj)
|
|
return obj
|
|
else:
|
|
return existing_obj
|
|
|
|
|
|
def find_duplicate_object_in_list(
|
|
list_to_search: list, new_obj: object, keys_to_match
|
|
):
|
|
""""""
|
|
existing_objects = list_to_search
|
|
for key in keys_to_match:
|
|
existing_objects = [
|
|
obj
|
|
for obj in existing_objects
|
|
if (key in new_obj) and (key in obj) and (obj[key] == new_obj[key])
|
|
]
|
|
|
|
if not bool(existing_objects):
|
|
return None
|
|
else:
|
|
return existing_objects[0]
|
|
|
|
|
|
def get_parts(dv_files, schema_files):
|
|
has_part = []
|
|
qualified_part = []
|
|
# for each file:
|
|
for f in dv_files:
|
|
fname = f["label"]
|
|
fpath = f["label"] if "directoryLabel" not in f else f["directoryLabel"] + "/" + f["label"]
|
|
fdata = f["dataFile"]
|
|
# Let's start with the file:
|
|
file_obj = {
|
|
"id": "exthisdsver:./" + fpath,
|
|
# "id": fdata["md5"],
|
|
"byte_size": fdata["filesize"],
|
|
"checksum": [{
|
|
"algorithm": "md5",
|
|
"digest": fdata["md5"]
|
|
}],
|
|
"download_url": ["https://dataverse.nl/api/access/datafile/" + str(fdata["id"])],
|
|
"media_type": fdata["contentType"],
|
|
}
|
|
# all fileobjects have to be added to the main has_part list
|
|
file_obj = add_obj_to_list(
|
|
obj=file_obj,
|
|
arr=has_part,
|
|
keys_to_match=[
|
|
"id",
|
|
]
|
|
)
|
|
# If the file data DOES NOT have "directoryLabel" field, it is a root-level file
|
|
# which has to be added to the root-level qualified_part list
|
|
if "directoryLabel" not in f:
|
|
qualified_file_obj = add_obj_to_list(
|
|
obj={
|
|
"entity": file_obj["id"],
|
|
"name": f["label"],
|
|
},
|
|
arr=qualified_part,
|
|
keys_to_match=[
|
|
"entity",
|
|
"name"
|
|
]
|
|
)
|
|
# nothing left to do for this file
|
|
continue
|
|
# If the file data DOES have a "directoryLabel" field, it is a file inside a directory
|
|
# (or a tree of directories). We need to loop through directory tree (i.e. parts in path up
|
|
# to and the file's parent) and create distribution objects and their relations for each directory,
|
|
# and distribution objects and their relations for each file, and add all to the relevant lists.
|
|
fdir = Path(f["directoryLabel"])
|
|
fdir_parts = list(fdir.parts)
|
|
for idx, dirname in enumerate(fdir_parts):
|
|
current_path = f"{'/'.join(fdir_parts[:idx+1])}/"
|
|
# All sub-distributions (i.e. directories) have to be added to the top-level distribution["has_part"] (if it doesn't exist)
|
|
# All sub-distribution entities have to be added to their parent's filecontainerobject["qualified_part"] (if it doesn't exist)
|
|
container_obj = add_obj_to_list(
|
|
obj={
|
|
"id": "exthisdsver:./" + current_path,
|
|
"qualified_part": []
|
|
},
|
|
arr=has_part,
|
|
keys_to_match=[
|
|
"id",
|
|
]
|
|
)
|
|
# Then:
|
|
# 1. if it's the root directory of the current file (and not parent):
|
|
# - add qualified part to distribution["qualified_part"] (if it doesn't exist)
|
|
# 2. if it's NOT the root NOR parent directory of the current file:
|
|
# - add qualified part to distribution["has_part"][current_dir] (if it doesn't exist)
|
|
# 3. if it's the file's parent directory:
|
|
# - add qualified part to distribution["has_part"][current_dir] (if it doesn't exist)
|
|
# - add file relation as qualified part to distribution["has_part"][current_dir] (if it doesn't exist)
|
|
# 1:
|
|
if idx == 0:
|
|
# IS rootdir
|
|
qualified_obj = add_obj_to_list(
|
|
obj={
|
|
"entity": "exthisdsver:./" + current_path,
|
|
"name": dirname,
|
|
},
|
|
arr=qualified_part,
|
|
keys_to_match=[
|
|
"entity",
|
|
"name"
|
|
]
|
|
)
|
|
# 3:
|
|
else:
|
|
# any other dir
|
|
qualified_obj = add_obj_to_list(
|
|
obj={
|
|
"entity": "exthisdsver:./" + current_path,
|
|
"name": dirname,
|
|
},
|
|
arr=previous_container_obj["qualified_part"],
|
|
keys_to_match=[
|
|
"entity",
|
|
"name"
|
|
]
|
|
)
|
|
# with file's parent dir, add qualified file relation to container object
|
|
if idx == len(fdir_parts) - 1:
|
|
qualified_file_obj = add_obj_to_list(
|
|
obj={
|
|
"entity": "exthisdsver:./" + fpath,
|
|
"name": f["label"],
|
|
},
|
|
arr=container_obj["qualified_part"],
|
|
keys_to_match=[
|
|
"entity",
|
|
"name"
|
|
]
|
|
)
|
|
previous_container_obj = container_obj
|
|
|
|
return has_part, qualified_part
|
|
|
|
|
|
if __name__ == "__main__":
|
|
# load arguments
|
|
parser = ArgumentParser()
|
|
parser.add_argument(
|
|
"jsonfile", type=str, help="Path to the dataverse-json export",
|
|
)
|
|
parser.add_argument(
|
|
"schema_file", type=str, help="Path to the schemaorg-jsonld export",
|
|
)
|
|
args = parser.parse_args()
|
|
# load files into dicts
|
|
json_obj = read_json_file(Path(args.jsonfile))
|
|
schema_obj = read_json_file(Path(args.schema_file))
|
|
|
|
|
|
# ---------------------------
|
|
# Relations to contextual entities
|
|
# i.e. describe the dataset contextually
|
|
# ---------------------------
|
|
|
|
# First list agents:
|
|
agents = [
|
|
# Stephan Heunis
|
|
dict(
|
|
id="exthisds:#" + schema_obj["creator"][0]["givenName"] + schema_obj["creator"][0]["familyName"],
|
|
meta_type="dldist:Person",
|
|
name=schema_obj["creator"][0]["givenName"] + ' ' + schema_obj["creator"][0]["familyName"],
|
|
identifier=[
|
|
{
|
|
"schema_agency": "https://orcid.org",
|
|
"notation": schema_obj["creator"][0]["identifier"].replace("https://orcid.org/", ""),
|
|
},
|
|
],
|
|
affiliation=["exthisds:#" + schema_obj["creator"][0]["affiliation"]["name"].replace(" ", "")],
|
|
same_as=[
|
|
schema_obj["creator"][0]["identifier"]
|
|
],
|
|
),
|
|
# Eindhoven University
|
|
dict(
|
|
id="exthisds:#" + schema_obj["creator"][0]["affiliation"]["name"].replace(" ", ""),
|
|
meta_type="dldist:Organization",
|
|
name=schema_obj["creator"][0]["affiliation"]["name"],
|
|
identifier=[
|
|
{
|
|
"schema_agency": "https://ror.org",
|
|
"notation": "02c2kyt77",
|
|
},
|
|
{
|
|
"schema_agency": "https://www.wikidata.org/wiki",
|
|
"notation": "Q280824",
|
|
},
|
|
],
|
|
same_as=[
|
|
"https://ror.org/02c2kyt77"
|
|
"https://www.wikidata.org/wiki/Q280824"
|
|
],
|
|
),
|
|
]
|
|
# Then publications
|
|
# note: these publications aren't contained in the dataverse metadata
|
|
publications = [
|
|
dict(
|
|
id="exthisds:#data_paper",
|
|
meta_type="dlsdd:Publication",
|
|
notation="Heunis S, Breeuwer M, Caballero-Gaudes C et al. rt-me-fMRI: a task and resting state dataset for real-time, multi-echo fMRI methods development and validation [version 1; peer review: 1 approved, 1 approved with reservations]. F1000Research 2021, 10:70 (https://doi.org/10.12688/f1000research.29988.1)",
|
|
date_published="2021-02-04",
|
|
identifier=[
|
|
{
|
|
"schema_agency": "https://doi.org",
|
|
"notation": "10.12688/f1000research.29988.1",
|
|
},
|
|
],
|
|
qualified_attribution=[
|
|
{
|
|
"agent":agents[0]["id"],
|
|
"had_role": [
|
|
"marcrel:aut",
|
|
"marcrel:col",
|
|
"marcrel:cre"
|
|
],
|
|
}
|
|
]
|
|
),
|
|
dict(
|
|
id="exthisds:#methods_paper",
|
|
meta_type="dlsdd:Publication",
|
|
notation="S. Heunis, M. Breeuwer, C. Caballero-Gaudes, L. Hellrung, W. Huijbers, J.F. Jansen, R. Lamerichs, S. Zinger, A.P. Aldenkamp. The effects of multi-echo fMRI combination and rapid T*-mapping on offline and real-time BOLD sensitivity. NeuroImage, 238 (2021), Article 118244, 10.1016/j.neuroimage.2021.118244",
|
|
date_published="2021-06-11",
|
|
identifier=[
|
|
{
|
|
"schema_agency": "https://doi.org",
|
|
"notation": "10.1016/j.neuroimage.2021.118244",
|
|
},
|
|
],
|
|
qualified_attribution=[
|
|
{
|
|
"agent":agents[0]["id"],
|
|
"had_role": [
|
|
"marcrel:aut",
|
|
],
|
|
}
|
|
],
|
|
)
|
|
]
|
|
# Then dataset version relation
|
|
dataset_version = dict(
|
|
id="exthisdsver:#",
|
|
meta_type="dldist:Resource",
|
|
date_modified=schema_obj["dateModified"],
|
|
description=schema_obj["description"],
|
|
is_version_of="exthisds:#",
|
|
keyword=schema_obj["keywords"],
|
|
landing_page=schema_obj["identifier"], # doi of the dataset
|
|
name="dataverse-rtmefmri",
|
|
title=schema_obj["name"],
|
|
version=str(json_obj["datasetVersion"]["versionNumber"]) + "." + str(json_obj["datasetVersion"]["versionMinorNumber"]), # jsonld only gives major version
|
|
same_as=[
|
|
schema_obj["identifier"],
|
|
],
|
|
identifier=[
|
|
{
|
|
"schema_agency": "https://doi.org",
|
|
"notation": schema_obj["identifier"].replace("https://doi.org/", ""),
|
|
},
|
|
],
|
|
qualified_attribution=[
|
|
dict(
|
|
agent=agents[0]["id"],
|
|
had_role=[
|
|
"marcrel:aut",
|
|
"marcrel:col",
|
|
"marcrel:cre"
|
|
],
|
|
),
|
|
dict(
|
|
agent=agents[1]["id"],
|
|
had_role=[
|
|
"marcrel:sht", # Supporting host
|
|
],
|
|
)
|
|
],
|
|
qualified_relation=[
|
|
# data paper
|
|
dict(
|
|
had_role=[
|
|
"CiTO:isDocumentedBy",
|
|
"CiTO:citesAsAuthority",
|
|
],
|
|
entity=[publications[0]["id"]]
|
|
),
|
|
# methods paper
|
|
dict(
|
|
had_role=[
|
|
"CiTO:isCitedAsDataSourceBy",
|
|
],
|
|
entity=[publications[1]["id"]]
|
|
),
|
|
# TODO: include grants with "had_role: schema:funding"
|
|
],
|
|
)
|
|
# Then license
|
|
license = dict(
|
|
id="exthisds:#humanhealthdatalicense",
|
|
meta_type="dldist:LicenseDocument",
|
|
same_as=[
|
|
schema_obj["license"],
|
|
],
|
|
license_text=json_obj["datasetVersion"]["termsOfUse"]
|
|
)
|
|
# Then put the relations together in a list
|
|
relation = [dataset_version] + agents + publications + [license]
|
|
# TODO: include study activity and study participants
|
|
# TODO: include access
|
|
|
|
# ---------------------------
|
|
# Then build the distribution
|
|
# ---------------------------
|
|
|
|
# Distribution and some base properties, relations, and (qualified) parts
|
|
has_part, qualified_part = get_parts(json_obj["datasetVersion"]["files"], schema_obj["distribution"])
|
|
distribution = dict(
|
|
id="exthisdsver:.",
|
|
conforms_to=[
|
|
"https://bids-specification.readthedocs.io/en/v1.6.0"
|
|
],
|
|
license=license["id"],
|
|
is_distribution_of="exthisdsver:#",
|
|
was_attributed_to=[
|
|
agents[0]["id"],
|
|
agents[1]["id"],
|
|
],
|
|
relation=relation,
|
|
qualified_part=qualified_part,
|
|
has_part=has_part,
|
|
)
|
|
# 5. Write to yaml file
|
|
outfile = 'Distribution-dataverse-rtmefmri.yaml'
|
|
with open(outfile, 'w') as outf:
|
|
yaml.dump(distribution, outf) |