This schema takes the previously UI-annotated schema for a scientific data dictionary and adds neurobagel- specific annotations such that a neurobagel data dictionary can be derived from the schema, programmatically. At the same time, the schema should allow for the terms/fields described in the data dictionary to be captured by a shacl-vue form generated from the same schema. The main change is that a 'sample' class is now also modeled. A sample is connected to a specific participant_id and optionally to a specific session_id, and has several variables (columns) attached to it, including e.g. age, sex, tool_X_score, etc. Each variable is a slot, which has the appropriate and exact annotations needed for a data dictionary. Note that Linkml doesn't currently support multi-level annotation objects, and the annotations are therefore added as json strings. The script can generate jsonld from the schema, using linkml, and will then filter and manipulate the output to get the exact data dictionary expected by neurobagel.
126 lines
3.7 KiB
Python
126 lines
3.7 KiB
Python
"""
|
|
docstring to be created...
|
|
"""
|
|
|
|
|
|
from argparse import ArgumentParser, RawDescriptionHelpFormatter
|
|
import json
|
|
from pathlib import Path
|
|
import subprocess
|
|
import sys
|
|
|
|
|
|
def get_jsonld(schema_path):
|
|
""""""
|
|
args = ['gen-jsonld', str(schema_path)]
|
|
return run_subprocess(args)
|
|
|
|
|
|
def run_subprocess(args):
|
|
return subprocess.run(args, capture_output=True, text=True).stdout
|
|
|
|
|
|
if __name__ == "__main__":
|
|
parser = ArgumentParser(epilog=__doc__,
|
|
formatter_class=RawDescriptionHelpFormatter)
|
|
parser.add_argument(
|
|
"schema",
|
|
type=str,
|
|
help="Path to the LinkML schema with Neuobagel annotations (YAML)",
|
|
)
|
|
parser.add_argument(
|
|
"--output",
|
|
type=str,
|
|
help="""Path to the output file containing the Neurobagel data dictionary""",
|
|
)
|
|
args = parser.parse_args()
|
|
|
|
# Establish some paths
|
|
fp = Path(__file__)
|
|
schema_path = Path(args.schema).resolve()
|
|
curdir = Path.cwd()
|
|
if args.output:
|
|
output_path = Path(args.output).resolve()
|
|
else:
|
|
output_path = curdir / 'data_dictionary.json'
|
|
|
|
# Generate json-ld
|
|
json_ld = get_jsonld(schema_path)
|
|
data = json.loads(json_ld)
|
|
|
|
# Get slots with nb annotations
|
|
slots_nb = [s for s in data.get("slots", []) if "annotations" in s
|
|
and "is_usage_slot" not in s
|
|
and any([a for a in s["annotations"] if "tag" in a
|
|
and a["tag"] == "nb:Annotations"
|
|
and a["value"]])]
|
|
|
|
known_column_types = [
|
|
"nb:IdentifierNeurobagel",
|
|
"nb:ToolNeurobagel",
|
|
"nb:ContinuousColumn",
|
|
"nb:CategoricalColumn",
|
|
]
|
|
|
|
data_dict = {}
|
|
for s in slots_nb:
|
|
s_name = s["name"]
|
|
s_annotations = s["annotations"]
|
|
has_known_column = next((a for a in s["annotations"]
|
|
if a["tag"] in known_column_types
|
|
and a["value"] == True), None)
|
|
if not has_known_column:
|
|
print(f"Slot '{s_name}' has no annotations with"
|
|
"a known neurobagel column type; cannot use"
|
|
"this slot for data dictionary.")
|
|
continue
|
|
column_type = has_known_column["tag"]
|
|
has_levels = None
|
|
if column_type == "nb:CategoricalColumn":
|
|
has_levels = next((a for a in s["annotations"]
|
|
if a["tag"] == "nb:Levels"
|
|
and a["value"]), None)
|
|
has_description = next((a for a in s["annotations"]
|
|
if a["tag"] == "nb:Description"
|
|
and a["value"]), None)
|
|
if not has_description:
|
|
print(f"Slot '{s_name}' has no annotations with"
|
|
"a neurobagel-required description; cannot"
|
|
"use this slot for data dictionary.")
|
|
continue
|
|
description = has_description["value"]
|
|
|
|
nb_annotations = next((a for a in s["annotations"]
|
|
if a["tag"] == "nb:Annotations"), None)
|
|
|
|
print(nb_annotations["value"])
|
|
nb_annotations_dict = json.loads(nb_annotations["value"])
|
|
|
|
data_dict[s_name] = {
|
|
"Description": description,
|
|
"Annotations": nb_annotations_dict,
|
|
}
|
|
if has_levels:
|
|
data_dict[s_name]["Levels"] = json.loads(has_levels["value"])
|
|
|
|
with open(output_path, 'w') as f:
|
|
json.dump(data_dict, f)
|
|
|
|
|
|
|
|
|
|
|
|
# current_path = Path(__file__).resolve().parent
|
|
# test_fp = current_path / 'sddnb.jsonld'
|
|
|
|
# with open(test_fp) as json_data:
|
|
# data = json.load(json_data)
|
|
|
|
# print(data)
|
|
|
|
# obj = json.loads(data["value"])
|
|
|
|
# print(obj)
|
|
|
|
|
|
|