tools/code/get_nb_data_dict.py
Stephan Heunis 55370197c4 A neurobagel-and-UI annotated linkml schema
This schema takes the previously UI-annotated schema
for a scientific data dictionary and adds neurobagel-
specific annotations such that a neurobagel data dictionary
can be derived from the schema, programmatically. At the same
time, the schema should allow for the terms/fields described
in the data dictionary to be captured by a shacl-vue form
generated from the same schema.

The main change is that a 'sample' class is now also modeled.
A sample is connected to a specific participant_id and optionally
to a specific session_id, and has several variables (columns)
attached to it, including e.g. age, sex, tool_X_score, etc.
Each variable is a slot, which has the appropriate and exact
annotations needed for a data dictionary. Note that Linkml doesn't
currently support multi-level annotation objects, and the annotations
are therefore added as json strings.

The script can generate jsonld from the schema, using linkml, and
will then filter and manipulate the output to get the exact data
dictionary expected by neurobagel.
2024-09-20 12:06:30 +02:00

126 lines
3.7 KiB
Python

"""
docstring to be created...
"""
from argparse import ArgumentParser, RawDescriptionHelpFormatter
import json
from pathlib import Path
import subprocess
import sys
def get_jsonld(schema_path):
""""""
args = ['gen-jsonld', str(schema_path)]
return run_subprocess(args)
def run_subprocess(args):
return subprocess.run(args, capture_output=True, text=True).stdout
if __name__ == "__main__":
parser = ArgumentParser(epilog=__doc__,
formatter_class=RawDescriptionHelpFormatter)
parser.add_argument(
"schema",
type=str,
help="Path to the LinkML schema with Neuobagel annotations (YAML)",
)
parser.add_argument(
"--output",
type=str,
help="""Path to the output file containing the Neurobagel data dictionary""",
)
args = parser.parse_args()
# Establish some paths
fp = Path(__file__)
schema_path = Path(args.schema).resolve()
curdir = Path.cwd()
if args.output:
output_path = Path(args.output).resolve()
else:
output_path = curdir / 'data_dictionary.json'
# Generate json-ld
json_ld = get_jsonld(schema_path)
data = json.loads(json_ld)
# Get slots with nb annotations
slots_nb = [s for s in data.get("slots", []) if "annotations" in s
and "is_usage_slot" not in s
and any([a for a in s["annotations"] if "tag" in a
and a["tag"] == "nb:Annotations"
and a["value"]])]
known_column_types = [
"nb:IdentifierNeurobagel",
"nb:ToolNeurobagel",
"nb:ContinuousColumn",
"nb:CategoricalColumn",
]
data_dict = {}
for s in slots_nb:
s_name = s["name"]
s_annotations = s["annotations"]
has_known_column = next((a for a in s["annotations"]
if a["tag"] in known_column_types
and a["value"] == True), None)
if not has_known_column:
print(f"Slot '{s_name}' has no annotations with"
"a known neurobagel column type; cannot use"
"this slot for data dictionary.")
continue
column_type = has_known_column["tag"]
has_levels = None
if column_type == "nb:CategoricalColumn":
has_levels = next((a for a in s["annotations"]
if a["tag"] == "nb:Levels"
and a["value"]), None)
has_description = next((a for a in s["annotations"]
if a["tag"] == "nb:Description"
and a["value"]), None)
if not has_description:
print(f"Slot '{s_name}' has no annotations with"
"a neurobagel-required description; cannot"
"use this slot for data dictionary.")
continue
description = has_description["value"]
nb_annotations = next((a for a in s["annotations"]
if a["tag"] == "nb:Annotations"), None)
print(nb_annotations["value"])
nb_annotations_dict = json.loads(nb_annotations["value"])
data_dict[s_name] = {
"Description": description,
"Annotations": nb_annotations_dict,
}
if has_levels:
data_dict[s_name]["Levels"] = json.loads(has_levels["value"])
with open(output_path, 'w') as f:
json.dump(data_dict, f)
# current_path = Path(__file__).resolve().parent
# test_fp = current_path / 'sddnb.jsonld'
# with open(test_fp) as json_data:
# data = json.load(json_data)
# print(data)
# obj = json.loads(data["value"])
# print(obj)