tools/code/get_nb_data_dict.py

"""
docstring to be created...
"""


from argparse import ArgumentParser, RawDescriptionHelpFormatter
import json
from pathlib import Path
import subprocess
import sys


def get_jsonld(schema_path):
    """"""
    args = ['gen-jsonld', str(schema_path)]
    return run_subprocess(args)


def run_subprocess(args):
    return subprocess.run(args, capture_output=True, text=True).stdout


if __name__ == "__main__":
    parser = ArgumentParser(epilog=__doc__,
                            formatter_class=RawDescriptionHelpFormatter)
    parser.add_argument(
        "schema",
        type=str,
        help="Path to the LinkML schema with Neuobagel annotations (YAML)",
    )
    parser.add_argument(
        "--output",
        type=str,
        help="""Path to the output file containing the Neurobagel data dictionary""",
    )
    args = parser.parse_args()

    # Establish some paths
    fp = Path(__file__)
    schema_path = Path(args.schema).resolve()
    curdir = Path.cwd()
    if args.output:
        output_path = Path(args.output).resolve()
    else:
        output_path = curdir / 'data_dictionary.json'

    # Generate json-ld
    json_ld = get_jsonld(schema_path)
    data = json.loads(json_ld)

    # Get slots with nb annotations
    slots_nb = [s for s in data.get("slots", []) if "annotations" in s
             and "is_usage_slot" not in s
             and any([a for a in s["annotations"] if "tag" in a
                      and a["tag"] == "nb:Annotations"
                      and a["value"]])]

    known_column_types = [
        "nb:IdentifierNeurobagel",
        "nb:ToolNeurobagel",
        "nb:ContinuousColumn",
        "nb:CategoricalColumn",
    ]

    data_dict = {}
    for s in slots_nb:
        s_name = s["name"]
        s_annotations = s["annotations"]
        has_known_column = next((a for a in s["annotations"]
                                 if a["tag"] in known_column_types
                                 and a["value"] == True), None)
        if not has_known_column:
            print(f"Slot '{s_name}' has no annotations with"
                  "a known neurobagel column type; cannot use"
                  "this slot for data dictionary.")
            continue
        column_type = has_known_column["tag"]
        has_levels = None
        if column_type == "nb:CategoricalColumn":
            has_levels = next((a for a in s["annotations"]
                               if a["tag"] == "nb:Levels"
                               and a["value"]), None)
        has_description = next((a for a in s["annotations"]
                                if a["tag"] == "nb:Description"
                                and a["value"]), None)
        if not has_description:
            print(f"Slot '{s_name}' has no annotations with"
                  "a neurobagel-required description; cannot"
                  "use this slot for data dictionary.")
            continue
        description = has_description["value"]

        nb_annotations = next((a for a in s["annotations"]
                               if a["tag"] == "nb:Annotations"), None)

        print(nb_annotations["value"])
        nb_annotations_dict = json.loads(nb_annotations["value"])

        data_dict[s_name] = {
            "Description": description,
            "Annotations": nb_annotations_dict,
        }
        if has_levels:
            data_dict[s_name]["Levels"] = json.loads(has_levels["value"])

    with open(output_path, 'w') as f:
        json.dump(data_dict, f)


# current_path = Path(__file__).resolve().parent
# test_fp = current_path / 'sddnb.jsonld'

# with open(test_fp) as json_data:
#     data = json.load(json_data)

# print(data)

# obj = json.loads(data["value"])

# print(obj)