bids-things/bids_things/__init__.py

import sys
import json
from collections.abc import Iterator
from pathlib import (
    Path,
    PurePosixPath,
)

import click
from datasalad.itertools import (
    decode_bytes,
    itemize,
)
from datalad_core.repo import (
    Worktree,
)
from datalad_core.runners import (
    call_git_oneline,
    iter_git_subproc,
)


def iter_gitcmd_zlines(
    path: Path,
    cmd: str,
    *args: str,
) -> Iterator[str]:
    """Run ``git <cmd>`` at a given ``path`` and with ``args``

    An unconditional ``-z`` argument is used to get zero-byte separation
    of output items, internally. A generator is returned that yields ``str``
    type values corresponding to these items.
    """
    with iter_git_subproc(
        [
            cmd,
            # we rely on zero-byte splitting below
            '-z',
            # otherwise take whatever is coming in
            *args,
        ],
        cwd=path,
    ) as r:
        yield from itemize(
            decode_bytes(r, backslash_replace=True),
            sep='\0',
            keep_ends=False,
        )


def git_ls_tree(path: Path, *args) -> Iterator[str]:
    """Run ``git ls-tree`` at a given ``path`` and with ``args``"""
    return iter_gitcmd_zlines(path, 'ls-tree', *args)


def yield_matching_paths(
    src_path: Path,
    suffixes: list[str],
) -> Iterator[PurePosixPath]:
    for titem in git_ls_tree(
        src_path, '-r', '--name-only', 'HEAD',
    ):
        ipath = PurePosixPath(titem)
        # TODO make accepted image suffixes a configuration parameter
        if not any(ipath.name.endswith(suf) for suf in suffixes):
            continue
        yield ipath


@click.group(context_settings={'help_option_names': ['-h', '--help']})
@click.version_option(version='0.1.0')
def cli():
    """Main entry point for the CLI."""


def get_datalad_pid(
    baseid,
    namespace: str | None = None,
    entity: str | None = None,
    content_defined_id: bool = False,
):
    prefix = 'dldi' if content_defined_id else 'datalad'
    if namespace is None and entity is not None:
        msg = 'entity must have a namespace'
        raise ValueError(msg)
    if entity is None:
        return f'{prefix}:{baseid}'
    else:
        return f'{prefix}:{baseid}/{namespace}/{entity}'


def get_dataset_rec(dsid, name):
    rec = {
        'pid': get_datalad_pid(dsid, 'dl', 'dataset'),
    }
    if name:
        rec['title'] = name
    return rec


def get_dsdistribution_rec(commit, dsid):
    rec = {
        'pid': get_datalad_pid(commit, 'dl', 'commit', content_defined_id=True),
        'distribution_of': [get_datalad_pid(dsid, 'dl', 'dataset')],
        # TODO: define https://concepts.datalad.org/vocab/ as dlvocab:
        'broad_mappings': [
            'dlvocab:datalad-dataset-version',
        ],
    }
    return rec


def get_img_rec(dsid, rpath):
    imgid = rpath.name[:(-1) * sum(
        len(s) for s in rpath.suffixes)]
    subid = None
    #sesid = None
    for entity in imgid.split('_'):
        if entity.startswith('sub-'):
            subid = entity
        #elif entity.startswith('ses-'):
        #    sesid = entity
    if subid is None:
        print('Ignoring image without subject identifier, '
              f'not BIDS-compliant: {rpath}',
              file=sys.stderr)
        return None
    rec = {
        'pid': get_datalad_pid(dsid, 'bids', imgid),
        'part_of': get_datalad_pid(dsid, 'dl', 'dataset'),
        'generated_by': {
            # TODO classifier PID should be parameter
            # this is using a generic activity, rather than a specifically
            # crafted instance
            'object': 'obo:MAXO_0000424',
            'used': {
                'object': get_datalad_pid(dsid, 'bids', subid),
                'roles': [
                    # specimen role
                    'obo:OBI_0000112',
                ],
            # 'at_time': ...
            },
        },
        # magnetic resonance image data set
        'broad_mappings': [
            'obo:OBI_0003328',
        ],
    }
    #if sesid:
    #    rec['generated_by'] = get_datalad_pid(dsid, 'ns/mriscans', f'{subid}_{sesid}')
    #else:
    #    # the whole dataset subject only had one session
    #    rec['generated_by'] = get_datalad_pid(dsid, 'ns/mriscans', f'{subid}')
    return rec


def write_records(dst, records: dict[str, list]):
    dpath = Path(dst)
    for typ_, recs in records.items():
        if not recs:
            # no empty files
            continue
        with open(f'{dpath}-{typ_}.jsonl', 'w') as fobj:
            for rec in sorted(recs, key=lambda x: x['pid']):
                json.dump(
                    rec,
                    fobj,
                    sort_keys=True,
                    ensure_ascii=False,
                    separators=(',', ':'),
                    indent=None,
                )
                fobj.write('\n')


# TODO move into a factory
records = {
    'DataItem': [],
    'Dataset': [],
    'Distribution': [],
    #'Genesis': [],
    'Subject': [],
}

iqm_types = {
    'snr_total': '',
    'snr_wm': '',
    'snrd_csf': '',
    'snrd_gm': '',
    'snrd_total': '',
    'summary_bg_k': '',
    'summary_csf_p95': '',
    'summary_wm_n': '',
    'wm2max': '',
}


@cli.command()
@click.argument('src', type=click.Path(exists=True))
@click.argument('dst', type=click.Path())
@click.argument('common_ns_dsid')
@click.argument('activity_id')
def parse_mriqc(src, dst, common_ns_dsid, activity_id):
    """Read dataset SRC and write to file with base path DST."""
    # RF wrt to parse_raw()
    wt = Worktree(Path(src))

    prefix = 'inputs/'
    derivation_source_dsid = None
    for mod in iter_gitcmd_zlines(
        wt.path, 'config', '--list', '--file', '.gitmodules',
    ):
        mvar, mval = mod.split('\n', maxsplit=1)
        # ignore any line that is unrelated to a dataset in the
        # "inputs"
        if not mvar.startswith(f'submodule.{prefix}'):
            continue
        if not mvar.endswith('.datalad-id'):
            continue
        if derivation_source_dsid is not None:
            msg = "Found more than one input dataset. Not supported"
            raise RuntimeError(msg)
        derivation_source_dsid = mval.strip()

    # a bit ugly, this pulls it from the worktree and
    # not from HEAD
    dsid = wt.config.get('datalad.dataset.id').value
    commit = call_git_oneline(
        ['rev-parse', 'HEAD'],
        cwd=wt.path,
    )
    with (wt.path / 'dataset_description.json').open() as fobj:
        descr = json.load(fobj)

    generating_activity = f'{get_datalad_pid(common_ns_dsid)}/{activity_id}'
    dsrec = get_dataset_rec(dsid, descr.get('Name'))
    dsrec['generated_by'] = {
        'object': generating_activity,
    }
    dsrec['derived_from'] = [{
        'object': get_datalad_pid(derivation_source_dsid, 'dl', 'dataset'),
        'generated_by': {
            'object':
            # quality assessment
            'obo:T4FS_000015',
        },
        'used': [
            # TODO ideally a versioned object
            {'object': 'rrid:SCR_022942'},
        ]
    }]
    records['Dataset'].append(dsrec)
    records['Distribution'].append(get_dsdistribution_rec(commit, dsid))

    for rpath in yield_matching_paths(wt.path, ('.json',)):
        data = json.load((wt.path / rpath).open('r'))
        try:
            if not data['provenance']['software'] == 'mriqc':
                continue
        except KeyError:
            continue

        # bids entity is the filename with .json stripped
        bids_img_entity = rpath.name[:-5]
        srcimg_pid = get_datalad_pid(
            derivation_source_dsid,
            'bids',
            bids_img_entity,
        )
        for iqm_type in iqm_types:
            value = data.get(iqm_type)
            if value is None:
                continue
            rec = {
                'pid': get_datalad_pid(dsid, 'mriqc', f'{bids_img_entity}/{iqm_type}'),
                # in the absence of a global definition, use one in the scope of the
                # common namespace dataset
                'broad_mappings': [
                    get_datalad_pid(common_ns_dsid, 'mriqc/iqms', iqm_type),
                ],
                'quantitative_value': value,
                # link to derivative dataset with all IQMs
                'part_of': dsrec['pid'],
                # source image
                'derived_from': {
                    'object': srcimg_pid,
                },
                'generated_by': {
                    'object': generating_activity,
                },
            }
            records['DataItem'].append(rec)

    write_records(dst, records)


@cli.command()
@click.argument('src', type=click.Path(exists=True))
@click.argument('dst', type=click.Path())
# TODO: Study PID as a parameter
def parse_raw(src, dst):
    """Read dataset SRC and write to file with base path DST."""
    wt = Worktree(Path(src))
    # a bit ugly, this pulls it from the worktree and
    # not from HEAD
    dsid = wt.config.get('datalad.dataset.id').value
    commit = call_git_oneline(
        ['rev-parse', 'HEAD'],
        cwd=wt.path,
    )
    with (wt.path / 'dataset_description.json').open() as fobj:
        descr = json.load(fobj)

    dsrec = get_dataset_rec(dsid, descr.get('Name'))
    # TODO the parts
    dsrec['conforms_to'] = [
        'rrid:SCR_016124',
    ]
    records['Dataset'].append(dsrec)
    records['Distribution'].append(get_dsdistribution_rec(commit, dsid))

    for ipath in yield_matching_paths(wt.path, ('.nii.gz', '.nii')):
        irec = get_img_rec(dsid, ipath)
        if irec is None:
            continue
        records['DataItem'].append(irec)

        # build subject record
        subpid = irec['generated_by']['used']['object']
        if not any(sp['pid'] == subpid for sp in records['Subject']):
            subname = subpid.split('/')[-1].split('-')[-1]
            records['Subject'].append({
                'pid': subpid,
                'name': subname,
                # TODO fill study when parameter is present
            })

        ## build genesis record
        #genpid = irec['generated_by']
        #if not any(gp['pid'] == genpid for gp in records['Genesis']):
        #    if '_' in genpid.split('/')[-1]:
        #        # we have a named session per subject
        #        sesname = genpid.split('/')[-1].split('-')[-1]
        #    else:
        #        # the whole subject is one session only
        #        sesname = None
        #    records['Genesis'].append({
        #        'pid': genpid,
        #        'name': f'MRI scan subject {subname!r}' if not sesname else
        #        f'MRI scan subject {subname!r}, session {sesname!r}',
        #        'broad_mappings': [
        #            'obo:MAXO_0000424',
        #        ],
        #        # TODO fill study when parameter is present
        #    })

    write_records(dst, records)


if __name__ == '__main__':
    cli(prog_name='flatbids')