The idea is that all information is generated using `things/v2+` features only. This would make this tool compatible with any `things`-derived schema, and not specific to something like `research-information`. The adaptation to a specific, derived schema could then be implemented by a generic (not BIDS-specific) tool that looks for information in a record that can be expressed (more natively) using predefined structural slots of a concrete class in a derived schema.
362 lines
11 KiB
Python
362 lines
11 KiB
Python
import sys
|
|
import json
|
|
from collections.abc import Iterator
|
|
from pathlib import (
|
|
Path,
|
|
PurePosixPath,
|
|
)
|
|
|
|
import click
|
|
from datasalad.itertools import (
|
|
decode_bytes,
|
|
itemize,
|
|
)
|
|
from datalad_core.repo import (
|
|
Worktree,
|
|
)
|
|
from datalad_core.runners import (
|
|
call_git_oneline,
|
|
iter_git_subproc,
|
|
)
|
|
|
|
|
|
def iter_gitcmd_zlines(
|
|
path: Path,
|
|
cmd: str,
|
|
*args: str,
|
|
) -> Iterator[str]:
|
|
"""Run ``git <cmd>`` at a given ``path`` and with ``args``
|
|
|
|
An unconditional ``-z`` argument is used to get zero-byte separation
|
|
of output items, internally. A generator is returned that yields ``str``
|
|
type values corresponding to these items.
|
|
"""
|
|
with iter_git_subproc(
|
|
[
|
|
cmd,
|
|
# we rely on zero-byte splitting below
|
|
'-z',
|
|
# otherwise take whatever is coming in
|
|
*args,
|
|
],
|
|
cwd=path,
|
|
) as r:
|
|
yield from itemize(
|
|
decode_bytes(r, backslash_replace=True),
|
|
sep='\0',
|
|
keep_ends=False,
|
|
)
|
|
|
|
|
|
def git_ls_tree(path: Path, *args) -> Iterator[str]:
|
|
"""Run ``git ls-tree`` at a given ``path`` and with ``args``"""
|
|
return iter_gitcmd_zlines(path, 'ls-tree', *args)
|
|
|
|
|
|
def yield_matching_paths(
|
|
src_path: Path,
|
|
suffixes: list[str],
|
|
) -> Iterator[PurePosixPath]:
|
|
for titem in git_ls_tree(
|
|
src_path, '-r', '--name-only', 'HEAD',
|
|
):
|
|
ipath = PurePosixPath(titem)
|
|
# TODO make accepted image suffixes a configuration parameter
|
|
if not any(ipath.name.endswith(suf) for suf in suffixes):
|
|
continue
|
|
yield ipath
|
|
|
|
|
|
@click.group(context_settings={'help_option_names': ['-h', '--help']})
|
|
@click.version_option(version='0.1.0')
|
|
def cli():
|
|
"""Main entry point for the CLI."""
|
|
|
|
|
|
def get_datalad_pid(
|
|
baseid,
|
|
namespace: str | None = None,
|
|
entity: str | None = None,
|
|
content_defined_id: bool = False,
|
|
):
|
|
prefix = 'dldi' if content_defined_id else 'datalad'
|
|
if namespace is None and entity is not None:
|
|
msg = 'entity must have a namespace'
|
|
raise ValueError(msg)
|
|
if entity is None:
|
|
return f'{prefix}:{baseid}'
|
|
else:
|
|
return f'{prefix}:{baseid}/{namespace}/{entity}'
|
|
|
|
|
|
def get_dataset_rec(dsid, name):
|
|
rec = {
|
|
'pid': get_datalad_pid(dsid, 'dl', 'dataset'),
|
|
}
|
|
if name:
|
|
rec['title'] = name
|
|
return rec
|
|
|
|
|
|
def get_dsdistribution_rec(commit, dsid):
|
|
rec = {
|
|
'pid': get_datalad_pid(commit, 'dl', 'commit', content_defined_id=True),
|
|
'distribution_of': [get_datalad_pid(dsid, 'dl', 'dataset')],
|
|
# TODO: define https://concepts.datalad.org/vocab/ as dlvocab:
|
|
'broad_mappings': [
|
|
'dlvocab:datalad-dataset-version',
|
|
],
|
|
}
|
|
return rec
|
|
|
|
|
|
def get_img_rec(dsid, rpath):
|
|
imgid = rpath.name[:(-1) * sum(
|
|
len(s) for s in rpath.suffixes)]
|
|
subid = None
|
|
#sesid = None
|
|
for entity in imgid.split('_'):
|
|
if entity.startswith('sub-'):
|
|
subid = entity
|
|
#elif entity.startswith('ses-'):
|
|
# sesid = entity
|
|
if subid is None:
|
|
print('Ignoring image without subject identifier, '
|
|
f'not BIDS-compliant: {rpath}',
|
|
file=sys.stderr)
|
|
return None
|
|
rec = {
|
|
'pid': get_datalad_pid(dsid, 'bids', imgid),
|
|
'part_of': get_datalad_pid(dsid, 'dl', 'dataset'),
|
|
'generated_by': {
|
|
# TODO classifier PID should be parameter
|
|
# this is using a generic activity, rather than a specifically
|
|
# crafted instance
|
|
'object': 'obo:MAXO_0000424',
|
|
'used': {
|
|
'object': get_datalad_pid(dsid, 'bids', subid),
|
|
'roles': [
|
|
# specimen role
|
|
'obo:OBI_0000112',
|
|
],
|
|
# 'at_time': ...
|
|
},
|
|
},
|
|
# magnetic resonance image data set
|
|
'broad_mappings': [
|
|
'obo:OBI_0003328',
|
|
],
|
|
}
|
|
#if sesid:
|
|
# rec['generated_by'] = get_datalad_pid(dsid, 'ns/mriscans', f'{subid}_{sesid}')
|
|
#else:
|
|
# # the whole dataset subject only had one session
|
|
# rec['generated_by'] = get_datalad_pid(dsid, 'ns/mriscans', f'{subid}')
|
|
return rec
|
|
|
|
|
|
def write_records(dst, records: dict[str, list]):
|
|
dpath = Path(dst)
|
|
for typ_, recs in records.items():
|
|
if not recs:
|
|
# no empty files
|
|
continue
|
|
with open(f'{dpath}-{typ_}.jsonl', 'w') as fobj:
|
|
for rec in sorted(recs, key=lambda x: x['pid']):
|
|
json.dump(
|
|
rec,
|
|
fobj,
|
|
sort_keys=True,
|
|
ensure_ascii=False,
|
|
separators=(',', ':'),
|
|
indent=None,
|
|
)
|
|
fobj.write('\n')
|
|
|
|
|
|
# TODO move into a factory
|
|
records = {
|
|
'DataItem': [],
|
|
'Dataset': [],
|
|
'Distribution': [],
|
|
#'Genesis': [],
|
|
'Subject': [],
|
|
}
|
|
|
|
iqm_types = {
|
|
'snr_total': '',
|
|
'snr_wm': '',
|
|
'snrd_csf': '',
|
|
'snrd_gm': '',
|
|
'snrd_total': '',
|
|
'summary_bg_k': '',
|
|
'summary_csf_p95': '',
|
|
'summary_wm_n': '',
|
|
'wm2max': '',
|
|
}
|
|
|
|
|
|
@cli.command()
|
|
@click.argument('src', type=click.Path(exists=True))
|
|
@click.argument('dst', type=click.Path())
|
|
@click.argument('common_ns_dsid')
|
|
@click.argument('activity_id')
|
|
def parse_mriqc(src, dst, common_ns_dsid, activity_id):
|
|
"""Read dataset SRC and write to file with base path DST."""
|
|
# RF wrt to parse_raw()
|
|
wt = Worktree(Path(src))
|
|
|
|
prefix = 'inputs/'
|
|
derivation_source_dsid = None
|
|
for mod in iter_gitcmd_zlines(
|
|
wt.path, 'config', '--list', '--file', '.gitmodules',
|
|
):
|
|
mvar, mval = mod.split('\n', maxsplit=1)
|
|
# ignore any line that is unrelated to a dataset in the
|
|
# "inputs"
|
|
if not mvar.startswith(f'submodule.{prefix}'):
|
|
continue
|
|
if not mvar.endswith('.datalad-id'):
|
|
continue
|
|
if derivation_source_dsid is not None:
|
|
msg = "Found more than one input dataset. Not supported"
|
|
raise RuntimeError(msg)
|
|
derivation_source_dsid = mval.strip()
|
|
|
|
# a bit ugly, this pulls it from the worktree and
|
|
# not from HEAD
|
|
dsid = wt.config.get('datalad.dataset.id').value
|
|
commit = call_git_oneline(
|
|
['rev-parse', 'HEAD'],
|
|
cwd=wt.path,
|
|
)
|
|
with (wt.path / 'dataset_description.json').open() as fobj:
|
|
descr = json.load(fobj)
|
|
|
|
generating_activity = f'{get_datalad_pid(common_ns_dsid)}/{activity_id}'
|
|
dsrec = get_dataset_rec(dsid, descr.get('Name'))
|
|
dsrec['generated_by'] = {
|
|
'object': generating_activity,
|
|
}
|
|
dsrec['derived_from'] = [{
|
|
'object': get_datalad_pid(derivation_source_dsid, 'dl', 'dataset'),
|
|
'generated_by': {
|
|
'object':
|
|
# quality assessment
|
|
'obo:T4FS_000015',
|
|
},
|
|
'used': [
|
|
# TODO ideally a versioned object
|
|
{'object': 'rrid:SCR_022942'},
|
|
]
|
|
}]
|
|
records['Dataset'].append(dsrec)
|
|
records['Distribution'].append(get_dsdistribution_rec(commit, dsid))
|
|
|
|
for rpath in yield_matching_paths(wt.path, ('.json',)):
|
|
data = json.load((wt.path / rpath).open('r'))
|
|
try:
|
|
if not data['provenance']['software'] == 'mriqc':
|
|
continue
|
|
except KeyError:
|
|
continue
|
|
|
|
# bids entity is the filename with .json stripped
|
|
bids_img_entity = rpath.name[:-5]
|
|
srcimg_pid = get_datalad_pid(
|
|
derivation_source_dsid,
|
|
'bids',
|
|
bids_img_entity,
|
|
)
|
|
for iqm_type in iqm_types:
|
|
value = data.get(iqm_type)
|
|
if value is None:
|
|
continue
|
|
rec = {
|
|
'pid': get_datalad_pid(dsid, 'mriqc', f'{bids_img_entity}/{iqm_type}'),
|
|
# in the absence of a global definition, use one in the scope of the
|
|
# common namespace dataset
|
|
'broad_mappings': [
|
|
get_datalad_pid(common_ns_dsid, 'mriqc/iqms', iqm_type),
|
|
],
|
|
'quantitative_value': value,
|
|
# link to derivative dataset with all IQMs
|
|
'part_of': dsrec['pid'],
|
|
# source image
|
|
'derived_from': {
|
|
'object': srcimg_pid,
|
|
},
|
|
'generated_by': {
|
|
'object': generating_activity,
|
|
},
|
|
}
|
|
records['DataItem'].append(rec)
|
|
|
|
write_records(dst, records)
|
|
|
|
|
|
@cli.command()
|
|
@click.argument('src', type=click.Path(exists=True))
|
|
@click.argument('dst', type=click.Path())
|
|
# TODO: Study PID as a parameter
|
|
def parse_raw(src, dst):
|
|
"""Read dataset SRC and write to file with base path DST."""
|
|
wt = Worktree(Path(src))
|
|
# a bit ugly, this pulls it from the worktree and
|
|
# not from HEAD
|
|
dsid = wt.config.get('datalad.dataset.id').value
|
|
commit = call_git_oneline(
|
|
['rev-parse', 'HEAD'],
|
|
cwd=wt.path,
|
|
)
|
|
with (wt.path / 'dataset_description.json').open() as fobj:
|
|
descr = json.load(fobj)
|
|
|
|
dsrec = get_dataset_rec(dsid, descr.get('Name'))
|
|
# TODO the parts
|
|
dsrec['conforms_to'] = [
|
|
'rrid:SCR_016124',
|
|
]
|
|
records['Dataset'].append(dsrec)
|
|
records['Distribution'].append(get_dsdistribution_rec(commit, dsid))
|
|
|
|
for ipath in yield_matching_paths(wt.path, ('.nii.gz', '.nii')):
|
|
irec = get_img_rec(dsid, ipath)
|
|
if irec is None:
|
|
continue
|
|
records['DataItem'].append(irec)
|
|
|
|
# build subject record
|
|
subpid = irec['generated_by']['used']['object']
|
|
if not any(sp['pid'] == subpid for sp in records['Subject']):
|
|
subname = subpid.split('/')[-1].split('-')[-1]
|
|
records['Subject'].append({
|
|
'pid': subpid,
|
|
'name': subname,
|
|
# TODO fill study when parameter is present
|
|
})
|
|
|
|
## build genesis record
|
|
#genpid = irec['generated_by']
|
|
#if not any(gp['pid'] == genpid for gp in records['Genesis']):
|
|
# if '_' in genpid.split('/')[-1]:
|
|
# # we have a named session per subject
|
|
# sesname = genpid.split('/')[-1].split('-')[-1]
|
|
# else:
|
|
# # the whole subject is one session only
|
|
# sesname = None
|
|
# records['Genesis'].append({
|
|
# 'pid': genpid,
|
|
# 'name': f'MRI scan subject {subname!r}' if not sesname else
|
|
# f'MRI scan subject {subname!r}, session {sesname!r}',
|
|
# 'broad_mappings': [
|
|
# 'obo:MAXO_0000424',
|
|
# ],
|
|
# # TODO fill study when parameter is present
|
|
# })
|
|
|
|
write_records(dst, records)
|
|
|
|
|
|
if __name__ == '__main__':
|
|
cli(prog_name='flatbids')
|