bids-things/bids_things/__init__.py
Michael Hanke ee75ce9f7b
rf: adjust project naming and semantic to targets things directly
The idea is that all information is generated using `things/v2+`
features only. This would make this tool compatible with any
`things`-derived schema, and not specific to something like
`research-information`.

The adaptation to a specific, derived schema could then be implemented
by a generic (not BIDS-specific) tool that looks for information in a
record that can be expressed (more natively) using predefined structural
slots of a concrete class in a derived schema.
2026-03-26 10:34:05 +01:00

362 lines
11 KiB
Python

import sys
import json
from collections.abc import Iterator
from pathlib import (
Path,
PurePosixPath,
)
import click
from datasalad.itertools import (
decode_bytes,
itemize,
)
from datalad_core.repo import (
Worktree,
)
from datalad_core.runners import (
call_git_oneline,
iter_git_subproc,
)
def iter_gitcmd_zlines(
path: Path,
cmd: str,
*args: str,
) -> Iterator[str]:
"""Run ``git <cmd>`` at a given ``path`` and with ``args``
An unconditional ``-z`` argument is used to get zero-byte separation
of output items, internally. A generator is returned that yields ``str``
type values corresponding to these items.
"""
with iter_git_subproc(
[
cmd,
# we rely on zero-byte splitting below
'-z',
# otherwise take whatever is coming in
*args,
],
cwd=path,
) as r:
yield from itemize(
decode_bytes(r, backslash_replace=True),
sep='\0',
keep_ends=False,
)
def git_ls_tree(path: Path, *args) -> Iterator[str]:
"""Run ``git ls-tree`` at a given ``path`` and with ``args``"""
return iter_gitcmd_zlines(path, 'ls-tree', *args)
def yield_matching_paths(
src_path: Path,
suffixes: list[str],
) -> Iterator[PurePosixPath]:
for titem in git_ls_tree(
src_path, '-r', '--name-only', 'HEAD',
):
ipath = PurePosixPath(titem)
# TODO make accepted image suffixes a configuration parameter
if not any(ipath.name.endswith(suf) for suf in suffixes):
continue
yield ipath
@click.group(context_settings={'help_option_names': ['-h', '--help']})
@click.version_option(version='0.1.0')
def cli():
"""Main entry point for the CLI."""
def get_datalad_pid(
baseid,
namespace: str | None = None,
entity: str | None = None,
content_defined_id: bool = False,
):
prefix = 'dldi' if content_defined_id else 'datalad'
if namespace is None and entity is not None:
msg = 'entity must have a namespace'
raise ValueError(msg)
if entity is None:
return f'{prefix}:{baseid}'
else:
return f'{prefix}:{baseid}/{namespace}/{entity}'
def get_dataset_rec(dsid, name):
rec = {
'pid': get_datalad_pid(dsid, 'dl', 'dataset'),
}
if name:
rec['title'] = name
return rec
def get_dsdistribution_rec(commit, dsid):
rec = {
'pid': get_datalad_pid(commit, 'dl', 'commit', content_defined_id=True),
'distribution_of': [get_datalad_pid(dsid, 'dl', 'dataset')],
# TODO: define https://concepts.datalad.org/vocab/ as dlvocab:
'broad_mappings': [
'dlvocab:datalad-dataset-version',
],
}
return rec
def get_img_rec(dsid, rpath):
imgid = rpath.name[:(-1) * sum(
len(s) for s in rpath.suffixes)]
subid = None
#sesid = None
for entity in imgid.split('_'):
if entity.startswith('sub-'):
subid = entity
#elif entity.startswith('ses-'):
# sesid = entity
if subid is None:
print('Ignoring image without subject identifier, '
f'not BIDS-compliant: {rpath}',
file=sys.stderr)
return None
rec = {
'pid': get_datalad_pid(dsid, 'bids', imgid),
'part_of': get_datalad_pid(dsid, 'dl', 'dataset'),
'generated_by': {
# TODO classifier PID should be parameter
# this is using a generic activity, rather than a specifically
# crafted instance
'object': 'obo:MAXO_0000424',
'used': {
'object': get_datalad_pid(dsid, 'bids', subid),
'roles': [
# specimen role
'obo:OBI_0000112',
],
# 'at_time': ...
},
},
# magnetic resonance image data set
'broad_mappings': [
'obo:OBI_0003328',
],
}
#if sesid:
# rec['generated_by'] = get_datalad_pid(dsid, 'ns/mriscans', f'{subid}_{sesid}')
#else:
# # the whole dataset subject only had one session
# rec['generated_by'] = get_datalad_pid(dsid, 'ns/mriscans', f'{subid}')
return rec
def write_records(dst, records: dict[str, list]):
dpath = Path(dst)
for typ_, recs in records.items():
if not recs:
# no empty files
continue
with open(f'{dpath}-{typ_}.jsonl', 'w') as fobj:
for rec in sorted(recs, key=lambda x: x['pid']):
json.dump(
rec,
fobj,
sort_keys=True,
ensure_ascii=False,
separators=(',', ':'),
indent=None,
)
fobj.write('\n')
# TODO move into a factory
records = {
'DataItem': [],
'Dataset': [],
'Distribution': [],
#'Genesis': [],
'Subject': [],
}
iqm_types = {
'snr_total': '',
'snr_wm': '',
'snrd_csf': '',
'snrd_gm': '',
'snrd_total': '',
'summary_bg_k': '',
'summary_csf_p95': '',
'summary_wm_n': '',
'wm2max': '',
}
@cli.command()
@click.argument('src', type=click.Path(exists=True))
@click.argument('dst', type=click.Path())
@click.argument('common_ns_dsid')
@click.argument('activity_id')
def parse_mriqc(src, dst, common_ns_dsid, activity_id):
"""Read dataset SRC and write to file with base path DST."""
# RF wrt to parse_raw()
wt = Worktree(Path(src))
prefix = 'inputs/'
derivation_source_dsid = None
for mod in iter_gitcmd_zlines(
wt.path, 'config', '--list', '--file', '.gitmodules',
):
mvar, mval = mod.split('\n', maxsplit=1)
# ignore any line that is unrelated to a dataset in the
# "inputs"
if not mvar.startswith(f'submodule.{prefix}'):
continue
if not mvar.endswith('.datalad-id'):
continue
if derivation_source_dsid is not None:
msg = "Found more than one input dataset. Not supported"
raise RuntimeError(msg)
derivation_source_dsid = mval.strip()
# a bit ugly, this pulls it from the worktree and
# not from HEAD
dsid = wt.config.get('datalad.dataset.id').value
commit = call_git_oneline(
['rev-parse', 'HEAD'],
cwd=wt.path,
)
with (wt.path / 'dataset_description.json').open() as fobj:
descr = json.load(fobj)
generating_activity = f'{get_datalad_pid(common_ns_dsid)}/{activity_id}'
dsrec = get_dataset_rec(dsid, descr.get('Name'))
dsrec['generated_by'] = {
'object': generating_activity,
}
dsrec['derived_from'] = [{
'object': get_datalad_pid(derivation_source_dsid, 'dl', 'dataset'),
'generated_by': {
'object':
# quality assessment
'obo:T4FS_000015',
},
'used': [
# TODO ideally a versioned object
{'object': 'rrid:SCR_022942'},
]
}]
records['Dataset'].append(dsrec)
records['Distribution'].append(get_dsdistribution_rec(commit, dsid))
for rpath in yield_matching_paths(wt.path, ('.json',)):
data = json.load((wt.path / rpath).open('r'))
try:
if not data['provenance']['software'] == 'mriqc':
continue
except KeyError:
continue
# bids entity is the filename with .json stripped
bids_img_entity = rpath.name[:-5]
srcimg_pid = get_datalad_pid(
derivation_source_dsid,
'bids',
bids_img_entity,
)
for iqm_type in iqm_types:
value = data.get(iqm_type)
if value is None:
continue
rec = {
'pid': get_datalad_pid(dsid, 'mriqc', f'{bids_img_entity}/{iqm_type}'),
# in the absence of a global definition, use one in the scope of the
# common namespace dataset
'broad_mappings': [
get_datalad_pid(common_ns_dsid, 'mriqc/iqms', iqm_type),
],
'quantitative_value': value,
# link to derivative dataset with all IQMs
'part_of': dsrec['pid'],
# source image
'derived_from': {
'object': srcimg_pid,
},
'generated_by': {
'object': generating_activity,
},
}
records['DataItem'].append(rec)
write_records(dst, records)
@cli.command()
@click.argument('src', type=click.Path(exists=True))
@click.argument('dst', type=click.Path())
# TODO: Study PID as a parameter
def parse_raw(src, dst):
"""Read dataset SRC and write to file with base path DST."""
wt = Worktree(Path(src))
# a bit ugly, this pulls it from the worktree and
# not from HEAD
dsid = wt.config.get('datalad.dataset.id').value
commit = call_git_oneline(
['rev-parse', 'HEAD'],
cwd=wt.path,
)
with (wt.path / 'dataset_description.json').open() as fobj:
descr = json.load(fobj)
dsrec = get_dataset_rec(dsid, descr.get('Name'))
# TODO the parts
dsrec['conforms_to'] = [
'rrid:SCR_016124',
]
records['Dataset'].append(dsrec)
records['Distribution'].append(get_dsdistribution_rec(commit, dsid))
for ipath in yield_matching_paths(wt.path, ('.nii.gz', '.nii')):
irec = get_img_rec(dsid, ipath)
if irec is None:
continue
records['DataItem'].append(irec)
# build subject record
subpid = irec['generated_by']['used']['object']
if not any(sp['pid'] == subpid for sp in records['Subject']):
subname = subpid.split('/')[-1].split('-')[-1]
records['Subject'].append({
'pid': subpid,
'name': subname,
# TODO fill study when parameter is present
})
## build genesis record
#genpid = irec['generated_by']
#if not any(gp['pid'] == genpid for gp in records['Genesis']):
# if '_' in genpid.split('/')[-1]:
# # we have a named session per subject
# sesname = genpid.split('/')[-1].split('-')[-1]
# else:
# # the whole subject is one session only
# sesname = None
# records['Genesis'].append({
# 'pid': genpid,
# 'name': f'MRI scan subject {subname!r}' if not sesname else
# f'MRI scan subject {subname!r}, session {sesname!r}',
# 'broad_mappings': [
# 'obo:MAXO_0000424',
# ],
# # TODO fill study when parameter is present
# })
write_records(dst, records)
if __name__ == '__main__':
cli(prog_name='flatbids')