From 43745352cffffc9069d82b723c99e9eb4e582e19 Mon Sep 17 00:00:00 2001
From: Adina Wagner <adina.wagner@t-online.de>
Date: Thu, 28 May 2026 16:15:48 +0200
Subject: [PATCH 1/4] WIP: Start towards automatic file metadata ingestion from
 local datasets

---
 code/index-dataset-files.py | 234 ++++++++++++++++++++++++++++++++++++
 1 file changed, 234 insertions(+)
 create mode 100644 code/index-dataset-files.py

diff --git a/code/index-dataset-files.py b/code/index-dataset-files.py
new file mode 100644
index 0000000..b771180
--- /dev/null
+++ b/code/index-dataset-files.py
@@ -0,0 +1,234 @@
+# /// script
+# requires-python = ">=3.12"
+# dependencies = [
+#   "datalad-core @ git+https://hub.datalad.org/datalad/datalad-core@minilad",
+#   "dump-things-pyclient @ https://hub.psychoinformatics.de/datalink/dump-things-pyclient.git",
+#   "git-annex",
+#   "rich",
+#   "rich-click",
+# ]
+# ///
+
+
+import uuid
+import rich_click as click
+from os import environ
+from pathlib import Path
+from pprint import pprint
+
+from datalad.api import Dataset
+from datalad_next.iter_collections import iter_annexworktree
+
+"""
+Brainstorming how this tool can work:
+
+* Use script on a local BIDS dataset
+* Specify a subject
+"""
+
+
+script_pid = '12345'
+
+bare_subject_rec = {
+    'display_label': None,
+    'kind': None,
+    'schema_type': "xyzri:XYZSubject",
+    'pid': None,
+    'study': None
+}
+
+
+bare_study_rec = {
+    "schema_type": "xyzri:XYZStudy",
+    "pid": None,
+    "title": None
+}
+
+bare_file_rec = {
+  "derived_from": [
+    {
+      "schema_type": "dlthings:Derivation",
+      "object": None
+    }
+  ],
+  "display_label": None,
+  "schema_type": "xyzri:XYZFile",
+  "pid": None,
+  "byte_size": None,
+}
+
+
+class FileRegistrator(object):
+    def __init__(
+            self,
+            repo: Dataset,
+            studypid: str,
+            subject: str,
+            subjectpid: str,
+            dtc_api_url: str,
+            dtc_collection: str,
+    ) -> None:
+        self.repo = repo
+        self.studypid = studypid
+        self.subject = subject
+        self.subjectpid = subjectpid
+        self.dtc_api_url = dtc_api_url
+        self.dtc_collection = dtc_collection
+
+    def create_records(
+            self,
+            file_rec,
+            ) -> list :
+        """Create metadata records on all files"""
+        # TODO: don't edit anything a human touched
+        recs = []
+        # list all files in the worktree
+        for file in iter_annexworktree(Path(self.repo.path) / self.subject):
+            file_rec['derived_from'][0]['object'] = self.subjectpid
+            annexkey = file.annexkey
+            file_rec['display_label'] = file.name.name
+            generated = ['derived_from', 'display_label', 'pid']
+            if annexkey is None:
+                file_rec['pid'] = _construct_pid(key=file.gitsha)
+            else:
+                file_rec['pid'] = _construct_pid(key=annexkey)
+                file_rec['byte_size'] = file.annexsize
+                generated.extend(['byte_size'])
+            # add annotation that a script provided this info
+            file_rec = self.machine_prov(file_rec, generated)
+            recs.append(file_rec)
+        return recs
+
+    def machine_prov(
+            self,
+            rec: dict,
+            generated: list
+    ) -> dict:
+        prov = {'attributes': [
+            {'predicate': 'http://purl.org/pav/importedFrom',
+             'value': self.repo.path,
+             'attributes': []
+             }]}
+        for value in generated:
+            new = {'predicate': 'prov:generated',
+                   'value': value,
+                   'characterized_by': [{
+                       'predicate': 'prov:generated_by',
+                       'object': script_pid
+                   }]}
+            prov['attributes'][0]['attributes'].append(new)
+        rec.update(prov)
+        return rec
+
+    def register(
+            self,
+            print_only: bool,
+    ):
+        recs = self.create_records(file_rec=bare_file_rec)
+        if print_only:
+            pprint(recs)
+            return
+        #TODO: upload records to dumpthings
+
+
+def create_study_record(
+        study_rec,
+        title
+) -> (str, dict):
+    """Create a new study record, if it does not yet exist."""
+    pid = _construct_pid(prefix='xyzrins', key=str(uuid.uuid4()))
+    study_rec['pid'] = pid
+    study_rec['title'] = title
+    return pid, study_rec
+
+
+def create_subject_record(
+        study_pid,
+        subject_rec,
+        label,
+        kind="obo:NCBITaxon_9606",  # human
+) -> (str, dict):
+    pid = _construct_pid(prefix='xyzrins', key=str(uuid.uuid4()))
+    subject_rec['pid'] = pid
+    subject_rec['study'] = study_pid
+    subject_rec['display_label'] = label
+    subject_rec['kind'] = kind
+    return pid, subject_rec
+
+
+def _construct_pid(
+        key: str,
+        prefix: str = 'dldi',
+) -> str:
+    # take unused keys, construct pid programmatically
+    pid = f'{prefix}:{key}'
+    return pid
+
+
+@click.command()
+@click.option('--dtc-api-url', '-a', default='https://pool.v0.trr379.de/api')
+@click.option('--dtc-collection', '-c', default=['public'], multiple=True)
+@click.option('--dataset')
+@click.option('--study-pid')
+@click.option('--sub')
+@click.option('--study-label')
+@click.option('--sub-pid')
+def main(
+        dataset: str,
+        sub: str,
+        study_pid: str | None = None,
+        study_label: str | None = None,
+        sub_pid: str | None = None,
+        dtc_api_url: str = 'https://pool.psychoinformatics.de/api',
+        dtc_collection: str = 'public',
+) -> None:
+    """
+    Clone a BIDS-conform DataLad dataset and, per subject, create file records
+    for each included file.
+
+    Usage Notes:
+
+    Provide the following arguments:
+    - service URL of the dumpthings deployment (e.g., https://pool.v0.trr379.de)
+    - the collection(s) in which records should be queried (e.g., public).
+      Several collections can be queried by prodividing the argument multiple
+      times (--dtc-collection public --dtc-collection protected)
+    - local path of the dataset to index
+    - subject to process
+    - optionally: pid of the study the dataset belongs to
+    - optionally: pid of the subject record to link
+    Run the script using uv:
+
+        > uv run TODO
+    """
+    repo = Dataset(Path(dataset))
+    if not study_pid:
+        study_pid, study_rec = \
+            create_study_record(study_rec=bare_study_rec,
+                                title=study_label)
+    if not sub_pid:
+        sub_pid, subject_rec = \
+            create_subject_record(study_pid=study_pid,
+                                  subject_rec=bare_subject_rec,
+                                  label=study_label + '-' + sub)
+    # TODO list and submit these records
+    ar = FileRegistrator(
+        repo=repo,
+        dtc_api_url=dtc_api_url,
+        dtc_collection=dtc_collection,
+        studypid=study_pid,
+        subjectpid=sub_pid,
+        subject=sub,
+    )
+    ar.register(
+        print_only=True,
+    )
+    """
+    If print_only is set to True, records are not submitted, just displayed
+    in stdout"""
+    #assert "DTC_TOKEN" in environ
+
+
+
+if __name__ == '__main__':
+    main()
-- 
2.52.0


From 54b2cb313a9e2e6a1d50bc9db511d940d988eff2 Mon Sep 17 00:00:00 2001
From: Adina Wagner <adina.wagner@t-online.de>
Date: Tue, 2 Jun 2026 16:07:41 +0200
Subject: [PATCH 2/4] add some logging

---
 code/index-dataset-files.py | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/code/index-dataset-files.py b/code/index-dataset-files.py
index b771180..9fbe4a4 100644
--- a/code/index-dataset-files.py
+++ b/code/index-dataset-files.py
@@ -9,7 +9,7 @@
 # ]
 # ///
 
-
+import logging
 import uuid
 import rich_click as click
 from os import environ
@@ -19,6 +19,9 @@ from pprint import pprint
 from datalad.api import Dataset
 from datalad_next.iter_collections import iter_annexworktree
 
+
+logger = logging.getLogger(__name__)
+
 """
 Brainstorming how this tool can work:
 
@@ -203,10 +206,12 @@ def main(
     """
     repo = Dataset(Path(dataset))
     if not study_pid:
+        logging.info('Creating new study record')
         study_pid, study_rec = \
             create_study_record(study_rec=bare_study_rec,
                                 title=study_label)
     if not sub_pid:
+        logging.info('Creating new subject record')
         sub_pid, subject_rec = \
             create_subject_record(study_pid=study_pid,
                                   subject_rec=bare_subject_rec,
-- 
2.52.0


From 8ff8ca8fde10727bce249a07d7c3c5b23eb03576 Mon Sep 17 00:00:00 2001
From: Adina Wagner <adina.wagner@t-online.de>
Date: Tue, 2 Jun 2026 16:07:59 +0200
Subject: [PATCH 3/4] WIP: start thinking about checks for existing records

---
 code/index-dataset-files.py | 22 ++++++++++++++++++++++
 1 file changed, 22 insertions(+)

diff --git a/code/index-dataset-files.py b/code/index-dataset-files.py
index 9fbe4a4..d274f79 100644
--- a/code/index-dataset-files.py
+++ b/code/index-dataset-files.py
@@ -19,6 +19,10 @@ from pprint import pprint
 from datalad.api import Dataset
 from datalad_next.iter_collections import iter_annexworktree
 
+from dump_things_pyclient.communicate import (
+    collection_write_record,
+    collection_read_record_with_pid,
+)
 
 logger = logging.getLogger(__name__)
 
@@ -78,6 +82,24 @@ class FileRegistrator(object):
         self.dtc_api_url = dtc_api_url
         self.dtc_collection = dtc_collection
 
+    def check_existing_subject_record(
+            self,
+            pid):
+        """If there already is a subject record, we need to check if there are
+        already files derived from it. If we find those, we can't create new
+        records, but need to check old records"""
+        old_rec = self._get_existing_record
+
+    def _get_existing_record(self,
+                               pid: str) -> dict | None:
+        record = collection_read_record_with_pid(
+            service_url=self.dtc_api_url,
+            collection=self.dtc_collection,
+            pid=pid,
+            token=environ['DTC_TOKEN']
+            )
+        return record
+
     def create_records(
             self,
             file_rec,
-- 
2.52.0


From 941ccbd55740c56745878c79d7e7fd54ccb9da04 Mon Sep 17 00:00:00 2001
From: Adina Wagner <adina.wagner@t-online.de>
Date: Tue, 2 Jun 2026 16:08:29 +0200
Subject: [PATCH 4/4] Short options for some params, make study_label mandatory

---
 code/index-dataset-files.py | 28 +++++++++++++++++-----------
 1 file changed, 17 insertions(+), 11 deletions(-)

diff --git a/code/index-dataset-files.py b/code/index-dataset-files.py
index d274f79..d30886c 100644
--- a/code/index-dataset-files.py
+++ b/code/index-dataset-files.py
@@ -191,9 +191,9 @@ def _construct_pid(
 
 
 @click.command()
-@click.option('--dtc-api-url', '-a', default='https://pool.v0.trr379.de/api')
+@click.option('--dtc-api-url', '-a', default='https://pool.psychoinformatics.de/api')
 @click.option('--dtc-collection', '-c', default=['public'], multiple=True)
-@click.option('--dataset')
+@click.option('--dataset', '-d')
 @click.option('--study-pid')
 @click.option('--sub')
 @click.option('--study-label')
@@ -201,8 +201,8 @@ def _construct_pid(
 def main(
         dataset: str,
         sub: str,
+        study_label: str,
         study_pid: str | None = None,
-        study_label: str | None = None,
         sub_pid: str | None = None,
         dtc_api_url: str = 'https://pool.psychoinformatics.de/api',
         dtc_collection: str = 'public',
@@ -214,14 +214,20 @@ def main(
     Usage Notes:
 
     Provide the following arguments:
-    - service URL of the dumpthings deployment (e.g., https://pool.v0.trr379.de)
-    - the collection(s) in which records should be queried (e.g., public).
-      Several collections can be queried by prodividing the argument multiple
-      times (--dtc-collection public --dtc-collection protected)
-    - local path of the dataset to index
-    - subject to process
-    - optionally: pid of the study the dataset belongs to
-    - optionally: pid of the subject record to link
+    - --dtc-api-url / -a: service URL of the dumpthings deployment (e.g.,
+     https://pool.psychoinformatics.de)
+    - --dtc-collection / -c: the collection into which records should be curated
+     (e.g., public).
+    - --dataset / -d: local path of the dataset to index
+    - --sub: within-dataset path to the subject directory to process, e.g.
+     'sub-01'.
+    - --study-pid (opt): Existing pid of the study the dataset belongs to. If
+     not given, a new study record is created.
+    - --study-label: A label to use for study and subject when creating new
+     records.
+    - --sub-pid (opt): Existing pid of the subject record to link. If not given,
+     a new subject record will be created, using the --sub argument as a label
+
     Run the script using uv:
 
         > uv run TODO
-- 
2.52.0