From 1a3fa2da6999feffb7286ecd5b9c6471c5566112 Mon Sep 17 00:00:00 2001 From: Christian Monch Date: Tue, 17 Mar 2026 16:27:29 +0100 Subject: [PATCH 1/9] support record_id search in gitaudit backend This commit adds an index to the gitaudit backend that contains the IDs of all records that were added to the audit log. This allows to search through IDs. --- dump_things_service/audit/gitaudit.py | 73 +++++++++++++++---- .../commands/report_gitaudit.py | 20 +++-- 2 files changed, 72 insertions(+), 21 deletions(-) diff --git a/dump_things_service/audit/gitaudit.py b/dump_things_service/audit/gitaudit.py index 8d4ce51..f039079 100644 --- a/dump_things_service/audit/gitaudit.py +++ b/dump_things_service/audit/gitaudit.py @@ -7,6 +7,7 @@ committed. Changes are annotated with a time stamp and a user-id """ import hashlib +import re from datetime import datetime from pathlib import Path @@ -28,9 +29,10 @@ class GitAuditBackend(AuditBackend): path: Path, ): self.path = path - self.cache = {} + self.index_path = None + self.cached_index_entries = [] self.current_change_set = {} - self.repo = self._init_repo() + self._init_repo() def add_record( self, @@ -41,11 +43,15 @@ class GitAuditBackend(AuditBackend): location = self._get_location_for(record_id) if self._has_pending_changes(location): self._persist_pending_changes() - self._add_elements(location, user_id, record) + self._add_elements(record_id, location, user_id, record) def flush(self): if self.current_change_set: self._persist_pending_changes() + if self.cached_index_entries: + with self.index_path.open('at') as f: + f.write('\n'.join(self.cached_index_entries) + '\n') + self.cached_index_entries = [] def get_audit_log( self, @@ -96,8 +102,26 @@ class GitAuditBackend(AuditBackend): changes.sort() return {c[0]: c[1:] for c in changes} + def get_audit_logs( + self, + record_id_pattern: str, + ) -> dict: + self.flush() + matcher = re.compile(record_id_pattern) + matching_ids = tuple( + filter( + lambda record_id: matcher.match(record_id) is not None, + self.index, + ) + ) + return { + record_id: self.get_audit_log(record_id) + for record_id in sorted(matching_ids) + } + def _add_elements( self, + record_id: str, location: tuple[str, Path, Path], user_id: str, record: dict, @@ -111,6 +135,7 @@ class GitAuditBackend(AuditBackend): default_flow_style=False, ) self._add_log_entry(location[2], user_id) + self._add_index_entry(record_id) return True return False @@ -124,6 +149,14 @@ class GitAuditBackend(AuditBackend): log_content += f'{time_stamp} {user_id}\n' self.current_change_set[log_location] = log_content + def _add_index_entry( + self, + record_id: str, + ): + if record_id not in self.index: + self.cached_index_entries.append(record_id) + self.index.add(record_id) + def _read_from_repo_path( self, path: Path, @@ -181,19 +214,33 @@ class GitAuditBackend(AuditBackend): location_dir / (base + '.log'), ) - def _init_repo(self) -> Repo: + def _init_repo(self) -> None: if self.path.exists(): is_empty = len(tuple(Path(self.path).glob('**'))) == 1 else: self.path.mkdir(parents=True) is_empty = True - if is_empty: - call_git(['init', '--bare', str(self.path)], cwd=self.path) - self.repo = Repo(self.path) + self.index_path = self.path / 'index.log' - apply_changeset( - self.repo, - {'README.txt': 'A git-based audit backend\n'}, - message='add README.txt', - ) - return self.repo + if is_empty: + call_git(['init', '--bare', str(self.path)], capture_output=True) + self.repo = Repo(self.path) + apply_changeset( + self.repo, + {'README.txt': 'A git-based audit backend\n'}, + message='add README.txt', + ) + self.index_path.write_text('') + else: + self.repo = Repo(self.path) + + with open(self.index_path, 'rt') as f: + self.index = set(line.strip() for line in f.readlines()) + + def _add_to_index( + self, + record_id: str, + ): + if record_id not in self.index: + self.cached_index_entries.append(record_id) + self.index.add(record_id) diff --git a/dump_things_service/commands/report_gitaudit.py b/dump_things_service/commands/report_gitaudit.py index 5ec037a..03e4539 100644 --- a/dump_things_service/commands/report_gitaudit.py +++ b/dump_things_service/commands/report_gitaudit.py @@ -20,7 +20,9 @@ parser.add_argument( ) parser.add_argument( 'pid', - help='The PID of the record for which audit information should be reported.', + help='The PID of the record for which audit information should be reported ' + '(interpreted as regular expression, to see all audit log entries, ' + 'specify ".*").', ) @@ -28,15 +30,17 @@ def main(): arguments = parser.parse_args() audit_backend = GitAuditBackend(Path(arguments.audit_store)) - changes = audit_backend.get_audit_log(arguments.pid) - + all_changes = audit_backend.get_audit_logs(arguments.pid) output = { - time_stamp: { - 'user-id': change[0], - 'diff': change[1], - 'resulting-record': change[2], + record_id: { + time_stamp: { + 'user-id': change[0], + 'diff': change[1], + 'resulting-record': change[2], + } + for time_stamp, change in record_changes.items() } - for time_stamp, change in changes.items() + for record_id, record_changes in all_changes.items() } print(json.dumps(output, indent=2, ensure_ascii=False)) -- 2.52.0 From 30c7dfa124a60cd1b405ad932a7ca2a4cee5636a Mon Sep 17 00:00:00 2001 From: Christian Monch Date: Tue, 17 Mar 2026 21:30:46 +0100 Subject: [PATCH 2/9] use `fullmatch` in regex matching Use `fullmatch` when matching record IDs against the ID-pattern in `get_audit_logs`. --- dump_things_service/audit/gitaudit.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/dump_things_service/audit/gitaudit.py b/dump_things_service/audit/gitaudit.py index f039079..dc4f39e 100644 --- a/dump_things_service/audit/gitaudit.py +++ b/dump_things_service/audit/gitaudit.py @@ -110,7 +110,7 @@ class GitAuditBackend(AuditBackend): matcher = re.compile(record_id_pattern) matching_ids = tuple( filter( - lambda record_id: matcher.match(record_id) is not None, + lambda record_id: matcher.fullmatch(record_id) is not None, self.index, ) ) -- 2.52.0 From 3f716af542bcf625bd44056e4559b5dca5a18135 Mon Sep 17 00:00:00 2001 From: Christian Monch Date: Wed, 18 Mar 2026 09:36:13 +0100 Subject: [PATCH 3/9] include all tests in test execution by default This commit fixes erroneous parameters to test execution that prevented the collection of all tests in the source. --- pyproject.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pyproject.toml b/pyproject.toml index 9104d5f..7d9a20c 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -118,7 +118,7 @@ extra-dependencies = [ ] [tool.hatch.envs.tests.scripts] -run = 'python -m pytest {args:dump_things_service/tests dump_things_service/backends/tests}' +run = 'python -m pytest {args}' [tool.ruff] extend-exclude = [ -- 2.52.0 From 02f5c5f385fb2b2d06684da051deb402868cdf2b Mon Sep 17 00:00:00 2001 From: Christian Monch Date: Wed, 18 Mar 2026 09:46:55 +0100 Subject: [PATCH 4/9] rename `dump-things-report gitaudit` `dump-things-report-gitaudit` is renamed to `dump-things-gitaudit-report`. This will lead to better command names, when more commands are introduced that work with `gitaudit`-backends, e.g., `dump-things-gitaudit-rebuild-index`. Generally audit-backend commands should be named according to the following schema: `dump-things--` --- pyproject.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pyproject.toml b/pyproject.toml index 7d9a20c..8ff4506 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -55,7 +55,7 @@ dump-things-rebuild-index = "dump_things_service.commands.rebuild_index:main" dump-things-copy-store = "dump_things_service.commands.copy_store:main" dump-things-pid-check = "dump_things_service.commands.check_pids:main" dump-things-create-merged-schema = "dump_things_service.commands.create_merged_schema:main" -dump-things-report-gitaudit = "dump_things_service.commands.report_gitaudit:main" +dump-things-gitaudit-report = "dump_things_service.commands.gitaudit_report:main" [tool.hatch.build.targets.wheel] exclude = [ -- 2.52.0 From efd53ec6177d571a0ec8816604f7f16b2be6c6a9 Mon Sep 17 00:00:00 2001 From: Christian Monch Date: Wed, 18 Mar 2026 10:03:18 +0100 Subject: [PATCH 5/9] use JSONL for `dump-things-gitaudit-report` results This commit changes the output format of `dump-things-gitaudit-report` to JSONl. Each record is self contained, i.e., it contains the record ID, time-stamp, user-id, diff, and resulting record. This allows to process individual lines without keeping track of large dictionaries that spawn the complete output. --- ...{report_gitaudit.py => gitaudit_report.py} | 28 +++++++++++-------- 1 file changed, 16 insertions(+), 12 deletions(-) rename dump_things_service/commands/{report_gitaudit.py => gitaudit_report.py} (60%) diff --git a/dump_things_service/commands/report_gitaudit.py b/dump_things_service/commands/gitaudit_report.py similarity index 60% rename from dump_things_service/commands/report_gitaudit.py rename to dump_things_service/commands/gitaudit_report.py index 03e4539..ca58c5e 100644 --- a/dump_things_service/commands/report_gitaudit.py +++ b/dump_things_service/commands/gitaudit_report.py @@ -1,6 +1,7 @@ from __future__ import annotations import json +import re import sys from argparse import ArgumentParser from pathlib import Path @@ -20,30 +21,33 @@ parser.add_argument( ) parser.add_argument( 'pid', - help='The PID of the record for which audit information should be reported ' - '(interpreted as regular expression, to see all audit log entries, ' - 'specify ".*").', + help='Regex pattern that identifies PIDs of the record for which audit ' + 'information should be reported ' + '(to see all audit log entries, specify ".*").', ) def main(): arguments = parser.parse_args() + try: + re.compile(arguments.pid) + except re.error as e: + print('Error in PID pattern:', e, file=sys.stderr, flush=True) + return 1 + audit_backend = GitAuditBackend(Path(arguments.audit_store)) all_changes = audit_backend.get_audit_logs(arguments.pid) - output = { - record_id: { - time_stamp: { + for record_id, report_changes in all_changes.items(): + for time_stamp, change in report_changes.items(): + report = { + 'time-stamp': time_stamp, + 'record-id': record_id, 'user-id': change[0], 'diff': change[1], 'resulting-record': change[2], } - for time_stamp, change in record_changes.items() - } - for record_id, record_changes in all_changes.items() - } - - print(json.dumps(output, indent=2, ensure_ascii=False)) + print(json.dumps(report, ensure_ascii=False), flush=True) return 0 -- 2.52.0 From 1cfcf2f1660cdecccc4c0fc7d2c6b411649c47bb Mon Sep 17 00:00:00 2001 From: Christian Monch Date: Wed, 18 Mar 2026 10:53:42 +0100 Subject: [PATCH 6/9] add index-rebuilding to gitaudit backend The gitaudit-backend will now rebuild an index if no index is found in a backend (or if requested to do so). To ensure that indexes can be rebuild the interface was changed. Record-IDs are now read from the records themselve, i.e. from `record["pid"]`. This ensures that the IDs in the index are the same as the IDs in the persisted records. --- dump_things_service/audit/__init__.py | 6 ++-- dump_things_service/audit/gitaudit.py | 29 ++++++++++++++++++- .../audit/tests/test_gitaudit.py | 4 --- dump_things_service/curated.py | 1 - 4 files changed, 31 insertions(+), 9 deletions(-) diff --git a/dump_things_service/audit/__init__.py b/dump_things_service/audit/__init__.py index 3bfef2b..41de4b3 100644 --- a/dump_things_service/audit/__init__.py +++ b/dump_things_service/audit/__init__.py @@ -8,14 +8,14 @@ class AuditBackend(metaclass=ABCMeta): @abstractmethod def add_record( self, - record_id: str, record: dict, user_id: str, ) -> None: """Add information about a new record version to the audit log - :param record_id: the ID of the record (this is usually `record['pid']`. - :param record: the content of the new record (will be stored in YAML format). + :param record: the content of the new record. The record must contain + a `pid`-key which is associated with the ID of the record (the + record will be stored in YAML format). :param user_id: the ID of the user who adds the record. :return: A dictionary where the keys are time stamps of the changes, the values are tuples containing the elements: diff --git a/dump_things_service/audit/gitaudit.py b/dump_things_service/audit/gitaudit.py index dc4f39e..aecbcff 100644 --- a/dump_things_service/audit/gitaudit.py +++ b/dump_things_service/audit/gitaudit.py @@ -36,10 +36,10 @@ class GitAuditBackend(AuditBackend): def add_record( self, - record_id: str, record: dict, user_id: str, ) -> None: + record_id = record['pid'] location = self._get_location_for(record_id) if self._has_pending_changes(location): self._persist_pending_changes() @@ -234,6 +234,9 @@ class GitAuditBackend(AuditBackend): else: self.repo = Repo(self.path) + if not self.index_path.exists(): + self._rebuild_index() + with open(self.index_path, 'rt') as f: self.index = set(line.strip() for line in f.readlines()) @@ -244,3 +247,27 @@ class GitAuditBackend(AuditBackend): if record_id not in self.index: self.cached_index_entries.append(record_id) self.index.add(record_id) + + def _rebuild_index(self): + print('rebuilding index') + tree_entries = call_git( + ['ls-tree', '-r', 'master:'], + cwd=self.path, + capture_output=True, + ).decode().splitlines() + with open(self.index_path, 'wt') as f: + for line in tree_entries: + if not line.endswith('.yaml'): + print('ignoring line:', repr(line)) + continue + print('got line:', repr(line)) + flag, object_type, object_hash, file_name = line.split() + record = yaml.safe_load( + call_git( + ['show', object_hash], + cwd=self.path, + capture_output=True, + ).decode() + ) + print('got record:', repr(record)) + f.write(record['pid'] + '\n') diff --git a/dump_things_service/audit/tests/test_gitaudit.py b/dump_things_service/audit/tests/test_gitaudit.py index 1620355..f6e9340 100644 --- a/dump_things_service/audit/tests/test_gitaudit.py +++ b/dump_things_service/audit/tests/test_gitaudit.py @@ -27,7 +27,6 @@ def test_gitaudit_basic(tmp_path_factory): for index in range(4): backend.add_record( - record_id=record_id, record={'pid': record_id, 'content': index}, user_id=f'tester_{index}@example.com', ) @@ -54,13 +53,11 @@ def test_gitaudit_identical_change(tmp_path_factory): record_id = 'test_gitaudit_idempotent' backend.add_record( - record_id=record_id, record={'pid': record_id}, user_id='tester@example.com', ) backend.add_record( - record_id=record_id, record={'pid': record_id}, user_id='tester@example.com', ) @@ -91,7 +88,6 @@ def test_gitaudit_huge_log(tmp_path_factory): for i in range(record_number): record_id = f'huge_{i}' backend.add_record( - record_id=record_id, record={'pid': record_id, 'content': f'j:{j}, i:{i}'}, user_id='tester@example.com', ) diff --git a/dump_things_service/curated.py b/dump_things_service/curated.py index 6f12927..b64d921 100644 --- a/dump_things_service/curated.py +++ b/dump_things_service/curated.py @@ -377,7 +377,6 @@ async def store_curated_record( for audit_backend in instance_config.audit_backends[collection]: audit_backend.add_record( - record_id=pid, record=json_object, user_id=instance_config.tokens[collection][api_key]['user_id'], ) -- 2.52.0 From dddf1408bc33dbebf4439ed1717a387e5661d4c2 Mon Sep 17 00:00:00 2001 From: Christian Monch Date: Wed, 18 Mar 2026 12:57:52 +0100 Subject: [PATCH 7/9] add `dump-things-gitaudit-rebuild-index` Add the command line tool `dump-things-gitaudit-rebuild-index`. It can be used to rebuild indices of gitaudit databases. This is only useful for maintenance operations. The gitaudit-backend code will automatically build indices, if they are not existing. --- README.md | 6 +++- .../commands/gitaudit_rebuild_index.py | 29 +++++++++++++++++++ pyproject.toml | 1 + 3 files changed, 35 insertions(+), 1 deletion(-) create mode 100644 dump_things_service/commands/gitaudit_rebuild_index.py diff --git a/README.md b/README.md index 2debf48..127df19 100644 --- a/README.md +++ b/README.md @@ -527,9 +527,13 @@ Here `` must be a path to a directory. If the directory does not exist, it will be created. If the directory exists, it should contain a bare git repository. -The commands `dump-things-report-gitaudit ` can be used to show the audit-log for the given `PID`. +The command `dump-things-gitaudit-report ` can be used to show the audit-log for all PIDs that match the given `PID`-pattern (pattern are in python `re`-module syntax, i.e. use `'.*'` to report changes for all PIDs). Each log entry contains the timestamp of the change, the ID of the curator that posted the change, a diff of the change, and the resulting record. +The command `dump-things-gitaudit-rebuild-index ` can be used to rebuild an index for a git-audit backend. +Executing this command should not be necessary in normal operations because the backend will rebuild an index if it is instantiated on a directory that has no index. +The command mainly exists for maintenance purposes. + Note: currently the user ID of the curator will be stored as author in the audit-log entries. The "original" author of a change is usually identified in the `annotations`-field of the record. diff --git a/dump_things_service/commands/gitaudit_rebuild_index.py b/dump_things_service/commands/gitaudit_rebuild_index.py new file mode 100644 index 0000000..bde297e --- /dev/null +++ b/dump_things_service/commands/gitaudit_rebuild_index.py @@ -0,0 +1,29 @@ +from __future__ import annotations + +import sys +from argparse import ArgumentParser +from pathlib import Path + +from dump_things_service.audit.gitaudit import GitAuditBackend + + +parser = ArgumentParser( + prog='Rebuild the index of a `gitaudit`-database', + description='This command rebuilds the index of a `gitaudit`-database.' +) +parser.add_argument( + 'audit_store', + help='The directory in which the `gitaudit`-database is located.' +) + + +def main(): + arguments = parser.parse_args() + + audit_backend = GitAuditBackend(Path(arguments.audit_store)) + audit_backend._rebuild_index() + return 0 + + +if __name__ == '__main__': + sys.exit(main()) diff --git a/pyproject.toml b/pyproject.toml index 8ff4506..af15cfa 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -56,6 +56,7 @@ dump-things-copy-store = "dump_things_service.commands.copy_store:main" dump-things-pid-check = "dump_things_service.commands.check_pids:main" dump-things-create-merged-schema = "dump_things_service.commands.create_merged_schema:main" dump-things-gitaudit-report = "dump_things_service.commands.gitaudit_report:main" +dump-things-gitaudit-rebuild-index = "dump_things_service.commands.gitaudit_rebuild_index:main" [tool.hatch.build.targets.wheel] exclude = [ -- 2.52.0 From c086fdaf56a9d3718020b24ec201db2dd12fcd90 Mon Sep 17 00:00:00 2001 From: Christian Monch Date: Wed, 18 Mar 2026 15:16:07 +0100 Subject: [PATCH 8/9] use `committer_id` and `author_id` in audit logs Record the committer ID and the author ID seperately in audit backends. --- dump_things_service/audit/__init__.py | 7 +++- dump_things_service/audit/gitaudit.py | 42 +++++++++++++++---- .../audit/tests/test_gitaudit.py | 17 +++++--- .../commands/gitaudit_report.py | 7 ++-- dump_things_service/curated.py | 2 +- dump_things_service/tests/test_curated.py | 2 +- 6 files changed, 57 insertions(+), 20 deletions(-) diff --git a/dump_things_service/audit/__init__.py b/dump_things_service/audit/__init__.py index 41de4b3..c7520ce 100644 --- a/dump_things_service/audit/__init__.py +++ b/dump_things_service/audit/__init__.py @@ -9,14 +9,17 @@ class AuditBackend(metaclass=ABCMeta): def add_record( self, record: dict, - user_id: str, + committer_id: str, + author_id: str | None = None, ) -> None: """Add information about a new record version to the audit log :param record: the content of the new record. The record must contain a `pid`-key which is associated with the ID of the record (the record will be stored in YAML format). - :param user_id: the ID of the user who adds the record. + :param committer_id: the ID of the user who adds the record. + :param author_id: the ID of the user who modified the record, defaults + to `committer_id` if not given. :return: A dictionary where the keys are time stamps of the changes, the values are tuples containing the elements: (user_id, diff, resulting_record), where user_id is the diff --git a/dump_things_service/audit/gitaudit.py b/dump_things_service/audit/gitaudit.py index aecbcff..5feeefe 100644 --- a/dump_things_service/audit/gitaudit.py +++ b/dump_things_service/audit/gitaudit.py @@ -8,6 +8,7 @@ Changes are annotated with a time stamp and a user-id """ import hashlib import re +import string from datetime import datetime from pathlib import Path @@ -37,13 +38,17 @@ class GitAuditBackend(AuditBackend): def add_record( self, record: dict, - user_id: str, + committer_id: str, + author_id: str = '', ) -> None: + author_id = committer_id if author_id == '' else author_id + committer_id = self._escape_person_id(committer_id) + author_id = self._escape_person_id(author_id) record_id = record['pid'] location = self._get_location_for(record_id) if self._has_pending_changes(location): self._persist_pending_changes() - self._add_elements(record_id, location, user_id, record) + self._add_elements(record_id, location, committer_id, author_id, record) def flush(self): if self.current_change_set: @@ -81,7 +86,7 @@ class GitAuditBackend(AuditBackend): log_diff_lines, ) )[0][1:] - time_stamp, user_id = log_entry.split(' ', 1) + time_stamp, committer_id, author_id = log_entry.split(' ') # Get the YAML diff yaml_diff_lines = call_git( @@ -97,7 +102,9 @@ class GitAuditBackend(AuditBackend): cwd=self.path, capture_output=True, ).decode() - changes.append((time_stamp, user_id, yaml_diff, yaml_content)) + changes.append( + (time_stamp, committer_id, author_id, yaml_diff, yaml_content) + ) changes.sort() return {c[0]: c[1:] for c in changes} @@ -123,7 +130,8 @@ class GitAuditBackend(AuditBackend): self, record_id: str, location: tuple[str, Path, Path], - user_id: str, + committer_id: str, + author_id: str, record: dict, ) -> bool: existing_record = self._read_record_from_repo_path(location[1]) @@ -134,7 +142,7 @@ class GitAuditBackend(AuditBackend): allow_unicode=True, default_flow_style=False, ) - self._add_log_entry(location[2], user_id) + self._add_log_entry(location[2], committer_id, author_id) self._add_index_entry(record_id) return True return False @@ -142,11 +150,12 @@ class GitAuditBackend(AuditBackend): def _add_log_entry( self, log_location: Path, - user_id: str, + committer_id: str, + author_id: str, ) -> None: time_stamp = datetime.now().isoformat() log_content = self._read_from_repo_path(log_location).decode() - log_content += f'{time_stamp} {user_id}\n' + log_content += f'{time_stamp} {committer_id} {author_id}\n' self.current_change_set[log_location] = log_content def _add_index_entry( @@ -271,3 +280,20 @@ class GitAuditBackend(AuditBackend): ) print('got record:', repr(record)) f.write(record['pid'] + '\n') + + def _escape_person_id( + self, + person_id: str, + ): + if not person_id: + msg = f'empty ID string not allowed: {person_id}' + raise ValueError(msg) + if any( + map( + lambda character: character in person_id, + string.whitespace + ) + ): + msg = f'ID string must not contain whitespace: {person_id}' + raise ValueError(msg) + return person_id diff --git a/dump_things_service/audit/tests/test_gitaudit.py b/dump_things_service/audit/tests/test_gitaudit.py index f6e9340..ef1e059 100644 --- a/dump_things_service/audit/tests/test_gitaudit.py +++ b/dump_things_service/audit/tests/test_gitaudit.py @@ -28,7 +28,8 @@ def test_gitaudit_basic(tmp_path_factory): for index in range(4): backend.add_record( record={'pid': record_id, 'content': index}, - user_id=f'tester_{index}@example.com', + committer_id=f'committer_{100 + index}@x.org', + author_id=f'author_{index}@y.org', ) # Check that the log file has 4 entries @@ -43,7 +44,10 @@ def test_gitaudit_basic(tmp_path_factory): # Check that the changes are reported changes = backend.get_audit_log(record_id) assert len(changes) == 4 - assert tuple(map(lambda e: e[0], changes.values())) == tuple((f'tester_{i}@example.com' for i in range(4))) + assert tuple(map(lambda e: e[0:2], changes.values())) == tuple( + (f'committer_{100 + i}@x.org', f'author_{i}@y.org') + for i in range(4) + ) def test_gitaudit_identical_change(tmp_path_factory): @@ -54,12 +58,14 @@ def test_gitaudit_identical_change(tmp_path_factory): record_id = 'test_gitaudit_idempotent' backend.add_record( record={'pid': record_id}, - user_id='tester@example.com', + committer_id='committer_b@x.org', + author_id = 'author_b@y.org', ) backend.add_record( record={'pid': record_id}, - user_id='tester@example.com', + committer_id='committer_b@x.org', + author_id = 'author_b@y.org', ) # Check that there is only one entry in the audit log @@ -89,7 +95,8 @@ def test_gitaudit_huge_log(tmp_path_factory): record_id = f'huge_{i}' backend.add_record( record={'pid': record_id, 'content': f'j:{j}, i:{i}'}, - user_id='tester@example.com', + committer_id='committer@x.org', + author_id = 'author@y.org', ) # Check that the changes are reported diff --git a/dump_things_service/commands/gitaudit_report.py b/dump_things_service/commands/gitaudit_report.py index ca58c5e..06600b8 100644 --- a/dump_things_service/commands/gitaudit_report.py +++ b/dump_things_service/commands/gitaudit_report.py @@ -43,9 +43,10 @@ def main(): report = { 'time-stamp': time_stamp, 'record-id': record_id, - 'user-id': change[0], - 'diff': change[1], - 'resulting-record': change[2], + 'committer-id': change[0], + 'author-id': change[1], + 'diff': change[2], + 'resulting-record': change[3], } print(json.dumps(report, ensure_ascii=False), flush=True) return 0 diff --git a/dump_things_service/curated.py b/dump_things_service/curated.py index b64d921..96ae7c5 100644 --- a/dump_things_service/curated.py +++ b/dump_things_service/curated.py @@ -378,5 +378,5 @@ async def store_curated_record( for audit_backend in instance_config.audit_backends[collection]: audit_backend.add_record( record=json_object, - user_id=instance_config.tokens[collection][api_key]['user_id'], + committer_id=instance_config.tokens[collection][api_key]['user_id'], ) diff --git a/dump_things_service/tests/test_curated.py b/dump_things_service/tests/test_curated.py index 3af68d7..b23393d 100644 --- a/dump_things_service/tests/test_curated.py +++ b/dump_things_service/tests/test_curated.py @@ -145,4 +145,4 @@ def test_audit_backend(fastapi_client_simple): values = tuple(changes.values()) for i in range(2): assert values[i][0] == user_names[i] - assert yaml.safe_load(values[i][2]) == json_objects[i] + assert yaml.safe_load(values[i][3]) == json_objects[i] -- 2.52.0 From db5c9014d13e761b1f14a9eb77c7af521aa7fc43 Mon Sep 17 00:00:00 2001 From: Christian Monch Date: Wed, 18 Mar 2026 23:15:43 +0100 Subject: [PATCH 9/9] add `author_id`-parameter to curated write endpoint --- README.md | 2 +- dump_things_service/curated.py | 7 ++++++- dump_things_service/tests/test_curated.py | 3 ++- 3 files changed, 9 insertions(+), 3 deletions(-) diff --git a/README.md b/README.md index 127df19..dbac399 100644 --- a/README.md +++ b/README.md @@ -677,7 +677,7 @@ The service provides the following user endpoints (In addition to user endpoints The service supports a set of curation endpoints that allows direct access to the curated area as well as the incoming areas. A `CURATOR`-token required to access these endpoints. -Details about the curation endpoints can be found in [this issue](https://github.com/christian-monch/dump-things-server/issues/118). +Details about the curation endpoints can be found in [this issue](https://codeberg.org/datalink/dump-things-server/issues/118). ### Tips & Tricks diff --git a/dump_things_service/curated.py b/dump_things_service/curated.py index 96ae7c5..7366343 100644 --- a/dump_things_service/curated.py +++ b/dump_things_service/curated.py @@ -44,17 +44,20 @@ if TYPE_CHECKING: _endpoint_curated_template = """ async def {name}( data: {model_var_name}.{class_name}, + author_id: str | None = None, api_key: str = Depends(api_key_header_scheme), ) -> JSONResponse: logger.info( - '{name}(%s, %s)', + '{name}(%s, %s, %s)', repr(data), + repr(author_id), repr({model_var_name}), ) return await store_curated_record( '{collection}', data, '{class_name}', + author_id, api_key, ) """ @@ -353,6 +356,7 @@ async def store_curated_record( collection: str, data: BaseModel, class_name: str, + author_id: str | None = None, api_key: str | None = Depends(api_key_header_scheme), ): @@ -379,4 +383,5 @@ async def store_curated_record( audit_backend.add_record( record=json_object, committer_id=instance_config.tokens[collection][api_key]['user_id'], + author_id=author_id, ) diff --git a/dump_things_service/tests/test_curated.py b/dump_things_service/tests/test_curated.py index b23393d..e6a99a3 100644 --- a/dump_things_service/tests/test_curated.py +++ b/dump_things_service/tests/test_curated.py @@ -131,7 +131,7 @@ def test_audit_backend(fastapi_client_simple): for i in range(2): response = test_client.post( - '/collection_1/curated/record/Person', + f'/collection_1/curated/record/Person?author_id=author_{i}@www.org', headers={'x-dumpthings-token': tokens[i]}, json=json_objects[i], ) @@ -145,4 +145,5 @@ def test_audit_backend(fastapi_client_simple): values = tuple(changes.values()) for i in range(2): assert values[i][0] == user_names[i] + assert values[i][1] == f'author_{i}@www.org' assert yaml.safe_load(values[i][3]) == json_objects[i] -- 2.52.0