From f3c20c913e6da3dd205989b61de7747f16a33f4f Mon Sep 17 00:00:00 2001 From: Christian Monch Date: Wed, 11 Mar 2026 14:38:09 +0100 Subject: [PATCH 01/17] add a git-based audit trail backend --- dump_things_service/audit/__init__.py | 0 dump_things_service/audit/gitaudit.py | 158 ++++++++++++++++++ dump_things_service/audit/tests/__init__.py | 0 .../audit/tests/test_gitaudit.py | 91 ++++++++++ 4 files changed, 249 insertions(+) create mode 100644 dump_things_service/audit/__init__.py create mode 100644 dump_things_service/audit/gitaudit.py create mode 100644 dump_things_service/audit/tests/__init__.py create mode 100644 dump_things_service/audit/tests/test_gitaudit.py diff --git a/dump_things_service/audit/__init__.py b/dump_things_service/audit/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/dump_things_service/audit/gitaudit.py b/dump_things_service/audit/gitaudit.py new file mode 100644 index 0000000..b87fff3 --- /dev/null +++ b/dump_things_service/audit/gitaudit.py @@ -0,0 +1,158 @@ +"""A git-based audit backend + +The backend minimizes commits by caching changes until an already +changed record is changed again. In this case all changes are +committed. + +Changes have a time stamp and a user-id +""" +import hashlib +import subprocess +from datetime import datetime +from pathlib import Path + +import yaml + + +class GitAuditBackend: + + def __init__( + self, + path: Path, + ): + self.path = path + self.init_repo() + # Check for empty directory, if empty, initialize git-repo + + def add_record( + self, + record_id: str, + record: dict, + user_id: str, + ): + location = self.get_location_for(record_id) + initialize_location = not self.elements_exists(location) + + if self.has_pending_changes(location): + self.persist_pending_changes() + self.add_elements(location, user_id, record) + + if initialize_location: + self.initialize_location(location) + + def add_elements( + self, + location: tuple[str, Path, Path], + user_id: str, + record: dict, + ) -> bool: + persisted_record = self.read_record(location[1]) + if persisted_record != record: + self.write_record(location[1], record) + self.add_log_entry(location[2], user_id) + return True + return False + + def add_log_entry( + self, + log_location: Path, + user_id: str, + ) -> None: + time_stamp = datetime.now().isoformat() + with (self.path / log_location).open('at') as f: + f.write(f'{time_stamp} {user_id}\n') + + def write_record( + self, + record_location: Path, + record: dict, + ): + with (self.path / record_location).open('wt') as f: + yaml.dump(record, stream=f) + + def read_record( + self, + location: Path, + ) -> dict | None: + if (self.path / location).exists(): + return yaml.safe_load((self.path / location).read_text()) + return None + + def elements_exists( + self, + location: tuple[str, Path, Path], + ) -> bool: + log_location, record_location = location[1:] + assert log_location.exists() == record_location.exists(), f'inconsistent elements exist status for: {location}' + return log_location.exists() + + def has_pending_changes( + self, + location: tuple[str, Path, Path], + ) -> bool: + log_pending, record_pending = self.pending_changes(*location[1:]) + + # If the pending state of log and record differ, the database has + # become inconsistent. For now, raise an error. This has to be fixed + # manually. + if log_pending != record_pending: + msg = ( + f'change status mismatch: changed: ' + f'{location[1]} ({log_pending}), ' + f'{location[2]} ({record_pending})' + ) + raise SystemError(msg) + + return log_pending + + def pending_changes( + self, + *paths: tuple[Path, ...], + ): + result = subprocess.run( + ['git', '-C', str(self.path), 'status', '--porcelain=v1', str(self.path)] + [ + str(p) for p in paths + ], + capture_output=True, + check=True, + ) + status = { + line[3:].decode(): True + for line in result.stdout.splitlines() + } + return (status.get(str(p), False) for p in paths) + + def persist_pending_changes(self) -> None: + subprocess.run( + ['git', '-C', str(self.path), 'commit', '-a', '-m', 'persist changes'], + #capture_output=True, + check=True, + ) + + def initialize_location( + self, + location: tuple[str, Path, Path], + ) -> None: + subprocess.run( + ['git', '-C', str(self.path), 'add', str(location[1]), str(location[2])], + #capture_output=True, + check=True, + ) + + def get_location_for( + self, + record_id: str, + ) -> tuple[str, Path, Path]: + base = hashlib.sha1(record_id.encode()).hexdigest() + return ( + base, + Path(base + '.yaml'), + Path(base + '.log'), + ) + + def init_repo(self) -> None: + subprocess.run( + ['git', '-C', str(self.path), 'init'], + #capture_output=True, + check=True, + ) diff --git a/dump_things_service/audit/tests/__init__.py b/dump_things_service/audit/tests/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/dump_things_service/audit/tests/test_gitaudit.py b/dump_things_service/audit/tests/test_gitaudit.py new file mode 100644 index 0000000..ca1c7d3 --- /dev/null +++ b/dump_things_service/audit/tests/test_gitaudit.py @@ -0,0 +1,91 @@ +import subprocess +from pathlib import Path + +from dump_things_service.audit.gitaudit import GitAuditBackend + + +def _get_git_status(path: Path) -> dict: + result = subprocess.run( + ['git', '-C', str(path), 'status', '--porcelain=v1'], + capture_output=True, + check=True, + ) + return { + line[3:]: line[:2] + for line in result.stdout.decode().splitlines() + } + + +def _get_git_log(path: Path) -> list[str]: + result = subprocess.run( + ['git', '-C', str(path), 'log', '--oneline'], + capture_output=True, + check=True, + ) + return result.stdout.decode().splitlines() + + +def _get_audit_log_lines(backend: GitAuditBackend, record_id: str) -> list[str]: + locations = backend.get_location_for(record_id) + return (backend.path / locations[2]).read_text().splitlines() + + +def test_gitaudit_basic(tmp_path_factory): + tmp_path = tmp_path_factory.mktemp("gitaudit_backend") + + backend = GitAuditBackend(tmp_path) + + record_id = 'test_gitaudit_basic' + + for index in range(4): + backend.add_record( + record_id=record_id, + record={'pid': record_id, 'content': index}, + user_id=f'tester_{index}@example.com', + ) + + # Check that the log file has 4 entries + log_lines = _get_audit_log_lines(backend, record_id) + assert len(log_lines) == 4 + + # Check that the commit log has 3 entries + commit_log_lines = _get_git_log(tmp_path) + assert len(commit_log_lines) == 3 + + # Check that the files are in modified state. + status = _get_git_status(tmp_path) + assert len(status) == 2 + assert tuple(status.values()) == ('M ', 'M ') + + +def test_gitaudit_identical_change(tmp_path_factory): + tmp_path = tmp_path_factory.mktemp("gitaudit_backend") + + backend = GitAuditBackend(tmp_path) + + record_id = 'test_gitaudit_idempotent' + backend.add_record( + record_id=record_id, + record={'pid': record_id}, + user_id='tester@example.com', + ) + # Check that the a new record was added + status = _get_git_status(tmp_path) + assert tuple(status.values()) == ('A ', 'A ') + + backend.add_record( + record_id=record_id, + record={'pid': record_id}, + user_id='tester@example.com', + ) + # Check that there is change in the repository + status = _get_git_status(tmp_path) + assert len(status) == 0 + + # Check that there is only one entry in the audit log + log_lines = _get_audit_log_lines(backend, record_id) + assert len(log_lines) == 1 + + # Check that there is only one entry in the commit history + commit_log_lines = _get_git_log(tmp_path) + assert len(commit_log_lines) == 1 -- 2.52.0 From c1287e65b9a410856f5954973048f168a485e518 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Christian=20M=C3=B6nch?= Date: Wed, 11 Mar 2026 16:21:37 +0100 Subject: [PATCH 02/17] use three-level directory hierarchies for files --- dump_things_service/audit/gitaudit.py | 25 +++++++++++++++---------- 1 file changed, 15 insertions(+), 10 deletions(-) diff --git a/dump_things_service/audit/gitaudit.py b/dump_things_service/audit/gitaudit.py index b87fff3..b707862 100644 --- a/dump_things_service/audit/gitaudit.py +++ b/dump_things_service/audit/gitaudit.py @@ -4,7 +4,7 @@ The backend minimizes commits by caching changes until an already changed record is changed again. In this case all changes are committed. -Changes have a time stamp and a user-id +Changes are annotated with a time stamp and a user-id """ import hashlib import subprocess @@ -125,7 +125,7 @@ class GitAuditBackend: def persist_pending_changes(self) -> None: subprocess.run( ['git', '-C', str(self.path), 'commit', '-a', '-m', 'persist changes'], - #capture_output=True, + capture_output=True, check=True, ) @@ -135,7 +135,7 @@ class GitAuditBackend: ) -> None: subprocess.run( ['git', '-C', str(self.path), 'add', str(location[1]), str(location[2])], - #capture_output=True, + capture_output=True, check=True, ) @@ -144,15 +144,20 @@ class GitAuditBackend: record_id: str, ) -> tuple[str, Path, Path]: base = hashlib.sha1(record_id.encode()).hexdigest() + dir_1, dir_2, name = base[0:3], base[3:6], base[6:] + location_dir = Path(dir_1) / Path(dir_2) + (self.path / location_dir).mkdir(parents=True, exist_ok=True) return ( base, - Path(base + '.yaml'), - Path(base + '.log'), + location_dir / (base + '.yaml'), + location_dir / (base + '.log'), ) def init_repo(self) -> None: - subprocess.run( - ['git', '-C', str(self.path), 'init'], - #capture_output=True, - check=True, - ) + is_empty = len(tuple(Path(self.path).glob('**'))) == 1 + if is_empty: + subprocess.run( + ['git', '-C', str(self.path), 'init'], + capture_output=True, + check=True, + ) -- 2.52.0 From acc09813363bc6c82f67f65205b25ef73ac84bf6 Mon Sep 17 00:00:00 2001 From: Christian Monch Date: Thu, 12 Mar 2026 09:01:42 +0100 Subject: [PATCH 03/17] add change reports for IDs This commit adds a method that reports all changes that were applied to a record with a specific ID. The report is a dictionaries. The keys are ISO timestamps, the values are tuples containing the following entries: 0: the user id that did the change 2: the git diff of the change 3: the content of the record after the change --- dump_things_service/audit/gitaudit.py | 76 ++++++++++++++++++- .../audit/tests/test_gitaudit.py | 9 +++ 2 files changed, 83 insertions(+), 2 deletions(-) diff --git a/dump_things_service/audit/gitaudit.py b/dump_things_service/audit/gitaudit.py index b707862..371445e 100644 --- a/dump_things_service/audit/gitaudit.py +++ b/dump_things_service/audit/gitaudit.py @@ -110,7 +110,7 @@ class GitAuditBackend: *paths: tuple[Path, ...], ): result = subprocess.run( - ['git', '-C', str(self.path), 'status', '--porcelain=v1', str(self.path)] + [ + ['git', '-C', str(self.path), 'status', '--porcelain=v1'] + [ str(p) for p in paths ], capture_output=True, @@ -120,7 +120,7 @@ class GitAuditBackend: line[3:].decode(): True for line in result.stdout.splitlines() } - return (status.get(str(p), False) for p in paths) + return tuple((status.get(str(p), False) for p in paths)) def persist_pending_changes(self) -> None: subprocess.run( @@ -161,3 +161,75 @@ class GitAuditBackend: capture_output=True, check=True, ) + + def get_modifications( + self, + record_id: str, + ) -> dict: + if self.any_change(): + self.persist_pending_changes() + + # Get all commits that updated the log. Those will also have updated + # the records + changes = [] + yaml_location, log_location = map(str, self.get_location_for(record_id)[1:]) + commit_hashes = self.run_get_output_lines( + ['git', '-C', str(self.path), 'log', '--format=%H', '--', log_location] + ) + for commit_hash in commit_hashes: + log_diff_lines = self.run_get_output_lines( + ['git', '-C', str(self.path), 'show', '--format=%b', commit_hash, '--', log_location] + ) + # Get the log entry + log_entry = tuple( + filter( + lambda l: not l.startswith('+++') and l.startswith('+'), + log_diff_lines, + ) + )[0][1:] + time_stamp, user_id = log_entry.split(' ', 1) + + # Get the YAML diff + yaml_diff_lines = self.run_get_output_lines( + ['git', '-C', str(self.path), 'show', '--format=%b', commit_hash, '--', yaml_location] + ) + yaml_diff = '\n'.join(filter(lambda l: l != '', yaml_diff_lines)) + '\n' + + # Get the YAML content + yaml_content = self.run_get_output( + ['git', '-C', str(self.path), 'show', f'{commit_hash}:{yaml_location}'] + ) + changes.append((time_stamp, user_id, yaml_diff, yaml_content)) + + changes.sort() + return {c[0]: c[1:] for c in changes} + + + def run_get_output_lines( + self, + command: list[str], + ) -> list[str]: + result = self.run_get_output(command) + return result.splitlines() + + def run_get_output( + self, + command: list[str], + ) -> str: + result = subprocess.run( + command, + capture_output=True, + check=True, + ) + return result.stdout.decode() + + def any_change( + self, + ) -> bool: + result = subprocess.run( + ['git', '-C', str(self.path), 'status', '--porcelain=v1'], + capture_output=True, + check=True, + ) + lines = result.stdout.decode().splitlines() + return len(lines) > 0 diff --git a/dump_things_service/audit/tests/test_gitaudit.py b/dump_things_service/audit/tests/test_gitaudit.py index ca1c7d3..62cab7e 100644 --- a/dump_things_service/audit/tests/test_gitaudit.py +++ b/dump_things_service/audit/tests/test_gitaudit.py @@ -57,6 +57,11 @@ def test_gitaudit_basic(tmp_path_factory): assert len(status) == 2 assert tuple(status.values()) == ('M ', 'M ') + # Check that the changes are reported + changes = backend.get_modifications(record_id) + assert len(changes) == 4 + assert tuple(map(lambda e: e[0], changes.values())) == tuple((f'tester_{i}@example.com' for i in range(4))) + def test_gitaudit_identical_change(tmp_path_factory): tmp_path = tmp_path_factory.mktemp("gitaudit_backend") @@ -89,3 +94,7 @@ def test_gitaudit_identical_change(tmp_path_factory): # Check that there is only one entry in the commit history commit_log_lines = _get_git_log(tmp_path) assert len(commit_log_lines) == 1 + + # Check that the changes are reported + changes = backend.get_modifications(record_id) + assert len(changes) == 1 -- 2.52.0 From e6306f65c20239736a52f66976d12fbab05bf93b Mon Sep 17 00:00:00 2001 From: Christian Monch Date: Thu, 12 Mar 2026 14:09:24 +0100 Subject: [PATCH 04/17] add gitaudit-description to changelog --- CHANGELOG.md | 13 ++++++++++++- 1 file changed, 12 insertions(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 10d0b85..d3b7ef9 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,4 +1,15 @@ -# 5.5.1 (2026-03-10) +# x.x.x () + +## New features + +- Support for audit backends was added to `dump-things-service`. Currently there + is one audit backend: `gitaudit`. The audit backend stores provenance information + about records, i.e. who changed what at which time. + +- The new tool `dump-things-report-gitaudit` reports audit information for + individual PIDs, i.e., timestamps, user-id, associated diffs, and the + resulting record. + ## Improvements -- 2.52.0 From 038d18399b5114af6771e088a4633c988d5f4b54 Mon Sep 17 00:00:00 2001 From: Christian Monch Date: Thu, 12 Mar 2026 14:28:06 +0100 Subject: [PATCH 05/17] add dump-things-report-gitaudit command --- .../commands/report_gitaudit.py | 46 +++++++++++++++++++ pyproject.toml | 1 + 2 files changed, 47 insertions(+) create mode 100644 dump_things_service/commands/report_gitaudit.py diff --git a/dump_things_service/commands/report_gitaudit.py b/dump_things_service/commands/report_gitaudit.py new file mode 100644 index 0000000..d67c574 --- /dev/null +++ b/dump_things_service/commands/report_gitaudit.py @@ -0,0 +1,46 @@ +from __future__ import annotations + +import json +import sys +from argparse import ArgumentParser + +from dump_things_service.audit.gitaudit import GitAuditBackend + + +parser = ArgumentParser( + prog='Report audit information for a PID', + description='Report the audit information that was stored for a specific ' + 'PID. For every change to a record the tool will report: ' + 'time stamp, user ID, diff, and the resulting record.', +) +parser.add_argument( + 'audit_store', + help='The path to the gitaudit store', +) +parser.add_argument( + 'pid', + help='The PID of the record for which audit information should be reported.', +) + + +def main(): + arguments = parser.parse_args() + + audit_backend = GitAuditBackend(arguments.audit_store) + changes = audit_backend.get_modifications(arguments.pid) + + output = { + time_stamp: { + 'user-id': change[0], + 'diff': change[1], + 'resulting-record': change[2], + } + for time_stamp, change in changes.items() + } + + print(json.dumps(output, indent=2, ensure_ascii=False)) + return 0 + + +if __name__ == '__main__': + sys.exit(main()) diff --git a/pyproject.toml b/pyproject.toml index f852c96..d99e64a 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -50,6 +50,7 @@ dump-things-rebuild-index = "dump_things_service.commands.rebuild_index:main" dump-things-copy-store = "dump_things_service.commands.copy_store:main" dump-things-pid-check = "dump_things_service.commands.check_pids:main" dump-things-create-merged-schema = "dump_things_service.commands.create_merged_schema:main" +dump-things-report-gitaudit = "dump_things_service.commands.report_gitaudit:main" [tool.hatch.build.targets.wheel] exclude = [ -- 2.52.0 From f87256fd86cc54c1beea14abd5b4586afd170ce4 Mon Sep 17 00:00:00 2001 From: Christian Monch Date: Thu, 12 Mar 2026 14:46:03 +0100 Subject: [PATCH 06/17] create gitaudit-dir if it does not exist --- dump_things_service/audit/gitaudit.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/dump_things_service/audit/gitaudit.py b/dump_things_service/audit/gitaudit.py index 371445e..f36b7a5 100644 --- a/dump_things_service/audit/gitaudit.py +++ b/dump_things_service/audit/gitaudit.py @@ -154,7 +154,11 @@ class GitAuditBackend: ) def init_repo(self) -> None: - is_empty = len(tuple(Path(self.path).glob('**'))) == 1 + if self.path.exists(): + is_empty = len(tuple(Path(self.path).glob('**'))) == 1 + else: + self.path.mkdir(parents=True) + is_empty = True if is_empty: subprocess.run( ['git', '-C', str(self.path), 'init'], -- 2.52.0 From 0b7dee74859a4f3d899ddeaa10606a11289bd9b8 Mon Sep 17 00:00:00 2001 From: Christian Monch Date: Fri, 13 Mar 2026 13:57:59 +0100 Subject: [PATCH 07/17] implement worktree-free audit trail Add changes directly to git-index, and commit from there. This keeps the worktree empty. --- dump_things_service/audit/gitaudit.py | 254 +++++++++--------- .../audit/tests/test_gitaudit.py | 17 +- .../commands/report_gitaudit.py | 2 +- dump_things_service/config.py | 6 + 4 files changed, 146 insertions(+), 133 deletions(-) diff --git a/dump_things_service/audit/gitaudit.py b/dump_things_service/audit/gitaudit.py index f36b7a5..f91ba49 100644 --- a/dump_things_service/audit/gitaudit.py +++ b/dump_things_service/audit/gitaudit.py @@ -31,14 +31,52 @@ class GitAuditBackend: user_id: str, ): location = self.get_location_for(record_id) - initialize_location = not self.elements_exists(location) - if self.has_pending_changes(location): self.persist_pending_changes() self.add_elements(location, user_id, record) - if initialize_location: - self.initialize_location(location) + def flush(self): + if self.any_change(): + self.persist_pending_changes() + + def get_audit_log( + self, + record_id: str, + ) -> dict: + self.flush() + + # Get all commits that updated the log. Those will also have updated + # the records + changes = [] + yaml_location, log_location = map(str, self.get_location_for(record_id)[1:]) + commit_hashes = self.git_lines(['log', '--format=%H', '--', log_location]) + for commit_hash in commit_hashes: + log_diff_lines = self.git_lines( + ['show', '--format=%b', commit_hash, '--', log_location] + ) + # Get the log entry + log_entry = tuple( + filter( + lambda l: not l.startswith('+++') and l.startswith('+'), + log_diff_lines, + ) + )[0][1:] + time_stamp, user_id = log_entry.split(' ', 1) + + # Get the YAML diff + yaml_diff_lines = self.git_lines( + ['show', '--format=%b', commit_hash, '--', yaml_location] + ) + yaml_diff = '\n'.join(filter(lambda l: l != '', yaml_diff_lines)) + '\n' + + # Get the YAML content + yaml_content = self.git_execute( + ['show', f'{commit_hash}:{yaml_location}'] + ).decode() + changes.append((time_stamp, user_id, yaml_diff, yaml_content)) + + changes.sort() + return {c[0]: c[1:] for c in changes} def add_elements( self, @@ -46,8 +84,8 @@ class GitAuditBackend: user_id: str, record: dict, ) -> bool: - persisted_record = self.read_record(location[1]) - if persisted_record != record: + existing_record = self.read_record_from_repo_path(location[1]) + if existing_record != record: self.write_record(location[1], record) self.add_log_entry(location[2], user_id) return True @@ -59,39 +97,47 @@ class GitAuditBackend: user_id: str, ) -> None: time_stamp = datetime.now().isoformat() - with (self.path / log_location).open('at') as f: - f.write(f'{time_stamp} {user_id}\n') + log_content = self.read_from_repo_path(log_location).decode() + log_content += f'{time_stamp} {user_id}\n' + self.add_file_to_index(log_location, log_content) def write_record( self, record_location: Path, record: dict, ): - with (self.path / record_location).open('wt') as f: - yaml.dump(record, stream=f) + self.add_file_to_index(record_location, yaml.dump(record)) - def read_record( + def read_from_repo_path( self, - location: Path, - ) -> dict | None: - if (self.path / location).exists(): - return yaml.safe_load((self.path / location).read_text()) - return None + path: Path, + ) -> bytes: + branch = '' if self.in_index(path) else 'master' + try: + return self.git_execute(['cat-file', '-p', f'{branch}:{str(path)}']) + except subprocess.CalledProcessError as e: + return b'' - def elements_exists( + def read_record_from_repo_path( self, - location: tuple[str, Path, Path], + path: Path, + ): + return yaml.safe_load(self.read_from_repo_path(path)) + + def in_index( + self, + path: Path, ) -> bool: - log_location, record_location = location[1:] - assert log_location.exists() == record_location.exists(), f'inconsistent elements exist status for: {location}' - return log_location.exists() + lines = self.git_lines(['status', '--porcelain=v1']) + return any(line[3:] == str(path) for line in lines if not line.startswith('??')) def has_pending_changes( self, location: tuple[str, Path, Path], ) -> bool: - log_pending, record_pending = self.pending_changes(*location[1:]) + """Check if the log-file and the record-file are modified""" + log_pending, record_pending = self.pending_changes(*location[1:]) # If the pending state of log and record differ, the database has # become inconsistent. For now, raise an error. This has to be fixed # manually. @@ -102,42 +148,18 @@ class GitAuditBackend: f'{location[2]} ({record_pending})' ) raise SystemError(msg) - return log_pending def pending_changes( self, *paths: tuple[Path, ...], ): - result = subprocess.run( - ['git', '-C', str(self.path), 'status', '--porcelain=v1'] + [ - str(p) for p in paths - ], - capture_output=True, - check=True, - ) - status = { - line[3:].decode(): True - for line in result.stdout.splitlines() - } + lines = self.git_lines(['status', '--porcelain=v1']) + status = {line[3:]: True for line in lines} return tuple((status.get(str(p), False) for p in paths)) def persist_pending_changes(self) -> None: - subprocess.run( - ['git', '-C', str(self.path), 'commit', '-a', '-m', 'persist changes'], - capture_output=True, - check=True, - ) - - def initialize_location( - self, - location: tuple[str, Path, Path], - ) -> None: - subprocess.run( - ['git', '-C', str(self.path), 'add', str(location[1]), str(location[2])], - capture_output=True, - check=True, - ) + self.git_lines(['commit', '-m', 'persist changes']) def get_location_for( self, @@ -146,7 +168,6 @@ class GitAuditBackend: base = hashlib.sha1(record_id.encode()).hexdigest() dir_1, dir_2, name = base[0:3], base[3:6], base[6:] location_dir = Path(dir_1) / Path(dir_2) - (self.path / location_dir).mkdir(parents=True, exist_ok=True) return ( base, location_dir / (base + '.yaml'), @@ -160,80 +181,63 @@ class GitAuditBackend: self.path.mkdir(parents=True) is_empty = True if is_empty: - subprocess.run( - ['git', '-C', str(self.path), 'init'], - capture_output=True, - check=True, - ) + self.git_execute(['init']) - def get_modifications( - self, - record_id: str, - ) -> dict: - if self.any_change(): - self.persist_pending_changes() - - # Get all commits that updated the log. Those will also have updated - # the records - changes = [] - yaml_location, log_location = map(str, self.get_location_for(record_id)[1:]) - commit_hashes = self.run_get_output_lines( - ['git', '-C', str(self.path), 'log', '--format=%H', '--', log_location] - ) - for commit_hash in commit_hashes: - log_diff_lines = self.run_get_output_lines( - ['git', '-C', str(self.path), 'show', '--format=%b', commit_hash, '--', log_location] - ) - # Get the log entry - log_entry = tuple( - filter( - lambda l: not l.startswith('+++') and l.startswith('+'), - log_diff_lines, - ) - )[0][1:] - time_stamp, user_id = log_entry.split(' ', 1) - - # Get the YAML diff - yaml_diff_lines = self.run_get_output_lines( - ['git', '-C', str(self.path), 'show', '--format=%b', commit_hash, '--', yaml_location] - ) - yaml_diff = '\n'.join(filter(lambda l: l != '', yaml_diff_lines)) + '\n' - - # Get the YAML content - yaml_content = self.run_get_output( - ['git', '-C', str(self.path), 'show', f'{commit_hash}:{yaml_location}'] - ) - changes.append((time_stamp, user_id, yaml_diff, yaml_content)) - - changes.sort() - return {c[0]: c[1:] for c in changes} - - - def run_get_output_lines( - self, - command: list[str], - ) -> list[str]: - result = self.run_get_output(command) - return result.splitlines() - - def run_get_output( - self, - command: list[str], - ) -> str: - result = subprocess.run( - command, - capture_output=True, - check=True, - ) - return result.stdout.decode() - - def any_change( - self, - ) -> bool: - result = subprocess.run( - ['git', '-C', str(self.path), 'status', '--porcelain=v1'], - capture_output=True, - check=True, - ) - lines = result.stdout.decode().splitlines() + def any_change(self) -> bool: + lines = [ + line + for line in self.git_lines(['status', '--porcelain=v1']) + if not line.startswith(' D') + ] return len(lines) > 0 + + def git_execute( + self, + command: list[str], + stdin: str | bytes | None = None, + ) -> bytes: + result = subprocess.run( + ['git', '-C', str(self.path)] + command, + capture_output=True, + check=True, + input=stdin.encode() if isinstance(stdin, str) else stdin, + ) + return result.stdout + + def git_lines( + self, + command: list[str], + stdin: str | bytes | None = None, + ) -> list[str]: + result = self.git_execute(command, stdin=stdin) + return result.decode().splitlines() + + def git_one_line( + self, + command: list[str], + stdin: str | bytes | None = None, + ) -> str: + lines = self.git_lines(command, stdin) + if len(lines) != 1: + msg = f'expected one line as result, got: {len(lines)}: {lines}' + raise ValueError(msg) + return lines[0] + + def add_file_to_index( + self, + file_path: Path, + content: str, + ): + """Add a file to the index without storing it in the worktree""" + + object_hash = self.git_one_line( + ['hash-object', '-t', 'blob', '-w', '--stdin'], + stdin=content, + ) + self.git_lines( + [ + 'update-index', + '--add', + '--cacheinfo', f'100644,{object_hash},{file_path}', + ], + ) diff --git a/dump_things_service/audit/tests/test_gitaudit.py b/dump_things_service/audit/tests/test_gitaudit.py index 62cab7e..b1e11ad 100644 --- a/dump_things_service/audit/tests/test_gitaudit.py +++ b/dump_things_service/audit/tests/test_gitaudit.py @@ -1,6 +1,8 @@ import subprocess from pathlib import Path +from graphql.language import location + from dump_things_service.audit.gitaudit import GitAuditBackend @@ -27,7 +29,7 @@ def _get_git_log(path: Path) -> list[str]: def _get_audit_log_lines(backend: GitAuditBackend, record_id: str) -> list[str]: locations = backend.get_location_for(record_id) - return (backend.path / locations[2]).read_text().splitlines() + return backend.read_from_repo_path(locations[2]).decode().splitlines() def test_gitaudit_basic(tmp_path_factory): @@ -55,10 +57,10 @@ def test_gitaudit_basic(tmp_path_factory): # Check that the files are in modified state. status = _get_git_status(tmp_path) assert len(status) == 2 - assert tuple(status.values()) == ('M ', 'M ') + assert tuple(status.values()) == ('MD', 'MD') # Check that the changes are reported - changes = backend.get_modifications(record_id) + changes = backend.get_audit_log(record_id) assert len(changes) == 4 assert tuple(map(lambda e: e[0], changes.values())) == tuple((f'tester_{i}@example.com' for i in range(4))) @@ -76,16 +78,17 @@ def test_gitaudit_identical_change(tmp_path_factory): ) # Check that the a new record was added status = _get_git_status(tmp_path) - assert tuple(status.values()) == ('A ', 'A ') + assert tuple(status.values()) == ('AD', 'AD') backend.add_record( record_id=record_id, record={'pid': record_id}, user_id='tester@example.com', ) - # Check that there is change in the repository + # Check that there is no change in the repository, which means the files + # should just be registered as deleted. status = _get_git_status(tmp_path) - assert len(status) == 0 + assert tuple(status.values()) == (' D', ' D') # Check that there is only one entry in the audit log log_lines = _get_audit_log_lines(backend, record_id) @@ -96,5 +99,5 @@ def test_gitaudit_identical_change(tmp_path_factory): assert len(commit_log_lines) == 1 # Check that the changes are reported - changes = backend.get_modifications(record_id) + changes = backend.get_audit_log(record_id) assert len(changes) == 1 diff --git a/dump_things_service/commands/report_gitaudit.py b/dump_things_service/commands/report_gitaudit.py index d67c574..b242da4 100644 --- a/dump_things_service/commands/report_gitaudit.py +++ b/dump_things_service/commands/report_gitaudit.py @@ -27,7 +27,7 @@ def main(): arguments = parser.parse_args() audit_backend = GitAuditBackend(arguments.audit_store) - changes = audit_backend.get_modifications(arguments.pid) + changes = audit_backend.get_audit_log(arguments.pid) output = { time_stamp: { diff --git a/dump_things_service/config.py b/dump_things_service/config.py index 59b8a16..5537bd4 100644 --- a/dump_things_service/config.py +++ b/dump_things_service/config.py @@ -129,6 +129,11 @@ class ConfigAuthConfig(StrictModel): type: Literal['config'] = 'config' +class GitAuditBackendConfig(StrictModel): + type: Literal['gitaudit'] + path: Path + + class TagConfig(StrictModel): submitter_id_tag: str = 'http://purl.obolibrary.org/obo/NCIT_C54269' submission_time_tag: str = 'http://semanticscience.org/resource/SIO_001083' @@ -143,6 +148,7 @@ class CollectionConfig(StrictModel): submission_tags: TagConfig = TagConfig() use_classes: list[str] = dataclasses.field(default_factory=list) ignore_classes: list[str] = dataclasses.field(default_factory=list) + audit_backends: list[GitAuditBackendConfig] = dataclasses.field(default_factory=list) class GlobalConfig(StrictModel): -- 2.52.0 From e49ca98c1b54f4ab9e6531b391b3f543aa8b9267 Mon Sep 17 00:00:00 2001 From: Christian Monch Date: Fri, 13 Mar 2026 16:15:03 +0100 Subject: [PATCH 08/17] fix change detection Due to the empty worktree, `git status` reports even unchanged files as deleted. This lead to unnecessary audit-commits. This commit fixes the behavior. --- dump_things_service/audit/gitaudit.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/dump_things_service/audit/gitaudit.py b/dump_things_service/audit/gitaudit.py index f91ba49..ae1d987 100644 --- a/dump_things_service/audit/gitaudit.py +++ b/dump_things_service/audit/gitaudit.py @@ -155,7 +155,7 @@ class GitAuditBackend: *paths: tuple[Path, ...], ): lines = self.git_lines(['status', '--porcelain=v1']) - status = {line[3:]: True for line in lines} + status = {line[3:]: True for line in lines if line[0] != ' '} return tuple((status.get(str(p), False) for p in paths)) def persist_pending_changes(self) -> None: -- 2.52.0 From c5be11dc386b4d22e9ce268dcb6a3f2aeb060cb4 Mon Sep 17 00:00:00 2001 From: Christian Monch Date: Fri, 13 Mar 2026 16:47:13 +0100 Subject: [PATCH 09/17] add an audit test with non-trivial change numbers --- .../audit/tests/test_gitaudit.py | 26 +++++++++++++++++++ 1 file changed, 26 insertions(+) diff --git a/dump_things_service/audit/tests/test_gitaudit.py b/dump_things_service/audit/tests/test_gitaudit.py index b1e11ad..1c8e434 100644 --- a/dump_things_service/audit/tests/test_gitaudit.py +++ b/dump_things_service/audit/tests/test_gitaudit.py @@ -101,3 +101,29 @@ def test_gitaudit_identical_change(tmp_path_factory): # Check that the changes are reported changes = backend.get_audit_log(record_id) assert len(changes) == 1 + + +def test_gitaudit_huge_log(tmp_path_factory): + tmp_path = tmp_path_factory.mktemp("gitaudit_backend") + + backend = GitAuditBackend(tmp_path) + + change_number = 2 + record_number = 100 + + for j in range(change_number): + for i in range(record_number): + record_id = f'huge_{i}' + backend.add_record( + record_id=record_id, + record={'pid': record_id, 'content': f'j:{j}, i:{i}'}, + user_id='tester@example.com', + ) + + # Check that the changes are reported + for i in range(record_number): + record_id = f'huge_{i}' + changes = backend.get_audit_log(record_id) + assert len(changes) == change_number + print(f'- {record_id} --------------') + print(changes) -- 2.52.0 From 016576a6eb03168f21c1aae108dbdbe30dd90d5e Mon Sep 17 00:00:00 2001 From: Christian Monch Date: Sun, 15 Mar 2026 22:15:18 +0100 Subject: [PATCH 10/17] use `apply_change_set` Use the code from the minilad branch of datalad-core. --- dump_things_service/audit/gitaudit.py | 169 +++++++----------- .../audit/tests/test_gitaudit.py | 36 +--- 2 files changed, 67 insertions(+), 138 deletions(-) diff --git a/dump_things_service/audit/gitaudit.py b/dump_things_service/audit/gitaudit.py index ae1d987..091c646 100644 --- a/dump_things_service/audit/gitaudit.py +++ b/dump_things_service/audit/gitaudit.py @@ -7,12 +7,16 @@ committed. Changes are annotated with a time stamp and a user-id """ import hashlib -import subprocess from datetime import datetime from pathlib import Path import yaml - +from datalad_core.git_utils import apply_changeset +from datalad_core.repo import Repo +from datalad_core.runners import ( + call_git, + CommandError, +) class GitAuditBackend: @@ -21,8 +25,9 @@ class GitAuditBackend: path: Path, ): self.path = path - self.init_repo() - # Check for empty directory, if empty, initialize git-repo + self.cache = {} + self.current_change_set = {} + self.repo = self.init_repo() def add_record( self, @@ -36,7 +41,7 @@ class GitAuditBackend: self.add_elements(location, user_id, record) def flush(self): - if self.any_change(): + if self.current_change_set: self.persist_pending_changes() def get_audit_log( @@ -49,11 +54,17 @@ class GitAuditBackend: # the records changes = [] yaml_location, log_location = map(str, self.get_location_for(record_id)[1:]) - commit_hashes = self.git_lines(['log', '--format=%H', '--', log_location]) + commit_hashes = call_git( + ['log', '--format=%H', '--', log_location], + cwd=self.path, + capture_output=True, + ).decode().splitlines() for commit_hash in commit_hashes: - log_diff_lines = self.git_lines( - ['show', '--format=%b', commit_hash, '--', log_location] - ) + log_diff_lines = call_git( + ['show', '--format=%b', commit_hash, '--', log_location], + cwd=self.path, + capture_output=True, + ).decode().splitlines() # Get the log entry log_entry = tuple( filter( @@ -64,14 +75,18 @@ class GitAuditBackend: time_stamp, user_id = log_entry.split(' ', 1) # Get the YAML diff - yaml_diff_lines = self.git_lines( - ['show', '--format=%b', commit_hash, '--', yaml_location] - ) + yaml_diff_lines = call_git( + ['show', '--format=%b', commit_hash, '--', yaml_location], + cwd=self.path, + capture_output=True, + ).decode().splitlines() yaml_diff = '\n'.join(filter(lambda l: l != '', yaml_diff_lines)) + '\n' # Get the YAML content - yaml_content = self.git_execute( - ['show', f'{commit_hash}:{yaml_location}'] + yaml_content = call_git( + ['show', f'{commit_hash}:{yaml_location}'], + cwd=self.path, + capture_output=True, ).decode() changes.append((time_stamp, user_id, yaml_diff, yaml_content)) @@ -86,7 +101,12 @@ class GitAuditBackend: ) -> bool: existing_record = self.read_record_from_repo_path(location[1]) if existing_record != record: - self.write_record(location[1], record) + self.current_change_set[location[1]] = yaml.dump( + data=record, + sort_keys=False, + allow_unicode=True, + default_flow_style=False, + ) self.add_log_entry(location[2], user_id) return True return False @@ -99,24 +119,22 @@ class GitAuditBackend: time_stamp = datetime.now().isoformat() log_content = self.read_from_repo_path(log_location).decode() log_content += f'{time_stamp} {user_id}\n' - self.add_file_to_index(log_location, log_content) - - def write_record( - self, - record_location: Path, - record: dict, - ): - self.add_file_to_index(record_location, yaml.dump(record)) + self.current_change_set[log_location] = log_content def read_from_repo_path( self, path: Path, ) -> bytes: - branch = '' if self.in_index(path) else 'master' try: - return self.git_execute(['cat-file', '-p', f'{branch}:{str(path)}']) - except subprocess.CalledProcessError as e: - return b'' + return call_git( + ['cat-file', '-p', f'master:{str(path)}'], + cwd=self.path, + capture_output=True, + ) + except CommandError as e: + if e.returncode == 128: + return b'' + raise def read_record_from_repo_path( self, @@ -124,23 +142,12 @@ class GitAuditBackend: ): return yaml.safe_load(self.read_from_repo_path(path)) - def in_index( - self, - path: Path, - ) -> bool: - lines = self.git_lines(['status', '--porcelain=v1']) - return any(line[3:] == str(path) for line in lines if not line.startswith('??')) - def has_pending_changes( self, location: tuple[str, Path, Path], ) -> bool: - """Check if the log-file and the record-file are modified""" - - log_pending, record_pending = self.pending_changes(*location[1:]) - # If the pending state of log and record differ, the database has - # become inconsistent. For now, raise an error. This has to be fixed - # manually. + log_pending = location[1] in self.current_change_set + record_pending = location[2] in self.current_change_set if log_pending != record_pending: msg = ( f'change status mismatch: changed: ' @@ -150,16 +157,13 @@ class GitAuditBackend: raise SystemError(msg) return log_pending - def pending_changes( - self, - *paths: tuple[Path, ...], - ): - lines = self.git_lines(['status', '--porcelain=v1']) - status = {line[3:]: True for line in lines if line[0] != ' '} - return tuple((status.get(str(p), False) for p in paths)) - def persist_pending_changes(self) -> None: - self.git_lines(['commit', '-m', 'persist changes']) + apply_changeset( + self.repo, + self.current_change_set, + message='persist changes', + ) + self.current_change_set = {} def get_location_for( self, @@ -174,70 +178,19 @@ class GitAuditBackend: location_dir / (base + '.log'), ) - def init_repo(self) -> None: + def init_repo(self) -> Repo: if self.path.exists(): is_empty = len(tuple(Path(self.path).glob('**'))) == 1 else: self.path.mkdir(parents=True) is_empty = True if is_empty: - self.git_execute(['init']) + call_git(['init', '--bare', str(self.path)], cwd=self.path) + self.repo = Repo(self.path) - def any_change(self) -> bool: - lines = [ - line - for line in self.git_lines(['status', '--porcelain=v1']) - if not line.startswith(' D') - ] - return len(lines) > 0 - - def git_execute( - self, - command: list[str], - stdin: str | bytes | None = None, - ) -> bytes: - result = subprocess.run( - ['git', '-C', str(self.path)] + command, - capture_output=True, - check=True, - input=stdin.encode() if isinstance(stdin, str) else stdin, - ) - return result.stdout - - def git_lines( - self, - command: list[str], - stdin: str | bytes | None = None, - ) -> list[str]: - result = self.git_execute(command, stdin=stdin) - return result.decode().splitlines() - - def git_one_line( - self, - command: list[str], - stdin: str | bytes | None = None, - ) -> str: - lines = self.git_lines(command, stdin) - if len(lines) != 1: - msg = f'expected one line as result, got: {len(lines)}: {lines}' - raise ValueError(msg) - return lines[0] - - def add_file_to_index( - self, - file_path: Path, - content: str, - ): - """Add a file to the index without storing it in the worktree""" - - object_hash = self.git_one_line( - ['hash-object', '-t', 'blob', '-w', '--stdin'], - stdin=content, - ) - self.git_lines( - [ - 'update-index', - '--add', - '--cacheinfo', f'100644,{object_hash},{file_path}', - ], + apply_changeset( + self.repo, + {'README.txt': 'A git-based audit backend\n'}, + message='add README.txt', ) + return self.repo diff --git a/dump_things_service/audit/tests/test_gitaudit.py b/dump_things_service/audit/tests/test_gitaudit.py index 1c8e434..41821be 100644 --- a/dump_things_service/audit/tests/test_gitaudit.py +++ b/dump_things_service/audit/tests/test_gitaudit.py @@ -6,18 +6,6 @@ from graphql.language import location from dump_things_service.audit.gitaudit import GitAuditBackend -def _get_git_status(path: Path) -> dict: - result = subprocess.run( - ['git', '-C', str(path), 'status', '--porcelain=v1'], - capture_output=True, - check=True, - ) - return { - line[3:]: line[:2] - for line in result.stdout.decode().splitlines() - } - - def _get_git_log(path: Path) -> list[str]: result = subprocess.run( ['git', '-C', str(path), 'log', '--oneline'], @@ -47,17 +35,13 @@ def test_gitaudit_basic(tmp_path_factory): ) # Check that the log file has 4 entries + backend.flush() log_lines = _get_audit_log_lines(backend, record_id) assert len(log_lines) == 4 - # Check that the commit log has 3 entries + # Check that the commit log has 4 + 1 (from `README.txt`) entries commit_log_lines = _get_git_log(tmp_path) - assert len(commit_log_lines) == 3 - - # Check that the files are in modified state. - status = _get_git_status(tmp_path) - assert len(status) == 2 - assert tuple(status.values()) == ('MD', 'MD') + assert len(commit_log_lines) == 5 # Check that the changes are reported changes = backend.get_audit_log(record_id) @@ -76,27 +60,21 @@ def test_gitaudit_identical_change(tmp_path_factory): record={'pid': record_id}, user_id='tester@example.com', ) - # Check that the a new record was added - status = _get_git_status(tmp_path) - assert tuple(status.values()) == ('AD', 'AD') backend.add_record( record_id=record_id, record={'pid': record_id}, user_id='tester@example.com', ) - # Check that there is no change in the repository, which means the files - # should just be registered as deleted. - status = _get_git_status(tmp_path) - assert tuple(status.values()) == (' D', ' D') # Check that there is only one entry in the audit log log_lines = _get_audit_log_lines(backend, record_id) assert len(log_lines) == 1 - # Check that there is only one entry in the commit history + # Check that there are two entries in the commit history, one for the + # `README.txt`-file, one for the log entries. commit_log_lines = _get_git_log(tmp_path) - assert len(commit_log_lines) == 1 + assert len(commit_log_lines) == 2 # Check that the changes are reported changes = backend.get_audit_log(record_id) @@ -125,5 +103,3 @@ def test_gitaudit_huge_log(tmp_path_factory): record_id = f'huge_{i}' changes = backend.get_audit_log(record_id) assert len(changes) == change_number - print(f'- {record_id} --------------') - print(changes) -- 2.52.0 From f0451da3149c008f8df54907b75b07d2060b32c5 Mon Sep 17 00:00:00 2001 From: Christian Monch Date: Mon, 16 Mar 2026 11:41:55 +0100 Subject: [PATCH 11/17] add audit-backdend handling This commit adds `audit_backends` to configuration specifications. It also adds audit trails for every record that is stored via the curator-interface. For each record that is stored via the curator interface, an audit entry is added to every audit-backend. The audit entry contains: 1. timestamp when the record was added to the audit trail 2. the id of the token that was used to store the record 3. the changes that were made to the record (git diff format) 4. the content of the new record The audit trail for a given PID can be printed by using the command: dump-things-report-gitaudit --- .../commands/report_gitaudit.py | 3 +- dump_things_service/config.py | 9 +++++ dump_things_service/curated.py | 7 ++++ dump_things_service/tests/fixtures.py | 10 ++++-- dump_things_service/tests/test_curated.py | 34 +++++++++++++++++++ 5 files changed, 60 insertions(+), 3 deletions(-) diff --git a/dump_things_service/commands/report_gitaudit.py b/dump_things_service/commands/report_gitaudit.py index b242da4..5ec037a 100644 --- a/dump_things_service/commands/report_gitaudit.py +++ b/dump_things_service/commands/report_gitaudit.py @@ -3,6 +3,7 @@ from __future__ import annotations import json import sys from argparse import ArgumentParser +from pathlib import Path from dump_things_service.audit.gitaudit import GitAuditBackend @@ -26,7 +27,7 @@ parser.add_argument( def main(): arguments = parser.parse_args() - audit_backend = GitAuditBackend(arguments.audit_store) + audit_backend = GitAuditBackend(Path(arguments.audit_store)) changes = audit_backend.get_audit_log(arguments.pid) output = { diff --git a/dump_things_service/config.py b/dump_things_service/config.py index 5537bd4..b47b236 100644 --- a/dump_things_service/config.py +++ b/dump_things_service/config.py @@ -27,6 +27,7 @@ from dump_things_service import ( HTTP_404_NOT_FOUND, Format, ) +from dump_things_service.audit.gitaudit import GitAuditBackend from dump_things_service.backends.record_dir import RecordDirStore from dump_things_service.backends.schema_type_layer import SchemaTypeLayer from dump_things_service.backends.sqlite import SQLiteBackend @@ -180,6 +181,7 @@ class InstanceConfig: validators: dict = dataclasses.field(default_factory=dict) use_classes: dict = dataclasses.field(default_factory=dict) maintenance_mode: set = dataclasses.field(default_factory=set) + audit_backends: dict = dataclasses.field(default_factory=dict) mode_mapping = { TokenModes.READ_CURATED: TokenPermission(curated_read=True), @@ -452,6 +454,13 @@ def process_config_object( # authentication routine. instance_config.token_stores[collection_name] = {} + # Generate audit backends + instance_config.audit_backends[collection_name] = [] + for audit_backend in collection_info.audit_backends: + instance_config.audit_backends[collection_name].append( + GitAuditBackend(audit_backend.path) + ) + # Create validator for each collection for collection_name, _ in config_object.collections.items(): instance_config.validators[collection_name] = FormatConverter( diff --git a/dump_things_service/curated.py b/dump_things_service/curated.py index bb70aad..6f12927 100644 --- a/dump_things_service/curated.py +++ b/dump_things_service/curated.py @@ -374,3 +374,10 @@ async def store_curated_record( class_name, json_object, ) + + for audit_backend in instance_config.audit_backends[collection]: + audit_backend.add_record( + record_id=pid, + record=json_object, + user_id=instance_config.tokens[collection][api_key]['user_id'], + ) diff --git a/dump_things_service/tests/fixtures.py b/dump_things_service/tests/fixtures.py index 6edc5b0..5eb4cb7 100644 --- a/dump_things_service/tests/fixtures.py +++ b/dump_things_service/tests/fixtures.py @@ -42,6 +42,9 @@ collections: submission_tags: submitter_id_tag: oxo:NCIT_C54269 submission_time_tag: https://time + audit_backends: + - type: gitaudit + path: {{audit_store_path}} collection_2: default_token: basic_access curated: {curated}/collection_2 @@ -269,7 +272,10 @@ tokens: @pytest.fixture(scope='session') def dump_stores_simple(tmp_path_factory): tmp_path = tmp_path_factory.mktemp('dump_store') - (tmp_path / config_file_name).write_text(global_config_text) + audit_store_path = tmp_path_factory.mktemp('audit_store') + + final_config_text = global_config_text.format(audit_store_path=str(audit_store_path)) + (tmp_path / config_file_name).write_text(final_config_text) default_entries = { f'collection_{i}': [('Person', pid, test_record)] for i in range(1, 9) @@ -289,7 +295,7 @@ def dump_stores_simple(tmp_path_factory): default_entries['collection_dlflatsocial-2'] = [('Person', pid_trr, test_record_trr)] create_store( root_dir=tmp_path, - config=GlobalConfig(**yaml.safe_load(global_config_text)), + config=GlobalConfig(**yaml.safe_load(final_config_text)), per_collection_info={ 'collection_1': (str(schema_path), 'digest-md5'), 'collection_2': (str(schema_path), 'digest-md5-p3'), diff --git a/dump_things_service/tests/test_curated.py b/dump_things_service/tests/test_curated.py index b1888c1..b7be4e2 100644 --- a/dump_things_service/tests/test_curated.py +++ b/dump_things_service/tests/test_curated.py @@ -6,6 +6,7 @@ from dump_things_service import ( HTTP_200_OK, HTTP_404_NOT_FOUND, ) +from dump_things_service.config import get_config delete_record = { 'schema_type': 'abc:Person', @@ -109,3 +110,36 @@ def test_curated_delete(fastapi_client_simple): headers={'x-dumpthings-token': 'token_1_xxxxx'}, ) assert response.status_code == HTTP_404_NOT_FOUND + + +def test_audit_backend(fastapi_client_simple): + test_client, _ = fastapi_client_simple + + record_id = 'abc:audit-trailed' + response = test_client.post( + '/collection_1/curated/record/Person', + headers={'x-dumpthings-token': 'token_1_xxxxx'}, + json={ + 'schema_type': 'abc:Person', + 'pid': record_id, + 'given_name': 'Frederick', + } + ) + assert response.status_code == HTTP_200_OK + + response = test_client.post( + '/collection_1/curated/record/Person', + headers={'x-dumpthings-token': 'token_1_xxxxx'}, + json={ + 'schema_type': 'abc:Person', + 'pid': record_id, + 'given_name': 'Johny', + } + ) + assert response.status_code == HTTP_200_OK + + + config_instance = get_config() + audit_backend = config_instance.audit_backends['collection_1'][0] + changes = audit_backend.get_audit_log(record_id) + assert len(changes) == 2 -- 2.52.0 From 581948edbf9f3976b0b71eaac2739baaaf9137ba Mon Sep 17 00:00:00 2001 From: Christian Monch Date: Mon, 16 Mar 2026 14:31:23 +0100 Subject: [PATCH 12/17] add reference to branch `minilad` of datalad-core --- pyproject.toml | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/pyproject.toml b/pyproject.toml index d99e64a..9104d5f 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -2,6 +2,10 @@ requires = ["hatchling"] build-backend = "hatchling.build" +[tool.hatch.metadata] +# This is required for the git+https dependency +allow-direct-references = true + [project] name = "dump-things-service" dynamic = ["version"] @@ -27,6 +31,7 @@ classifiers = [ dependencies = [ "aiohttp", "click", + "datalad-core @ git+https://hub.datalad.org/datalad/datalad-core@minilad", "fastapi[standard]", "fastapi-pagination", "fsspec", -- 2.52.0 From 89c7252d63b55d24a5cc00062a8b8471f208d580 Mon Sep 17 00:00:00 2001 From: Christian Monch Date: Mon, 16 Mar 2026 14:33:37 +0100 Subject: [PATCH 13/17] remove unused import from `test_gitaudit.py` --- dump_things_service/audit/tests/test_gitaudit.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/dump_things_service/audit/tests/test_gitaudit.py b/dump_things_service/audit/tests/test_gitaudit.py index 41821be..1c242b6 100644 --- a/dump_things_service/audit/tests/test_gitaudit.py +++ b/dump_things_service/audit/tests/test_gitaudit.py @@ -1,8 +1,6 @@ import subprocess from pathlib import Path -from graphql.language import location - from dump_things_service.audit.gitaudit import GitAuditBackend -- 2.52.0 From e5f648a86f4e0ecccbc870d5f22495b395defc3d Mon Sep 17 00:00:00 2001 From: Christian Monch Date: Mon, 16 Mar 2026 14:53:23 +0100 Subject: [PATCH 14/17] set git-identity before runing tests --- .forgejo/workflows/run_tests.yaml | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/.forgejo/workflows/run_tests.yaml b/.forgejo/workflows/run_tests.yaml index 1716e7e..6630ca2 100644 --- a/.forgejo/workflows/run_tests.yaml +++ b/.forgejo/workflows/run_tests.yaml @@ -4,6 +4,11 @@ jobs: Test-all: runs-on: ubuntu-latest steps: + - name: Set up environment + run: | + git config --global user.email "test@example.org" + git config --global user.name "CI Tester" + - name: Check out repository code uses: actions/checkout@v4 -- 2.52.0 From ee6c3a2f15ca6a294b90533b7056e5e5a1c0afd5 Mon Sep 17 00:00:00 2001 From: Christian Monch Date: Mon, 16 Mar 2026 15:30:17 +0100 Subject: [PATCH 15/17] add audit-backend documentation This commit adds a description of the audit-backend of type `git-audit`. That means, it describes its configuration and features in the changelog and in `README.md`. It also adds a short description of the associated reporting tool, i.e, of `dump-things-report-gitaudit`. --- CHANGELOG.md | 2 +- README.md | 32 ++++++++++++++++++++++++++++++++ 2 files changed, 33 insertions(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index d3b7ef9..71547df 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -3,7 +3,7 @@ ## New features - Support for audit backends was added to `dump-things-service`. Currently there - is one audit backend: `gitaudit`. The audit backend stores provenance information + is one audit backend type: `gitaudit`. The audit backend stores provenance information about records, i.e. who changed what at which time. - The new tool `dump-things-report-gitaudit` reports audit information for diff --git a/README.md b/README.md index 530b3dd..2debf48 100644 --- a/README.md +++ b/README.md @@ -502,6 +502,38 @@ collections: ``` +#### Audit Backends + +The service supports audit-logs of changes that are made via the curation interface. +Audit logs are configured per collection via the key `audit-backends`. +The key expects a list of audit-backend configurations. +Currently the only supported audit-backend type is `gitaudit`: + +```yaml +type: collections +version: 1 +collections: + collection_1: + + ... + + audit-backends: + - type: gitaudit + path: + + ... +``` +Here `` must be a path to a directory. +If the directory does not exist, it will be created. +If the directory exists, it should contain a bare git repository. + +The commands `dump-things-report-gitaudit ` can be used to show the audit-log for the given `PID`. +Each log entry contains the timestamp of the change, the ID of the curator that posted the change, a diff of the change, and the resulting record. + +Note: currently the user ID of the curator will be stored as author in the audit-log entries. +The "original" author of a change is usually identified in the `annotations`-field of the record. + + ### Endpoints Most endpoints require a *collection*. These correspond to the names of the "data record collection"-directories (for example `myschema-v3-fmta` in [Dump Things Service](https://concepts.datalad.org/dump-things-storage-v0/)) in the stores. -- 2.52.0 From 2fcf4752350f4922b2c7c8a97030eba21837451f Mon Sep 17 00:00:00 2001 From: Christian Monch Date: Tue, 17 Mar 2026 09:17:26 +0100 Subject: [PATCH 16/17] improve gitaudit tests --- dump_things_service/tests/test_curated.py | 37 ++++++++++++----------- 1 file changed, 20 insertions(+), 17 deletions(-) diff --git a/dump_things_service/tests/test_curated.py b/dump_things_service/tests/test_curated.py index b7be4e2..3af68d7 100644 --- a/dump_things_service/tests/test_curated.py +++ b/dump_things_service/tests/test_curated.py @@ -1,6 +1,7 @@ from __future__ import annotations import pytest +import yaml from dump_things_service import ( HTTP_200_OK, @@ -116,30 +117,32 @@ def test_audit_backend(fastapi_client_simple): test_client, _ = fastapi_client_simple record_id = 'abc:audit-trailed' - response = test_client.post( - '/collection_1/curated/record/Person', - headers={'x-dumpthings-token': 'token_1_xxxxx'}, - json={ + names = 'Frederick', 'Johny' + tokens = 'token_1_xxxxx', 'token_admin' + user_names = 'test_user_1_curated', 'test_admin' + json_objects = tuple( + { 'schema_type': 'abc:Person', 'pid': record_id, - 'given_name': 'Frederick', + 'given_name': name, } + for name in names ) - assert response.status_code == HTTP_200_OK - - response = test_client.post( - '/collection_1/curated/record/Person', - headers={'x-dumpthings-token': 'token_1_xxxxx'}, - json={ - 'schema_type': 'abc:Person', - 'pid': record_id, - 'given_name': 'Johny', - } - ) - assert response.status_code == HTTP_200_OK + for i in range(2): + response = test_client.post( + '/collection_1/curated/record/Person', + headers={'x-dumpthings-token': tokens[i]}, + json=json_objects[i], + ) + assert response.status_code == HTTP_200_OK config_instance = get_config() audit_backend = config_instance.audit_backends['collection_1'][0] + changes = audit_backend.get_audit_log(record_id) assert len(changes) == 2 + values = tuple(changes.values()) + for i in range(2): + assert values[i][0] == user_names[i] + assert yaml.safe_load(values[i][2]) == json_objects[i] -- 2.52.0 From 300a32665e737124c0bec03c5f7aad70350bbff2 Mon Sep 17 00:00:00 2001 From: Christian Monch Date: Tue, 17 Mar 2026 09:51:17 +0100 Subject: [PATCH 17/17] refactor GitAuditBackend Introduce an abstract `AuditBackend` base class and derive `GitAuditBackend` from the base class. --- dump_things_service/audit/__init__.py | 60 +++++++++++++++++++ dump_things_service/audit/gitaudit.py | 45 +++++++------- .../audit/tests/test_gitaudit.py | 4 +- 3 files changed, 86 insertions(+), 23 deletions(-) diff --git a/dump_things_service/audit/__init__.py b/dump_things_service/audit/__init__.py index e69de29..3bfef2b 100644 --- a/dump_things_service/audit/__init__.py +++ b/dump_things_service/audit/__init__.py @@ -0,0 +1,60 @@ +from abc import ( + ABCMeta, + abstractmethod, +) + + +class AuditBackend(metaclass=ABCMeta): + @abstractmethod + def add_record( + self, + record_id: str, + record: dict, + user_id: str, + ) -> None: + """Add information about a new record version to the audit log + + :param record_id: the ID of the record (this is usually `record['pid']`. + :param record: the content of the new record (will be stored in YAML format). + :param user_id: the ID of the user who adds the record. + :return: A dictionary where the keys are time stamps of the changes, + the values are tuples containing the elements: + (user_id, diff, resulting_record), where user_id is the + `user_id` that was used in `add_record`, `resulting_record` is + the YAML-representation of `record` that was given to + `add_record`, and diff is path the transfers the previous + version of the record to the version provided in `record` (in + git-diff format). + """ + raise NotImplementedError + + @abstractmethod + def flush(self): + """Ensure that all audit-log entries are persisted on disk + + After `flush()` is external tools should be able to pick up all + log-entries from the persisted data. + """ + raise NotImplementedError + + @abstractmethod + def get_audit_log( + self, + record_id: str, + ) -> dict: + """Get the content of the audit log + + All diffs and content are communicated in YAML format. + + :param record_id: the ID of the record (as given in the parameter + `record_id` in the call to `add_record`). + :return: A dictionary where the keys are time stamps of the changes, + the values are tuples containing the elements: + (user_id, diff, resulting_record), where user_id is the + `user_id` that was used in `add_record`, `resulting_record` is + the YAML-representation of `record` that was given to + `add_record`, and diff is path the transfers the previous + version of the record to the version provided in `record` (in + git-diff format). + """ + raise NotImplementedError diff --git a/dump_things_service/audit/gitaudit.py b/dump_things_service/audit/gitaudit.py index 091c646..8d4ce51 100644 --- a/dump_things_service/audit/gitaudit.py +++ b/dump_things_service/audit/gitaudit.py @@ -18,7 +18,10 @@ from datalad_core.runners import ( CommandError, ) -class GitAuditBackend: +from . import AuditBackend + + +class GitAuditBackend(AuditBackend): def __init__( self, @@ -27,22 +30,22 @@ class GitAuditBackend: self.path = path self.cache = {} self.current_change_set = {} - self.repo = self.init_repo() + self.repo = self._init_repo() def add_record( self, record_id: str, record: dict, user_id: str, - ): - location = self.get_location_for(record_id) - if self.has_pending_changes(location): - self.persist_pending_changes() - self.add_elements(location, user_id, record) + ) -> None: + location = self._get_location_for(record_id) + if self._has_pending_changes(location): + self._persist_pending_changes() + self._add_elements(location, user_id, record) def flush(self): if self.current_change_set: - self.persist_pending_changes() + self._persist_pending_changes() def get_audit_log( self, @@ -53,7 +56,7 @@ class GitAuditBackend: # Get all commits that updated the log. Those will also have updated # the records changes = [] - yaml_location, log_location = map(str, self.get_location_for(record_id)[1:]) + yaml_location, log_location = map(str, self._get_location_for(record_id)[1:]) commit_hashes = call_git( ['log', '--format=%H', '--', log_location], cwd=self.path, @@ -93,13 +96,13 @@ class GitAuditBackend: changes.sort() return {c[0]: c[1:] for c in changes} - def add_elements( + def _add_elements( self, location: tuple[str, Path, Path], user_id: str, record: dict, ) -> bool: - existing_record = self.read_record_from_repo_path(location[1]) + existing_record = self._read_record_from_repo_path(location[1]) if existing_record != record: self.current_change_set[location[1]] = yaml.dump( data=record, @@ -107,21 +110,21 @@ class GitAuditBackend: allow_unicode=True, default_flow_style=False, ) - self.add_log_entry(location[2], user_id) + self._add_log_entry(location[2], user_id) return True return False - def add_log_entry( + def _add_log_entry( self, log_location: Path, user_id: str, ) -> None: time_stamp = datetime.now().isoformat() - log_content = self.read_from_repo_path(log_location).decode() + log_content = self._read_from_repo_path(log_location).decode() log_content += f'{time_stamp} {user_id}\n' self.current_change_set[log_location] = log_content - def read_from_repo_path( + def _read_from_repo_path( self, path: Path, ) -> bytes: @@ -136,13 +139,13 @@ class GitAuditBackend: return b'' raise - def read_record_from_repo_path( + def _read_record_from_repo_path( self, path: Path, ): - return yaml.safe_load(self.read_from_repo_path(path)) + return yaml.safe_load(self._read_from_repo_path(path)) - def has_pending_changes( + def _has_pending_changes( self, location: tuple[str, Path, Path], ) -> bool: @@ -157,7 +160,7 @@ class GitAuditBackend: raise SystemError(msg) return log_pending - def persist_pending_changes(self) -> None: + def _persist_pending_changes(self) -> None: apply_changeset( self.repo, self.current_change_set, @@ -165,7 +168,7 @@ class GitAuditBackend: ) self.current_change_set = {} - def get_location_for( + def _get_location_for( self, record_id: str, ) -> tuple[str, Path, Path]: @@ -178,7 +181,7 @@ class GitAuditBackend: location_dir / (base + '.log'), ) - def init_repo(self) -> Repo: + def _init_repo(self) -> Repo: if self.path.exists(): is_empty = len(tuple(Path(self.path).glob('**'))) == 1 else: diff --git a/dump_things_service/audit/tests/test_gitaudit.py b/dump_things_service/audit/tests/test_gitaudit.py index 1c242b6..1620355 100644 --- a/dump_things_service/audit/tests/test_gitaudit.py +++ b/dump_things_service/audit/tests/test_gitaudit.py @@ -14,8 +14,8 @@ def _get_git_log(path: Path) -> list[str]: def _get_audit_log_lines(backend: GitAuditBackend, record_id: str) -> list[str]: - locations = backend.get_location_for(record_id) - return backend.read_from_repo_path(locations[2]).decode().splitlines() + locations = backend._get_location_for(record_id) + return backend._read_from_repo_path(locations[2]).decode().splitlines() def test_gitaudit_basic(tmp_path_factory): -- 2.52.0