From a46d24614f8282036ae3741fd1c1e90ab97d412b Mon Sep 17 00:00:00 2001 From: Christian Monch Date: Fri, 20 Mar 2026 08:18:15 +0100 Subject: [PATCH 1/3] use JSONL format for gitaudit log-entries --- dump_things_service/audit/gitaudit.py | 39 +++++++++++---------------- 1 file changed, 16 insertions(+), 23 deletions(-) diff --git a/dump_things_service/audit/gitaudit.py b/dump_things_service/audit/gitaudit.py index 9818874..555bbb6 100644 --- a/dump_things_service/audit/gitaudit.py +++ b/dump_things_service/audit/gitaudit.py @@ -7,6 +7,7 @@ committed. Changes are annotated with a time stamp and a user-id """ import hashlib +import json import re import string from datetime import datetime @@ -42,8 +43,6 @@ class GitAuditBackend(AuditBackend): author_id: str | None = None, ) -> None: author_id = committer_id if author_id is None else author_id - committer_id = self._escape_person_id(committer_id) - author_id = self._escape_person_id(author_id) record_id = record['pid'] location = self._get_location_for(record_id) if self._has_pending_changes(location): @@ -80,13 +79,13 @@ class GitAuditBackend(AuditBackend): capture_output=True, ).decode().splitlines() # Get the log entry - log_entry = tuple( + log_line = tuple( filter( lambda l: not l.startswith('+++') and l.startswith('+'), log_diff_lines, ) )[0][1:] - time_stamp, committer_id, author_id = log_entry.split(' ') + log_entry = json.loads(log_line) # Get the YAML diff yaml_diff_lines = call_git( @@ -103,7 +102,13 @@ class GitAuditBackend(AuditBackend): capture_output=True, ).decode() changes.append( - (time_stamp, committer_id, author_id, yaml_diff, yaml_content) + ( + log_entry['time_stamp'], + log_entry['committer_id'], + log_entry['author_id'], + yaml_diff, + yaml_content, + ) ) changes.sort() @@ -154,8 +159,13 @@ class GitAuditBackend(AuditBackend): author_id: str, ) -> None: time_stamp = datetime.now().isoformat() + entry = { + 'time_stamp': time_stamp, + 'committer_id': committer_id, + 'author_id': author_id, + } log_content = self._read_from_repo_path(log_location).decode() - log_content += f'{time_stamp} {committer_id} {author_id}\n' + log_content += json.dumps(entry, ensure_ascii=False) + '\n' self.current_change_set[log_location] = log_content def _add_index_entry( @@ -280,20 +290,3 @@ class GitAuditBackend(AuditBackend): ) print('got record:', repr(record)) f.write(record['pid'] + '\n') - - def _escape_person_id( - self, - person_id: str, - ): - if not person_id: - msg = f'empty ID string not allowed: {person_id}' - raise ValueError(msg) - if any( - map( - lambda character: character in person_id, - string.whitespace - ) - ): - msg = f'ID string must not contain whitespace: {person_id}' - raise ValueError(msg) - return person_id -- 2.52.0 From c5ad5a760e25b6ab01ed048e01b79f0e697f73df Mon Sep 17 00:00:00 2001 From: Christian Monch Date: Fri, 20 Mar 2026 08:45:18 +0100 Subject: [PATCH 2/3] improve git-ls-tree interpretation, remove prints --- dump_things_service/audit/gitaudit.py | 7 +------ 1 file changed, 1 insertion(+), 6 deletions(-) diff --git a/dump_things_service/audit/gitaudit.py b/dump_things_service/audit/gitaudit.py index 555bbb6..2ffbc5f 100644 --- a/dump_things_service/audit/gitaudit.py +++ b/dump_things_service/audit/gitaudit.py @@ -9,7 +9,6 @@ Changes are annotated with a time stamp and a user-id import hashlib import json import re -import string from datetime import datetime from pathlib import Path @@ -268,7 +267,6 @@ class GitAuditBackend(AuditBackend): self.index.add(record_id) def _rebuild_index(self): - print('rebuilding index') tree_entries = call_git( ['ls-tree', '-r', 'master:'], cwd=self.path, @@ -277,10 +275,8 @@ class GitAuditBackend(AuditBackend): with open(self.index_path, 'wt') as f: for line in tree_entries: if not line.endswith('.yaml'): - print('ignoring line:', repr(line)) continue - print('got line:', repr(line)) - flag, object_type, object_hash, file_name = line.split() + flag, object_type, object_hash, file_name = line.split(maxsplit=3) record = yaml.safe_load( call_git( ['show', object_hash], @@ -288,5 +284,4 @@ class GitAuditBackend(AuditBackend): capture_output=True, ).decode() ) - print('got record:', repr(record)) f.write(record['pid'] + '\n') -- 2.52.0 From 5b972df7034c6cdc3cb2e918e4536e223578cb73 Mon Sep 17 00:00:00 2001 From: Christian Monch Date: Fri, 20 Mar 2026 09:33:26 +0100 Subject: [PATCH 3/3] change gitaudit index filename --- dump_things_service/audit/gitaudit.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/dump_things_service/audit/gitaudit.py b/dump_things_service/audit/gitaudit.py index 2ffbc5f..94e2ea5 100644 --- a/dump_things_service/audit/gitaudit.py +++ b/dump_things_service/audit/gitaudit.py @@ -23,6 +23,9 @@ from datalad_core.runners import ( from . import AuditBackend +index_file_name = 'gitaudit_index.log' + + class GitAuditBackend(AuditBackend): def __init__( @@ -238,7 +241,7 @@ class GitAuditBackend(AuditBackend): else: self.path.mkdir(parents=True) is_empty = True - self.index_path = self.path / 'index.log' + self.index_path = self.path / index_file_name if is_empty: call_git(['init', '--bare', str(self.path)], capture_output=True) -- 2.52.0