Add PID matching based on regex-patterns for gitaudit-backend reports. #199

Merged
cmo merged 9 commits from audit_log_id_regex into master 2026-03-19 07:32:01 +00:00
10 changed files with 244 additions and 90 deletions

View file

@ -527,9 +527,13 @@ Here `<path to directory>` must be a path to a directory.
If the directory does not exist, it will be created. If the directory does not exist, it will be created.
If the directory exists, it should contain a bare git repository. If the directory exists, it should contain a bare git repository.
The commands `dump-things-report-gitaudit <path to directory> <PID>` can be used to show the audit-log for the given `PID`. The command `dump-things-gitaudit-report <path to directory> <PID-pattern>` can be used to show the audit-log for all PIDs that match the given `PID`-pattern (pattern are in python `re`-module syntax, i.e. use `'.*'` to report changes for all PIDs).
Each log entry contains the timestamp of the change, the ID of the curator that posted the change, a diff of the change, and the resulting record. Each log entry contains the timestamp of the change, the ID of the curator that posted the change, a diff of the change, and the resulting record.
The command `dump-things-gitaudit-rebuild-index <path to directory>` can be used to rebuild an index for a git-audit backend.
Executing this command should not be necessary in normal operations because the backend will rebuild an index if it is instantiated on a directory that has no index.
The command mainly exists for maintenance purposes.
Note: currently the user ID of the curator will be stored as author in the audit-log entries. Note: currently the user ID of the curator will be stored as author in the audit-log entries.
The "original" author of a change is usually identified in the `annotations`-field of the record. The "original" author of a change is usually identified in the `annotations`-field of the record.
@ -673,7 +677,7 @@ The service provides the following user endpoints (In addition to user endpoints
The service supports a set of curation endpoints that allows direct access to the curated area as well as the incoming areas. The service supports a set of curation endpoints that allows direct access to the curated area as well as the incoming areas.
A `CURATOR`-token required to access these endpoints. A `CURATOR`-token required to access these endpoints.
Details about the curation endpoints can be found in [this issue](https://github.com/christian-monch/dump-things-server/issues/118). Details about the curation endpoints can be found in [this issue](https://codeberg.org/datalink/dump-things-server/issues/118).
### Tips & Tricks ### Tips & Tricks

View file

@ -8,15 +8,18 @@ class AuditBackend(metaclass=ABCMeta):
@abstractmethod @abstractmethod
def add_record( def add_record(
self, self,
record_id: str,
record: dict, record: dict,
user_id: str, committer_id: str,
author_id: str | None = None,
) -> None: ) -> None:
"""Add information about a new record version to the audit log """Add information about a new record version to the audit log
:param record_id: the ID of the record (this is usually `record['pid']`. :param record: the content of the new record. The record must contain
:param record: the content of the new record (will be stored in YAML format). a `pid`-key which is associated with the ID of the record (the
:param user_id: the ID of the user who adds the record. record will be stored in YAML format).
:param committer_id: the ID of the user who adds the record.
:param author_id: the ID of the user who modified the record, defaults
to `committer_id` if not given.
:return: A dictionary where the keys are time stamps of the changes, :return: A dictionary where the keys are time stamps of the changes,
the values are tuples containing the elements: the values are tuples containing the elements:
(user_id, diff, resulting_record), where user_id is the (user_id, diff, resulting_record), where user_id is the

View file

@ -7,6 +7,8 @@ committed.
Changes are annotated with a time stamp and a user-id Changes are annotated with a time stamp and a user-id
""" """
import hashlib import hashlib
import re
import string
from datetime import datetime from datetime import datetime
from pathlib import Path from pathlib import Path
@ -28,24 +30,33 @@ class GitAuditBackend(AuditBackend):
path: Path, path: Path,
): ):
self.path = path self.path = path
self.cache = {} self.index_path = None
self.cached_index_entries = []
self.current_change_set = {} self.current_change_set = {}
self.repo = self._init_repo() self._init_repo()
def add_record( def add_record(
self, self,
record_id: str,
record: dict, record: dict,
user_id: str, committer_id: str,
author_id: str = '',
) -> None: ) -> None:
author_id = committer_id if author_id == '' else author_id
committer_id = self._escape_person_id(committer_id)
author_id = self._escape_person_id(author_id)
record_id = record['pid']
location = self._get_location_for(record_id) location = self._get_location_for(record_id)
if self._has_pending_changes(location): if self._has_pending_changes(location):
self._persist_pending_changes() self._persist_pending_changes()
self._add_elements(location, user_id, record) self._add_elements(record_id, location, committer_id, author_id, record)
def flush(self): def flush(self):
if self.current_change_set: if self.current_change_set:
self._persist_pending_changes() self._persist_pending_changes()
if self.cached_index_entries:
with self.index_path.open('at') as f:
f.write('\n'.join(self.cached_index_entries) + '\n')
self.cached_index_entries = []
def get_audit_log( def get_audit_log(
self, self,
@ -75,7 +86,7 @@ class GitAuditBackend(AuditBackend):
log_diff_lines, log_diff_lines,
) )
)[0][1:] )[0][1:]
time_stamp, user_id = log_entry.split(' ', 1) time_stamp, committer_id, author_id = log_entry.split(' ')
# Get the YAML diff # Get the YAML diff
yaml_diff_lines = call_git( yaml_diff_lines = call_git(
@ -91,15 +102,36 @@ class GitAuditBackend(AuditBackend):
cwd=self.path, cwd=self.path,
capture_output=True, capture_output=True,
).decode() ).decode()
changes.append((time_stamp, user_id, yaml_diff, yaml_content)) changes.append(
(time_stamp, committer_id, author_id, yaml_diff, yaml_content)
)
changes.sort() changes.sort()
return {c[0]: c[1:] for c in changes} return {c[0]: c[1:] for c in changes}
def get_audit_logs(
self,
record_id_pattern: str,
) -> dict:
self.flush()
matcher = re.compile(record_id_pattern)
matching_ids = tuple(
filter(
lambda record_id: matcher.fullmatch(record_id) is not None,
self.index,
)
)
return {
record_id: self.get_audit_log(record_id)
for record_id in sorted(matching_ids)
}
def _add_elements( def _add_elements(
self, self,
record_id: str,
location: tuple[str, Path, Path], location: tuple[str, Path, Path],
user_id: str, committer_id: str,
author_id: str,
record: dict, record: dict,
) -> bool: ) -> bool:
existing_record = self._read_record_from_repo_path(location[1]) existing_record = self._read_record_from_repo_path(location[1])
@ -110,20 +142,30 @@ class GitAuditBackend(AuditBackend):
allow_unicode=True, allow_unicode=True,
default_flow_style=False, default_flow_style=False,
) )
self._add_log_entry(location[2], user_id) self._add_log_entry(location[2], committer_id, author_id)
self._add_index_entry(record_id)
return True return True
return False return False
def _add_log_entry( def _add_log_entry(
self, self,
log_location: Path, log_location: Path,
user_id: str, committer_id: str,
author_id: str,
) -> None: ) -> None:
time_stamp = datetime.now().isoformat() time_stamp = datetime.now().isoformat()
log_content = self._read_from_repo_path(log_location).decode() log_content = self._read_from_repo_path(log_location).decode()
log_content += f'{time_stamp} {user_id}\n' log_content += f'{time_stamp} {committer_id} {author_id}\n'
self.current_change_set[log_location] = log_content self.current_change_set[log_location] = log_content
def _add_index_entry(
self,
record_id: str,
):
if record_id not in self.index:
self.cached_index_entries.append(record_id)
self.index.add(record_id)
def _read_from_repo_path( def _read_from_repo_path(
self, self,
path: Path, path: Path,
@ -181,19 +223,77 @@ class GitAuditBackend(AuditBackend):
location_dir / (base + '.log'), location_dir / (base + '.log'),
) )
def _init_repo(self) -> Repo: def _init_repo(self) -> None:
if self.path.exists(): if self.path.exists():
is_empty = len(tuple(Path(self.path).glob('**'))) == 1 is_empty = len(tuple(Path(self.path).glob('**'))) == 1
else: else:
self.path.mkdir(parents=True) self.path.mkdir(parents=True)
is_empty = True is_empty = True
if is_empty: self.index_path = self.path / 'index.log'
call_git(['init', '--bare', str(self.path)], cwd=self.path)
self.repo = Repo(self.path)
apply_changeset( if is_empty:
self.repo, call_git(['init', '--bare', str(self.path)], capture_output=True)
{'README.txt': 'A git-based audit backend\n'}, self.repo = Repo(self.path)
message='add README.txt', apply_changeset(
) self.repo,
return self.repo {'README.txt': 'A git-based audit backend\n'},
message='add README.txt',
)
self.index_path.write_text('')
else:
self.repo = Repo(self.path)
if not self.index_path.exists():
self._rebuild_index()
with open(self.index_path, 'rt') as f:
self.index = set(line.strip() for line in f.readlines())
def _add_to_index(
self,
record_id: str,
):
if record_id not in self.index:
self.cached_index_entries.append(record_id)
self.index.add(record_id)
def _rebuild_index(self):
print('rebuilding index')
tree_entries = call_git(
['ls-tree', '-r', 'master:'],
cwd=self.path,
capture_output=True,
).decode().splitlines()
with open(self.index_path, 'wt') as f:
for line in tree_entries:
if not line.endswith('.yaml'):
print('ignoring line:', repr(line))
continue
print('got line:', repr(line))
flag, object_type, object_hash, file_name = line.split()
record = yaml.safe_load(
call_git(
['show', object_hash],
cwd=self.path,
capture_output=True,
).decode()
)
print('got record:', repr(record))
f.write(record['pid'] + '\n')
def _escape_person_id(
self,
person_id: str,
):
if not person_id:
msg = f'empty ID string not allowed: {person_id}'
raise ValueError(msg)
if any(
map(
lambda character: character in person_id,
string.whitespace
)
):
msg = f'ID string must not contain whitespace: {person_id}'
raise ValueError(msg)
return person_id

View file

@ -27,9 +27,9 @@ def test_gitaudit_basic(tmp_path_factory):
for index in range(4): for index in range(4):
backend.add_record( backend.add_record(
record_id=record_id,
record={'pid': record_id, 'content': index}, record={'pid': record_id, 'content': index},
user_id=f'tester_{index}@example.com', committer_id=f'committer_{100 + index}@x.org',
author_id=f'author_{index}@y.org',
) )
# Check that the log file has 4 entries # Check that the log file has 4 entries
@ -44,7 +44,10 @@ def test_gitaudit_basic(tmp_path_factory):
# Check that the changes are reported # Check that the changes are reported
changes = backend.get_audit_log(record_id) changes = backend.get_audit_log(record_id)
assert len(changes) == 4 assert len(changes) == 4
assert tuple(map(lambda e: e[0], changes.values())) == tuple((f'tester_{i}@example.com' for i in range(4))) assert tuple(map(lambda e: e[0:2], changes.values())) == tuple(
(f'committer_{100 + i}@x.org', f'author_{i}@y.org')
for i in range(4)
)
def test_gitaudit_identical_change(tmp_path_factory): def test_gitaudit_identical_change(tmp_path_factory):
@ -54,15 +57,15 @@ def test_gitaudit_identical_change(tmp_path_factory):
record_id = 'test_gitaudit_idempotent' record_id = 'test_gitaudit_idempotent'
backend.add_record( backend.add_record(
record_id=record_id,
record={'pid': record_id}, record={'pid': record_id},
user_id='tester@example.com', committer_id='committer_b@x.org',
author_id = 'author_b@y.org',
) )
backend.add_record( backend.add_record(
record_id=record_id,
record={'pid': record_id}, record={'pid': record_id},
user_id='tester@example.com', committer_id='committer_b@x.org',
author_id = 'author_b@y.org',
) )
# Check that there is only one entry in the audit log # Check that there is only one entry in the audit log
@ -91,9 +94,9 @@ def test_gitaudit_huge_log(tmp_path_factory):
for i in range(record_number): for i in range(record_number):
record_id = f'huge_{i}' record_id = f'huge_{i}'
backend.add_record( backend.add_record(
record_id=record_id,
record={'pid': record_id, 'content': f'j:{j}, i:{i}'}, record={'pid': record_id, 'content': f'j:{j}, i:{i}'},
user_id='tester@example.com', committer_id='committer@x.org',
author_id = 'author@y.org',
) )
# Check that the changes are reported # Check that the changes are reported

View file

@ -0,0 +1,29 @@
from __future__ import annotations
import sys
from argparse import ArgumentParser
from pathlib import Path
from dump_things_service.audit.gitaudit import GitAuditBackend
parser = ArgumentParser(
prog='Rebuild the index of a `gitaudit`-database',
description='This command rebuilds the index of a `gitaudit`-database.'
)
parser.add_argument(
'audit_store',
help='The directory in which the `gitaudit`-database is located.'
)
def main():
arguments = parser.parse_args()
audit_backend = GitAuditBackend(Path(arguments.audit_store))
audit_backend._rebuild_index()
return 0
if __name__ == '__main__':
sys.exit(main())

View file

@ -0,0 +1,56 @@
from __future__ import annotations
import json
import re
import sys
from argparse import ArgumentParser
from pathlib import Path
from dump_things_service.audit.gitaudit import GitAuditBackend
parser = ArgumentParser(
prog='Report audit information for a PID',
description='Report the audit information that was stored for a specific '
'PID. For every change to a record the tool will report: '
'time stamp, user ID, diff, and the resulting record.',
)
parser.add_argument(
'audit_store',
help='The path to the gitaudit store',
)
parser.add_argument(
'pid',
help='Regex pattern that identifies PIDs of the record for which audit '
'information should be reported '
'(to see all audit log entries, specify ".*").',
)
def main():
arguments = parser.parse_args()
try:
re.compile(arguments.pid)
except re.error as e:
print('Error in PID pattern:', e, file=sys.stderr, flush=True)
return 1
audit_backend = GitAuditBackend(Path(arguments.audit_store))
all_changes = audit_backend.get_audit_logs(arguments.pid)
for record_id, report_changes in all_changes.items():
for time_stamp, change in report_changes.items():
report = {
'time-stamp': time_stamp,
'record-id': record_id,
'committer-id': change[0],
'author-id': change[1],
'diff': change[2],
'resulting-record': change[3],
}
print(json.dumps(report, ensure_ascii=False), flush=True)
return 0
if __name__ == '__main__':
sys.exit(main())

View file

@ -1,47 +0,0 @@
from __future__ import annotations
import json
import sys
from argparse import ArgumentParser
from pathlib import Path
from dump_things_service.audit.gitaudit import GitAuditBackend
parser = ArgumentParser(
prog='Report audit information for a PID',
description='Report the audit information that was stored for a specific '
'PID. For every change to a record the tool will report: '
'time stamp, user ID, diff, and the resulting record.',
)
parser.add_argument(
'audit_store',
help='The path to the gitaudit store',
)
parser.add_argument(
'pid',
help='The PID of the record for which audit information should be reported.',
)
def main():
arguments = parser.parse_args()
audit_backend = GitAuditBackend(Path(arguments.audit_store))
changes = audit_backend.get_audit_log(arguments.pid)
output = {
time_stamp: {
'user-id': change[0],
'diff': change[1],
'resulting-record': change[2],
}
for time_stamp, change in changes.items()
}
print(json.dumps(output, indent=2, ensure_ascii=False))
return 0
if __name__ == '__main__':
sys.exit(main())

View file

@ -44,17 +44,20 @@ if TYPE_CHECKING:
_endpoint_curated_template = """ _endpoint_curated_template = """
async def {name}( async def {name}(
data: {model_var_name}.{class_name}, data: {model_var_name}.{class_name},
author_id: str | None = None,
api_key: str = Depends(api_key_header_scheme), api_key: str = Depends(api_key_header_scheme),
) -> JSONResponse: ) -> JSONResponse:
logger.info( logger.info(
'{name}(%s, %s)', '{name}(%s, %s, %s)',
repr(data), repr(data),
repr(author_id),
repr({model_var_name}), repr({model_var_name}),
) )
return await store_curated_record( return await store_curated_record(
'{collection}', '{collection}',
data, data,
'{class_name}', '{class_name}',
author_id,
api_key, api_key,
) )
""" """
@ -353,6 +356,7 @@ async def store_curated_record(
collection: str, collection: str,
data: BaseModel, data: BaseModel,
class_name: str, class_name: str,
author_id: str | None = None,
api_key: str | None = Depends(api_key_header_scheme), api_key: str | None = Depends(api_key_header_scheme),
): ):
@ -377,7 +381,7 @@ async def store_curated_record(
for audit_backend in instance_config.audit_backends[collection]: for audit_backend in instance_config.audit_backends[collection]:
audit_backend.add_record( audit_backend.add_record(
record_id=pid,
record=json_object, record=json_object,
user_id=instance_config.tokens[collection][api_key]['user_id'], committer_id=instance_config.tokens[collection][api_key]['user_id'],
author_id=author_id,
) )

View file

@ -131,7 +131,7 @@ def test_audit_backend(fastapi_client_simple):
for i in range(2): for i in range(2):
response = test_client.post( response = test_client.post(
'/collection_1/curated/record/Person', f'/collection_1/curated/record/Person?author_id=author_{i}@www.org',
headers={'x-dumpthings-token': tokens[i]}, headers={'x-dumpthings-token': tokens[i]},
json=json_objects[i], json=json_objects[i],
) )
@ -145,4 +145,5 @@ def test_audit_backend(fastapi_client_simple):
values = tuple(changes.values()) values = tuple(changes.values())
for i in range(2): for i in range(2):
assert values[i][0] == user_names[i] assert values[i][0] == user_names[i]
assert yaml.safe_load(values[i][2]) == json_objects[i] assert values[i][1] == f'author_{i}@www.org'
assert yaml.safe_load(values[i][3]) == json_objects[i]

View file

@ -55,7 +55,8 @@ dump-things-rebuild-index = "dump_things_service.commands.rebuild_index:main"
dump-things-copy-store = "dump_things_service.commands.copy_store:main" dump-things-copy-store = "dump_things_service.commands.copy_store:main"
dump-things-pid-check = "dump_things_service.commands.check_pids:main" dump-things-pid-check = "dump_things_service.commands.check_pids:main"
dump-things-create-merged-schema = "dump_things_service.commands.create_merged_schema:main" dump-things-create-merged-schema = "dump_things_service.commands.create_merged_schema:main"
dump-things-report-gitaudit = "dump_things_service.commands.report_gitaudit:main" dump-things-gitaudit-report = "dump_things_service.commands.gitaudit_report:main"
dump-things-gitaudit-rebuild-index = "dump_things_service.commands.gitaudit_rebuild_index:main"
[tool.hatch.build.targets.wheel] [tool.hatch.build.targets.wheel]
exclude = [ exclude = [
@ -118,7 +119,7 @@ extra-dependencies = [
] ]
[tool.hatch.envs.tests.scripts] [tool.hatch.envs.tests.scripts]
run = 'python -m pytest {args:dump_things_service/tests dump_things_service/backends/tests}' run = 'python -m pytest {args}'
[tool.ruff] [tool.ruff]
extend-exclude = [ extend-exclude = [