Add PID matching based on regex-patterns for gitaudit-backend reports. #199
10 changed files with 244 additions and 90 deletions
|
|
@ -527,9 +527,13 @@ Here `<path to directory>` must be a path to a directory.
|
|||
If the directory does not exist, it will be created.
|
||||
If the directory exists, it should contain a bare git repository.
|
||||
|
||||
The commands `dump-things-report-gitaudit <path to directory> <PID>` can be used to show the audit-log for the given `PID`.
|
||||
The command `dump-things-gitaudit-report <path to directory> <PID-pattern>` can be used to show the audit-log for all PIDs that match the given `PID`-pattern (pattern are in python `re`-module syntax, i.e. use `'.*'` to report changes for all PIDs).
|
||||
Each log entry contains the timestamp of the change, the ID of the curator that posted the change, a diff of the change, and the resulting record.
|
||||
|
||||
The command `dump-things-gitaudit-rebuild-index <path to directory>` can be used to rebuild an index for a git-audit backend.
|
||||
Executing this command should not be necessary in normal operations because the backend will rebuild an index if it is instantiated on a directory that has no index.
|
||||
The command mainly exists for maintenance purposes.
|
||||
|
||||
Note: currently the user ID of the curator will be stored as author in the audit-log entries.
|
||||
The "original" author of a change is usually identified in the `annotations`-field of the record.
|
||||
|
||||
|
|
@ -673,7 +677,7 @@ The service provides the following user endpoints (In addition to user endpoints
|
|||
|
||||
The service supports a set of curation endpoints that allows direct access to the curated area as well as the incoming areas.
|
||||
A `CURATOR`-token required to access these endpoints.
|
||||
Details about the curation endpoints can be found in [this issue](https://github.com/christian-monch/dump-things-server/issues/118).
|
||||
Details about the curation endpoints can be found in [this issue](https://codeberg.org/datalink/dump-things-server/issues/118).
|
||||
|
||||
|
||||
### Tips & Tricks
|
||||
|
|
|
|||
|
|
@ -8,15 +8,18 @@ class AuditBackend(metaclass=ABCMeta):
|
|||
@abstractmethod
|
||||
def add_record(
|
||||
self,
|
||||
record_id: str,
|
||||
record: dict,
|
||||
user_id: str,
|
||||
committer_id: str,
|
||||
author_id: str | None = None,
|
||||
) -> None:
|
||||
"""Add information about a new record version to the audit log
|
||||
|
||||
:param record_id: the ID of the record (this is usually `record['pid']`.
|
||||
:param record: the content of the new record (will be stored in YAML format).
|
||||
:param user_id: the ID of the user who adds the record.
|
||||
:param record: the content of the new record. The record must contain
|
||||
a `pid`-key which is associated with the ID of the record (the
|
||||
record will be stored in YAML format).
|
||||
:param committer_id: the ID of the user who adds the record.
|
||||
:param author_id: the ID of the user who modified the record, defaults
|
||||
to `committer_id` if not given.
|
||||
:return: A dictionary where the keys are time stamps of the changes,
|
||||
the values are tuples containing the elements:
|
||||
(user_id, diff, resulting_record), where user_id is the
|
||||
|
|
|
|||
|
|
@ -7,6 +7,8 @@ committed.
|
|||
Changes are annotated with a time stamp and a user-id
|
||||
"""
|
||||
import hashlib
|
||||
import re
|
||||
import string
|
||||
from datetime import datetime
|
||||
from pathlib import Path
|
||||
|
||||
|
|
@ -28,24 +30,33 @@ class GitAuditBackend(AuditBackend):
|
|||
path: Path,
|
||||
):
|
||||
self.path = path
|
||||
self.cache = {}
|
||||
self.index_path = None
|
||||
self.cached_index_entries = []
|
||||
self.current_change_set = {}
|
||||
self.repo = self._init_repo()
|
||||
self._init_repo()
|
||||
|
||||
def add_record(
|
||||
self,
|
||||
record_id: str,
|
||||
record: dict,
|
||||
user_id: str,
|
||||
committer_id: str,
|
||||
author_id: str = '',
|
||||
) -> None:
|
||||
author_id = committer_id if author_id == '' else author_id
|
||||
committer_id = self._escape_person_id(committer_id)
|
||||
author_id = self._escape_person_id(author_id)
|
||||
record_id = record['pid']
|
||||
location = self._get_location_for(record_id)
|
||||
if self._has_pending_changes(location):
|
||||
self._persist_pending_changes()
|
||||
self._add_elements(location, user_id, record)
|
||||
self._add_elements(record_id, location, committer_id, author_id, record)
|
||||
|
||||
def flush(self):
|
||||
if self.current_change_set:
|
||||
self._persist_pending_changes()
|
||||
if self.cached_index_entries:
|
||||
with self.index_path.open('at') as f:
|
||||
f.write('\n'.join(self.cached_index_entries) + '\n')
|
||||
self.cached_index_entries = []
|
||||
|
||||
def get_audit_log(
|
||||
self,
|
||||
|
|
@ -75,7 +86,7 @@ class GitAuditBackend(AuditBackend):
|
|||
log_diff_lines,
|
||||
)
|
||||
)[0][1:]
|
||||
time_stamp, user_id = log_entry.split(' ', 1)
|
||||
time_stamp, committer_id, author_id = log_entry.split(' ')
|
||||
|
||||
# Get the YAML diff
|
||||
yaml_diff_lines = call_git(
|
||||
|
|
@ -91,15 +102,36 @@ class GitAuditBackend(AuditBackend):
|
|||
cwd=self.path,
|
||||
capture_output=True,
|
||||
).decode()
|
||||
changes.append((time_stamp, user_id, yaml_diff, yaml_content))
|
||||
changes.append(
|
||||
(time_stamp, committer_id, author_id, yaml_diff, yaml_content)
|
||||
)
|
||||
|
||||
changes.sort()
|
||||
return {c[0]: c[1:] for c in changes}
|
||||
|
||||
def get_audit_logs(
|
||||
self,
|
||||
record_id_pattern: str,
|
||||
) -> dict:
|
||||
self.flush()
|
||||
matcher = re.compile(record_id_pattern)
|
||||
matching_ids = tuple(
|
||||
filter(
|
||||
lambda record_id: matcher.fullmatch(record_id) is not None,
|
||||
self.index,
|
||||
)
|
||||
)
|
||||
return {
|
||||
record_id: self.get_audit_log(record_id)
|
||||
for record_id in sorted(matching_ids)
|
||||
}
|
||||
|
||||
def _add_elements(
|
||||
self,
|
||||
record_id: str,
|
||||
location: tuple[str, Path, Path],
|
||||
user_id: str,
|
||||
committer_id: str,
|
||||
author_id: str,
|
||||
record: dict,
|
||||
) -> bool:
|
||||
existing_record = self._read_record_from_repo_path(location[1])
|
||||
|
|
@ -110,20 +142,30 @@ class GitAuditBackend(AuditBackend):
|
|||
allow_unicode=True,
|
||||
default_flow_style=False,
|
||||
)
|
||||
self._add_log_entry(location[2], user_id)
|
||||
self._add_log_entry(location[2], committer_id, author_id)
|
||||
self._add_index_entry(record_id)
|
||||
return True
|
||||
return False
|
||||
|
||||
def _add_log_entry(
|
||||
self,
|
||||
log_location: Path,
|
||||
user_id: str,
|
||||
committer_id: str,
|
||||
author_id: str,
|
||||
) -> None:
|
||||
time_stamp = datetime.now().isoformat()
|
||||
log_content = self._read_from_repo_path(log_location).decode()
|
||||
log_content += f'{time_stamp} {user_id}\n'
|
||||
log_content += f'{time_stamp} {committer_id} {author_id}\n'
|
||||
self.current_change_set[log_location] = log_content
|
||||
|
||||
def _add_index_entry(
|
||||
self,
|
||||
record_id: str,
|
||||
):
|
||||
if record_id not in self.index:
|
||||
self.cached_index_entries.append(record_id)
|
||||
self.index.add(record_id)
|
||||
|
||||
def _read_from_repo_path(
|
||||
self,
|
||||
path: Path,
|
||||
|
|
@ -181,19 +223,77 @@ class GitAuditBackend(AuditBackend):
|
|||
location_dir / (base + '.log'),
|
||||
)
|
||||
|
||||
def _init_repo(self) -> Repo:
|
||||
def _init_repo(self) -> None:
|
||||
if self.path.exists():
|
||||
is_empty = len(tuple(Path(self.path).glob('**'))) == 1
|
||||
else:
|
||||
self.path.mkdir(parents=True)
|
||||
is_empty = True
|
||||
if is_empty:
|
||||
call_git(['init', '--bare', str(self.path)], cwd=self.path)
|
||||
self.repo = Repo(self.path)
|
||||
self.index_path = self.path / 'index.log'
|
||||
|
||||
apply_changeset(
|
||||
self.repo,
|
||||
{'README.txt': 'A git-based audit backend\n'},
|
||||
message='add README.txt',
|
||||
)
|
||||
return self.repo
|
||||
if is_empty:
|
||||
call_git(['init', '--bare', str(self.path)], capture_output=True)
|
||||
self.repo = Repo(self.path)
|
||||
apply_changeset(
|
||||
self.repo,
|
||||
{'README.txt': 'A git-based audit backend\n'},
|
||||
message='add README.txt',
|
||||
)
|
||||
self.index_path.write_text('')
|
||||
else:
|
||||
self.repo = Repo(self.path)
|
||||
|
||||
if not self.index_path.exists():
|
||||
self._rebuild_index()
|
||||
|
||||
with open(self.index_path, 'rt') as f:
|
||||
self.index = set(line.strip() for line in f.readlines())
|
||||
|
||||
def _add_to_index(
|
||||
self,
|
||||
record_id: str,
|
||||
):
|
||||
if record_id not in self.index:
|
||||
self.cached_index_entries.append(record_id)
|
||||
self.index.add(record_id)
|
||||
|
||||
def _rebuild_index(self):
|
||||
print('rebuilding index')
|
||||
tree_entries = call_git(
|
||||
['ls-tree', '-r', 'master:'],
|
||||
cwd=self.path,
|
||||
capture_output=True,
|
||||
).decode().splitlines()
|
||||
with open(self.index_path, 'wt') as f:
|
||||
for line in tree_entries:
|
||||
if not line.endswith('.yaml'):
|
||||
print('ignoring line:', repr(line))
|
||||
continue
|
||||
print('got line:', repr(line))
|
||||
flag, object_type, object_hash, file_name = line.split()
|
||||
record = yaml.safe_load(
|
||||
call_git(
|
||||
['show', object_hash],
|
||||
cwd=self.path,
|
||||
capture_output=True,
|
||||
).decode()
|
||||
)
|
||||
print('got record:', repr(record))
|
||||
f.write(record['pid'] + '\n')
|
||||
|
||||
def _escape_person_id(
|
||||
self,
|
||||
person_id: str,
|
||||
):
|
||||
if not person_id:
|
||||
msg = f'empty ID string not allowed: {person_id}'
|
||||
raise ValueError(msg)
|
||||
if any(
|
||||
map(
|
||||
lambda character: character in person_id,
|
||||
string.whitespace
|
||||
)
|
||||
):
|
||||
msg = f'ID string must not contain whitespace: {person_id}'
|
||||
raise ValueError(msg)
|
||||
return person_id
|
||||
|
|
|
|||
|
|
@ -27,9 +27,9 @@ def test_gitaudit_basic(tmp_path_factory):
|
|||
|
||||
for index in range(4):
|
||||
backend.add_record(
|
||||
record_id=record_id,
|
||||
record={'pid': record_id, 'content': index},
|
||||
user_id=f'tester_{index}@example.com',
|
||||
committer_id=f'committer_{100 + index}@x.org',
|
||||
author_id=f'author_{index}@y.org',
|
||||
)
|
||||
|
||||
# Check that the log file has 4 entries
|
||||
|
|
@ -44,7 +44,10 @@ def test_gitaudit_basic(tmp_path_factory):
|
|||
# Check that the changes are reported
|
||||
changes = backend.get_audit_log(record_id)
|
||||
assert len(changes) == 4
|
||||
assert tuple(map(lambda e: e[0], changes.values())) == tuple((f'tester_{i}@example.com' for i in range(4)))
|
||||
assert tuple(map(lambda e: e[0:2], changes.values())) == tuple(
|
||||
(f'committer_{100 + i}@x.org', f'author_{i}@y.org')
|
||||
for i in range(4)
|
||||
)
|
||||
|
||||
|
||||
def test_gitaudit_identical_change(tmp_path_factory):
|
||||
|
|
@ -54,15 +57,15 @@ def test_gitaudit_identical_change(tmp_path_factory):
|
|||
|
||||
record_id = 'test_gitaudit_idempotent'
|
||||
backend.add_record(
|
||||
record_id=record_id,
|
||||
record={'pid': record_id},
|
||||
user_id='tester@example.com',
|
||||
committer_id='committer_b@x.org',
|
||||
author_id = 'author_b@y.org',
|
||||
)
|
||||
|
||||
backend.add_record(
|
||||
record_id=record_id,
|
||||
record={'pid': record_id},
|
||||
user_id='tester@example.com',
|
||||
committer_id='committer_b@x.org',
|
||||
author_id = 'author_b@y.org',
|
||||
)
|
||||
|
||||
# Check that there is only one entry in the audit log
|
||||
|
|
@ -91,9 +94,9 @@ def test_gitaudit_huge_log(tmp_path_factory):
|
|||
for i in range(record_number):
|
||||
record_id = f'huge_{i}'
|
||||
backend.add_record(
|
||||
record_id=record_id,
|
||||
record={'pid': record_id, 'content': f'j:{j}, i:{i}'},
|
||||
user_id='tester@example.com',
|
||||
committer_id='committer@x.org',
|
||||
author_id = 'author@y.org',
|
||||
)
|
||||
|
||||
# Check that the changes are reported
|
||||
|
|
|
|||
29
dump_things_service/commands/gitaudit_rebuild_index.py
Normal file
29
dump_things_service/commands/gitaudit_rebuild_index.py
Normal file
|
|
@ -0,0 +1,29 @@
|
|||
from __future__ import annotations
|
||||
|
||||
import sys
|
||||
from argparse import ArgumentParser
|
||||
from pathlib import Path
|
||||
|
||||
from dump_things_service.audit.gitaudit import GitAuditBackend
|
||||
|
||||
|
||||
parser = ArgumentParser(
|
||||
prog='Rebuild the index of a `gitaudit`-database',
|
||||
description='This command rebuilds the index of a `gitaudit`-database.'
|
||||
)
|
||||
parser.add_argument(
|
||||
'audit_store',
|
||||
help='The directory in which the `gitaudit`-database is located.'
|
||||
)
|
||||
|
||||
|
||||
def main():
|
||||
arguments = parser.parse_args()
|
||||
|
||||
audit_backend = GitAuditBackend(Path(arguments.audit_store))
|
||||
audit_backend._rebuild_index()
|
||||
return 0
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
sys.exit(main())
|
||||
56
dump_things_service/commands/gitaudit_report.py
Normal file
56
dump_things_service/commands/gitaudit_report.py
Normal file
|
|
@ -0,0 +1,56 @@
|
|||
from __future__ import annotations
|
||||
|
||||
import json
|
||||
import re
|
||||
import sys
|
||||
from argparse import ArgumentParser
|
||||
from pathlib import Path
|
||||
|
||||
from dump_things_service.audit.gitaudit import GitAuditBackend
|
||||
|
||||
|
||||
parser = ArgumentParser(
|
||||
prog='Report audit information for a PID',
|
||||
description='Report the audit information that was stored for a specific '
|
||||
'PID. For every change to a record the tool will report: '
|
||||
'time stamp, user ID, diff, and the resulting record.',
|
||||
)
|
||||
parser.add_argument(
|
||||
'audit_store',
|
||||
help='The path to the gitaudit store',
|
||||
)
|
||||
parser.add_argument(
|
||||
'pid',
|
||||
help='Regex pattern that identifies PIDs of the record for which audit '
|
||||
'information should be reported '
|
||||
'(to see all audit log entries, specify ".*").',
|
||||
)
|
||||
|
||||
|
||||
def main():
|
||||
arguments = parser.parse_args()
|
||||
|
||||
try:
|
||||
re.compile(arguments.pid)
|
||||
except re.error as e:
|
||||
print('Error in PID pattern:', e, file=sys.stderr, flush=True)
|
||||
return 1
|
||||
|
||||
audit_backend = GitAuditBackend(Path(arguments.audit_store))
|
||||
all_changes = audit_backend.get_audit_logs(arguments.pid)
|
||||
for record_id, report_changes in all_changes.items():
|
||||
for time_stamp, change in report_changes.items():
|
||||
report = {
|
||||
'time-stamp': time_stamp,
|
||||
'record-id': record_id,
|
||||
'committer-id': change[0],
|
||||
'author-id': change[1],
|
||||
'diff': change[2],
|
||||
'resulting-record': change[3],
|
||||
}
|
||||
print(json.dumps(report, ensure_ascii=False), flush=True)
|
||||
return 0
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
sys.exit(main())
|
||||
|
|
@ -1,47 +0,0 @@
|
|||
from __future__ import annotations
|
||||
|
||||
import json
|
||||
import sys
|
||||
from argparse import ArgumentParser
|
||||
from pathlib import Path
|
||||
|
||||
from dump_things_service.audit.gitaudit import GitAuditBackend
|
||||
|
||||
|
||||
parser = ArgumentParser(
|
||||
prog='Report audit information for a PID',
|
||||
description='Report the audit information that was stored for a specific '
|
||||
'PID. For every change to a record the tool will report: '
|
||||
'time stamp, user ID, diff, and the resulting record.',
|
||||
)
|
||||
parser.add_argument(
|
||||
'audit_store',
|
||||
help='The path to the gitaudit store',
|
||||
)
|
||||
parser.add_argument(
|
||||
'pid',
|
||||
help='The PID of the record for which audit information should be reported.',
|
||||
)
|
||||
|
||||
|
||||
def main():
|
||||
arguments = parser.parse_args()
|
||||
|
||||
audit_backend = GitAuditBackend(Path(arguments.audit_store))
|
||||
changes = audit_backend.get_audit_log(arguments.pid)
|
||||
|
||||
output = {
|
||||
time_stamp: {
|
||||
'user-id': change[0],
|
||||
'diff': change[1],
|
||||
'resulting-record': change[2],
|
||||
}
|
||||
for time_stamp, change in changes.items()
|
||||
}
|
||||
|
||||
print(json.dumps(output, indent=2, ensure_ascii=False))
|
||||
return 0
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
sys.exit(main())
|
||||
|
|
@ -44,17 +44,20 @@ if TYPE_CHECKING:
|
|||
_endpoint_curated_template = """
|
||||
async def {name}(
|
||||
data: {model_var_name}.{class_name},
|
||||
author_id: str | None = None,
|
||||
api_key: str = Depends(api_key_header_scheme),
|
||||
) -> JSONResponse:
|
||||
logger.info(
|
||||
'{name}(%s, %s)',
|
||||
'{name}(%s, %s, %s)',
|
||||
repr(data),
|
||||
repr(author_id),
|
||||
repr({model_var_name}),
|
||||
)
|
||||
return await store_curated_record(
|
||||
'{collection}',
|
||||
data,
|
||||
'{class_name}',
|
||||
author_id,
|
||||
api_key,
|
||||
)
|
||||
"""
|
||||
|
|
@ -353,6 +356,7 @@ async def store_curated_record(
|
|||
collection: str,
|
||||
data: BaseModel,
|
||||
class_name: str,
|
||||
author_id: str | None = None,
|
||||
api_key: str | None = Depends(api_key_header_scheme),
|
||||
):
|
||||
|
||||
|
|
@ -377,7 +381,7 @@ async def store_curated_record(
|
|||
|
||||
for audit_backend in instance_config.audit_backends[collection]:
|
||||
audit_backend.add_record(
|
||||
record_id=pid,
|
||||
record=json_object,
|
||||
user_id=instance_config.tokens[collection][api_key]['user_id'],
|
||||
committer_id=instance_config.tokens[collection][api_key]['user_id'],
|
||||
author_id=author_id,
|
||||
)
|
||||
|
|
|
|||
|
|
@ -131,7 +131,7 @@ def test_audit_backend(fastapi_client_simple):
|
|||
|
||||
for i in range(2):
|
||||
response = test_client.post(
|
||||
'/collection_1/curated/record/Person',
|
||||
f'/collection_1/curated/record/Person?author_id=author_{i}@www.org',
|
||||
headers={'x-dumpthings-token': tokens[i]},
|
||||
json=json_objects[i],
|
||||
)
|
||||
|
|
@ -145,4 +145,5 @@ def test_audit_backend(fastapi_client_simple):
|
|||
values = tuple(changes.values())
|
||||
for i in range(2):
|
||||
assert values[i][0] == user_names[i]
|
||||
assert yaml.safe_load(values[i][2]) == json_objects[i]
|
||||
assert values[i][1] == f'author_{i}@www.org'
|
||||
assert yaml.safe_load(values[i][3]) == json_objects[i]
|
||||
|
|
|
|||
|
|
@ -55,7 +55,8 @@ dump-things-rebuild-index = "dump_things_service.commands.rebuild_index:main"
|
|||
dump-things-copy-store = "dump_things_service.commands.copy_store:main"
|
||||
dump-things-pid-check = "dump_things_service.commands.check_pids:main"
|
||||
dump-things-create-merged-schema = "dump_things_service.commands.create_merged_schema:main"
|
||||
dump-things-report-gitaudit = "dump_things_service.commands.report_gitaudit:main"
|
||||
dump-things-gitaudit-report = "dump_things_service.commands.gitaudit_report:main"
|
||||
dump-things-gitaudit-rebuild-index = "dump_things_service.commands.gitaudit_rebuild_index:main"
|
||||
|
||||
[tool.hatch.build.targets.wheel]
|
||||
exclude = [
|
||||
|
|
@ -118,7 +119,7 @@ extra-dependencies = [
|
|||
]
|
||||
|
||||
[tool.hatch.envs.tests.scripts]
|
||||
run = 'python -m pytest {args:dump_things_service/tests dump_things_service/backends/tests}'
|
||||
run = 'python -m pytest {args}'
|
||||
|
||||
[tool.ruff]
|
||||
extend-exclude = [
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue