diff --git a/README.md b/README.md index 2debf48..dbac399 100644 --- a/README.md +++ b/README.md @@ -527,9 +527,13 @@ Here `` must be a path to a directory. If the directory does not exist, it will be created. If the directory exists, it should contain a bare git repository. -The commands `dump-things-report-gitaudit ` can be used to show the audit-log for the given `PID`. +The command `dump-things-gitaudit-report ` can be used to show the audit-log for all PIDs that match the given `PID`-pattern (pattern are in python `re`-module syntax, i.e. use `'.*'` to report changes for all PIDs). Each log entry contains the timestamp of the change, the ID of the curator that posted the change, a diff of the change, and the resulting record. +The command `dump-things-gitaudit-rebuild-index ` can be used to rebuild an index for a git-audit backend. +Executing this command should not be necessary in normal operations because the backend will rebuild an index if it is instantiated on a directory that has no index. +The command mainly exists for maintenance purposes. + Note: currently the user ID of the curator will be stored as author in the audit-log entries. The "original" author of a change is usually identified in the `annotations`-field of the record. @@ -673,7 +677,7 @@ The service provides the following user endpoints (In addition to user endpoints The service supports a set of curation endpoints that allows direct access to the curated area as well as the incoming areas. A `CURATOR`-token required to access these endpoints. -Details about the curation endpoints can be found in [this issue](https://github.com/christian-monch/dump-things-server/issues/118). +Details about the curation endpoints can be found in [this issue](https://codeberg.org/datalink/dump-things-server/issues/118). ### Tips & Tricks diff --git a/dump_things_service/audit/__init__.py b/dump_things_service/audit/__init__.py index 3bfef2b..c7520ce 100644 --- a/dump_things_service/audit/__init__.py +++ b/dump_things_service/audit/__init__.py @@ -8,15 +8,18 @@ class AuditBackend(metaclass=ABCMeta): @abstractmethod def add_record( self, - record_id: str, record: dict, - user_id: str, + committer_id: str, + author_id: str | None = None, ) -> None: """Add information about a new record version to the audit log - :param record_id: the ID of the record (this is usually `record['pid']`. - :param record: the content of the new record (will be stored in YAML format). - :param user_id: the ID of the user who adds the record. + :param record: the content of the new record. The record must contain + a `pid`-key which is associated with the ID of the record (the + record will be stored in YAML format). + :param committer_id: the ID of the user who adds the record. + :param author_id: the ID of the user who modified the record, defaults + to `committer_id` if not given. :return: A dictionary where the keys are time stamps of the changes, the values are tuples containing the elements: (user_id, diff, resulting_record), where user_id is the diff --git a/dump_things_service/audit/gitaudit.py b/dump_things_service/audit/gitaudit.py index 8d4ce51..5feeefe 100644 --- a/dump_things_service/audit/gitaudit.py +++ b/dump_things_service/audit/gitaudit.py @@ -7,6 +7,8 @@ committed. Changes are annotated with a time stamp and a user-id """ import hashlib +import re +import string from datetime import datetime from pathlib import Path @@ -28,24 +30,33 @@ class GitAuditBackend(AuditBackend): path: Path, ): self.path = path - self.cache = {} + self.index_path = None + self.cached_index_entries = [] self.current_change_set = {} - self.repo = self._init_repo() + self._init_repo() def add_record( self, - record_id: str, record: dict, - user_id: str, + committer_id: str, + author_id: str = '', ) -> None: + author_id = committer_id if author_id == '' else author_id + committer_id = self._escape_person_id(committer_id) + author_id = self._escape_person_id(author_id) + record_id = record['pid'] location = self._get_location_for(record_id) if self._has_pending_changes(location): self._persist_pending_changes() - self._add_elements(location, user_id, record) + self._add_elements(record_id, location, committer_id, author_id, record) def flush(self): if self.current_change_set: self._persist_pending_changes() + if self.cached_index_entries: + with self.index_path.open('at') as f: + f.write('\n'.join(self.cached_index_entries) + '\n') + self.cached_index_entries = [] def get_audit_log( self, @@ -75,7 +86,7 @@ class GitAuditBackend(AuditBackend): log_diff_lines, ) )[0][1:] - time_stamp, user_id = log_entry.split(' ', 1) + time_stamp, committer_id, author_id = log_entry.split(' ') # Get the YAML diff yaml_diff_lines = call_git( @@ -91,15 +102,36 @@ class GitAuditBackend(AuditBackend): cwd=self.path, capture_output=True, ).decode() - changes.append((time_stamp, user_id, yaml_diff, yaml_content)) + changes.append( + (time_stamp, committer_id, author_id, yaml_diff, yaml_content) + ) changes.sort() return {c[0]: c[1:] for c in changes} + def get_audit_logs( + self, + record_id_pattern: str, + ) -> dict: + self.flush() + matcher = re.compile(record_id_pattern) + matching_ids = tuple( + filter( + lambda record_id: matcher.fullmatch(record_id) is not None, + self.index, + ) + ) + return { + record_id: self.get_audit_log(record_id) + for record_id in sorted(matching_ids) + } + def _add_elements( self, + record_id: str, location: tuple[str, Path, Path], - user_id: str, + committer_id: str, + author_id: str, record: dict, ) -> bool: existing_record = self._read_record_from_repo_path(location[1]) @@ -110,20 +142,30 @@ class GitAuditBackend(AuditBackend): allow_unicode=True, default_flow_style=False, ) - self._add_log_entry(location[2], user_id) + self._add_log_entry(location[2], committer_id, author_id) + self._add_index_entry(record_id) return True return False def _add_log_entry( self, log_location: Path, - user_id: str, + committer_id: str, + author_id: str, ) -> None: time_stamp = datetime.now().isoformat() log_content = self._read_from_repo_path(log_location).decode() - log_content += f'{time_stamp} {user_id}\n' + log_content += f'{time_stamp} {committer_id} {author_id}\n' self.current_change_set[log_location] = log_content + def _add_index_entry( + self, + record_id: str, + ): + if record_id not in self.index: + self.cached_index_entries.append(record_id) + self.index.add(record_id) + def _read_from_repo_path( self, path: Path, @@ -181,19 +223,77 @@ class GitAuditBackend(AuditBackend): location_dir / (base + '.log'), ) - def _init_repo(self) -> Repo: + def _init_repo(self) -> None: if self.path.exists(): is_empty = len(tuple(Path(self.path).glob('**'))) == 1 else: self.path.mkdir(parents=True) is_empty = True - if is_empty: - call_git(['init', '--bare', str(self.path)], cwd=self.path) - self.repo = Repo(self.path) + self.index_path = self.path / 'index.log' - apply_changeset( - self.repo, - {'README.txt': 'A git-based audit backend\n'}, - message='add README.txt', - ) - return self.repo + if is_empty: + call_git(['init', '--bare', str(self.path)], capture_output=True) + self.repo = Repo(self.path) + apply_changeset( + self.repo, + {'README.txt': 'A git-based audit backend\n'}, + message='add README.txt', + ) + self.index_path.write_text('') + else: + self.repo = Repo(self.path) + + if not self.index_path.exists(): + self._rebuild_index() + + with open(self.index_path, 'rt') as f: + self.index = set(line.strip() for line in f.readlines()) + + def _add_to_index( + self, + record_id: str, + ): + if record_id not in self.index: + self.cached_index_entries.append(record_id) + self.index.add(record_id) + + def _rebuild_index(self): + print('rebuilding index') + tree_entries = call_git( + ['ls-tree', '-r', 'master:'], + cwd=self.path, + capture_output=True, + ).decode().splitlines() + with open(self.index_path, 'wt') as f: + for line in tree_entries: + if not line.endswith('.yaml'): + print('ignoring line:', repr(line)) + continue + print('got line:', repr(line)) + flag, object_type, object_hash, file_name = line.split() + record = yaml.safe_load( + call_git( + ['show', object_hash], + cwd=self.path, + capture_output=True, + ).decode() + ) + print('got record:', repr(record)) + f.write(record['pid'] + '\n') + + def _escape_person_id( + self, + person_id: str, + ): + if not person_id: + msg = f'empty ID string not allowed: {person_id}' + raise ValueError(msg) + if any( + map( + lambda character: character in person_id, + string.whitespace + ) + ): + msg = f'ID string must not contain whitespace: {person_id}' + raise ValueError(msg) + return person_id diff --git a/dump_things_service/audit/tests/test_gitaudit.py b/dump_things_service/audit/tests/test_gitaudit.py index 1620355..ef1e059 100644 --- a/dump_things_service/audit/tests/test_gitaudit.py +++ b/dump_things_service/audit/tests/test_gitaudit.py @@ -27,9 +27,9 @@ def test_gitaudit_basic(tmp_path_factory): for index in range(4): backend.add_record( - record_id=record_id, record={'pid': record_id, 'content': index}, - user_id=f'tester_{index}@example.com', + committer_id=f'committer_{100 + index}@x.org', + author_id=f'author_{index}@y.org', ) # Check that the log file has 4 entries @@ -44,7 +44,10 @@ def test_gitaudit_basic(tmp_path_factory): # Check that the changes are reported changes = backend.get_audit_log(record_id) assert len(changes) == 4 - assert tuple(map(lambda e: e[0], changes.values())) == tuple((f'tester_{i}@example.com' for i in range(4))) + assert tuple(map(lambda e: e[0:2], changes.values())) == tuple( + (f'committer_{100 + i}@x.org', f'author_{i}@y.org') + for i in range(4) + ) def test_gitaudit_identical_change(tmp_path_factory): @@ -54,15 +57,15 @@ def test_gitaudit_identical_change(tmp_path_factory): record_id = 'test_gitaudit_idempotent' backend.add_record( - record_id=record_id, record={'pid': record_id}, - user_id='tester@example.com', + committer_id='committer_b@x.org', + author_id = 'author_b@y.org', ) backend.add_record( - record_id=record_id, record={'pid': record_id}, - user_id='tester@example.com', + committer_id='committer_b@x.org', + author_id = 'author_b@y.org', ) # Check that there is only one entry in the audit log @@ -91,9 +94,9 @@ def test_gitaudit_huge_log(tmp_path_factory): for i in range(record_number): record_id = f'huge_{i}' backend.add_record( - record_id=record_id, record={'pid': record_id, 'content': f'j:{j}, i:{i}'}, - user_id='tester@example.com', + committer_id='committer@x.org', + author_id = 'author@y.org', ) # Check that the changes are reported diff --git a/dump_things_service/commands/gitaudit_rebuild_index.py b/dump_things_service/commands/gitaudit_rebuild_index.py new file mode 100644 index 0000000..bde297e --- /dev/null +++ b/dump_things_service/commands/gitaudit_rebuild_index.py @@ -0,0 +1,29 @@ +from __future__ import annotations + +import sys +from argparse import ArgumentParser +from pathlib import Path + +from dump_things_service.audit.gitaudit import GitAuditBackend + + +parser = ArgumentParser( + prog='Rebuild the index of a `gitaudit`-database', + description='This command rebuilds the index of a `gitaudit`-database.' +) +parser.add_argument( + 'audit_store', + help='The directory in which the `gitaudit`-database is located.' +) + + +def main(): + arguments = parser.parse_args() + + audit_backend = GitAuditBackend(Path(arguments.audit_store)) + audit_backend._rebuild_index() + return 0 + + +if __name__ == '__main__': + sys.exit(main()) diff --git a/dump_things_service/commands/gitaudit_report.py b/dump_things_service/commands/gitaudit_report.py new file mode 100644 index 0000000..06600b8 --- /dev/null +++ b/dump_things_service/commands/gitaudit_report.py @@ -0,0 +1,56 @@ +from __future__ import annotations + +import json +import re +import sys +from argparse import ArgumentParser +from pathlib import Path + +from dump_things_service.audit.gitaudit import GitAuditBackend + + +parser = ArgumentParser( + prog='Report audit information for a PID', + description='Report the audit information that was stored for a specific ' + 'PID. For every change to a record the tool will report: ' + 'time stamp, user ID, diff, and the resulting record.', +) +parser.add_argument( + 'audit_store', + help='The path to the gitaudit store', +) +parser.add_argument( + 'pid', + help='Regex pattern that identifies PIDs of the record for which audit ' + 'information should be reported ' + '(to see all audit log entries, specify ".*").', +) + + +def main(): + arguments = parser.parse_args() + + try: + re.compile(arguments.pid) + except re.error as e: + print('Error in PID pattern:', e, file=sys.stderr, flush=True) + return 1 + + audit_backend = GitAuditBackend(Path(arguments.audit_store)) + all_changes = audit_backend.get_audit_logs(arguments.pid) + for record_id, report_changes in all_changes.items(): + for time_stamp, change in report_changes.items(): + report = { + 'time-stamp': time_stamp, + 'record-id': record_id, + 'committer-id': change[0], + 'author-id': change[1], + 'diff': change[2], + 'resulting-record': change[3], + } + print(json.dumps(report, ensure_ascii=False), flush=True) + return 0 + + +if __name__ == '__main__': + sys.exit(main()) diff --git a/dump_things_service/commands/report_gitaudit.py b/dump_things_service/commands/report_gitaudit.py deleted file mode 100644 index 5ec037a..0000000 --- a/dump_things_service/commands/report_gitaudit.py +++ /dev/null @@ -1,47 +0,0 @@ -from __future__ import annotations - -import json -import sys -from argparse import ArgumentParser -from pathlib import Path - -from dump_things_service.audit.gitaudit import GitAuditBackend - - -parser = ArgumentParser( - prog='Report audit information for a PID', - description='Report the audit information that was stored for a specific ' - 'PID. For every change to a record the tool will report: ' - 'time stamp, user ID, diff, and the resulting record.', -) -parser.add_argument( - 'audit_store', - help='The path to the gitaudit store', -) -parser.add_argument( - 'pid', - help='The PID of the record for which audit information should be reported.', -) - - -def main(): - arguments = parser.parse_args() - - audit_backend = GitAuditBackend(Path(arguments.audit_store)) - changes = audit_backend.get_audit_log(arguments.pid) - - output = { - time_stamp: { - 'user-id': change[0], - 'diff': change[1], - 'resulting-record': change[2], - } - for time_stamp, change in changes.items() - } - - print(json.dumps(output, indent=2, ensure_ascii=False)) - return 0 - - -if __name__ == '__main__': - sys.exit(main()) diff --git a/dump_things_service/curated.py b/dump_things_service/curated.py index 6f12927..7366343 100644 --- a/dump_things_service/curated.py +++ b/dump_things_service/curated.py @@ -44,17 +44,20 @@ if TYPE_CHECKING: _endpoint_curated_template = """ async def {name}( data: {model_var_name}.{class_name}, + author_id: str | None = None, api_key: str = Depends(api_key_header_scheme), ) -> JSONResponse: logger.info( - '{name}(%s, %s)', + '{name}(%s, %s, %s)', repr(data), + repr(author_id), repr({model_var_name}), ) return await store_curated_record( '{collection}', data, '{class_name}', + author_id, api_key, ) """ @@ -353,6 +356,7 @@ async def store_curated_record( collection: str, data: BaseModel, class_name: str, + author_id: str | None = None, api_key: str | None = Depends(api_key_header_scheme), ): @@ -377,7 +381,7 @@ async def store_curated_record( for audit_backend in instance_config.audit_backends[collection]: audit_backend.add_record( - record_id=pid, record=json_object, - user_id=instance_config.tokens[collection][api_key]['user_id'], + committer_id=instance_config.tokens[collection][api_key]['user_id'], + author_id=author_id, ) diff --git a/dump_things_service/tests/test_curated.py b/dump_things_service/tests/test_curated.py index 3af68d7..e6a99a3 100644 --- a/dump_things_service/tests/test_curated.py +++ b/dump_things_service/tests/test_curated.py @@ -131,7 +131,7 @@ def test_audit_backend(fastapi_client_simple): for i in range(2): response = test_client.post( - '/collection_1/curated/record/Person', + f'/collection_1/curated/record/Person?author_id=author_{i}@www.org', headers={'x-dumpthings-token': tokens[i]}, json=json_objects[i], ) @@ -145,4 +145,5 @@ def test_audit_backend(fastapi_client_simple): values = tuple(changes.values()) for i in range(2): assert values[i][0] == user_names[i] - assert yaml.safe_load(values[i][2]) == json_objects[i] + assert values[i][1] == f'author_{i}@www.org' + assert yaml.safe_load(values[i][3]) == json_objects[i] diff --git a/pyproject.toml b/pyproject.toml index 9104d5f..af15cfa 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -55,7 +55,8 @@ dump-things-rebuild-index = "dump_things_service.commands.rebuild_index:main" dump-things-copy-store = "dump_things_service.commands.copy_store:main" dump-things-pid-check = "dump_things_service.commands.check_pids:main" dump-things-create-merged-schema = "dump_things_service.commands.create_merged_schema:main" -dump-things-report-gitaudit = "dump_things_service.commands.report_gitaudit:main" +dump-things-gitaudit-report = "dump_things_service.commands.gitaudit_report:main" +dump-things-gitaudit-rebuild-index = "dump_things_service.commands.gitaudit_rebuild_index:main" [tool.hatch.build.targets.wheel] exclude = [ @@ -118,7 +119,7 @@ extra-dependencies = [ ] [tool.hatch.envs.tests.scripts] -run = 'python -m pytest {args:dump_things_service/tests dump_things_service/backends/tests}' +run = 'python -m pytest {args}' [tool.ruff] extend-exclude = [